diff options
Diffstat (limited to 'fs')
503 files changed, 14597 insertions, 10642 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig index 09fd4a185fd2..d7bc93447c85 100644 --- a/fs/9p/Kconfig +++ b/fs/9p/Kconfig @@ -2,6 +2,7 @@ config 9P_FS tristate "Plan 9 Resource Sharing Support (9P2000)" depends on INET && NET_9P + select NETFS_SUPPORT help If you say Y here, you will get experimental support for Plan 9 resource sharing via the 9P2000 protocol. diff --git a/fs/9p/acl.c b/fs/9p/acl.c index c381499f5416..4dac4a0dc5f4 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * Copyright IBM Corporation, 2010 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * */ #include <linux/module.h> @@ -123,6 +115,7 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) char *name; size_t size; void *buffer; + if (!acl) return 0; diff --git a/fs/9p/acl.h b/fs/9p/acl.h index d43c8949e807..ce5175d463dd 100644 --- a/fs/9p/acl.h +++ b/fs/9p/acl.h @@ -1,28 +1,21 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * Copyright IBM Corporation, 2010 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * */ #ifndef FS_9P_ACL_H #define FS_9P_ACL_H #ifdef CONFIG_9P_FS_POSIX_ACL -extern int v9fs_get_acl(struct inode *, struct p9_fid *); -extern struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, bool rcu); -extern int v9fs_acl_chmod(struct inode *, struct p9_fid *); -extern int v9fs_set_create_acl(struct inode *, struct p9_fid *, - struct posix_acl *, struct posix_acl *); -extern int v9fs_acl_mode(struct inode *dir, umode_t *modep, - struct posix_acl **dpacl, struct posix_acl **pacl); -extern void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl); +int v9fs_get_acl(struct inode *inode, struct p9_fid *fid); +struct posix_acl *v9fs_iop_get_acl(struct inode *inode, int type, + bool rcu); +int v9fs_acl_chmod(struct inode *inode, struct p9_fid *fid); +int v9fs_set_create_acl(struct inode *inode, struct p9_fid *fid, + struct posix_acl *dacl, struct posix_acl *acl); +int v9fs_acl_mode(struct inode *dir, umode_t *modep, + struct posix_acl **dpacl, struct posix_acl **pacl); +void v9fs_put_acl(struct posix_acl *dacl, struct posix_acl *acl); #else #define v9fs_iop_get_acl NULL static inline int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) diff --git a/fs/9p/cache.c b/fs/9p/cache.c index eb2151fb6049..f2ba131cede1 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -19,11 +19,11 @@ #define CACHETAG_LEN 11 struct fscache_netfs v9fs_cache_netfs = { - .name = "9p", - .version = 0, + .name = "9p", + .version = 0, }; -/** +/* * v9fs_random_cachetag - Generate a random tag to be associated * with a new cache session. * @@ -199,140 +199,3 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) mutex_unlock(&v9inode->fscache_lock); } - -int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) -{ - struct inode *inode = page->mapping->host; - struct v9fs_inode *v9inode = V9FS_I(inode); - - BUG_ON(!v9inode->fscache); - - return fscache_maybe_release_page(v9inode->fscache, page, gfp); -} - -void __v9fs_fscache_invalidate_page(struct page *page) -{ - struct inode *inode = page->mapping->host; - struct v9fs_inode *v9inode = V9FS_I(inode); - - BUG_ON(!v9inode->fscache); - - if (PageFsCache(page)) { - fscache_wait_on_page_write(v9inode->fscache, page); - BUG_ON(!PageLocked(page)); - fscache_uncache_page(v9inode->fscache, page); - } -} - -static void v9fs_vfs_readpage_complete(struct page *page, void *data, - int error) -{ - if (!error) - SetPageUptodate(page); - - unlock_page(page); -} - -/** - * __v9fs_readpage_from_fscache - read a page from cache - * - * Returns 0 if the pages are in cache and a BIO is submitted, - * 1 if the pages are not in cache and -error otherwise. - */ - -int __v9fs_readpage_from_fscache(struct inode *inode, struct page *page) -{ - int ret; - const struct v9fs_inode *v9inode = V9FS_I(inode); - - p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); - if (!v9inode->fscache) - return -ENOBUFS; - - ret = fscache_read_or_alloc_page(v9inode->fscache, - page, - v9fs_vfs_readpage_complete, - NULL, - GFP_KERNEL); - switch (ret) { - case -ENOBUFS: - case -ENODATA: - p9_debug(P9_DEBUG_FSC, "page/inode not in cache %d\n", ret); - return 1; - case 0: - p9_debug(P9_DEBUG_FSC, "BIO submitted\n"); - return ret; - default: - p9_debug(P9_DEBUG_FSC, "ret %d\n", ret); - return ret; - } -} - -/** - * __v9fs_readpages_from_fscache - read multiple pages from cache - * - * Returns 0 if the pages are in cache and a BIO is submitted, - * 1 if the pages are not in cache and -error otherwise. - */ - -int __v9fs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - int ret; - const struct v9fs_inode *v9inode = V9FS_I(inode); - - p9_debug(P9_DEBUG_FSC, "inode %p pages %u\n", inode, *nr_pages); - if (!v9inode->fscache) - return -ENOBUFS; - - ret = fscache_read_or_alloc_pages(v9inode->fscache, - mapping, pages, nr_pages, - v9fs_vfs_readpage_complete, - NULL, - mapping_gfp_mask(mapping)); - switch (ret) { - case -ENOBUFS: - case -ENODATA: - p9_debug(P9_DEBUG_FSC, "pages/inodes not in cache %d\n", ret); - return 1; - case 0: - BUG_ON(!list_empty(pages)); - BUG_ON(*nr_pages != 0); - p9_debug(P9_DEBUG_FSC, "BIO submitted\n"); - return ret; - default: - p9_debug(P9_DEBUG_FSC, "ret %d\n", ret); - return ret; - } -} - -/** - * __v9fs_readpage_to_fscache - write a page to the cache - * - */ - -void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page) -{ - int ret; - const struct v9fs_inode *v9inode = V9FS_I(inode); - - p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); - ret = fscache_write_page(v9inode->fscache, page, - i_size_read(&v9inode->vfs_inode), GFP_KERNEL); - p9_debug(P9_DEBUG_FSC, "ret = %d\n", ret); - if (ret != 0) - v9fs_uncache_page(inode, page); -} - -/* - * wait for a page to complete writing to the cache - */ -void __v9fs_fscache_wait_on_page_write(struct inode *inode, struct page *page) -{ - const struct v9fs_inode *v9inode = V9FS_I(inode); - p9_debug(P9_DEBUG_FSC, "inode %p page %p\n", inode, page); - if (PageFsCache(page)) - fscache_wait_on_page_write(v9inode->fscache, page); -} diff --git a/fs/9p/cache.h b/fs/9p/cache.h index 00f107af443e..7480b4b49fea 100644 --- a/fs/9p/cache.h +++ b/fs/9p/cache.h @@ -7,9 +7,10 @@ #ifndef _9P_CACHE_H #define _9P_CACHE_H -#ifdef CONFIG_9P_FSCACHE +#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> -#include <linux/spinlock.h> + +#ifdef CONFIG_9P_FSCACHE extern struct fscache_netfs v9fs_cache_netfs; extern const struct fscache_cookie_def v9fs_cache_session_index_def; @@ -27,64 +28,6 @@ extern void v9fs_cache_inode_reset_cookie(struct inode *inode); extern int __v9fs_cache_register(void); extern void __v9fs_cache_unregister(void); -extern int __v9fs_fscache_release_page(struct page *page, gfp_t gfp); -extern void __v9fs_fscache_invalidate_page(struct page *page); -extern int __v9fs_readpage_from_fscache(struct inode *inode, - struct page *page); -extern int __v9fs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages); -extern void __v9fs_readpage_to_fscache(struct inode *inode, struct page *page); -extern void __v9fs_fscache_wait_on_page_write(struct inode *inode, - struct page *page); - -static inline int v9fs_fscache_release_page(struct page *page, - gfp_t gfp) -{ - return __v9fs_fscache_release_page(page, gfp); -} - -static inline void v9fs_fscache_invalidate_page(struct page *page) -{ - __v9fs_fscache_invalidate_page(page); -} - -static inline int v9fs_readpage_from_fscache(struct inode *inode, - struct page *page) -{ - return __v9fs_readpage_from_fscache(inode, page); -} - -static inline int v9fs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - return __v9fs_readpages_from_fscache(inode, mapping, pages, - nr_pages); -} - -static inline void v9fs_readpage_to_fscache(struct inode *inode, - struct page *page) -{ - if (PageFsCache(page)) - __v9fs_readpage_to_fscache(inode, page); -} - -static inline void v9fs_uncache_page(struct inode *inode, struct page *page) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - fscache_uncache_page(v9inode->fscache, page); - BUG_ON(PageFsCache(page)); -} - -static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, - struct page *page) -{ - return __v9fs_fscache_wait_on_page_write(inode, page); -} - #else /* CONFIG_9P_FSCACHE */ static inline void v9fs_cache_inode_get_cookie(struct inode *inode) @@ -99,39 +42,5 @@ static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file { } -static inline int v9fs_fscache_release_page(struct page *page, - gfp_t gfp) { - return 1; -} - -static inline void v9fs_fscache_invalidate_page(struct page *page) {} - -static inline int v9fs_readpage_from_fscache(struct inode *inode, - struct page *page) -{ - return -ENOBUFS; -} - -static inline int v9fs_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - return -ENOBUFS; -} - -static inline void v9fs_readpage_to_fscache(struct inode *inode, - struct page *page) -{} - -static inline void v9fs_uncache_page(struct inode *inode, struct page *page) -{} - -static inline void v9fs_fscache_wait_on_page_write(struct inode *inode, - struct page *page) -{ - return; -} - #endif /* CONFIG_9P_FSCACHE */ #endif /* _9P_CACHE_H */ diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 9d9de62592be..6aab046c98e2 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -19,18 +19,18 @@ #include "v9fs_vfs.h" #include "fid.h" +static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) +{ + hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); +} + + /** * v9fs_fid_add - add a fid to a dentry * @dentry: dentry that the fid is being added to * @fid: fid to add * */ - -static inline void __add_fid(struct dentry *dentry, struct p9_fid *fid) -{ - hlist_add_head(&fid->dlist, (struct hlist_head *)&dentry->d_fsdata); -} - void v9fs_fid_add(struct dentry *dentry, struct p9_fid *fid) { spin_lock(&dentry->d_lock); @@ -67,7 +67,7 @@ static struct p9_fid *v9fs_fid_find_inode(struct inode *inode, kuid_t uid) /** * v9fs_open_fid_add - add an open fid to an inode - * @dentry: inode that the fid is being added to + * @inode: inode that the fid is being added to * @fid: fid to add * */ @@ -103,6 +103,7 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) /* we'll recheck under lock if there's anything to look in */ if (!ret && dentry->d_fsdata) { struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata; + spin_lock(&dentry->d_lock); hlist_for_each_entry(fid, h, dlist) { if (any || uid_eq(fid->uid, uid)) { @@ -185,7 +186,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, return ERR_PTR(-EPERM); if (v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) - uname = NULL; + uname = NULL; else uname = v9ses->uname; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index cdb99507ef33..e32dd5f7721b 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/fs/9p/v9fs.c - * * This file contains functions assisting in mapping VFS to 9P2000 * * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> @@ -155,6 +153,7 @@ int v9fs_show_options(struct seq_file *m, struct dentry *root) /** * v9fs_parse_options - parse mount options into session structure * @v9ses: existing v9fs session information + * @opts: The mount option string * * Return 0 upon success, -ERRNO upon failure. */ @@ -165,7 +164,7 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) substring_t args[MAX_OPT_ARGS]; char *p; int option = 0; - char *s, *e; + char *s; int ret = 0; /* setup defaults */ @@ -189,8 +188,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) while ((p = strsep(&options, ",")) != NULL) { int token, r; + if (!*p) continue; + token = match_token(p, tokens, args); switch (token) { case Opt_debug: @@ -320,12 +321,13 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) v9ses->flags |= V9FS_ACCESS_CLIENT; } else { uid_t uid; + v9ses->flags |= V9FS_ACCESS_SINGLE; - uid = simple_strtoul(s, &e, 10); - if (*e != '\0') { - ret = -EINVAL; - pr_info("Unknown access argument %s\n", - s); + r = kstrtouint(s, 10, &uid); + if (r) { + ret = r; + pr_info("Unknown access argument %s: %d\n", + s, r); kfree(s); continue; } @@ -519,7 +521,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) * mark transport as disconnected and cancel all pending requests. */ -void v9fs_session_cancel(struct v9fs_session_info *v9ses) { +void v9fs_session_cancel(struct v9fs_session_info *v9ses) +{ p9_debug(P9_DEBUG_ERROR, "cancel session %p\n", v9ses); p9_client_disconnect(v9ses->clnt); } @@ -542,12 +545,9 @@ extern int v9fs_error_init(void); static struct kobject *v9fs_kobj; #ifdef CONFIG_9P_FSCACHE -/** - * caches_show - list caches associated with a session - * - * Returns the size of buffer written. +/* + * List caches associated with a session */ - static ssize_t caches_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -661,6 +661,7 @@ static void v9fs_destroy_inode_cache(void) static int v9fs_cache_register(void) { int ret; + ret = v9fs_init_inode_cache(); if (ret < 0) return ret; @@ -688,6 +689,7 @@ static void v9fs_cache_unregister(void) static int __init init_v9fs(void) { int err; + pr_info("Installing v9fs 9p2000 file system support\n"); /* TODO: Setup list of registered trasnport modules */ diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 4ca56c5dd637..1647a8e63671 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -124,15 +124,24 @@ static inline struct v9fs_inode *V9FS_I(const struct inode *inode) return container_of(inode, struct v9fs_inode, vfs_inode); } +static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode) +{ +#ifdef CONFIG_9P_FSCACHE + return v9inode->fscache; +#else + return NULL; +#endif +} + extern int v9fs_show_options(struct seq_file *m, struct dentry *root); -struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, - char *); +struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, + const char *dev_name, char *data); extern void v9fs_session_close(struct v9fs_session_info *v9ses); extern void v9fs_session_cancel(struct v9fs_session_info *v9ses); extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags); + unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); extern int v9fs_vfs_rename(struct user_namespace *mnt_userns, @@ -158,7 +167,7 @@ extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) { - return (inode->i_sb->s_fs_info); + return inode->i_sb->s_fs_info; } static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry) diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index d44ade76966a..bc417da7e9c1 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -44,9 +44,10 @@ extern struct kmem_cache *v9fs_inode_cache; struct inode *v9fs_alloc_inode(struct super_block *sb); void v9fs_free_inode(struct inode *inode); -struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t); +struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, + dev_t rdev); int v9fs_init_inode(struct v9fs_session_info *v9ses, - struct inode *inode, umode_t mode, dev_t); + struct inode *inode, umode_t mode, dev_t rdev); void v9fs_evict_inode(struct inode *inode); ino_t v9fs_qid2ino(struct p9_qid *qid); void v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, @@ -59,8 +60,8 @@ void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); void v9fs_blank_wstat(struct p9_wstat *wstat); -int v9fs_vfs_setattr_dotl(struct user_namespace *, struct dentry *, - struct iattr *); +int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *iattr); int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, int datasync); int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode); @@ -68,9 +69,9 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode); static inline void v9fs_invalidate_inode_attr(struct inode *inode) { struct v9fs_inode *v9inode; + v9inode = V9FS_I(inode); v9inode->cache_validity |= V9FS_INO_INVALID_ATTR; - return; } int v9fs_open_to_dotl_flags(int flags); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index cce9ace651a2..adafdf86f42f 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/fs/9p/vfs_addr.c - * * This file contians vfs address (mmap) ops for 9P2000. * * Copyright (C) 2005 by Eric Van Hensbergen <ericvh@gmail.com> @@ -19,7 +17,7 @@ #include <linux/idr.h> #include <linux/sched.h> #include <linux/uio.h> -#include <linux/bvec.h> +#include <linux/netfs.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -29,93 +27,103 @@ #include "fid.h" /** - * v9fs_fid_readpage - read an entire page in from 9P - * - * @fid: fid being read - * @page: structure to page - * + * v9fs_req_issue_op - Issue a read from 9P + * @subreq: The read to make */ -static int v9fs_fid_readpage(void *data, struct page *page) +static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq) { - struct p9_fid *fid = data; - struct inode *inode = page->mapping->host; - struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE}; + struct netfs_read_request *rreq = subreq->rreq; + struct p9_fid *fid = rreq->netfs_priv; struct iov_iter to; - int retval, err; + loff_t pos = subreq->start + subreq->transferred; + size_t len = subreq->len - subreq->transferred; + int total, err; - p9_debug(P9_DEBUG_VFS, "\n"); + iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len); - BUG_ON(!PageLocked(page)); + total = p9_client_read(fid, pos, &to, &err); + netfs_subreq_terminated(subreq, err ?: total, false); +} - retval = v9fs_readpage_from_fscache(inode, page); - if (retval == 0) - return retval; +/** + * v9fs_init_rreq - Initialise a read request + * @rreq: The read request + * @file: The file being read from + */ +static void v9fs_init_rreq(struct netfs_read_request *rreq, struct file *file) +{ + struct p9_fid *fid = file->private_data; - iov_iter_bvec(&to, READ, &bvec, 1, PAGE_SIZE); + refcount_inc(&fid->count); + rreq->netfs_priv = fid; +} - retval = p9_client_read(fid, page_offset(page), &to, &err); - if (err) { - v9fs_uncache_page(inode, page); - retval = err; - goto done; - } +/** + * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_rreq + * @mapping: unused mapping of request to cleanup + * @priv: private data to cleanup, a fid, guaranted non-null. + */ +static void v9fs_req_cleanup(struct address_space *mapping, void *priv) +{ + struct p9_fid *fid = priv; - zero_user(page, retval, PAGE_SIZE - retval); - flush_dcache_page(page); - SetPageUptodate(page); + p9_client_clunk(fid); +} - v9fs_readpage_to_fscache(inode, page); - retval = 0; +/** + * v9fs_is_cache_enabled - Determine if caching is enabled for an inode + * @inode: The inode to check + */ +static bool v9fs_is_cache_enabled(struct inode *inode) +{ + struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(inode)); -done: - unlock_page(page); - return retval; + return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects); } /** + * v9fs_begin_cache_operation - Begin a cache operation for a read + * @rreq: The read request + */ +static int v9fs_begin_cache_operation(struct netfs_read_request *rreq) +{ + struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode)); + + return fscache_begin_read_operation(rreq, cookie); +} + +static const struct netfs_read_request_ops v9fs_req_ops = { + .init_rreq = v9fs_init_rreq, + .is_cache_enabled = v9fs_is_cache_enabled, + .begin_cache_operation = v9fs_begin_cache_operation, + .issue_op = v9fs_req_issue_op, + .cleanup = v9fs_req_cleanup, +}; + +/** * v9fs_vfs_readpage - read an entire page in from 9P - * - * @filp: file being read + * @file: file being read * @page: structure to page * */ - -static int v9fs_vfs_readpage(struct file *filp, struct page *page) +static int v9fs_vfs_readpage(struct file *file, struct page *page) { - return v9fs_fid_readpage(filp->private_data, page); + return netfs_readpage(file, page, &v9fs_req_ops, NULL); } /** - * v9fs_vfs_readpages - read a set of pages from 9P - * - * @filp: file being read - * @mapping: the address space - * @pages: list of pages to read - * @nr_pages: count of pages to read - * + * v9fs_vfs_readahead - read a set of pages from 9P + * @ractl: The readahead parameters */ - -static int v9fs_vfs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static void v9fs_vfs_readahead(struct readahead_control *ractl) { - int ret = 0; - struct inode *inode; - - inode = mapping->host; - p9_debug(P9_DEBUG_VFS, "inode: %p file: %p\n", inode, filp); - - ret = v9fs_readpages_from_fscache(inode, mapping, pages, &nr_pages); - if (ret == 0) - return ret; - - ret = read_cache_pages(mapping, pages, v9fs_fid_readpage, - filp->private_data); - p9_debug(P9_DEBUG_VFS, " = %d\n", ret); - return ret; + netfs_readahead(ractl, &v9fs_req_ops, NULL); } /** * v9fs_release_page - release the private state associated with a page + * @page: The page to be released + * @gfp: The caller's allocation restrictions * * Returns 1 if the page can be released, false otherwise. */ @@ -124,34 +132,36 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) { if (PagePrivate(page)) return 0; - return v9fs_fscache_release_page(page, gfp); +#ifdef CONFIG_9P_FSCACHE + if (PageFsCache(page)) { + if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) + return 0; + wait_on_page_fscache(page); + } +#endif + return 1; } /** * v9fs_invalidate_page - Invalidate a page completely or partially - * - * @page: structure to page - * @offset: offset in the page + * @page: The page to be invalidated + * @offset: offset of the invalidated region + * @length: length of the invalidated region */ static void v9fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length) { - /* - * If called with zero offset, we should release - * the private state assocated with the page - */ - if (offset == 0 && length == PAGE_SIZE) - v9fs_fscache_invalidate_page(page); + wait_on_page_fscache(page); } static int v9fs_vfs_writepage_locked(struct page *page) { struct inode *inode = page->mapping->host; struct v9fs_inode *v9inode = V9FS_I(inode); + loff_t start = page_offset(page); loff_t size = i_size_read(inode); struct iov_iter from; - struct bio_vec bvec; int err, len; if (page->index == size >> PAGE_SHIFT) @@ -159,17 +169,14 @@ static int v9fs_vfs_writepage_locked(struct page *page) else len = PAGE_SIZE; - bvec.bv_page = page; - bvec.bv_offset = 0; - bvec.bv_len = len; - iov_iter_bvec(&from, WRITE, &bvec, 1, len); + iov_iter_xarray(&from, WRITE, &page->mapping->i_pages, start, len); /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); set_page_writeback(page); - p9_client_write(v9inode->writeback_fid, page_offset(page), &from, &err); + p9_client_write(v9inode->writeback_fid, start, &from, &err); end_page_writeback(page); return err; @@ -199,26 +206,28 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) /** * v9fs_launder_page - Writeback a dirty page + * @page: The page to be cleaned up + * * Returns 0 on success. */ static int v9fs_launder_page(struct page *page) { int retval; - struct inode *inode = page->mapping->host; - v9fs_fscache_wait_on_page_write(inode, page); if (clear_page_dirty_for_io(page)) { retval = v9fs_vfs_writepage_locked(page); if (retval) return retval; } + wait_on_page_fscache(page); return 0; } /** * v9fs_direct_IO - 9P address space operation for direct I/O * @iocb: target I/O control block + * @iter: The data/buffer to use * * The presence of v9fs_direct_IO() in the address space ops vector * allowes open() O_DIRECT flags which would have failed otherwise. @@ -238,11 +247,13 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) loff_t pos = iocb->ki_pos; ssize_t n; int err = 0; + if (iov_iter_rw(iter) == WRITE) { n = p9_client_write(file->private_data, pos, iter, &err); if (n) { struct inode *inode = file_inode(file); loff_t i_size = i_size_read(inode); + if (pos + n > i_size) inode_add_bytes(inode, pos + n - i_size); } @@ -253,43 +264,32 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } static int v9fs_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned int len, unsigned int flags, struct page **pagep, void **fsdata) { - int retval = 0; + int retval; struct page *page; - struct v9fs_inode *v9inode; - pgoff_t index = pos >> PAGE_SHIFT; - struct inode *inode = mapping->host; - + struct v9fs_inode *v9inode = V9FS_I(mapping->host); p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - v9inode = V9FS_I(inode); -start: - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { - retval = -ENOMEM; - goto out; - } BUG_ON(!v9inode->writeback_fid); - if (PageUptodate(page)) - goto out; - if (len == PAGE_SIZE) - goto out; + /* Prefetch area to be written into the cache if we're caching this + * file. We need to do this before we get a lock on the page in case + * there's more than one writer competing for the same cache block. + */ + retval = netfs_write_begin(filp, mapping, pos, len, flags, &page, fsdata, + &v9fs_req_ops, NULL); + if (retval < 0) + return retval; - retval = v9fs_fid_readpage(v9inode->writeback_fid, page); - put_page(page); - if (!retval) - goto start; -out: - *pagep = page; + *pagep = find_subpage(page, pos / PAGE_SIZE); return retval; } static int v9fs_write_end(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, + loff_t pos, unsigned int len, unsigned int copied, struct page *page, void *fsdata) { loff_t last_pos = pos + copied; @@ -301,10 +301,11 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, if (unlikely(copied < len)) { copied = 0; goto out; - } else if (len == PAGE_SIZE) { - SetPageUptodate(page); } + + SetPageUptodate(page); } + /* * No need to use i_size_read() here, the i_size * cannot change under us because we hold the i_mutex. @@ -324,7 +325,7 @@ out: const struct address_space_operations v9fs_addr_operations = { .readpage = v9fs_vfs_readpage, - .readpages = v9fs_vfs_readpages, + .readahead = v9fs_vfs_readahead, .set_page_dirty = __set_page_dirty_nobuffers, .writepage = v9fs_vfs_writepage, .write_begin = v9fs_write_begin, diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 4b4292123b3d..1c609e99d280 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/fs/9p/vfs_dentry.c - * * This file contians vfs dentry ops for the 9P2000 protocol. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> @@ -52,6 +50,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry) static void v9fs_dentry_release(struct dentry *dentry) { struct hlist_node *p, *n; + p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n", dentry, dentry); hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata) @@ -76,6 +75,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { int retval; struct v9fs_session_info *v9ses; + fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return PTR_ERR(fid); diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index b6a5a0be444d..8c854d8cb0cd 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/fs/9p/vfs_dir.c - * * This file contains vfs directory ops for the 9P2000 protocol. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> @@ -71,6 +69,7 @@ static inline int dt_type(struct p9_wstat *mistat) static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen) { struct p9_fid *fid = filp->private_data; + if (!fid->rdir) fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL); return fid->rdir; @@ -108,6 +107,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) if (rdir->tail == rdir->head) { struct iov_iter to; int n; + iov_iter_kvec(&to, READ, &kvec, 1, buflen); n = p9_client_read(file->private_data, ctx->pos, &to, &err); @@ -233,5 +233,5 @@ const struct file_operations v9fs_dir_operations_dotl = { .iterate_shared = v9fs_dir_readdir_dotl, .open = v9fs_file_open, .release = v9fs_dir_release, - .fsync = v9fs_file_fsync_dotl, + .fsync = v9fs_file_fsync_dotl, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index aab5e6538660..4244d48398ef 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/fs/9p/vfs_file.c - * * This file contians vfs file ops for 9P2000. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> @@ -359,14 +357,11 @@ out_err: } /** - * v9fs_file_read - read from a file - * @filp: file pointer to read - * @udata: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data + * v9fs_file_read_iter - read from a file + * @iocb: The operation parameters + * @to: The buffer to read into * */ - static ssize_t v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { @@ -388,11 +383,9 @@ v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /** - * v9fs_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data + * v9fs_file_write_iter - write to a file + * @iocb: The operation parameters + * @from: The data to write * */ static ssize_t @@ -413,6 +406,7 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(file); loff_t i_size; unsigned long pg_start, pg_end; + pg_start = origin >> PAGE_SHIFT; pg_end = (origin + retval - 1) >> PAGE_SHIFT; if (inode->i_mapping && inode->i_mapping->nrpages) @@ -542,14 +536,23 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", page, (unsigned long)filp->private_data); + v9inode = V9FS_I(inode); + + /* Wait for the page to be written to the cache before we allow it to + * be modified. We then assume the entire page will need writing back. + */ +#ifdef CONFIG_9P_FSCACHE + if (PageFsCache(page) && + wait_on_page_fscache_killable(page) < 0) + return VM_FAULT_RETRY; +#endif + /* Update file times before taking page lock */ file_update_time(filp); - v9inode = V9FS_I(inode); - /* make sure the cache has finished storing the page */ - v9fs_fscache_wait_on_page_write(inode, page); BUG_ON(!v9inode->writeback_fid); - lock_page(page); + if (lock_page_killable(page) < 0) + return VM_FAULT_RETRY; if (page->mapping != inode->i_mapping) goto out_unlock; wait_for_stable_page(page); @@ -561,11 +564,9 @@ out_unlock: } /** - * v9fs_mmap_file_read - read from a file - * @filp: file pointer to read - * @data: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data + * v9fs_mmap_file_read_iter - read from a file + * @iocb: The operation parameters + * @to: The buffer to read into * */ static ssize_t @@ -576,11 +577,9 @@ v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } /** - * v9fs_mmap_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data + * v9fs_mmap_file_write_iter - write to a file + * @iocb: The operation parameters + * @from: The data to write * */ static ssize_t diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 795706520b5e..328c338ff304 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/fs/9p/vfs_inode.c - * * This file contains vfs inode ops for the 9P2000 protocol. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> @@ -49,6 +47,7 @@ static const struct inode_operations v9fs_symlink_inode_operations; static u32 unixmode2p9mode(struct v9fs_session_info *v9ses, umode_t mode) { int res; + res = mode & 0777; if (S_ISDIR(mode)) res |= P9_DMDIR; @@ -110,7 +109,7 @@ static int p9mode2perm(struct v9fs_session_info *v9ses, static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, struct p9_wstat *stat, dev_t *rdev) { - int res; + int res, r; u32 mode = stat->mode; *rdev = 0; @@ -128,11 +127,16 @@ static umode_t p9mode2unixmode(struct v9fs_session_info *v9ses, res |= S_IFIFO; else if ((mode & P9_DMDEVICE) && (v9fs_proto_dotu(v9ses)) && (v9ses->nodev == 0)) { - char type = 0, ext[32]; + char type = 0; int major = -1, minor = -1; - strlcpy(ext, stat->extension, sizeof(ext)); - sscanf(ext, "%c %i %i", &type, &major, &minor); + r = sscanf(stat->extension, "%c %i %i", &type, &major, &minor); + if (r != 3) { + p9_debug(P9_DEBUG_ERROR, + "invalid device string, umode will be bogus: %s\n", + stat->extension); + return res; + } switch (type) { case 'c': res |= S_IFCHR; @@ -218,11 +222,12 @@ v9fs_blank_wstat(struct p9_wstat *wstat) /** * v9fs_alloc_inode - helper function to allocate an inode - * + * @sb: The superblock to allocate the inode from */ struct inode *v9fs_alloc_inode(struct super_block *sb) { struct v9fs_inode *v9inode; + v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL); if (!v9inode) return NULL; @@ -238,7 +243,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) /** * v9fs_free_inode - destroy an inode - * + * @inode: The inode to be freed */ void v9fs_free_inode(struct inode *inode) @@ -251,7 +256,7 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, { int err = 0; - inode_init_owner(&init_user_ns,inode, NULL, mode); + inode_init_owner(&init_user_ns, inode, NULL, mode); inode->i_blocks = 0; inode->i_rdev = rdev; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); @@ -343,7 +348,7 @@ error: * v9fs_get_inode - helper function to setup an inode * @sb: superblock * @mode: mode to setup inode with - * + * @rdev: The device numbers to set */ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) @@ -369,7 +374,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) } /** - * v9fs_clear_inode - release an inode + * v9fs_evict_inode - Remove an inode from the inode cache * @inode: inode to release * */ @@ -440,7 +445,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, unsigned long i_ino; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; - int (*test)(struct inode *, void *); + int (*test)(struct inode *inode, void *data); if (new) test = v9fs_test_new_inode; @@ -499,8 +504,10 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, static int v9fs_at_to_dotl_flags(int flags) { int rflags = 0; + if (flags & AT_REMOVEDIR) rflags |= P9_DOTL_AT_REMOVEDIR; + return rflags; } @@ -665,14 +672,15 @@ error: /** * v9fs_vfs_create - VFS hook to create a regular file + * @mnt_userns: The user namespace of the mount + * @dir: The parent directory + * @dentry: The name of file to be created + * @mode: The UNIX file mode to set + * @excl: True if the file must not yet exist * * open(.., O_CREAT) is handled in v9fs_vfs_atomic_open(). This is only called * for mknod(2). * - * @dir: directory inode that is being created - * @dentry: dentry that is being deleted - * @mode: create permissions - * */ static int @@ -696,6 +704,7 @@ v9fs_vfs_create(struct user_namespace *mnt_userns, struct inode *dir, /** * v9fs_vfs_mkdir - VFS mkdir hook to create a directory + * @mnt_userns: The user namespace of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @mode: mode for new directory @@ -795,7 +804,7 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, static int v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, umode_t mode) + struct file *file, unsigned int flags, umode_t mode) { int err; u32 perm; @@ -900,10 +909,12 @@ int v9fs_vfs_rmdir(struct inode *i, struct dentry *d) /** * v9fs_vfs_rename - VFS hook to rename an inode + * @mnt_userns: The user namespace of the mount * @old_dir: old dir inode * @old_dentry: old dentry * @new_dir: new dir inode * @new_dentry: new dentry + * @flags: RENAME_* flags * */ @@ -1009,6 +1020,7 @@ done: /** * v9fs_vfs_getattr - retrieve file metadata + * @mnt_userns: The user namespace of the mount * @path: Object to query * @stat: metadata structure to populate * @request_mask: Mask of STATX_xxx flags indicating the caller's interests @@ -1050,6 +1062,7 @@ v9fs_vfs_getattr(struct user_namespace *mnt_userns, const struct path *path, /** * v9fs_vfs_setattr - set file metadata + * @mnt_userns: The user namespace of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * @@ -1078,7 +1091,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, fid = v9fs_fid_lookup(dentry); use_dentry = 1; } - if(IS_ERR(fid)) + if (IS_ERR(fid)) return PTR_ERR(fid); v9fs_blank_wstat(&wstat); @@ -1285,6 +1298,7 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, /** * v9fs_vfs_symlink - helper function to create symlinks + * @mnt_userns: The user namespace of the mount * @dir: directory inode containing symlink * @dentry: dentry for symlink * @symname: symlink data @@ -1340,6 +1354,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod - create a special file + * @mnt_userns: The user namespace of the mount * @dir: inode destination for new link * @dentry: dentry for file * @mode: mode for creation @@ -1356,7 +1371,7 @@ v9fs_vfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1]; u32 perm; - p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", + p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, dentry, mode, MAJOR(rdev), MINOR(rdev)); diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index e1c0240b51c0..7dee89ba32e7 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/fs/9p/vfs_inode_dotl.c - * * This file contains vfs inode ops for the 9P2000.L protocol. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> @@ -37,7 +35,10 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t omode, dev_t rdev); /** - * v9fs_get_fsgid_for_create - Helper function to get the gid for creating a + * v9fs_get_fsgid_for_create - Helper function to get the gid for a new object + * @dir_inode: The directory inode + * + * Helper function to get the gid for creating a * new file system object. This checks the S_ISGID to determine the owning * group of the new file system object. */ @@ -104,7 +105,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, unsigned long i_ino; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; - int (*test)(struct inode *, void *); + int (*test)(struct inode *inode, void *data); if (new) test = v9fs_test_new_inode_dotl; @@ -211,12 +212,13 @@ int v9fs_open_to_dotl_flags(int flags) /** * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. + * @mnt_userns: The user namespace of the mount * @dir: directory inode that is being created * @dentry: dentry that is being deleted * @omode: create permissions + * @excl: True if the file must not yet exist * */ - static int v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t omode, bool excl) @@ -226,7 +228,7 @@ v9fs_vfs_create_dotl(struct user_namespace *mnt_userns, struct inode *dir, static int v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, umode_t omode) + struct file *file, unsigned int flags, umode_t omode) { int err = 0; kgid_t gid; @@ -257,7 +259,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, v9ses = v9fs_inode2v9ses(dir); name = dentry->d_name.name; - p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%hx\n", + p9_debug(P9_DEBUG_VFS, "name:%s flags:0x%x mode:0x%x\n", name, flags, omode); dfid = v9fs_parent_fid(dentry); @@ -361,6 +363,7 @@ err_clunk_old_fid: /** * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory + * @mnt_userns: The user namespace of the mount * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked * @omode: mode for new directory @@ -537,6 +540,7 @@ static int v9fs_mapped_iattr_valid(int iattr_valid) /** * v9fs_vfs_setattr_dotl - set file metadata + * @mnt_userns: The user namespace of the mount * @dentry: file whose metadata to set * @iattr: metadata assignment structure * @@ -801,6 +805,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { /* Get the latest stat info from server. */ struct p9_fid *fid; + fid = v9fs_fid_lookup(old_dentry); if (IS_ERR(fid)) return PTR_ERR(fid); @@ -816,6 +821,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, /** * v9fs_vfs_mknod_dotl - create a special file + * @mnt_userns: The user namespace of the mount * @dir: inode destination for new link * @dentry: dentry for file * @omode: mode for creation @@ -836,7 +842,7 @@ v9fs_vfs_mknod_dotl(struct user_namespace *mnt_userns, struct inode *dir, struct p9_qid qid; struct posix_acl *dacl = NULL, *pacl = NULL; - p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", + p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %x MAJOR: %u MINOR: %u\n", dir->i_ino, dentry, omode, MAJOR(rdev), MINOR(rdev)); diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 5fce6e30bc5a..b739e02f5ef7 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -1,9 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * linux/fs/9p/vfs_super.c - * - * This file contians superblock ops for 9P2000. It is intended that - * you mount this file system on directories. * * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> @@ -83,6 +79,9 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (!v9ses->cache) { sb->s_bdi->ra_pages = 0; sb->s_bdi->io_pages = 0; + } else { + sb->s_bdi->ra_pages = v9ses->maxdata >> PAGE_SHIFT; + sb->s_bdi->io_pages = v9ses->maxdata >> PAGE_SHIFT; } sb->s_flags |= SB_ACTIVE | SB_DIRSYNC; @@ -113,7 +112,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, struct inode *inode = NULL; struct dentry *root = NULL; struct v9fs_session_info *v9ses = NULL; - umode_t mode = S_IRWXUGO | S_ISVTX; + umode_t mode = 0777 | S_ISVTX; struct p9_fid *fid; int retval = 0; @@ -157,6 +156,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, sb->s_root = root; if (v9fs_proto_dotl(v9ses)) { struct p9_stat_dotl *st = NULL; + st = p9_client_getattr_dotl(fid, P9_STATS_BASIC); if (IS_ERR(st)) { retval = PTR_ERR(st); @@ -167,6 +167,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, kfree(st); } else { struct p9_wstat *st = NULL; + st = p9_client_stat(fid); if (IS_ERR(st)) { retval = PTR_ERR(st); @@ -275,12 +276,13 @@ done: static int v9fs_drop_inode(struct inode *inode) { struct v9fs_session_info *v9ses; + v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) return generic_drop_inode(inode); /* * in case of non cached mode always drop the - * the inode because we want the inode attribute + * inode because we want the inode attribute * to always match that on the server. */ return 1; diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index ee331845e2c7..a824441b95a2 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -1,15 +1,7 @@ +// SPDX-License-Identifier: LGPL-2.1 /* * Copyright IBM Corporation, 2010 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * */ #include <linux/module.h> diff --git a/fs/9p/xattr.h b/fs/9p/xattr.h index c63c3bea5de5..3e11fc3331eb 100644 --- a/fs/9p/xattr.h +++ b/fs/9p/xattr.h @@ -1,15 +1,7 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ /* * Copyright IBM Corporation, 2010 * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * */ #ifndef FS_9P_XATTR_H #define FS_9P_XATTR_H @@ -22,13 +14,14 @@ extern const struct xattr_handler *v9fs_xattr_handlers[]; extern const struct xattr_handler v9fs_xattr_acl_access_handler; extern const struct xattr_handler v9fs_xattr_acl_default_handler; -extern ssize_t v9fs_fid_xattr_get(struct p9_fid *, const char *, - void *, size_t); -extern ssize_t v9fs_xattr_get(struct dentry *, const char *, - void *, size_t); -extern int v9fs_fid_xattr_set(struct p9_fid *, const char *, - const void *, size_t, int); -extern int v9fs_xattr_set(struct dentry *, const char *, - const void *, size_t, int); -extern ssize_t v9fs_listxattr(struct dentry *, char *, size_t); +ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, + void *buffer, size_t buffer_size); +ssize_t v9fs_xattr_get(struct dentry *dentry, const char *name, + void *buffer, size_t buffer_size); +int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, + const void *value, size_t value_len, int flags); +int v9fs_xattr_set(struct dentry *dentry, const char *name, + const void *value, size_t value_len, int flags); +ssize_t v9fs_listxattr(struct dentry *dentry, char *buffer, + size_t buffer_size); #endif /* FS_9P_XATTR_H */ diff --git a/fs/affs/super.c b/fs/affs/super.c index c6c2a513ec92..c609005a9eaa 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -389,7 +389,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) * blocks, we will have to change it. */ - size = i_size_read(sb->s_bdev->bd_inode) >> 9; + size = bdev_nr_sectors(sb->s_bdev); pr_debug("initial blocksize=%d, #blocks=%d\n", 512, size); affs_set_blocksize(sb, PAGE_SIZE); diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 7d9b23d981bf..1b4d5809808d 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -21,6 +21,37 @@ #include "internal.h" /* + * Handle invalidation of an mmap'd file. We invalidate all the PTEs referring + * to the pages in this file's pagecache, forcing the kernel to go through + * ->fault() or ->page_mkwrite() - at which point we can handle invalidation + * more fully. + */ +void afs_invalidate_mmap_work(struct work_struct *work) +{ + struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work); + + unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false); +} + +void afs_server_init_callback_work(struct work_struct *work) +{ + struct afs_server *server = container_of(work, struct afs_server, initcb_work); + struct afs_vnode *vnode; + struct afs_cell *cell = server->cell; + + down_read(&cell->fs_open_mmaps_lock); + + list_for_each_entry(vnode, &cell->fs_open_mmaps, cb_mmap_link) { + if (vnode->cb_server == server) { + clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); + queue_work(system_unbound_wq, &vnode->cb_work); + } + } + + up_read(&cell->fs_open_mmaps_lock); +} + +/* * Allow the fileserver to request callback state (re-)initialisation. * Unfortunately, UUIDs are not guaranteed unique. */ @@ -29,8 +60,11 @@ void afs_init_callback_state(struct afs_server *server) rcu_read_lock(); do { server->cb_s_break++; - server = rcu_dereference(server->uuid_next); - } while (0); + atomic_inc(&server->cell->fs_s_break); + if (!list_empty(&server->cell->fs_open_mmaps)) + queue_work(system_unbound_wq, &server->initcb_work); + + } while ((server = rcu_dereference(server->uuid_next))); rcu_read_unlock(); } @@ -44,11 +78,17 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { vnode->cb_break++; + vnode->cb_v_break = vnode->volume->cb_v_break; afs_clear_permits(vnode); if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB) afs_lock_may_be_available(vnode); + if (reason != afs_cb_break_for_deleted && + vnode->status.type == AFS_FTYPE_FILE && + atomic_read(&vnode->cb_nr_mmap)) + queue_work(system_unbound_wq, &vnode->cb_work); + trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, true); } else { trace_afs_cb_break(&vnode->fid, vnode->cb_break, reason, false); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 887b673f6223..d88407fb9bc0 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -166,6 +166,8 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, seqlock_init(&cell->volume_lock); cell->fs_servers = RB_ROOT; seqlock_init(&cell->fs_lock); + INIT_LIST_HEAD(&cell->fs_open_mmaps); + init_rwsem(&cell->fs_open_mmaps_lock); rwlock_init(&cell->vl_servers_lock); cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index ac829e63c570..4579bbda4634 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1077,9 +1077,9 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, */ static int afs_d_revalidate_rcu(struct dentry *dentry) { - struct afs_vnode *dvnode, *vnode; + struct afs_vnode *dvnode; struct dentry *parent; - struct inode *dir, *inode; + struct inode *dir; long dir_version, de_version; _enter("%p", dentry); @@ -1109,18 +1109,6 @@ static int afs_d_revalidate_rcu(struct dentry *dentry) return -ECHILD; } - /* Check to see if the vnode referred to by the dentry still - * has a callback. - */ - if (d_really_is_positive(dentry)) { - inode = d_inode_rcu(dentry); - if (inode) { - vnode = AFS_FS_I(inode); - if (!afs_check_validity(vnode)) - return -ECHILD; - } - } - return 1; /* Still valid */ } @@ -1156,17 +1144,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (IS_ERR(key)) key = NULL; - if (d_really_is_positive(dentry)) { - inode = d_inode(dentry); - if (inode) { - vnode = AFS_FS_I(inode); - afs_validate(vnode, key); - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - goto out_bad; - } - } - - /* lock down the parent dentry so we can peer at it */ + /* Hold the parent dentry so we can peer at it */ parent = dget_parent(dentry); dir = AFS_FS_I(d_inode(parent)); @@ -1175,7 +1153,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (test_bit(AFS_VNODE_DELETED, &dir->flags)) { _debug("%pd: parent dir deleted", dentry); - goto out_bad_parent; + goto not_found; } /* We only need to invalidate a dentry if the server's copy changed @@ -1201,12 +1179,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) case 0: /* the filename maps to something */ if (d_really_is_negative(dentry)) - goto out_bad_parent; + goto not_found; inode = d_inode(dentry); if (is_bad_inode(inode)) { printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n", dentry); - goto out_bad_parent; + goto not_found; } vnode = AFS_FS_I(inode); @@ -1228,9 +1206,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) dentry, fid.unique, vnode->fid.unique, vnode->vfs_inode.i_generation); - write_seqlock(&vnode->cb_lock); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - write_sequnlock(&vnode->cb_lock); goto not_found; } goto out_valid; @@ -1245,7 +1220,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) default: _debug("failed to iterate dir %pd: %d", parent, ret); - goto out_bad_parent; + goto not_found; } out_valid: @@ -1256,16 +1231,9 @@ out_valid_noupdate: _leave(" = 1 [valid]"); return 1; - /* the dirent, if it exists, now points to a different vnode */ not_found: - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_NFSFS_RENAMED; - spin_unlock(&dentry->d_lock); - -out_bad_parent: _debug("dropping dentry %pd2", dentry); dput(parent); -out_bad: key_put(key); _leave(" = 0 [bad]"); @@ -1792,6 +1760,10 @@ static int afs_link(struct dentry *from, struct inode *dir, goto error; } + ret = afs_validate(vnode, op->key); + if (ret < 0) + goto error_op; + afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; @@ -1805,6 +1777,8 @@ static int afs_link(struct dentry *from, struct inode *dir, op->create.reason = afs_edit_dir_for_link; return afs_do_sync_operation(op); +error_op: + afs_put_operation(op); error: d_drop(dentry); _leave(" = %d", ret); @@ -1989,6 +1963,11 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, if (IS_ERR(op)) return PTR_ERR(op); + ret = afs_validate(vnode, op->key); + op->error = ret; + if (ret < 0) + goto error; + afs_op_set_vnode(op, 0, orig_dvnode); afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index f4600c1353ad..540b9fc96824 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -263,7 +263,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode, if (b == nr_blocks) { _debug("init %u", b); afs_edit_init_block(meta, block, b); - i_size_write(&vnode->vfs_inode, (b + 1) * AFS_DIR_BLOCK_SIZE); + afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE); } /* Only lower dir pages have a counter in the header. */ @@ -296,7 +296,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode, new_directory: afs_edit_init_block(meta, meta, 0); i_size = AFS_DIR_BLOCK_SIZE; - i_size_write(&vnode->vfs_inode, i_size); + afs_set_i_size(vnode, i_size); slot = AFS_DIR_RESV_BLOCKS0; page = page0; block = meta; diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index dae9a57d7ec0..45cfd50a9521 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -86,8 +86,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode return afs_do_sync_operation(op); } -/** - * afs_sillyrename - Perform a silly-rename of a dentry +/* + * Perform silly-rename of a dentry. * * AFS is stateless and the server doesn't know when the client is holding a * file open. To prevent application problems when a file is unlinked while diff --git a/fs/afs/file.c b/fs/afs/file.c index db035ae2a134..eb11d047c0ae 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -19,17 +19,22 @@ static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); static int afs_readpage(struct file *file, struct page *page); +static int afs_symlink_readpage(struct file *file, struct page *page); static void afs_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int afs_releasepage(struct page *page, gfp_t gfp_flags); static void afs_readahead(struct readahead_control *ractl); +static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); +static void afs_vm_open(struct vm_area_struct *area); +static void afs_vm_close(struct vm_area_struct *area); +static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); const struct file_operations afs_file_operations = { .open = afs_open, .release = afs_release, .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = afs_file_read_iter, .write_iter = afs_file_write, .mmap = afs_file_mmap, .splice_read = generic_file_splice_read, @@ -45,7 +50,7 @@ const struct inode_operations afs_file_inode_operations = { .permission = afs_permission, }; -const struct address_space_operations afs_fs_aops = { +const struct address_space_operations afs_file_aops = { .readpage = afs_readpage, .readahead = afs_readahead, .set_page_dirty = afs_set_page_dirty, @@ -58,9 +63,17 @@ const struct address_space_operations afs_fs_aops = { .writepages = afs_writepages, }; +const struct address_space_operations afs_symlink_aops = { + .readpage = afs_symlink_readpage, + .releasepage = afs_releasepage, + .invalidatepage = afs_invalidatepage, +}; + static const struct vm_operations_struct afs_vm_ops = { + .open = afs_vm_open, + .close = afs_vm_close, .fault = filemap_fault, - .map_pages = filemap_map_pages, + .map_pages = afs_vm_map_pages, .page_mkwrite = afs_page_mkwrite, }; @@ -295,7 +308,7 @@ static void afs_req_issue_op(struct netfs_read_subrequest *subreq) fsreq->subreq = subreq; fsreq->pos = subreq->start + subreq->transferred; fsreq->len = subreq->len - subreq->transferred; - fsreq->key = subreq->rreq->netfs_priv; + fsreq->key = key_get(subreq->rreq->netfs_priv); fsreq->vnode = vnode; fsreq->iter = &fsreq->def_iter; @@ -304,9 +317,10 @@ static void afs_req_issue_op(struct netfs_read_subrequest *subreq) fsreq->pos, fsreq->len); afs_fetch_data(fsreq->vnode, fsreq); + afs_put_read(fsreq); } -static int afs_symlink_readpage(struct page *page) +static int afs_symlink_readpage(struct file *file, struct page *page) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); struct afs_read *fsreq; @@ -371,9 +385,6 @@ const struct netfs_read_request_ops afs_req_ops = { static int afs_readpage(struct file *file, struct page *page) { - if (!file) - return afs_symlink_readpage(page); - return netfs_readpage(file, page, &afs_req_ops, NULL); } @@ -490,15 +501,88 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) return 1; } +static void afs_add_open_mmap(struct afs_vnode *vnode) +{ + if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) { + down_write(&vnode->volume->cell->fs_open_mmaps_lock); + + list_add_tail(&vnode->cb_mmap_link, + &vnode->volume->cell->fs_open_mmaps); + + up_write(&vnode->volume->cell->fs_open_mmaps_lock); + } +} + +static void afs_drop_open_mmap(struct afs_vnode *vnode) +{ + if (!atomic_dec_and_test(&vnode->cb_nr_mmap)) + return; + + down_write(&vnode->volume->cell->fs_open_mmaps_lock); + + if (atomic_read(&vnode->cb_nr_mmap) == 0) + list_del_init(&vnode->cb_mmap_link); + + up_write(&vnode->volume->cell->fs_open_mmaps_lock); + flush_work(&vnode->cb_work); +} + /* * Handle setting up a memory mapping on an AFS file. */ static int afs_file_mmap(struct file *file, struct vm_area_struct *vma) { + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); int ret; + afs_add_open_mmap(vnode); + ret = generic_file_mmap(file, vma); if (ret == 0) vma->vm_ops = &afs_vm_ops; + else + afs_drop_open_mmap(vnode); return ret; } + +static void afs_vm_open(struct vm_area_struct *vma) +{ + afs_add_open_mmap(AFS_FS_I(file_inode(vma->vm_file))); +} + +static void afs_vm_close(struct vm_area_struct *vma) +{ + afs_drop_open_mmap(AFS_FS_I(file_inode(vma->vm_file))); +} + +static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) +{ + struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file)); + struct afs_file *af = vmf->vma->vm_file->private_data; + + switch (afs_validate(vnode, af->key)) { + case 0: + return filemap_map_pages(vmf, start_pgoff, end_pgoff); + case -ENOMEM: + return VM_FAULT_OOM; + case -EINTR: + case -ERESTARTSYS: + return VM_FAULT_RETRY; + case -ESTALE: + default: + return VM_FAULT_SIGBUS; + } +} + +static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); + struct afs_file *af = iocb->ki_filp->private_data; + int ret; + + ret = afs_validate(vnode, af->key); + if (ret < 0) + return ret; + + return generic_file_read_iter(iocb, iter); +} diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index e7e98ad63a91..c0031a3ab42f 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -9,6 +9,7 @@ #include <linux/slab.h> #include "afs_fs.h" #include "internal.h" +#include "protocol_afs.h" #include "protocol_yfs.h" static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ; @@ -102,7 +103,7 @@ void afs_fileserver_probe_result(struct afs_call *call) struct afs_addr_list *alist = call->alist; struct afs_server *server = call->server; unsigned int index = call->addr_ix; - unsigned int rtt_us = 0; + unsigned int rtt_us = 0, cap0; int ret = call->error; _enter("%pU,%u", &server->uuid, index); @@ -159,6 +160,11 @@ responded: clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags); alist->addrs[index].srx_service = call->service_id; } + cap0 = ntohl(call->tmp); + if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES) + set_bit(AFS_SERVER_FL_HAS_FS64, &server->flags); + else + clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags); } if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) && diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index dd3f45d906d2..4943413d9c5f 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -456,9 +456,7 @@ void afs_fs_fetch_data(struct afs_operation *op) struct afs_read *req = op->fetch.req; __be32 *bp; - if (upper_32_bits(req->pos) || - upper_32_bits(req->len) || - upper_32_bits(req->pos + req->len)) + if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags)) return afs_fs_fetch_data64(op); _enter(""); @@ -1113,9 +1111,7 @@ void afs_fs_store_data(struct afs_operation *op) (unsigned long long)op->store.pos, (unsigned long long)op->store.i_size); - if (upper_32_bits(op->store.pos) || - upper_32_bits(op->store.size) || - upper_32_bits(op->store.i_size)) + if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags)) return afs_fs_store_data64(op); call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData, @@ -1229,7 +1225,7 @@ static void afs_fs_setattr_size(struct afs_operation *op) key_serial(op->key), vp->fid.vid, vp->fid.vnode); ASSERT(attr->ia_valid & ATTR_SIZE); - if (upper_32_bits(attr->ia_size)) + if (test_bit(AFS_SERVER_FL_HAS_FS64, &op->server->flags)) return afs_fs_setattr_size64(op); call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData_as_Status, @@ -1657,20 +1653,33 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call) return ret; count = ntohl(call->tmp); - call->count = count; call->count2 = count; - afs_extract_discard(call, count * sizeof(__be32)); + if (count == 0) { + call->unmarshall = 4; + call->tmp = 0; + break; + } + + /* Extract the first word of the capabilities to call->tmp */ + afs_extract_to_tmp(call); call->unmarshall++; fallthrough; - /* Extract capabilities words */ case 2: ret = afs_extract_data(call, false); if (ret < 0) return ret; - /* TODO: Examine capabilities */ + afs_extract_discard(call, (count - 1) * sizeof(__be32)); + call->unmarshall++; + fallthrough; + + /* Extract remaining capabilities words */ + case 3: + ret = afs_extract_data(call, false); + if (ret < 0) + return ret; call->unmarshall++; break; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 80b6c8d967d5..16906eb592d9 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -54,16 +54,6 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren } /* - * Set the file size and block count. Estimate the number of 512 bytes blocks - * used, rounded up to nearest 1K for consistency with other AFS clients. - */ -static void afs_set_i_size(struct afs_vnode *vnode, u64 size) -{ - i_size_write(&vnode->vfs_inode, size); - vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1; -} - -/* * Initialise an inode from the vnode status. */ static int afs_inode_init_from_status(struct afs_operation *op, @@ -105,7 +95,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_mode = S_IFREG | (status->mode & S_IALLUGO); inode->i_op = &afs_file_inode_operations; inode->i_fop = &afs_file_operations; - inode->i_mapping->a_ops = &afs_fs_aops; + inode->i_mapping->a_ops = &afs_file_aops; break; case AFS_FTYPE_DIR: inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO); @@ -123,11 +113,11 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_mode = S_IFDIR | 0555; inode->i_op = &afs_mntpt_inode_operations; inode->i_fop = &afs_mntpt_file_operations; - inode->i_mapping->a_ops = &afs_fs_aops; + inode->i_mapping->a_ops = &afs_symlink_aops; } else { inode->i_mode = S_IFLNK | status->mode; inode->i_op = &afs_symlink_inode_operations; - inode->i_mapping->a_ops = &afs_fs_aops; + inode->i_mapping->a_ops = &afs_symlink_aops; } inode_nohighmem(inode); break; @@ -587,22 +577,32 @@ static void afs_zap_data(struct afs_vnode *vnode) } /* - * Get the server reinit counter for a vnode's current server. + * Check to see if we have a server currently serving this volume and that it + * hasn't been reinitialised or dropped from the list. */ -static bool afs_get_s_break_rcu(struct afs_vnode *vnode, unsigned int *_s_break) +static bool afs_check_server_good(struct afs_vnode *vnode) { - struct afs_server_list *slist = rcu_dereference(vnode->volume->servers); + struct afs_server_list *slist; struct afs_server *server; + bool good; int i; + if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break)) + return true; + + rcu_read_lock(); + + slist = rcu_dereference(vnode->volume->servers); for (i = 0; i < slist->nr_servers; i++) { server = slist->servers[i].server; if (server == vnode->cb_server) { - *_s_break = READ_ONCE(server->cb_s_break); - return true; + good = (vnode->cb_s_break == server->cb_s_break); + rcu_read_unlock(); + return good; } } + rcu_read_unlock(); return false; } @@ -611,57 +611,46 @@ static bool afs_get_s_break_rcu(struct afs_vnode *vnode, unsigned int *_s_break) */ bool afs_check_validity(struct afs_vnode *vnode) { - struct afs_volume *volume = vnode->volume; enum afs_cb_break_reason need_clear = afs_cb_break_no_break; time64_t now = ktime_get_real_seconds(); - bool valid; - unsigned int cb_break, cb_s_break, cb_v_break; + unsigned int cb_break; int seq = 0; do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); - cb_v_break = READ_ONCE(volume->cb_v_break); cb_break = vnode->cb_break; - if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && - afs_get_s_break_rcu(vnode, &cb_s_break)) { - if (vnode->cb_s_break != cb_s_break || - vnode->cb_v_break != cb_v_break) { - vnode->cb_s_break = cb_s_break; - vnode->cb_v_break = cb_v_break; - need_clear = afs_cb_break_for_vsbreak; - valid = false; - } else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) { + if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + if (vnode->cb_v_break != vnode->volume->cb_v_break) + need_clear = afs_cb_break_for_v_break; + else if (!afs_check_server_good(vnode)) + need_clear = afs_cb_break_for_s_reinit; + else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) need_clear = afs_cb_break_for_zap; - valid = false; - } else if (vnode->cb_expires_at - 10 <= now) { + else if (vnode->cb_expires_at - 10 <= now) need_clear = afs_cb_break_for_lapsed; - valid = false; - } else { - valid = true; - } } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - valid = true; + ; } else { - vnode->cb_v_break = cb_v_break; - valid = false; + need_clear = afs_cb_break_no_promise; } } while (need_seqretry(&vnode->cb_lock, seq)); done_seqretry(&vnode->cb_lock, seq); - if (need_clear != afs_cb_break_no_break) { - write_seqlock(&vnode->cb_lock); - if (cb_break == vnode->cb_break) - __afs_break_callback(vnode, need_clear); - else - trace_afs_cb_miss(&vnode->fid, need_clear); - write_sequnlock(&vnode->cb_lock); - valid = false; - } + if (need_clear == afs_cb_break_no_break) + return true; - return valid; + write_seqlock(&vnode->cb_lock); + if (need_clear == afs_cb_break_no_promise) + vnode->cb_v_break = vnode->volume->cb_v_break; + else if (cb_break == vnode->cb_break) + __afs_break_callback(vnode, need_clear); + else + trace_afs_cb_miss(&vnode->fid, need_clear); + write_sequnlock(&vnode->cb_lock); + return false; } /* @@ -675,21 +664,20 @@ bool afs_check_validity(struct afs_vnode *vnode) */ int afs_validate(struct afs_vnode *vnode, struct key *key) { - bool valid; int ret; _enter("{v={%llx:%llu} fl=%lx},%x", vnode->fid.vid, vnode->fid.vnode, vnode->flags, key_serial(key)); - rcu_read_lock(); - valid = afs_check_validity(vnode); - rcu_read_unlock(); - - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - clear_nlink(&vnode->vfs_inode); + if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { + if (vnode->vfs_inode.i_nlink) + clear_nlink(&vnode->vfs_inode); + goto valid; + } - if (valid) + if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) && + afs_check_validity(vnode)) goto valid; down_write(&vnode->validate_lock); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 5ed416f4ff33..9357c53faa69 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -390,6 +390,9 @@ struct afs_cell { /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ seqlock_t fs_lock; /* For fs_servers */ + struct rw_semaphore fs_open_mmaps_lock; + struct list_head fs_open_mmaps; /* List of vnodes that are mmapped */ + atomic_t fs_s_break; /* Counter of CB.InitCallBackState messages */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ @@ -503,6 +506,7 @@ struct afs_server { struct hlist_node addr4_link; /* Link in net->fs_addresses4 */ struct hlist_node addr6_link; /* Link in net->fs_addresses6 */ struct hlist_node proc_link; /* Link in net->fs_proc */ + struct work_struct initcb_work; /* Work for CB.InitCallBackState* */ struct afs_server *gc_next; /* Next server in manager's list */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; @@ -516,6 +520,7 @@ struct afs_server { #define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ #define AFS_SERVER_FL_NO_RM2 18 /* Fileserver doesn't support YFS.RemoveFile2 */ +#define AFS_SERVER_FL_HAS_FS64 19 /* Fileserver supports FS.{Fetch,Store}Data64 */ atomic_t ref; /* Object refcount */ atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ @@ -657,7 +662,11 @@ struct afs_vnode { afs_lock_type_t lock_type : 8; /* outstanding callback notification on this file */ + struct work_struct cb_work; /* Work for mmap'd files */ + struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */ void *cb_server; /* Server with callback/filelock */ + atomic_t cb_nr_mmap; /* Number of mmaps */ + unsigned int cb_fs_s_break; /* Mass server break counter (cell->fs_s_break) */ unsigned int cb_s_break; /* Mass break counter on ->server */ unsigned int cb_v_break; /* Mass break counter on ->volume */ unsigned int cb_break; /* Break counter on vnode */ @@ -965,6 +974,8 @@ extern struct fscache_cookie_def afs_vnode_cache_index_def; /* * callback.c */ +extern void afs_invalidate_mmap_work(struct work_struct *); +extern void afs_server_init_callback_work(struct work_struct *work); extern void afs_init_callback_state(struct afs_server *); extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason); @@ -1044,7 +1055,8 @@ extern void afs_dynroot_depopulate(struct super_block *); /* * file.c */ -extern const struct address_space_operations afs_fs_aops; +extern const struct address_space_operations afs_file_aops; +extern const struct address_space_operations afs_symlink_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; extern const struct netfs_read_request_ops afs_req_ops; @@ -1586,6 +1598,16 @@ static inline void afs_update_dentry_version(struct afs_operation *op, } /* + * Set the file size and block count. Estimate the number of 512 bytes blocks + * used, rounded up to nearest 1K for consistency with other AFS clients. + */ +static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) +{ + i_size_write(&vnode->vfs_inode, size); + vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1; +} + +/* * Check for a conflicting operation on a directory that we just unlinked from. * If someone managed to sneak a link or an unlink in on the file we just * unlinked, we won't be able to trust nlink on an AFS file (but not YFS). diff --git a/fs/afs/protocol_afs.h b/fs/afs/protocol_afs.h new file mode 100644 index 000000000000..0c39358c8b70 --- /dev/null +++ b/fs/afs/protocol_afs.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* AFS protocol bits + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + + +#define AFSCAPABILITIESMAX 196 /* Maximum number of words in a capability set */ + +/* AFS3 Fileserver capabilities word 0 */ +#define AFS3_VICED_CAPABILITY_ERRORTRANS 0x0001 /* Uses UAE errors */ +#define AFS3_VICED_CAPABILITY_64BITFILES 0x0002 /* FetchData64 & StoreData64 supported */ +#define AFS3_VICED_CAPABILITY_WRITELOCKACL 0x0004 /* Can lock a file even without lock perm */ +#define AFS3_VICED_CAPABILITY_SANEACLS 0x0008 /* ACLs reviewed for sanity - don't use */ diff --git a/fs/afs/protocol_yfs.h b/fs/afs/protocol_yfs.h index b5bd03b1d3c7..e4cd89c44c46 100644 --- a/fs/afs/protocol_yfs.h +++ b/fs/afs/protocol_yfs.h @@ -168,3 +168,9 @@ enum yfs_lock_type { yfs_LockMandatoryWrite = 0x101, yfs_LockMandatoryExtend = 0x102, }; + +/* RXYFS Viced Capability Flags */ +#define YFS_VICED_CAPABILITY_ERRORTRANS 0x0001 /* Deprecated v0.195 */ +#define YFS_VICED_CAPABILITY_64BITFILES 0x0002 /* Deprecated v0.195 */ +#define YFS_VICED_CAPABILITY_WRITELOCKACL 0x0004 /* Can lock a file even without lock perm */ +#define YFS_VICED_CAPABILITY_SANEACLS 0x0008 /* Deprecated v0.195 */ diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index d83f13c44b92..79e1a5f6701b 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -374,6 +374,7 @@ selected_server: if (vnode->cb_server != server) { vnode->cb_server = server; vnode->cb_s_break = server->cb_s_break; + vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break); vnode->cb_v_break = vnode->volume->cb_v_break; clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); } diff --git a/fs/afs/server.c b/fs/afs/server.c index 684a2b02b9ff..6e5b9a19b234 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -235,6 +235,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, server->addr_version = alist->version; server->uuid = *uuid; rwlock_init(&server->fs_lock); + INIT_WORK(&server->initcb_work, afs_server_init_callback_work); init_waitqueue_head(&server->probe_wq); INIT_LIST_HEAD(&server->probe_link); spin_lock_init(&server->probe_lock); @@ -467,6 +468,7 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server) if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) afs_give_up_callbacks(net, server); + flush_work(&server->initcb_work); afs_put_server(net, server, afs_server_trace_destroy); } diff --git a/fs/afs/super.c b/fs/afs/super.c index e38bb1e7a4d2..d110def8aa8e 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -698,6 +698,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb) vnode->lock_state = AFS_VNODE_LOCK_NONE; init_rwsem(&vnode->rmdir_lock); + INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work); _leave(" = %p", &vnode->vfs_inode); return &vnode->vfs_inode; diff --git a/fs/afs/write.c b/fs/afs/write.c index c0534697268e..8b1d9c2f6bec 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -137,7 +137,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, write_seqlock(&vnode->cb_lock); i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) - i_size_write(&vnode->vfs_inode, maybe_i_size); + afs_set_i_size(vnode, maybe_i_size); write_sequnlock(&vnode->cb_lock); } @@ -471,13 +471,18 @@ static void afs_extend_writeback(struct address_space *mapping, } /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) + if (unlikely(page != xas_reload(&xas))) { + put_page(page); break; + } - if (!trylock_page(page)) + if (!trylock_page(page)) { + put_page(page); break; + } if (!PageDirty(page) || PageWriteback(page)) { unlock_page(page); + put_page(page); break; } @@ -487,6 +492,7 @@ static void afs_extend_writeback(struct address_space *mapping, t = afs_page_dirty_to(page, priv); if (f != 0 && !new_content) { unlock_page(page); + put_page(page); break; } @@ -801,6 +807,7 @@ int afs_writepages(struct address_space *mapping, ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) { struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp)); + struct afs_file *af = iocb->ki_filp->private_data; ssize_t result; size_t count = iov_iter_count(from); @@ -816,6 +823,10 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) if (!count) return 0; + result = afs_validate(vnode, af->key); + if (result < 0) + return result; + result = generic_file_write_iter(iocb, from); _leave(" = %zd", result); @@ -829,13 +840,18 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) */ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct inode *inode = file_inode(file); - struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct afs_file *af = file->private_data; + int ret; _enter("{%llx:%llu},{n=%pD},%d", vnode->fid.vid, vnode->fid.vnode, file, datasync); + ret = afs_validate(vnode, af->key); + if (ret < 0) + return ret; + return file_write_and_wait_range(file, start, end); } @@ -845,15 +861,19 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { - struct page *page = thp_head(vmf->page); + struct folio *folio = page_folio(vmf->page); + struct page *page = &folio->page; struct file *file = vmf->vma->vm_file; struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); + struct afs_file *af = file->private_data; unsigned long priv; vm_fault_t ret = VM_FAULT_RETRY; _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); + afs_validate(vnode, af->key); + sb_start_pagefault(inode->i_sb); /* Wait for the page to be written to the cache before we allow it to @@ -865,7 +885,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) goto out; #endif - if (wait_on_page_writeback_killable(page)) + if (folio_wait_writeback_killable(folio)) goto out; if (lock_page_killable(page) < 0) @@ -875,8 +895,8 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) * details the portion of the page we need to write back and we might * need to redirty the page if there's a problem. */ - if (wait_on_page_writeback_killable(page) < 0) { - unlock_page(page); + if (folio_wait_writeback_killable(folio) < 0) { + folio_unlock(folio); goto out; } @@ -955,8 +975,7 @@ int afs_launder_page(struct page *page) iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); trace_afs_page_dirty(vnode, tracepoint_string("launder"), page); - ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE, - true); + ret = afs_store_data(vnode, &iter, page_offset(page) + f, true); } trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page); diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index 2b35cba8ad62..fdc7d675b4b0 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -83,25 +83,18 @@ static s64 linux_to_yfs_time(const struct timespec64 *t) return (u64)t->tv_sec * 10000000 + t->tv_nsec/100; } -static __be32 *xdr_encode_YFSStoreStatus_mode(__be32 *bp, mode_t mode) -{ - struct yfs_xdr_YFSStoreStatus *x = (void *)bp; - - x->mask = htonl(AFS_SET_MODE); - x->mode = htonl(mode & S_IALLUGO); - x->mtime_client = u64_to_xdr(0); - x->owner = u64_to_xdr(0); - x->group = u64_to_xdr(0); - return bp + xdr_size(x); -} - -static __be32 *xdr_encode_YFSStoreStatus_mtime(__be32 *bp, const struct timespec64 *t) +static __be32 *xdr_encode_YFSStoreStatus(__be32 *bp, mode_t *mode, + const struct timespec64 *t) { struct yfs_xdr_YFSStoreStatus *x = (void *)bp; + mode_t masked_mode = mode ? *mode & S_IALLUGO : 0; s64 mtime = linux_to_yfs_time(t); + u32 mask = AFS_SET_MTIME; - x->mask = htonl(AFS_SET_MTIME); - x->mode = htonl(0); + mask |= mode ? AFS_SET_MODE : 0; + + x->mask = htonl(mask); + x->mode = htonl(masked_mode); x->mtime_client = u64_to_xdr(mtime); x->owner = u64_to_xdr(0); x->group = u64_to_xdr(0); @@ -576,7 +569,7 @@ void yfs_fs_create_file(struct afs_operation *op) bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &dvp->fid); bp = xdr_encode_name(bp, name); - bp = xdr_encode_YFSStoreStatus_mode(bp, op->create.mode); + bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime); bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */ yfs_check_req(call, bp); @@ -625,7 +618,7 @@ void yfs_fs_make_dir(struct afs_operation *op) bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &dvp->fid); bp = xdr_encode_name(bp, name); - bp = xdr_encode_YFSStoreStatus_mode(bp, op->create.mode); + bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime); yfs_check_req(call, bp); trace_afs_make_fs_call1(call, &dvp->fid, name); @@ -946,6 +939,7 @@ void yfs_fs_symlink(struct afs_operation *op) struct afs_vnode_param *dvp = &op->file[0]; struct afs_call *call; size_t contents_sz; + mode_t mode = 0777; __be32 *bp; _enter(""); @@ -972,7 +966,7 @@ void yfs_fs_symlink(struct afs_operation *op) bp = xdr_encode_YFSFid(bp, &dvp->fid); bp = xdr_encode_name(bp, name); bp = xdr_encode_string(bp, op->create.symlink, contents_sz); - bp = xdr_encode_YFSStoreStatus_mode(bp, S_IRWXUGO); + bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime); yfs_check_req(call, bp); trace_afs_make_fs_call1(call, &dvp->fid, name); @@ -1103,7 +1097,7 @@ void yfs_fs_store_data(struct afs_operation *op) bp = xdr_encode_u32(bp, YFSSTOREDATA64); bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &vp->fid); - bp = xdr_encode_YFSStoreStatus_mtime(bp, &op->mtime); + bp = xdr_encode_YFSStoreStatus(bp, NULL, &op->mtime); bp = xdr_encode_u64(bp, op->store.pos); bp = xdr_encode_u64(bp, op->store.size); bp = xdr_encode_u64(bp, op->store.i_size); @@ -659,8 +659,7 @@ static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) new_nr = (table ? table->nr : 1) * 4; spin_unlock(&mm->ioctx_lock); - table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * - new_nr, GFP_KERNEL); + table = kzalloc(struct_size(table, table, new_nr), GFP_KERNEL); if (!table) return -ENOMEM; @@ -1417,7 +1416,7 @@ static void aio_remove_iocb(struct aio_kiocb *iocb) spin_unlock_irqrestore(&ctx->ctx_lock, flags); } -static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) +static void aio_complete_rw(struct kiocb *kiocb, long res) { struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, rw); @@ -1437,7 +1436,7 @@ static void aio_complete_rw(struct kiocb *kiocb, long res, long res2) } iocb->ki_res.res = res; - iocb->ki_res.res2 = res2; + iocb->ki_res.res2 = 0; iocb_put(iocb); } @@ -1508,7 +1507,7 @@ static inline void aio_rw_done(struct kiocb *req, ssize_t ret) ret = -EINTR; fallthrough; default: - req->ki_complete(req, ret, 0); + req->ki_complete(req, ret); } } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index a280156138ed..e0c3e33c4177 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -148,6 +148,35 @@ struct file *anon_inode_getfile(const char *name, } EXPORT_SYMBOL_GPL(anon_inode_getfile); +/** + * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new + * !S_PRIVATE anon inode rather than reuse the + * singleton anon inode and calls the + * inode_init_security_anon() LSM hook. This + * allows for both the inode to have its own + * security context and for the LSM to enforce + * policy on the inode's creation. + * + * @name: [in] name of the "class" of the new file + * @fops: [in] file operations for the new file + * @priv: [in] private data for the new file (will be file's private_data) + * @flags: [in] flags + * @context_inode: + * [in] the logical relationship with the new inode (optional) + * + * The LSM may use @context_inode in inode_init_security_anon(), but a + * reference to it is not held. Returns the newly created file* or an error + * pointer. See the anon_inode_getfile() documentation for more information. + */ +struct file *anon_inode_getfile_secure(const char *name, + const struct file_operations *fops, + void *priv, int flags, + const struct inode *context_inode) +{ + return __anon_inode_getfile(name, fops, priv, flags, + context_inode, true); +} + static int __anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv, int flags, diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 16b5fca0626e..54c1f8b8b075 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -358,7 +358,7 @@ int autofs_wait(struct autofs_sb_info *sbi, qstr.len = strlen(p); offset = p - name; } - qstr.hash = full_name_hash(dentry, name, qstr.len); + qstr.hash = full_name_hash(dentry, qstr.name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { kfree(name); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 69d900a8473d..f8c7f26f1fbb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -156,7 +156,7 @@ static int padzero(unsigned long elf_bss) #define STACK_ADD(sp, items) ((elf_addr_t __user *)(sp) - (items)) #define STACK_ROUND(sp, items) \ (((unsigned long) (sp - items)) &~ 15UL) -#define STACK_ALLOC(sp, len) ({ sp -= len ; sp; }) +#define STACK_ALLOC(sp, len) (sp -= len) #endif #ifndef ELF_BASE_PLATFORM @@ -630,7 +630,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, vaddr = eppnt->p_vaddr; if (interp_elf_ex->e_type == ET_EXEC || load_addr_set) - elf_type |= MAP_FIXED_NOREPLACE; + elf_type |= MAP_FIXED; else if (no_base && interp_elf_ex->e_type == ET_DYN) load_addr = -vaddr; @@ -1074,20 +1074,26 @@ out_free_interp: vaddr = elf_ppnt->p_vaddr; /* - * If we are loading ET_EXEC or we have already performed - * the ET_DYN load_addr calculations, proceed normally. + * The first time through the loop, load_addr_set is false: + * layout will be calculated. Once set, use MAP_FIXED since + * we know we've already safely mapped the entire region with + * MAP_FIXED_NOREPLACE in the once-per-binary logic following. */ - if (elf_ex->e_type == ET_EXEC || load_addr_set) { + if (load_addr_set) { elf_flags |= MAP_FIXED; + } else if (elf_ex->e_type == ET_EXEC) { + /* + * This logic is run once for the first LOAD Program + * Header for ET_EXEC binaries. No special handling + * is needed. + */ + elf_flags |= MAP_FIXED_NOREPLACE; } else if (elf_ex->e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program * Header for ET_DYN binaries to calculate the * randomization (load_bias) for all the LOAD - * Program Headers, and to calculate the entire - * size of the ELF mapping (total_size). (Note that - * load_addr_set is set to true later once the - * initial mapping is performed.) + * Program Headers. * * There are effectively two types of ET_DYN * binaries: programs (i.e. PIE: ET_DYN with INTERP) @@ -1108,7 +1114,7 @@ out_free_interp: * Therefore, programs are loaded offset from * ELF_ET_DYN_BASE and loaders are loaded into the * independently randomized mmap region (0 load_bias - * without MAP_FIXED). + * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ if (interpreter) { load_bias = ELF_ET_DYN_BASE; @@ -1117,7 +1123,7 @@ out_free_interp: alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); if (alignment) load_bias &= ~(alignment - 1); - elf_flags |= MAP_FIXED; + elf_flags |= MAP_FIXED_NOREPLACE; } else load_bias = 0; @@ -1129,7 +1135,14 @@ out_free_interp: * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); + } + /* + * Calculate the entire size of the ELF mapping (total_size). + * (Note that load_addr_set is set to true later once the + * initial mapping is performed.) + */ + if (!load_addr_set) { total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum); if (!total_size) { @@ -1834,7 +1847,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, /* * Allocate a structure for each thread. */ - for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) { + for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) { t = kzalloc(offsetof(struct elf_thread_core_info, notes[info->thread_notes]), GFP_KERNEL); @@ -2024,7 +2037,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, if (!elf_note_info_init(info)) return 0; - for (ct = current->mm->core_state->dumper.next; + for (ct = current->signal->core_state->dumper.next; ct; ct = ct->next) { ets = kzalloc(sizeof(*ets), GFP_KERNEL); if (!ets) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 6d8fd6030cbb..c6f588dc4a9d 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1494,7 +1494,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) goto end_coredump; - for (ct = current->mm->core_state->dumper.next; + for (ct = current->signal->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, ct->task, &thread_status_size); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a3b830b8410a..444e9c89ff3e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/list_sort.h> #include "misc.h" #include "ctree.h" #include "block-group.h" @@ -144,6 +145,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) */ WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl); + kfree(cache->physical_map); kfree(cache); } } @@ -902,6 +904,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cluster->refill_lock); btrfs_clear_treelog_bg(block_group); + btrfs_clear_data_reloc_bg(block_group); path = btrfs_alloc_path(); if (!path) { @@ -1484,6 +1487,21 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +/* + * We want block groups with a low number of used bytes to be in the beginning + * of the list, so they will get reclaimed first. + */ +static int reclaim_bgs_cmp(void *unused, const struct list_head *a, + const struct list_head *b) +{ + const struct btrfs_block_group *bg1, *bg2; + + bg1 = list_entry(a, struct btrfs_block_group, bg_list); + bg2 = list_entry(b, struct btrfs_block_group, bg_list); + + return bg1->used > bg2->used; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1508,6 +1526,12 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_lock(&fs_info->unused_bgs_lock); + /* + * Sort happens under lock because we can't simply splice it and sort. + * The block groups might still be in use and reachable via bg_list, + * and their presence in the reclaim_bgs list must be preserved. + */ + list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; int ret = 0; @@ -1895,6 +1919,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->discard_list); INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); + INIT_LIST_HEAD(&cache->active_bg_list); btrfs_init_free_space_ctl(cache, cache->free_space_ctl); atomic_set(&cache->frozen, 0); mutex_init(&cache->free_space_lock); @@ -2035,6 +2060,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, */ if (btrfs_is_zoned(info)) { btrfs_calc_zone_unusable(cache); + /* Should not have any excluded extents. Just in case, though. */ + btrfs_free_excluded_extents(cache); } else if (cache->length == cache->used) { cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; @@ -2062,15 +2089,18 @@ static int read_one_block_group(struct btrfs_fs_info *info, link_block_group(cache); set_avail_alloc_bits(info, cache->flags); - if (btrfs_chunk_readonly(info, cache->start)) { + if (btrfs_chunk_writeable(info, cache->start)) { + if (cache->used == 0) { + ASSERT(list_empty(&cache->bg_list)); + if (btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_discard_queue_work(&info->discard_ctl, cache); + else + btrfs_mark_bg_unused(cache); + } + } else { inc_block_group_ro(cache, 1); - } else if (cache->used == 0) { - ASSERT(list_empty(&cache->bg_list)); - if (btrfs_test_opt(info, DISCARD_ASYNC)) - btrfs_discard_queue_work(&info->discard_ctl, cache); - else - btrfs_mark_bg_unused(cache); } + return 0; error: btrfs_put_block_group(cache); @@ -2438,6 +2468,12 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } + /* + * New block group is likely to be used soon. Try to activate it now. + * Failure is OK for now. + */ + btrfs_zone_activate(cache); + ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2479,7 +2515,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran */ trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, - cache->bytes_super, 0, &cache->space_info); + cache->bytes_super, cache->zone_unusable, + &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2594,7 +2631,9 @@ void btrfs_dec_block_group_ro(struct btrfs_block_group *cache) if (!--cache->ro) { if (btrfs_is_zoned(cache->fs_info)) { /* Migrate zone_unusable bytes back */ - cache->zone_unusable = cache->alloc_offset - cache->used; + cache->zone_unusable = + (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); sinfo->bytes_zone_unusable += cache->zone_unusable; sinfo->bytes_readonly -= cache->zone_unusable; } @@ -3143,7 +3182,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) } int btrfs_update_block_group(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int alloc) + u64 bytenr, u64 num_bytes, bool alloc) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_block_group *cache = NULL; @@ -3380,36 +3419,17 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) */ check_system_chunk(trans, flags); - bg = btrfs_alloc_chunk(trans, flags); + bg = btrfs_create_chunk(trans, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); goto out; } - /* - * If this is a system chunk allocation then stop right here and do not - * add the chunk item to the chunk btree. This is to prevent a deadlock - * because this system chunk allocation can be triggered while COWing - * some extent buffer of the chunk btree and while holding a lock on a - * parent extent buffer, in which case attempting to insert the chunk - * item (or update the device item) would result in a deadlock on that - * parent extent buffer. In this case defer the chunk btree updates to - * the second phase of chunk allocation and keep our reservation until - * the second phase completes. - * - * This is a rare case and can only be triggered by the very few cases - * we have where we need to touch the chunk btree outside chunk allocation - * and chunk removal. These cases are basically adding a device, removing - * a device or resizing a device. - */ - if (flags & BTRFS_BLOCK_GROUP_SYSTEM) - return 0; - ret = btrfs_chunk_alloc_add_chunk_item(trans, bg); /* * Normally we are not expected to fail with -ENOSPC here, since we have * previously reserved space in the system space_info and allocated one - * new system chunk if necessary. However there are two exceptions: + * new system chunk if necessary. However there are three exceptions: * * 1) We may have enough free space in the system space_info but all the * existing system block groups have a profile which can not be used @@ -3435,13 +3455,20 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) * with enough free space got turned into RO mode by a running scrub, * and in this case we have to allocate a new one and retry. We only * need do this allocate and retry once, since we have a transaction - * handle and scrub uses the commit root to search for block groups. + * handle and scrub uses the commit root to search for block groups; + * + * 3) We had one system block group with enough free space when we called + * check_system_chunk(), but after that, right before we tried to + * allocate the last extent buffer we needed, a discard operation came + * in and it temporarily removed the last free space entry from the + * block group (discard removes a free space entry, discards it, and + * then adds back the entry to the block group cache). */ if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); struct btrfs_block_group *sys_bg; - sys_bg = btrfs_alloc_chunk(trans, sys_flags); + sys_bg = btrfs_create_chunk(trans, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -3519,7 +3546,15 @@ out: * properly, either intentionally or as a bug. One example where this is * done intentionally is fsync, as it does not reserve any transaction units * and ends up allocating a variable number of metadata extents for log - * tree extent buffers. + * tree extent buffers; + * + * 4) The task has reserved enough transaction units / metadata space, but right + * before it tries to allocate the last extent buffer it needs, a discard + * operation comes in and, temporarily, removes the last free space entry from + * the only metadata block group that had free space (discard starts by + * removing a free space entry from a block group, then does the discard + * operation and, once it's done, it adds back the free space entry to the + * block group). * * We also need this 2 phases setup when adding a device to a filesystem with * a seed device - we must create new metadata and system chunks without adding @@ -3537,14 +3572,14 @@ out: * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of * the system chunk array due to concurrent allocations") provides more details. * - * For allocation of system chunks, we defer the updates and insertions into the - * chunk btree to phase 2. This is to prevent deadlocks on extent buffers because - * if the chunk allocation is triggered while COWing an extent buffer of the - * chunk btree, we are holding a lock on the parent of that extent buffer and - * doing the chunk btree updates and insertions can require locking that parent. - * This is for the very few and rare cases where we update the chunk btree that - * are not chunk allocation or chunk removal: adding a device, removing a device - * or resizing a device. + * Allocation of system chunks does not happen through this function. A task that + * needs to update the chunk btree (the only btree that uses system chunks), must + * preallocate chunk space by calling either check_system_chunk() or + * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or + * metadata chunk or when removing a chunk, while the later is used before doing + * a modification to the chunk btree - use cases for the later are adding, + * removing and resizing a device as well as relocation of a system chunk. + * See the comment below for more details. * * The reservation of system space, done through check_system_chunk(), as well * as all the updates and insertions into the chunk btree must be done while @@ -3581,11 +3616,27 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (trans->allocating_chunk) return -ENOSPC; /* - * If we are removing a chunk, don't re-enter or we would deadlock. - * System space reservation and system chunk allocation is done by the - * chunk remove operation (btrfs_remove_chunk()). + * Allocation of system chunks can not happen through this path, as we + * could end up in a deadlock if we are allocating a data or metadata + * chunk and there is another task modifying the chunk btree. + * + * This is because while we are holding the chunk mutex, we will attempt + * to add the new chunk item to the chunk btree or update an existing + * device item in the chunk btree, while the other task that is modifying + * the chunk btree is attempting to COW an extent buffer while holding a + * lock on it and on its parent - if the COW operation triggers a system + * chunk allocation, then we can deadlock because we are holding the + * chunk mutex and we may need to access that extent buffer or its parent + * in order to add the chunk item or update a device item. + * + * Tasks that want to modify the chunk tree should reserve system space + * before updating the chunk btree, by calling either + * btrfs_reserve_chunk_metadata() or check_system_chunk(). + * It's possible that after a task reserves the space, it still ends up + * here - this happens in the cases described above at do_chunk_alloc(). + * The task will have to either retry or fail. */ - if (trans->removing_chunk) + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return -ENOSPC; space_info = btrfs_find_space_info(fs_info, flags); @@ -3684,17 +3735,14 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) return num_dev; } -/* - * Reserve space in the system space for allocating or removing a chunk - */ -void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +static void reserve_chunk_space(struct btrfs_trans_handle *trans, + u64 bytes, + u64 type) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *info; u64 left; - u64 thresh; int ret = 0; - u64 num_devs; /* * Needed because we can end up allocating a system chunk and for an @@ -3707,19 +3755,13 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); - num_devs = get_profile_num_devs(fs_info, type); - - /* num_devs device items to update and 1 chunk item to add or remove */ - thresh = btrfs_calc_metadata_size(fs_info, num_devs) + - btrfs_calc_insert_metadata_size(fs_info, 1); - - if (left < thresh && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { + if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) { btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu", - left, thresh, type); + left, bytes, type); btrfs_dump_space_info(fs_info, info, 0, 0); } - if (left < thresh) { + if (left < bytes) { u64 flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *bg; @@ -3728,21 +3770,20 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) * needing it, as we might not need to COW all nodes/leafs from * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). - * - * Also, if our caller is allocating a system chunk, do not - * attempt to insert the chunk item in the chunk btree, as we - * could deadlock on an extent buffer since our caller may be - * COWing an extent buffer from the chunk btree. */ - bg = btrfs_alloc_chunk(trans, flags); + bg = btrfs_create_chunk(trans, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); - } else if (!(type & BTRFS_BLOCK_GROUP_SYSTEM)) { + } else { /* * If we fail to add the chunk item here, we end up * trying again at phase 2 of chunk allocation, at * btrfs_create_pending_block_groups(). So ignore - * any error here. + * any error here. An ENOSPC here could happen, due to + * the cases described at do_chunk_alloc() - the system + * block group we just created was just turned into RO + * mode by a scrub for example, or a running discard + * temporarily removed its free space entries, etc. */ btrfs_chunk_alloc_add_chunk_item(trans, bg); } @@ -3751,12 +3792,61 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) if (!ret) { ret = btrfs_block_rsv_add(fs_info->chunk_root, &fs_info->chunk_block_rsv, - thresh, BTRFS_RESERVE_NO_FLUSH); + bytes, BTRFS_RESERVE_NO_FLUSH); if (!ret) - trans->chunk_bytes_reserved += thresh; + trans->chunk_bytes_reserved += bytes; } } +/* + * Reserve space in the system space for allocating or removing a chunk. + * The caller must be holding fs_info->chunk_mutex. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + const u64 num_devs = get_profile_num_devs(fs_info, type); + u64 bytes; + + /* num_devs device items to update and 1 chunk item to add or remove. */ + bytes = btrfs_calc_metadata_size(fs_info, num_devs) + + btrfs_calc_insert_metadata_size(fs_info, 1); + + reserve_chunk_space(trans, bytes, type); +} + +/* + * Reserve space in the system space, if needed, for doing a modification to the + * chunk btree. + * + * @trans: A transaction handle. + * @is_item_insertion: Indicate if the modification is for inserting a new item + * in the chunk btree or if it's for the deletion or update + * of an existing item. + * + * This is used in a context where we need to update the chunk btree outside + * block group allocation and removal, to avoid a deadlock with a concurrent + * task that is allocating a metadata or data block group and therefore needs to + * update the chunk btree while holding the chunk mutex. After the update to the + * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called. + * + */ +void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, + bool is_item_insertion) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + u64 bytes; + + if (is_item_insertion) + bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + else + bytes = btrfs_calc_metadata_size(fs_info, 1); + + mutex_lock(&fs_info->chunk_mutex); + reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM); + mutex_unlock(&fs_info->chunk_mutex); +} + void btrfs_put_block_group_cache(struct btrfs_fs_info *info) { struct btrfs_block_group *block_group; @@ -3833,6 +3923,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->zone_active_bgs_lock); + while (!list_empty(&info->zone_active_bgs)) { + block_group = list_first_entry(&info->zone_active_bgs, + struct btrfs_block_group, + active_bg_list); + list_del_init(&block_group->active_bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->zone_active_bgs_lock); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index c72a71efcb18..5878b7ce3b78 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -98,6 +98,7 @@ struct btrfs_block_group { unsigned int to_copy:1; unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; + unsigned int zone_is_active:1; int disk_cache_state; @@ -202,7 +203,10 @@ struct btrfs_block_group { */ u64 alloc_offset; u64 zone_unusable; + u64 zone_capacity; u64 meta_write_pointer; + struct map_lookup *physical_map; + struct list_head active_bg_list; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -280,7 +284,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans); int btrfs_update_block_group(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes, int alloc); + u64 bytenr, u64 num_bytes, bool alloc); int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc); void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, @@ -289,6 +293,8 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, enum btrfs_chunk_alloc_enum force); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); +void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans, + bool is_item_insertion); u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags); void btrfs_put_block_group_cache(struct btrfs_fs_info *info); int btrfs_free_block_groups(struct btrfs_fs_info *info); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 76ee1452c57b..ab2a4a52e0bb 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -138,17 +138,34 @@ struct btrfs_inode { /* a local copy of root's last_log_commit */ int last_log_commit; - /* total number of bytes pending delalloc, used by stat to calc the - * real block usage of the file - */ - u64 delalloc_bytes; - - /* - * Total number of bytes pending delalloc that fall within a file - * range that is either a hole or beyond EOF (and no prealloc extent - * exists in the range). This is always <= delalloc_bytes. - */ - u64 new_delalloc_bytes; + union { + /* + * Total number of bytes pending delalloc, used by stat to + * calculate the real block usage of the file. This is used + * only for files. + */ + u64 delalloc_bytes; + /* + * The offset of the last dir item key that was logged. + * This is used only for directories. + */ + u64 last_dir_item_offset; + }; + + union { + /* + * Total number of bytes pending delalloc that fall within a file + * range that is either a hole or beyond EOF (and no prealloc extent + * exists in the range). This is always <= delalloc_bytes and this + * is used only for files. + */ + u64 new_delalloc_bytes; + /* + * The offset of the last dir index key that was logged. + * This is used only for directories. + */ + u64 last_dir_index_offset; + }; /* * total number of bytes pending defrag, used by stat to check whether @@ -339,7 +356,12 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) struct btrfs_dio_private { struct inode *inode; - u64 logical_offset; + + /* + * Since DIO can use anonymous page, we cannot use page_offset() to + * grab the file offset, thus need a dedicated member for file offset. + */ + u64 file_offset; u64 disk_bytenr; /* Used for bio::bi_size */ u32 bytes; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 86816088927f..7e9f90fa0388 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -186,7 +186,6 @@ struct btrfsic_dev_state { struct list_head collision_resolving_node; /* list node */ struct btrfsic_block dummy_block_for_bio_bh_flush; u64 last_flush_gen; - char name[BDEVNAME_SIZE]; }; struct btrfsic_block_hashtable { @@ -403,7 +402,6 @@ static void btrfsic_dev_state_init(struct btrfsic_dev_state *ds) ds->magic_num = BTRFSIC_DEV2STATE_MAGIC_NUMBER; ds->bdev = NULL; ds->state = NULL; - ds->name[0] = '\0'; INIT_LIST_HEAD(&ds->collision_resolving_node); ds->last_flush_gen = 0; btrfsic_block_init(&ds->dummy_block_for_bio_bh_flush); @@ -756,10 +754,10 @@ static int btrfsic_process_superblock_dev_mirror( superblock_tmp->mirror_num = 1 + superblock_mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) btrfs_info_in_rcu(fs_info, - "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)", + "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", superblock_bdev, rcu_str_deref(device->name), dev_bytenr, - dev_state->name, dev_bytenr, + dev_state->bdev, dev_bytenr, superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, &state->all_blocks_list); @@ -938,9 +936,10 @@ continue_with_current_leaf_stack_frame: if (disk_item_offset + sizeof(struct btrfs_item) > sf->block_ctx->len) { leaf_item_out_of_bounce_error: - pr_info("btrfsic: leaf item out of bounce at logical %llu, dev %s\n", + pr_info( + "btrfsic: leaf item out of bounce at logical %llu, dev %pg\n", sf->block_ctx->start, - sf->block_ctx->dev->name); + sf->block_ctx->dev->bdev); goto one_stack_frame_backwards; } btrfsic_read_from_block_data(sf->block_ctx, @@ -1058,9 +1057,10 @@ continue_with_current_node_stack_frame: (uintptr_t)nodehdr; if (key_ptr_offset + sizeof(struct btrfs_key_ptr) > sf->block_ctx->len) { - pr_info("btrfsic: node item out of bounce at logical %llu, dev %s\n", + pr_info( + "btrfsic: node item out of bounce at logical %llu, dev %pg\n", sf->block_ctx->start, - sf->block_ctx->dev->name); + sf->block_ctx->dev->bdev); goto one_stack_frame_backwards; } btrfsic_read_from_block_data( @@ -1228,15 +1228,17 @@ static int btrfsic_create_link_to_next_block( if (next_block->logical_bytenr != next_bytenr && !(!next_block->is_metadata && 0 == next_block->logical_bytenr)) - pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", - next_bytenr, next_block_ctx->dev->name, + pr_info( +"referenced block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", + next_bytenr, next_block_ctx->dev->bdev, next_block_ctx->dev_bytenr, *mirror_nump, btrfsic_get_block_type(state, next_block), next_block->logical_bytenr); else - pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, %c.\n", - next_bytenr, next_block_ctx->dev->name, + pr_info( + "referenced block @%llu (%pg/%llu/%d) found in hash table, %c\n", + next_bytenr, next_block_ctx->dev->bdev, next_block_ctx->dev_bytenr, *mirror_nump, btrfsic_get_block_type(state, next_block)); @@ -1324,8 +1326,8 @@ static int btrfsic_handle_extent_data( if (file_extent_item_offset + offsetof(struct btrfs_file_extent_item, disk_num_bytes) > block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n", - block_ctx->start, block_ctx->dev->name); + pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", + block_ctx->start, block_ctx->dev->bdev); return -1; } @@ -1344,8 +1346,8 @@ static int btrfsic_handle_extent_data( if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) > block_ctx->len) { - pr_info("btrfsic: file item out of bounce at logical %llu, dev %s\n", - block_ctx->start, block_ctx->dev->name); + pr_info("btrfsic: file item out of bounce at logical %llu, dev %pg\n", + block_ctx->start, block_ctx->dev->bdev); return -1; } btrfsic_read_from_block_data(block_ctx, &file_extent_item, @@ -1421,9 +1423,10 @@ static int btrfsic_handle_extent_data( next_block->logical_bytenr != next_bytenr && !(!next_block->is_metadata && 0 == next_block->logical_bytenr)) { - pr_info("Referenced block @%llu (%s/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu).\n", + pr_info( +"referenced block @%llu (%pg/%llu/%d) found in hash table, D, bytenr mismatch (!= stored %llu)\n", next_bytenr, - next_block_ctx.dev->name, + next_block_ctx.dev->bdev, next_block_ctx.dev_bytenr, mirror_num, next_block->logical_bytenr); @@ -1455,7 +1458,7 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len, struct btrfs_fs_info *fs_info = state->fs_info; int ret; u64 length; - struct btrfs_bio *multi = NULL; + struct btrfs_io_context *multi = NULL; struct btrfs_device *device; length = len; @@ -1561,7 +1564,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, struct bio *bio; unsigned int j; - bio = btrfs_io_bio_alloc(num_pages - i); + bio = btrfs_bio_alloc(num_pages - i); bio_set_dev(bio, block_ctx->dev->bdev); bio->bi_iter.bi_sector = dev_bytenr >> 9; bio->bi_opf = REQ_OP_READ; @@ -1577,8 +1580,8 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -1; } if (submit_bio_wait(bio)) { - pr_info("btrfsic: read error at logical %llu dev %s!\n", - block_ctx->start, block_ctx->dev->name); + pr_info("btrfsic: read error at logical %llu dev %pg!\n", + block_ctx->start, block_ctx->dev->bdev); bio_put(bio); return -1; } @@ -1602,33 +1605,35 @@ static void btrfsic_dump_database(struct btrfsic_state *state) list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { const struct btrfsic_block_link *l; - pr_info("%c-block @%llu (%s/%llu/%d)\n", + pr_info("%c-block @%llu (%pg/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num); list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { - pr_info(" %c @%llu (%s/%llu/%d) refers %u* to %c @%llu (%s/%llu/%d)\n", + pr_info( + " %c @%llu (%pg/%llu/%d) refers %u* to %c @%llu (%pg/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { - pr_info(" %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n", + pr_info( + " %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num); } @@ -1743,16 +1748,18 @@ again: if (block->logical_bytenr != bytenr && !(!block->is_metadata && block->logical_bytenr == 0)) - pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu).\n", - bytenr, dev_state->name, + pr_info( +"written block @%llu (%pg/%llu/%d) found in hash table, %c, bytenr mismatch (!= stored %llu)\n", + bytenr, dev_state->bdev, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block), block->logical_bytenr); else - pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", - bytenr, dev_state->name, + pr_info( + "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", + bytenr, dev_state->bdev, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); @@ -1767,8 +1774,9 @@ again: processed_len = state->datablock_size; bytenr = block->logical_bytenr; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("Written block @%llu (%s/%llu/%d) found in hash table, %c.\n", - bytenr, dev_state->name, dev_bytenr, + pr_info( + "written block @%llu (%pg/%llu/%d) found in hash table, %c\n", + bytenr, dev_state->bdev, dev_bytenr, block->mirror_num, btrfsic_get_block_type(state, block)); } @@ -1778,9 +1786,10 @@ again: list_empty(&block->ref_to_list) ? ' ' : '!', list_empty(&block->ref_from_list) ? ' ' : '!'); if (btrfsic_is_block_ref_by_superblock(state, block, 0)) { - pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", + pr_info( +"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), old(gen=%llu, objectid=%llu, type=%d, offset=%llu), new(gen=%llu), which is referenced by most recent superblock (superblockgen=%llu)!\n", btrfsic_get_block_type(state, block), bytenr, - dev_state->name, dev_bytenr, block->mirror_num, + dev_state->bdev, dev_bytenr, block->mirror_num, block->generation, btrfs_disk_key_objectid(&block->disk_key), block->disk_key.type, @@ -1792,9 +1801,10 @@ again: } if (!block->is_iodone && !block->never_written) { - pr_info("btrfs: attempt to overwrite %c-block @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", + pr_info( +"btrfs: attempt to overwrite %c-block @%llu (%pg/%llu/%d), oldgen=%llu, newgen=%llu, which is not yet iodone!\n", btrfsic_get_block_type(state, block), bytenr, - dev_state->name, dev_bytenr, block->mirror_num, + dev_state->bdev, dev_bytenr, block->mirror_num, block->generation, btrfs_stack_header_generation( (struct btrfs_header *) @@ -1921,8 +1931,9 @@ again: if (!is_metadata) { processed_len = state->datablock_size; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("Written block (%s/%llu/?) !found in hash table, D.\n", - dev_state->name, dev_bytenr); + pr_info( + "written block (%pg/%llu/?) !found in hash table, D\n", + dev_state->bdev, dev_bytenr); if (!state->include_extent_data) { /* ignore that written D block */ goto continue_loop; @@ -1939,8 +1950,9 @@ again: btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state, dev_bytenr); if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("Written block @%llu (%s/%llu/?) !found in hash table, M.\n", - bytenr, dev_state->name, dev_bytenr); + pr_info( + "written block @%llu (%pg/%llu/?) !found in hash table, M\n", + bytenr, dev_state->bdev, dev_bytenr); } block_ctx.dev = dev_state; @@ -1995,9 +2007,9 @@ again: block->next_in_same_bio = NULL; } if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("New written %c-block @%llu (%s/%llu/%d)\n", + pr_info("new written %c-block @%llu (%pg/%llu/%d)\n", is_metadata ? 'M' : 'D', - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num); list_add(&block->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(block, &state->block_hashtable); @@ -2041,10 +2053,10 @@ static void btrfsic_bio_end_io(struct bio *bp) if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n", + pr_info("bio_end_io(err=%d) for %c @%llu (%pg/%llu/%d)\n", bp->bi_status, btrfsic_get_block_type(dev_state->state, block), - block->logical_bytenr, dev_state->name, + block->logical_bytenr, dev_state->bdev, block->dev_bytenr, block->mirror_num); next_block = block->next_in_same_bio; block->iodone_w_error = iodone_w_error; @@ -2052,8 +2064,8 @@ static void btrfsic_bio_end_io(struct bio *bp) dev_state->last_flush_gen++; if ((dev_state->state->print_mask & BTRFSIC_PRINT_MASK_END_IO_BIO_BH)) - pr_info("bio_end_io() new %s flush_gen=%llu\n", - dev_state->name, + pr_info("bio_end_io() new %pg flush_gen=%llu\n", + dev_state->bdev, dev_state->last_flush_gen); } if (block->submit_bio_bh_rw & REQ_FUA) @@ -2078,17 +2090,19 @@ static int btrfsic_process_written_superblock( if (!(superblock->generation > state->max_superblock_generation || 0 == state->max_superblock_generation)) { if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info("btrfsic: superblock @%llu (%s/%llu/%d) with old gen %llu <= %llu\n", + pr_info( + "btrfsic: superblock @%llu (%pg/%llu/%d) with old gen %llu <= %llu\n", superblock->logical_bytenr, - superblock->dev_state->name, + superblock->dev_state->bdev, superblock->dev_bytenr, superblock->mirror_num, btrfs_super_generation(super_hdr), state->max_superblock_generation); } else { if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) - pr_info("btrfsic: got new superblock @%llu (%s/%llu/%d) with new gen %llu > %llu\n", + pr_info( + "btrfsic: got new superblock @%llu (%pg/%llu/%d) with new gen %llu > %llu\n", superblock->logical_bytenr, - superblock->dev_state->name, + superblock->dev_state->bdev, superblock->dev_bytenr, superblock->mirror_num, btrfs_super_generation(super_hdr), state->max_superblock_generation); @@ -2232,38 +2246,42 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, */ list_for_each_entry(l, &block->ref_to_list, node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("rl=%d, %c @%llu (%s/%llu/%d) %u* refers to %c @%llu (%s/%llu/%d)\n", + pr_info( + "rl=%d, %c @%llu (%pg/%llu/%d) %u* refers to %c @%llu (%pg/%llu/%d)\n", recursion_level, btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); if (l->block_ref_to->never_written) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is never written!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is never written!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (!l->block_ref_to->is_iodone) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not yet iodone!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not yet iodone!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; } else if (l->block_ref_to->iodone_w_error) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which has write error!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which has write error!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); ret = -1; @@ -2273,10 +2291,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, l->parent_generation && BTRFSIC_GENERATION_UNKNOWN != l->block_ref_to->generation) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) with generation %llu != parent generation %llu!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) with generation %llu != parent generation %llu!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num, l->block_ref_to->generation, @@ -2284,10 +2303,11 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, ret = -1; } else if (l->block_ref_to->flush_gen > l->block_ref_to->dev_state->last_flush_gen) { - pr_info("btrfs: attempt to write superblock which references block %c @%llu (%s/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", + pr_info( +"btrfs: attempt to write superblock which references block %c @%llu (%pg/%llu/%d) which is not flushed out of disk's write cache (block flush_gen=%llu, dev->flush_gen=%llu)!\n", btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num, block->flush_gen, l->block_ref_to->dev_state->last_flush_gen); @@ -2324,15 +2344,16 @@ static int btrfsic_is_block_ref_by_superblock( */ list_for_each_entry(l, &block->ref_from_list, node_ref_from) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("rl=%d, %c @%llu (%s/%llu/%d) is ref %u* from %c @%llu (%s/%llu/%d)\n", + pr_info( + "rl=%d, %c @%llu (%pg/%llu/%d) is ref %u* from %c @%llu (%pg/%llu/%d)\n", recursion_level, btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num, l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num); if (l->block_ref_from->is_superblock && @@ -2354,30 +2375,30 @@ static int btrfsic_is_block_ref_by_superblock( static void btrfsic_print_add_link(const struct btrfsic_state *state, const struct btrfsic_block_link *l) { - pr_info("Add %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n", + pr_info("add %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } static void btrfsic_print_rem_link(const struct btrfsic_state *state, const struct btrfsic_block_link *l) { - pr_info("Rem %u* link from %c @%llu (%s/%llu/%d) to %c @%llu (%s/%llu/%d).\n", + pr_info("rem %u* link from %c @%llu (%pg/%llu/%d) to %c @%llu (%pg/%llu/%d)\n", l->ref_cnt, btrfsic_get_block_type(state, l->block_ref_from), l->block_ref_from->logical_bytenr, - l->block_ref_from->dev_state->name, + l->block_ref_from->dev_state->bdev, l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num, btrfsic_get_block_type(state, l->block_ref_to), l->block_ref_to->logical_bytenr, - l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr, + l->block_ref_to->dev_state->bdev, l->block_ref_to->dev_bytenr, l->block_ref_to->mirror_num); } @@ -2419,9 +2440,9 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - indent_add = sprintf(buf, "%c-%llu(%s/%llu/%u)", + indent_add = sprintf(buf, "%c-%llu(%pg/%llu/%u)", btrfsic_get_block_type(state, block), - block->logical_bytenr, block->dev_state->name, + block->logical_bytenr, block->dev_state->bdev, block->dev_bytenr, block->mirror_num); if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) { printk("[...]\n"); @@ -2542,10 +2563,10 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add( block->never_written = never_written; block->mirror_num = mirror_num; if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) - pr_info("New %s%c-block @%llu (%s/%llu/%d)\n", + pr_info("New %s%c-block @%llu (%pg/%llu/%d)\n", additional_string, btrfsic_get_block_type(state, block), - block->logical_bytenr, dev_state->name, + block->logical_bytenr, dev_state->bdev, block->dev_bytenr, mirror_num); list_add(&block->all_blocks_node, &state->all_blocks_list); btrfsic_block_hashtable_add(block, &state->block_hashtable); @@ -2592,8 +2613,9 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, } if (WARN_ON(!match)) { - pr_info("btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%s, phys_bytenr=%llu)!\n", - bytenr, dev_state->name, dev_bytenr); + pr_info( +"btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio, buffer->log_bytenr=%llu, submit_bio(bdev=%pg, phys_bytenr=%llu)!\n", + bytenr, dev_state->bdev, dev_bytenr); for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) { ret = btrfsic_map_block(state, bytenr, state->metablock_size, @@ -2601,8 +2623,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state, if (ret) continue; - pr_info("Read logical bytenr @%llu maps to (%s/%llu/%d)\n", - bytenr, block_ctx.dev->name, + pr_info("read logical bytenr @%llu maps to (%pg/%llu/%d)\n", + bytenr, block_ctx.dev->bdev, block_ctx.dev_bytenr, mirror_num); } } @@ -2675,8 +2697,9 @@ static void __btrfsic_submit_bio(struct bio *bio) if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info("btrfsic_submit_bio(%s) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->name); + pr_info( +"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", + dev_state->bdev); } else { struct btrfsic_block *const block = &dev_state->dummy_block_for_bio_bh_flush; @@ -2751,7 +2774,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, list_for_each_entry(device, dev_head, dev_list) { struct btrfsic_dev_state *ds; - const char *p; if (!device->bdev || !device->name) continue; @@ -2763,10 +2785,6 @@ int btrfsic_mount(struct btrfs_fs_info *fs_info, } ds->bdev = device->bdev; ds->state = state; - bdevname(ds->bdev, ds->name); - ds->name[BDEVNAME_SIZE - 1] = '\0'; - p = kbasename(ds->name); - strlcpy(ds->name, p, sizeof(ds->name)); btrfsic_dev_state_hashtable_add(ds, &btrfsic_dev_state_hashtable); } @@ -2844,9 +2862,10 @@ void btrfsic_unmount(struct btrfs_fs_devices *fs_devices) if (b_all->is_iodone || b_all->never_written) btrfsic_block_free(b_all); else - pr_info("btrfs: attempt to free %c-block @%llu (%s/%llu/%d) on umount which is not yet iodone!\n", + pr_info( +"btrfs: attempt to free %c-block @%llu (%pg/%llu/%d) on umount which is not yet iodone!\n", btrfsic_get_block_type(state, b_all), - b_all->logical_bytenr, b_all->dev_state->name, + b_all->logical_bytenr, b_all->dev_state->bdev, b_all->dev_bytenr, b_all->mirror_num); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7869ad12bc6e..32da97c3c19d 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -9,6 +9,7 @@ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> +#include <linux/kthread.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> @@ -28,6 +29,7 @@ #include "compression.h" #include "extent_io.h" #include "extent_map.h" +#include "subpage.h" #include "zoned.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -172,16 +174,17 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, /* Hash through the page sector by sector */ for (pg_offset = 0; pg_offset < bytes_left; pg_offset += sectorsize) { - kaddr = page_address(page); + kaddr = kmap_atomic(page); crypto_shash_digest(shash, kaddr + pg_offset, sectorsize, csum); + kunmap_atomic(kaddr); if (memcmp(&csum, cb_sum, csum_size) != 0) { btrfs_print_data_csum_error(inode, disk_start, csum, cb_sum, cb->mirror_num); - if (btrfs_io_bio(bio)->device) + if (btrfs_bio(bio)->device) btrfs_dev_stat_inc_and_print( - btrfs_io_bio(bio)->device, + btrfs_bio(bio)->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); return -EIO; } @@ -192,6 +195,87 @@ static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, return 0; } +/* + * Reduce bio and io accounting for a compressed_bio with its corresponding bio. + * + * Return true if there is no pending bio nor io. + * Return false otherwise. + */ +static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + unsigned int bi_size = 0; + bool last_io = false; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + /* + * At endio time, bi_iter.bi_size doesn't represent the real bio size. + * Thus here we have to iterate through all segments to grab correct + * bio size. + */ + bio_for_each_segment_all(bvec, bio, iter_all) + bi_size += bvec->bv_len; + + if (bio->bi_status) + cb->errors = 1; + + ASSERT(bi_size && bi_size <= cb->compressed_len); + last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, + &cb->pending_sectors); + /* + * Here we must wake up the possible error handler after all other + * operations on @cb finished, or we can race with + * finish_compressed_bio_*() which may free @cb. + */ + wake_up_var(cb); + + return last_io; +} + +static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) +{ + unsigned int index; + struct page *page; + + /* Release the compressed pages */ + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + put_page(page); + } + + /* Do io completion on the original bio */ + if (cb->errors) { + bio_io_error(cb->orig_bio); + } else { + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + ASSERT(bio); + ASSERT(!bio->bi_status); + /* + * We have verified the checksum already, set page checked so + * the end_io handlers know about it + */ + ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { + u64 bvec_start = page_offset(bvec->bv_page) + + bvec->bv_offset; + + btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb), + bvec->bv_page, bvec_start, + bvec->bv_len); + } + + bio_endio(cb->orig_bio); + } + + /* Finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +} + /* when we finish reading compressed pages from the disk, we * decompress them and then run the bio end_io routines on the * decompressed pages (in the inode address space). @@ -206,25 +290,17 @@ static void end_compressed_bio_read(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; struct inode *inode; - struct page *page; - unsigned int index; - unsigned int mirror = btrfs_io_bio(bio)->mirror_num; + unsigned int mirror = btrfs_bio(bio)->mirror_num; int ret = 0; - if (bio->bi_status) - cb->errors = 1; - - /* if there are more bios still pending for this compressed - * extent, just exit - */ - if (!refcount_dec_and_test(&cb->pending_bios)) + if (!dec_and_test_compressed_bio(cb, bio)) goto out; /* * Record the correct mirror_num in cb->orig_bio so that * read-repair can work properly. */ - btrfs_io_bio(cb->orig_bio)->mirror_num = mirror; + btrfs_bio(cb->orig_bio)->mirror_num = mirror; cb->mirror_num = mirror; /* @@ -248,36 +324,7 @@ static void end_compressed_bio_read(struct bio *bio) csum_failed: if (ret) cb->errors = 1; - - /* release the compressed pages */ - index = 0; - for (index = 0; index < cb->nr_pages; index++) { - page = cb->compressed_pages[index]; - page->mapping = NULL; - put_page(page); - } - - /* do io completion on the original bio */ - if (cb->errors) { - bio_io_error(cb->orig_bio); - } else { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * we have verified the checksum already, set page - * checked so the end_io handlers know about it - */ - ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) - SetPageChecked(bvec->bv_page); - - bio_endio(cb->orig_bio); - } - - /* finally free the cb struct */ - kfree(cb->compressed_pages); - kfree(cb); + finish_compressed_bio_read(cb, bio); out: bio_put(bio); } @@ -289,6 +336,7 @@ out: static noinline void end_compressed_writeback(struct inode *inode, const struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct page *pages[16]; @@ -311,7 +359,8 @@ static noinline void end_compressed_writeback(struct inode *inode, for (i = 0; i < ret; i++) { if (cb->errors) SetPageError(pages[i]); - end_page_writeback(pages[i]); + btrfs_page_clamp_clear_writeback(fs_info, pages[i], + cb->start, cb->len); put_page(pages[i]); } nr_pages -= ret; @@ -320,60 +369,127 @@ static noinline void end_compressed_writeback(struct inode *inode, /* the inode may be gone now */ } -/* - * do the cleanup once all the compressed pages hit the disk. - * This will clear writeback on the file pages and free the compressed - * pages. - * - * This also calls the writeback end hooks for the file pages so that - * metadata and checksums can be updated in the file. - */ -static void end_compressed_bio_write(struct bio *bio) +static void finish_compressed_bio_write(struct compressed_bio *cb) { - struct compressed_bio *cb = bio->bi_private; - struct inode *inode; - struct page *page; + struct inode *inode = cb->inode; unsigned int index; - if (bio->bi_status) - cb->errors = 1; - - /* if there are more bios still pending for this compressed - * extent, just exit - */ - if (!refcount_dec_and_test(&cb->pending_bios)) - goto out; - - /* ok, we're the last bio for this extent, step one is to - * call back into the FS and do all the end_io operations + /* + * Ok, we're the last bio for this extent, step one is to call back + * into the FS and do all the end_io operations. */ - inode = cb->inode; - btrfs_record_physical_zoned(inode, cb->start, bio); btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, !cb->errors); end_compressed_writeback(inode, cb); - /* note, our inode could be gone now */ + /* Note, our inode could be gone now */ /* - * release the compressed pages, these came from alloc_page and + * Release the compressed pages, these came from alloc_page and * are not attached to the inode at all */ - index = 0; for (index = 0; index < cb->nr_pages; index++) { - page = cb->compressed_pages[index]; + struct page *page = cb->compressed_pages[index]; + page->mapping = NULL; put_page(page); } - /* finally free the cb struct */ + /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); +} + +/* + * Do the cleanup once all the compressed pages hit the disk. This will clear + * writeback on the file pages and free the compressed pages. + * + * This also calls the writeback end hooks for the file pages so that metadata + * and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio) +{ + struct compressed_bio *cb = bio->bi_private; + + if (!dec_and_test_compressed_bio(cb, bio)) + goto out; + + btrfs_record_physical_zoned(cb->inode, cb->start, bio); + + finish_compressed_bio_write(cb); out: bio_put(bio); } +static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, + struct compressed_bio *cb, + struct bio *bio, int mirror_num) +{ + blk_status_t ret; + + ASSERT(bio->bi_iter.bi_size); + ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); + if (ret) + return ret; + ret = btrfs_map_bio(fs_info, bio, mirror_num); + return ret; +} + +/* + * Allocate a compressed_bio, which will be used to read/write on-disk + * (aka, compressed) * data. + * + * @cb: The compressed_bio structure, which records all the needed + * information to bind the compressed data to the uncompressed + * page cache. + * @disk_byten: The logical bytenr where the compressed data will be read + * from or written to. + * @endio_func: The endio function to call after the IO for compressed data + * is finished. + * @next_stripe_start: Return value of logical bytenr of where next stripe starts. + * Let the caller know to only fill the bio up to the stripe + * boundary. + */ + + +static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, + unsigned int opf, bio_end_io_t endio_func, + u64 *next_stripe_start) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); + struct btrfs_io_geometry geom; + struct extent_map *em; + struct bio *bio; + int ret; + + bio = btrfs_bio_alloc(BIO_MAX_VECS); + + bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; + bio->bi_opf = opf; + bio->bi_private = cb; + bio->bi_end_io = endio_func; + + em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize); + if (IS_ERR(em)) { + bio_put(bio); + return ERR_CAST(em); + } + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) + bio_set_dev(bio, em->map_lookup->stripes[0].dev->bdev); + + ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), disk_bytenr, &geom); + free_extent_map(em); + if (ret < 0) { + bio_put(bio); + return ERR_PTR(ret); + } + *next_stripe_start = disk_bytenr + geom.len; + + return bio; +} + /* * worker function to build and submit bios for previously compressed pages. * The corresponding pages in the inode should be marked for writeback @@ -394,20 +510,19 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = NULL; struct compressed_bio *cb; - unsigned long bytes_left; - int pg_index = 0; - struct page *page; - u64 first_byte = disk_start; + u64 cur_disk_bytenr = disk_start; + u64 next_stripe_start; blk_status_t ret; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; const bool use_append = btrfs_use_zone_append(inode, disk_start); const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; - WARN_ON(!PAGE_ALIGNED(start)); + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(len, fs_info->sectorsize)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); cb->errors = 0; cb->inode = &inode->vfs_inode; cb->start = start; @@ -418,118 +533,100 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->orig_bio = NULL; cb->nr_pages = nr_pages; - bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = bio_op | write_flags; - bio->bi_private = cb; - bio->bi_end_io = end_compressed_bio_write; - - if (use_append) { - struct btrfs_device *device; - - device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE); - if (IS_ERR(device)) { - kfree(cb); - bio_put(bio); - return BLK_STS_NOTSUPP; + while (cur_disk_bytenr < disk_start + compressed_len) { + u64 offset = cur_disk_bytenr - disk_start; + unsigned int index = offset >> PAGE_SHIFT; + unsigned int real_size; + unsigned int added; + struct page *page = compressed_pages[index]; + bool submit = false; + + /* Allocate new bio if submitted or not yet allocated */ + if (!bio) { + bio = alloc_compressed_bio(cb, cur_disk_bytenr, + bio_op | write_flags, end_compressed_bio_write, + &next_stripe_start); + if (IS_ERR(bio)) { + ret = errno_to_blk_status(PTR_ERR(bio)); + bio = NULL; + goto finish_cb; + } } - - bio_set_dev(bio, device->bdev); - } - - if (blkcg_css) { - bio->bi_opf |= REQ_CGROUP_PUNT; - kthread_associate_blkcg(blkcg_css); - } - refcount_set(&cb->pending_bios, 1); - - /* create and submit bios for the compressed pages */ - bytes_left = compressed_len; - for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { - int submit = 0; - int len = 0; - - page = compressed_pages[pg_index]; - page->mapping = inode->vfs_inode.i_mapping; - if (bio->bi_iter.bi_size) - submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, - 0); - /* - * Page can only be added to bio if the current bio fits in - * stripe. + * We should never reach next_stripe_start start as we will + * submit comp_bio when reach the boundary immediately. */ - if (!submit) { - if (pg_index == 0 && use_append) - len = bio_add_zone_append_page(bio, page, - PAGE_SIZE, 0); - else - len = bio_add_page(bio, page, PAGE_SIZE, 0); - } - - page->mapping = NULL; - if (submit || len < PAGE_SIZE) { - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the cb might get - * freed before we're done setting it up - */ - refcount_inc(&cb->pending_bios); - ret = btrfs_bio_wq_end_io(fs_info, bio, - BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ + ASSERT(cur_disk_bytenr != next_stripe_start); + /* + * We have various limits on the real read size: + * - stripe boundary + * - page boundary + * - compressed length boundary + */ + real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_bytenr); + real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, real_size, compressed_len - offset); + ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); + + if (use_append) + added = bio_add_zone_append_page(bio, page, real_size, + offset_in_page(offset)); + else + added = bio_add_page(bio, page, real_size, + offset_in_page(offset)); + /* Reached zoned boundary */ + if (added == 0) + submit = true; + + cur_disk_bytenr += added; + /* Reached stripe boundary */ + if (cur_disk_bytenr == next_stripe_start) + submit = true; + + /* Finished the range */ + if (cur_disk_bytenr == disk_start + compressed_len) + submit = true; + + if (submit) { if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, start, 1); - BUG_ON(ret); /* -ENOMEM */ - } - - ret = btrfs_map_bio(fs_info, bio, 0); - if (ret) { - bio->bi_status = ret; - bio_endio(bio); + if (ret) + goto finish_cb; } - bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = bio_op | write_flags; - bio->bi_private = cb; - bio->bi_end_io = end_compressed_bio_write; - if (blkcg_css) - bio->bi_opf |= REQ_CGROUP_PUNT; - /* - * Use bio_add_page() to ensure the bio has at least one - * page. - */ - bio_add_page(bio, page, PAGE_SIZE, 0); + ret = submit_compressed_bio(fs_info, cb, bio, 0); + if (ret) + goto finish_cb; + bio = NULL; } - if (bytes_left < PAGE_SIZE) { - btrfs_info(fs_info, - "bytes left %lu compress len %u nr %u", - bytes_left, cb->compressed_len, cb->nr_pages); - } - bytes_left -= PAGE_SIZE; - first_byte += PAGE_SIZE; cond_resched(); } + if (blkcg_css) + kthread_associate_blkcg(NULL); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ - - if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, 1); - BUG_ON(ret); /* -ENOMEM */ - } + return 0; - ret = btrfs_map_bio(fs_info, bio, 0); - if (ret) { +finish_cb: + if (bio) { bio->bi_status = ret; bio_endio(bio); } + /* Last byte of @cb is submitted, endio will free @cb */ + if (cur_disk_bytenr == disk_start + compressed_len) + return ret; - if (blkcg_css) - kthread_associate_blkcg(NULL); - - return 0; + wait_var_event(cb, refcount_read(&cb->pending_sectors) == + (disk_start + compressed_len - cur_disk_bytenr) >> + fs_info->sectorsize_bits); + /* + * Even with previous bio ended, we should still have io not yet + * submitted, thus need to finish manually. + */ + ASSERT(refcount_read(&cb->pending_sectors)); + /* Now we are the only one referring @cb, can finish it safely. */ + finish_compressed_bio_write(cb); + return ret; } static u64 bio_end_offset(struct bio *bio) @@ -539,25 +636,33 @@ static u64 bio_end_offset(struct bio *bio) return page_offset(last->bv_page) + last->bv_len + last->bv_offset; } +/* + * Add extra pages in the same compressed file extent so that we don't need to + * re-read the same extent again and again. + * + * NOTE: this won't work well for subpage, as for subpage read, we lock the + * full page then submit bio for each compressed/regular extents. + * + * This means, if we have several sectors in the same page points to the same + * on-disk compressed data, we will re-read the same extent many times and + * this function can only help for the next page. + */ static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, struct compressed_bio *cb) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); unsigned long end_index; - unsigned long pg_index; - u64 last_offset; + u64 cur = bio_end_offset(cb->orig_bio); u64 isize = i_size_read(inode); int ret; struct page *page; - unsigned long nr_pages = 0; struct extent_map *em; struct address_space *mapping = inode->i_mapping; struct extent_map_tree *em_tree; struct extent_io_tree *tree; - u64 end; - int misses = 0; + int sectors_missed = 0; - last_offset = bio_end_offset(cb->orig_bio); em_tree = &BTRFS_I(inode)->extent_tree; tree = &BTRFS_I(inode)->io_tree; @@ -576,18 +681,29 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; - while (last_offset < compressed_end) { - pg_index = last_offset >> PAGE_SHIFT; + while (cur < compressed_end) { + u64 page_end; + u64 pg_index = cur >> PAGE_SHIFT; + u32 add_size; if (pg_index > end_index) break; page = xa_load(&mapping->i_pages, pg_index); if (page && !xa_is_value(page)) { - misses++; - if (misses > 4) + sectors_missed += (PAGE_SIZE - offset_in_page(cur)) >> + fs_info->sectorsize_bits; + + /* Beyond threshold, no need to continue */ + if (sectors_missed > 4) break; - goto next; + + /* + * Jump to next page start as we already have page for + * current offset. + */ + cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + continue; } page = __page_cache_alloc(mapping_gfp_constraint(mapping, @@ -597,14 +713,11 @@ static noinline int add_ra_bio_pages(struct inode *inode, if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) { put_page(page); - goto next; + /* There is already a page, skip to page end */ + cur = (pg_index << PAGE_SHIFT) + PAGE_SIZE; + continue; } - /* - * at this point, we have a locked page in the page cache - * for these bytes in the file. But, we have to make - * sure they map to this compressed extent on disk. - */ ret = set_page_extent_mapped(page); if (ret < 0) { unlock_page(page); @@ -612,18 +725,22 @@ static noinline int add_ra_bio_pages(struct inode *inode, break; } - end = last_offset + PAGE_SIZE - 1; - lock_extent(tree, last_offset, end); + page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1; + lock_extent(tree, cur, page_end); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, last_offset, - PAGE_SIZE); + em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); read_unlock(&em_tree->lock); - if (!em || last_offset < em->start || - (last_offset + PAGE_SIZE > extent_map_end(em)) || + /* + * At this point, we have a locked page in the page cache for + * these bytes in the file. But, we have to make sure they map + * to this compressed extent on disk. + */ + if (!em || cur < em->start || + (cur + fs_info->sectorsize > extent_map_end(em)) || (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) { free_extent_map(em); - unlock_extent(tree, last_offset, end); + unlock_extent(tree, cur, page_end); unlock_page(page); put_page(page); break; @@ -641,20 +758,23 @@ static noinline int add_ra_bio_pages(struct inode *inode, } } - ret = bio_add_page(cb->orig_bio, page, - PAGE_SIZE, 0); - - if (ret == PAGE_SIZE) { - nr_pages++; - put_page(page); - } else { - unlock_extent(tree, last_offset, end); + add_size = min(em->start + em->len, page_end + 1) - cur; + ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur)); + if (ret != add_size) { + unlock_extent(tree, cur, page_end); unlock_page(page); put_page(page); break; } -next: - last_offset += PAGE_SIZE; + /* + * If it's subpage, we also need to increase its + * subpage::readers number, as at endio we will decrease + * subpage::readers and to unlock the page. + */ + if (fs_info->sectorsize < PAGE_SIZE) + btrfs_subpage_start_reader(fs_info, page, cur, add_size); + put_page(page); + cur += add_size; } return 0; } @@ -679,9 +799,10 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, unsigned int compressed_len; unsigned int nr_pages; unsigned int pg_index; - struct page *page; - struct bio *comp_bio; - u64 cur_disk_byte = bio->bi_iter.bi_sector << 9; + struct bio *comp_bio = NULL; + const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; + u64 cur_disk_byte = disk_bytenr; + u64 next_stripe_start; u64 file_offset; u64 em_len; u64 em_start; @@ -708,7 +829,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!cb) goto out; - refcount_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; @@ -748,86 +869,74 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, /* include any pages we added in add_ra-bio_pages */ cb->len = bio->bi_iter.bi_size; - comp_bio = btrfs_bio_alloc(cur_disk_byte); - comp_bio->bi_opf = REQ_OP_READ; - comp_bio->bi_private = cb; - comp_bio->bi_end_io = end_compressed_bio_read; - refcount_set(&cb->pending_bios, 1); - - for (pg_index = 0; pg_index < nr_pages; pg_index++) { - u32 pg_len = PAGE_SIZE; - int submit = 0; + while (cur_disk_byte < disk_bytenr + compressed_len) { + u64 offset = cur_disk_byte - disk_bytenr; + unsigned int index = offset >> PAGE_SHIFT; + unsigned int real_size; + unsigned int added; + struct page *page = cb->compressed_pages[index]; + bool submit = false; + + /* Allocate new bio if submitted or not yet allocated */ + if (!comp_bio) { + comp_bio = alloc_compressed_bio(cb, cur_disk_byte, + REQ_OP_READ, end_compressed_bio_read, + &next_stripe_start); + if (IS_ERR(comp_bio)) { + ret = errno_to_blk_status(PTR_ERR(comp_bio)); + comp_bio = NULL; + goto finish_cb; + } + } + /* + * We should never reach next_stripe_start start as we will + * submit comp_bio when reach the boundary immediately. + */ + ASSERT(cur_disk_byte != next_stripe_start); + /* + * We have various limit on the real read size: + * - stripe boundary + * - page boundary + * - compressed length boundary + */ + real_size = min_t(u64, U32_MAX, next_stripe_start - cur_disk_byte); + real_size = min_t(u64, real_size, PAGE_SIZE - offset_in_page(offset)); + real_size = min_t(u64, real_size, compressed_len - offset); + ASSERT(IS_ALIGNED(real_size, fs_info->sectorsize)); + added = bio_add_page(comp_bio, page, real_size, offset_in_page(offset)); /* - * To handle subpage case, we need to make sure the bio only - * covers the range we need. - * - * If we're at the last page, truncate the length to only cover - * the remaining part. + * Maximum compressed extent is smaller than bio size limit, + * thus bio_add_page() should always success. */ - if (pg_index == nr_pages - 1) - pg_len = min_t(u32, PAGE_SIZE, - compressed_len - pg_index * PAGE_SIZE); + ASSERT(added == real_size); + cur_disk_byte += added; - page = cb->compressed_pages[pg_index]; - page->mapping = inode->i_mapping; - page->index = em_start >> PAGE_SHIFT; + /* Reached stripe boundary, need to submit */ + if (cur_disk_byte == next_stripe_start) + submit = true; - if (comp_bio->bi_iter.bi_size) - submit = btrfs_bio_fits_in_stripe(page, pg_len, - comp_bio, 0); + /* Has finished the range, need to submit */ + if (cur_disk_byte == disk_bytenr + compressed_len) + submit = true; - page->mapping = NULL; - if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) { + if (submit) { unsigned int nr_sectors; - ret = btrfs_bio_wq_end_io(fs_info, comp_bio, - BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ - - /* - * inc the count before we submit the bio so - * we know the end IO handler won't happen before - * we inc the count. Otherwise, the cb might get - * freed before we're done setting it up - */ - refcount_inc(&cb->pending_bios); - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - BUG_ON(ret); /* -ENOMEM */ + if (ret) + goto finish_cb; nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, fs_info->sectorsize); sums += fs_info->csum_size * nr_sectors; - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); - if (ret) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - - comp_bio = btrfs_bio_alloc(cur_disk_byte); - comp_bio->bi_opf = REQ_OP_READ; - comp_bio->bi_private = cb; - comp_bio->bi_end_io = end_compressed_bio_read; - - bio_add_page(comp_bio, page, pg_len, 0); + ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); + if (ret) + goto finish_cb; + comp_bio = NULL; } - cur_disk_byte += pg_len; } - - ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA); - BUG_ON(ret); /* -ENOMEM */ - - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - BUG_ON(ret); /* -ENOMEM */ - - ret = btrfs_map_bio(fs_info, comp_bio, mirror_num); - if (ret) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - return 0; fail2: @@ -842,6 +951,26 @@ fail1: out: free_extent_map(em); return ret; +finish_cb: + if (comp_bio) { + comp_bio->bi_status = ret; + bio_endio(comp_bio); + } + /* All bytes of @cb is submitted, endio will free @cb */ + if (cur_disk_byte == disk_bytenr + compressed_len) + return ret; + + wait_var_event(cb, refcount_read(&cb->pending_sectors) == + (disk_bytenr + compressed_len - cur_disk_byte) >> + fs_info->sectorsize_bits); + /* + * Even with previous bio ended, we should still have io not yet + * submitted, thus need to finish @cb manually. + */ + ASSERT(refcount_read(&cb->pending_sectors)); + /* Now we are the only one referring @cb, can finish it safely. */ + finish_compressed_bio_read(cb, NULL); + return ret; } /* diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 399be0b435bf..56eef0821e3e 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -28,8 +28,8 @@ struct btrfs_inode; #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* number of bios pending for this compressed extent */ - refcount_t pending_bios; + /* Number of sectors with unfinished IO (unsubmitted or unfinished) */ + refcount_t pending_sectors; /* Number of compressed pages in the array */ unsigned int nr_pages; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 84627cbd5b5b..c3983bdaf4b8 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/mm.h> +#include <linux/error-injection.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -395,7 +396,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (*cow_ret == buf) unlock_orig = 1; - btrfs_assert_tree_locked(buf); + btrfs_assert_tree_write_locked(buf); WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && trans->transid != fs_info->running_transaction->transid); @@ -2487,7 +2488,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, int ret; BUG_ON(!path->nodes[level]); - btrfs_assert_tree_locked(path->nodes[level]); + btrfs_assert_tree_write_locked(path->nodes[level]); lower = path->nodes[level]; nritems = btrfs_header_nritems(lower); BUG_ON(slot > nritems); @@ -2827,7 +2828,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (slot >= btrfs_header_nritems(upper) - 1) return 1; - btrfs_assert_tree_locked(path->nodes[1]); + btrfs_assert_tree_write_locked(path->nodes[1]); right = btrfs_read_node_slot(upper, slot + 1); /* @@ -3065,7 +3066,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (right_nritems == 0) return 1; - btrfs_assert_tree_locked(path->nodes[1]); + btrfs_assert_tree_write_locked(path->nodes[1]); left = btrfs_read_node_slot(path->nodes[1], slot - 1); /* @@ -3581,40 +3582,6 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, } /* - * This function duplicate a item, giving 'new_key' to the new item. - * It guarantees both items live in the same tree leaf and the new item - * is contiguous with the original item. - * - * This allows us to split file extent in place, keeping a lock on the - * leaf the entire time. - */ -int btrfs_duplicate_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - const struct btrfs_key *new_key) -{ - struct extent_buffer *leaf; - int ret; - u32 item_size; - - leaf = path->nodes[0]; - item_size = btrfs_item_size_nr(leaf, path->slots[0]); - ret = setup_leaf_for_split(trans, root, path, - item_size + sizeof(struct btrfs_item)); - if (ret) - return ret; - - path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, 1); - leaf = path->nodes[0]; - memcpy_extent_buffer(leaf, - btrfs_item_ptr_offset(leaf, path->slots[0]), - btrfs_item_ptr_offset(leaf, path->slots[0] - 1), - item_size); - return 0; -} - -/* * make the item pointed to by the path smaller. new_size indicates * how small to make it, and from_end tells us if we just chop bytes * off the end of the item or if we shift the item to chop bytes off @@ -3785,13 +3752,10 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) * * @root: root we are inserting items to * @path: points to the leaf/slot where we are going to insert new items - * @cpu_key: array of keys for items to be inserted - * @data_size: size of the body of each item we are going to insert - * @nr: size of @cpu_key/@data_size arrays + * @batch: information about the batch of items to insert */ -void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr) +static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, + const struct btrfs_item_batch *batch) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; @@ -3803,14 +3767,14 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, int slot; struct btrfs_map_token token; u32 total_size; - u32 total_data = 0; - - for (i = 0; i < nr; i++) - total_data += data_size[i]; - total_size = total_data + (nr * sizeof(struct btrfs_item)); + /* + * Before anything else, update keys in the parent and other ancestors + * if needed, then release the write locks on them, so that other tasks + * can use them while we modify the leaf. + */ if (path->slots[0] == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]); fixup_low_keys(path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -3820,6 +3784,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, nritems = btrfs_header_nritems(leaf); data_end = leaf_data_end(leaf); + total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); if (btrfs_leaf_free_space(leaf) < total_size) { btrfs_print_leaf(leaf); @@ -3849,31 +3814,32 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, item = btrfs_item_nr(i); ioff = btrfs_token_item_offset(&token, item); btrfs_set_token_item_offset(&token, item, - ioff - total_data); + ioff - batch->total_data_size); } /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), btrfs_item_nr_offset(slot), (nritems - slot) * sizeof(struct btrfs_item)); /* shift the data */ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - total_data, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data - data_end); + data_end - batch->total_data_size, + BTRFS_LEAF_DATA_OFFSET + data_end, + old_data - data_end); data_end = old_data; } /* setup the item for the new data */ - for (i = 0; i < nr; i++) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + for (i = 0; i < batch->nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(slot + i); - data_end -= data_size[i]; + data_end -= batch->data_sizes[i]; btrfs_set_token_item_offset(&token, item, data_end); - btrfs_set_token_item_size(&token, item, data_size[i]); + btrfs_set_token_item_size(&token, item, batch->data_sizes[i]); } - btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_set_header_nritems(leaf, nritems + batch->nr); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(leaf) < 0) { @@ -3883,26 +3849,43 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, } /* + * Insert a new item into a leaf. + * + * @root: The root of the btree. + * @path: A path pointing to the target leaf and slot. + * @key: The key of the new item. + * @data_size: The size of the data associated with the new key. + */ +void btrfs_setup_item_for_insert(struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key, + u32 data_size) +{ + struct btrfs_item_batch batch; + + batch.keys = key; + batch.data_sizes = &data_size; + batch.total_data_size = data_size; + batch.nr = 1; + + setup_items_for_insert(root, path, &batch); +} + +/* * Given a key and some data, insert items into the tree. * This does all the path init required, making room in the tree if needed. */ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr) + const struct btrfs_item_batch *batch) { int ret = 0; int slot; - int i; - u32 total_size = 0; - u32 total_data = 0; - - for (i = 0; i < nr; i++) - total_data += data_size[i]; + u32 total_size; - total_size = total_data + (nr * sizeof(struct btrfs_item)); - ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1); if (ret == 0) return -EEXIST; if (ret < 0) @@ -3911,7 +3894,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, nr); + setup_items_for_insert(root, path, batch); return 0; } @@ -3943,6 +3926,40 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* + * This function duplicates an item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item is + * contiguous with the original item. + * + * This allows us to split a file extent in place, keeping a lock on the leaf + * the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + btrfs_setup_item_for_insert(root, path, new_key, item_size); + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* * delete the pointer from a given node. * * the tree should have been previously balanced so the deletion does not diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dff2c8a3e059..7553e9dc5f93 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -48,6 +48,7 @@ extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; +struct btrfs_bio; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ @@ -217,6 +218,9 @@ struct btrfs_root_backup { u8 unused_8[10]; } __attribute__ ((__packed__)); +#define BTRFS_SUPER_INFO_OFFSET SZ_64K +#define BTRFS_SUPER_INFO_SIZE 4096 + /* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc @@ -269,7 +273,11 @@ struct btrfs_super_block { __le64 reserved[28]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; + + /* Padded to 4096 bytes */ + u8 padding[565]; } __attribute__ ((__packed__)); +static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); /* * Compat flags that we support. If any incompat flags are set other than the @@ -899,6 +907,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_parity_workers; + struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -1017,6 +1026,16 @@ struct btrfs_fs_info { spinlock_t treelog_bg_lock; u64 treelog_bg; + /* + * Start of the dedicated data relocation block group, protected by + * relocation_bg_lock. + */ + spinlock_t relocation_bg_lock; + u64 data_reloc_bg; + + spinlock_t zone_active_bgs_lock; + struct list_head zone_active_bgs; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -2885,16 +2904,42 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, return btrfs_del_items(trans, root, path, path->slots[0], 1); } -void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr); +/* + * Describes a batch of items to insert in a btree. This is used by + * btrfs_insert_empty_items(). + */ +struct btrfs_item_batch { + /* + * Pointer to an array containing the keys of the items to insert (in + * sorted order). + */ + const struct btrfs_key *keys; + /* Pointer to an array containing the data size for each item to insert. */ + const u32 *data_sizes; + /* + * The sum of data sizes for all items. The caller can compute this while + * setting up the data_sizes array, so it ends up being more efficient + * than having btrfs_insert_empty_items() or setup_item_for_insert() + * doing it, as it would avoid an extra loop over a potentially large + * array, and in the case of setup_item_for_insert(), we would be doing + * it while holding a write lock on a leaf and often on upper level nodes + * too, unnecessarily increasing the size of a critical section. + */ + u32 total_data_size; + /* Size of the keys and data_sizes arrays (number of items in the batch). */ + int nr; +}; + +void btrfs_setup_item_for_insert(struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key, + u32 data_size); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const struct btrfs_key *cpu_key, u32 *data_size, - int nr); + const struct btrfs_item_batch *batch); static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -2902,7 +2947,14 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, const struct btrfs_key *key, u32 data_size) { - return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); + struct btrfs_item_batch batch; + + batch.keys = key; + batch.data_sizes = &data_size; + batch.total_data_size = data_size; + batch.nr = 1; + + return btrfs_insert_empty_items(trans, root, path, &batch); } int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); @@ -3030,7 +3082,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, @@ -3129,8 +3181,9 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); -unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end); +unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, + u64 start, u64 end); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3142,7 +3195,6 @@ void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len); int btrfs_add_link(struct btrfs_trans_handle *trans, @@ -3174,8 +3226,6 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); -int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, - unsigned long bio_flags); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); @@ -3242,9 +3292,9 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(u8 *uuid); -int btrfs_defrag_file(struct inode *inode, struct file *file, +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_pages); + u64 newer_than, unsigned long max_to_defrag); void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, @@ -3563,6 +3613,9 @@ do { \ (errno), fmt, ##args); \ } while (0) +#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ + &(fs_info)->fs_state))) + __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, @@ -3842,6 +3895,11 @@ static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) return fs_info->zoned != 0; } +static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) +{ + return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; +} + /* * We use page status Private2 to indicate there is an ordered extent with * unfinished IO. diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1e08eb2b27f0..e164766dcc38 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -679,19 +679,18 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *first_item) { - LIST_HEAD(batch); + LIST_HEAD(item_list); struct btrfs_delayed_item *curr; struct btrfs_delayed_item *next; const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info); + struct btrfs_item_batch batch; int total_size; - int nitems; char *ins_data = NULL; - struct btrfs_key *ins_keys; - u32 *ins_sizes; int ret; - list_add_tail(&first_item->tree_list, &batch); - nitems = 1; + list_add_tail(&first_item->tree_list, &item_list); + batch.total_data_size = first_item->data_len; + batch.nr = 1; total_size = first_item->data_len + sizeof(struct btrfs_item); curr = first_item; @@ -706,39 +705,43 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, if (total_size + next_size > max_size) break; - list_add_tail(&next->tree_list, &batch); - nitems++; + list_add_tail(&next->tree_list, &item_list); + batch.nr++; total_size += next_size; + batch.total_data_size += next->data_len; curr = next; } - if (nitems == 1) { - ins_keys = &first_item->key; - ins_sizes = &first_item->data_len; + if (batch.nr == 1) { + batch.keys = &first_item->key; + batch.data_sizes = &first_item->data_len; } else { + struct btrfs_key *ins_keys; + u32 *ins_sizes; int i = 0; - ins_data = kmalloc(nitems * sizeof(u32) + - nitems * sizeof(struct btrfs_key), GFP_NOFS); + ins_data = kmalloc(batch.nr * sizeof(u32) + + batch.nr * sizeof(struct btrfs_key), GFP_NOFS); if (!ins_data) { ret = -ENOMEM; goto out; } ins_sizes = (u32 *)ins_data; - ins_keys = (struct btrfs_key *)(ins_data + nitems * sizeof(u32)); - list_for_each_entry(curr, &batch, tree_list) { + ins_keys = (struct btrfs_key *)(ins_data + batch.nr * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + list_for_each_entry(curr, &item_list, tree_list) { ins_keys[i] = curr->key; ins_sizes[i] = curr->data_len; i++; } } - ret = btrfs_insert_empty_items(trans, root, path, ins_keys, ins_sizes, - nitems); + ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret) goto out; - list_for_each_entry(curr, &batch, tree_list) { + list_for_each_entry(curr, &item_list, tree_list) { char *data_ptr; data_ptr = btrfs_item_ptr(path->nodes[0], path->slots[0], char); @@ -754,7 +757,7 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, */ btrfs_release_path(path); - list_for_each_entry_safe(curr, next, &batch, tree_list) { + list_for_each_entry_safe(curr, next, &item_list, tree_list) { list_del(&curr->tree_list); btrfs_delayed_item_release_metadata(root, curr); btrfs_release_delayed_item(curr); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ca848b183474..cca7e85e32dd 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -906,7 +906,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, u64 parent = generic_ref->parent; u8 ref_type; - is_system = (generic_ref->real_root == BTRFS_CHUNK_TREE_OBJECTID); + is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); BUG_ON(extent_op && extent_op->is_data); @@ -921,8 +921,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(generic_ref->real_root) && - is_fstree(generic_ref->tree_ref.root) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { @@ -938,14 +936,15 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, ref_type = BTRFS_TREE_BLOCK_REF_KEY; init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes, - generic_ref->tree_ref.root, action, ref_type); - ref->root = generic_ref->tree_ref.root; + generic_ref->tree_ref.owning_root, action, + ref_type); + ref->root = generic_ref->tree_ref.owning_root; ref->parent = parent; ref->level = level; init_delayed_ref_head(head_ref, record, bytenr, num_bytes, - generic_ref->tree_ref.root, 0, action, false, - is_system); + generic_ref->tree_ref.owning_root, 0, action, + false, is_system); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -997,7 +996,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, u64 bytenr = generic_ref->bytenr; u64 num_bytes = generic_ref->len; u64 parent = generic_ref->parent; - u64 ref_root = generic_ref->data_ref.ref_root; + u64 ref_root = generic_ref->data_ref.owning_root; u64 owner = generic_ref->data_ref.ino; u64 offset = generic_ref->data_ref.offset; u8 ref_type; @@ -1026,8 +1025,6 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, } if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) && - is_fstree(ref_root) && - is_fstree(generic_ref->real_root) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index e22fba272e4f..91a3aabad150 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -186,8 +186,8 @@ enum btrfs_ref_type { struct btrfs_data_ref { /* For EXTENT_DATA_REF */ - /* Root which refers to this data extent */ - u64 ref_root; + /* Original root this data extent belongs to */ + u64 owning_root; /* Inode which refers to this data extent */ u64 ino; @@ -210,11 +210,11 @@ struct btrfs_tree_ref { int level; /* - * Root which refers to this tree block. + * Root which owns this tree block. * * For TREE_BLOCK_REF (skinny metadata, either inline or keyed) */ - u64 root; + u64 owning_root; /* For non-skinny metadata, no special member needed */ }; @@ -231,17 +231,10 @@ struct btrfs_ref { */ bool skip_qgroup; - /* - * Optional. For which root is this modification. - * Mostly used for qgroup optimization. - * - * When unset, data/tree ref init code will populate it. - * In certain cases, we're modifying reference for a different root. - * E.g. COW fs tree blocks for balance. - * In that case, tree_ref::root will be fs tree, but we're doing this - * for reloc tree, then we should set @real_root to reloc tree. - */ +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + /* Through which root is this modification. */ u64 real_root; +#endif u64 bytenr; u64 len; @@ -271,26 +264,40 @@ static inline void btrfs_init_generic_ref(struct btrfs_ref *generic_ref, } static inline void btrfs_init_tree_ref(struct btrfs_ref *generic_ref, - int level, u64 root) + int level, u64 root, u64 mod_root, bool skip_qgroup) { +#ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ - if (!generic_ref->real_root) - generic_ref->real_root = root; + generic_ref->real_root = mod_root ?: root; +#endif generic_ref->tree_ref.level = level; - generic_ref->tree_ref.root = root; + generic_ref->tree_ref.owning_root = root; generic_ref->type = BTRFS_REF_METADATA; + if (skip_qgroup || !(is_fstree(root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; + } static inline void btrfs_init_data_ref(struct btrfs_ref *generic_ref, - u64 ref_root, u64 ino, u64 offset) + u64 ref_root, u64 ino, u64 offset, u64 mod_root, + bool skip_qgroup) { +#ifdef CONFIG_BTRFS_FS_REF_VERIFY /* If @real_root not set, use @root as fallback */ - if (!generic_ref->real_root) - generic_ref->real_root = ref_root; - generic_ref->data_ref.ref_root = ref_root; + generic_ref->real_root = mod_root ?: ref_root; +#endif + generic_ref->data_ref.owning_root = ref_root; generic_ref->data_ref.ino = ino; generic_ref->data_ref.offset = offset; generic_ref->type = BTRFS_REF_DATA; + if (skip_qgroup || !(is_fstree(ref_root) && + (!mod_root || is_fstree(mod_root)))) + generic_ref->skip_qgroup = true; + else + generic_ref->skip_qgroup = false; } static inline struct btrfs_delayed_extent_op * diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index d029be40ea6f..c85a7d44da79 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -70,6 +70,7 @@ static int btrfs_dev_replace_kthread(void *data); int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) { + struct btrfs_dev_lookup_args args = { .devid = BTRFS_DEV_REPLACE_DEVID }; struct btrfs_key key; struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; @@ -100,8 +101,7 @@ no_valid_dev_replace_entry_found: * We don't have a replace item or it's corrupted. If there is * a replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { + if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); ret = -EUCLEAN; @@ -163,8 +163,7 @@ no_valid_dev_replace_entry_found: * We don't have an active replace item but if there is a * replace target, fail the mount. */ - if (btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) { + if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "replace devid present without an active replace item"); ret = -EUCLEAN; @@ -175,11 +174,10 @@ no_valid_dev_replace_entry_found: break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, - src_devid, NULL, NULL); - dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, - BTRFS_DEV_REPLACE_DEVID, - NULL, NULL); + dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices, &args); + args.devid = src_devid; + dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices, &args); + /* * allow 'btrfs dev replace_cancel' if src/tgt device is * missing @@ -283,8 +281,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } - if (i_size_read(bdev->bd_inode) < - btrfs_device_get_total_bytes(srcdev)) { + if (bdev_nr_bytes(bdev) < btrfs_device_get_total_bytes(srcdev)) { btrfs_err(fs_info, "target device is smaller than source device!"); ret = -EINVAL; diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f1274d5c3805..7721ce0c0604 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -190,9 +190,20 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( } /* - * lookup a directory item based on name. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory item by name. + * + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir item does not exists, an error pointer if an error + * happened, or a pointer to a dir item if a dir item exists for the given name. */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -273,27 +284,42 @@ out: } /* - * lookup a directory item based on index. 'dir' is the objectid - * we're searching in, and 'mod' tells us if you plan on deleting the - * item (use mod < 0) or changing the options (use mod > 0) + * Lookup for a directory index item by name and index number. * - * The name is used to make sure the index really points to the name you were - * looking for. + * @trans: The transaction handle to use. Can be NULL if @mod is 0. + * @root: The root of the target tree. + * @path: Path to use for the search. + * @dir: The inode number (objectid) of the directory. + * @index: The index number. + * @name: The name associated to the directory entry we are looking for. + * @name_len: The length of the name. + * @mod: Used to indicate if the tree search is meant for a read only + * lookup, for a modification lookup or for a deletion lookup, so + * its value should be 0, 1 or -1, respectively. + * + * Returns: NULL if the dir index item does not exists, an error pointer if an + * error happened, or a pointer to a dir item if the dir index item exists and + * matches the criteria (name and index number). */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 objectid, const char *name, int name_len, + u64 index, const char *name, int name_len, int mod) { + struct btrfs_dir_item *di; struct btrfs_key key; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; - key.offset = objectid; + key.offset = index; - return btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + if (di == ERR_PTR(-ENOENT)) + return NULL; + + return di; } struct btrfs_dir_item * diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 355ea88d5c5f..59c3be8c1f4c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -683,7 +683,7 @@ err: return ret; } -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, +int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror) { @@ -1036,7 +1036,7 @@ static int btree_set_page_dirty(struct page *page) BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); + btrfs_assert_tree_write_locked(eb); return __set_page_dirty_nobuffers(page); } ASSERT(PagePrivate(page) && page->private); @@ -1061,7 +1061,7 @@ static int btree_set_page_dirty(struct page *page) ASSERT(eb); ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); ASSERT(atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); + btrfs_assert_tree_write_locked(eb); free_extent_buffer(eb); cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); @@ -1125,7 +1125,7 @@ void btrfs_clean_tree_block(struct extent_buffer *buf) struct btrfs_fs_info *fs_info = buf->fs_info; if (btrfs_header_generation(buf) == fs_info->running_transaction->transid) { - btrfs_assert_tree_locked(buf); + btrfs_assert_tree_write_locked(buf); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, @@ -1500,7 +1500,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) goto fail; if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && - root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + !btrfs_is_data_reloc_root(root)) { set_bit(BTRFS_ROOT_SHAREABLE, &root->state); btrfs_check_and_init_root_item(&root->root_item); } @@ -1644,6 +1644,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); + kfree(fs_info->subpage_info); kvfree(fs_info); } @@ -1953,8 +1954,7 @@ sleep: wake_up_process(fs_info->cleaner_kthread); mutex_unlock(&fs_info->transaction_kthread_mutex); - if (unlikely(test_bit(BTRFS_FS_STATE_ERROR, - &fs_info->fs_state))) + if (BTRFS_FS_ERROR(fs_info)) btrfs_cleanup_transaction(fs_info); if (!kthread_should_stop() && (!btrfs_transaction_blocked(fs_info) || @@ -2592,8 +2592,7 @@ static int validate_super(struct btrfs_fs_info *fs_info, /* * For 4K page size, we only support 4K sector size. - * For 64K page size, we support read-write for 64K sector size, and - * read-only for 4K sector size. + * For 64K page size, we support 64K and 4K sector sizes. */ if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && @@ -2883,6 +2882,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); + spin_lock_init(&fs_info->zone_active_bgs_lock); + spin_lock_init(&fs_info->relocation_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); mutex_init(&fs_info->reclaim_bgs_lock); @@ -2896,6 +2897,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); INIT_LIST_HEAD(&fs_info->reclaim_bgs); + INIT_LIST_HEAD(&fs_info->zone_active_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); INIT_LIST_HEAD(&fs_info->allocated_ebs); @@ -3228,12 +3230,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); btrfs_init_btree_inode(fs_info); - invalidate_bdev(fs_devices->latest_bdev); + invalidate_bdev(fs_devices->latest_dev->bdev); /* * Read super block and check the signature bytes only */ - disk_super = btrfs_read_dev_super(fs_devices->latest_bdev); + disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); if (IS_ERR(disk_super)) { err = PTR_ERR(disk_super); goto fail_alloc; @@ -3392,12 +3394,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } - if (sectorsize != PAGE_SIZE) { + if (sectorsize < PAGE_SIZE) { + struct btrfs_subpage_info *subpage_info; + btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - } - if (sectorsize != PAGE_SIZE) { if (btrfs_super_incompat_flags(fs_info->super_copy) & BTRFS_FEATURE_INCOMPAT_RAID56) { btrfs_err(fs_info, @@ -3406,6 +3408,11 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device err = -EINVAL; goto fail_alloc; } + subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); + if (!subpage_info) + goto fail_alloc; + btrfs_init_subpage_info(subpage_info, sectorsize); + fs_info->subpage_info = subpage_info; } ret = btrfs_init_workqueues(fs_info, fs_devices); @@ -3465,7 +3472,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device * below in btrfs_init_dev_replace(). */ btrfs_free_extra_devids(fs_devices); - if (!fs_devices->latest_bdev) { + if (!fs_devices->latest_dev->bdev) { btrfs_err(fs_info, "failed to read devices"); goto fail_tree_roots; } @@ -3556,7 +3563,8 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } - if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) { + if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices && + !btrfs_check_rw_degradable(fs_info, NULL)) { btrfs_warn(fs_info, "writable mount is not allowed due to too many missing devices"); goto fail_sysfs; @@ -3740,7 +3748,7 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, else if (ret) return ERR_PTR(ret); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode)) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); @@ -3881,7 +3889,9 @@ static int write_dev_supers(struct btrfs_device *device, bio->bi_opf |= REQ_FUA; btrfsic_submit_bio(bio); - btrfs_advance_sb_log(device, i); + + if (btrfs_advance_sb_log(device, i)) + errors++; } return errors < i ? 0 : -1; } @@ -4221,7 +4231,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); if (root->reloc_root) { btrfs_put_root(root->reloc_root); @@ -4372,8 +4382,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state) || - test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) btrfs_error_commit_super(fs_info); kthread_stop(fs_info->transaction_kthread); @@ -4470,7 +4479,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags))) return; #endif - btrfs_assert_tree_locked(buf); + btrfs_assert_tree_write_locked(buf); if (transid != fs_info->generation) WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, found %llu running %llu\n", buf->start, transid, fs_info->generation); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 0e7e9526b6a8..a2b5db4ba262 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -6,9 +6,6 @@ #ifndef BTRFS_DISK_IO_H #define BTRFS_DISK_IO_H -#define BTRFS_SUPER_INFO_OFFSET SZ_64K -#define BTRFS_SUPER_INFO_SIZE 4096 - #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 @@ -81,7 +78,7 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); -int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, +int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fc3da7585fb7..3fd736a02c1e 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1266,7 +1266,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } -static int do_discard_extent(struct btrfs_bio_stripe *stripe, u64 *bytes) +static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) { struct btrfs_device *dev = stripe->dev; struct btrfs_fs_info *fs_info = dev->fs_info; @@ -1313,22 +1313,21 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; - struct btrfs_bio *bbio = NULL; - + struct btrfs_io_context *bioc = NULL; /* - * Avoid races with device replace and make sure our bbio has devices + * Avoid races with device replace and make sure our bioc has devices * associated to its stripes that don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { - struct btrfs_bio_stripe *stripe; + struct btrfs_io_stripe *stripe; int i; num_bytes = end - cur; /* Tell the block device(s) that the sectors can be discarded */ ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, - &num_bytes, &bbio, 0); + &num_bytes, &bioc, 0); /* * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or * -EOPNOTSUPP. For any such error, @num_bytes is not updated, @@ -1337,8 +1336,8 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, if (ret < 0) goto out; - stripe = bbio->stripes; - for (i = 0; i < bbio->num_stripes; i++, stripe++) { + stripe = bioc->stripes; + for (i = 0; i < bioc->num_stripes; i++, stripe++) { u64 bytes; struct btrfs_device *device = stripe->dev; @@ -1361,7 +1360,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, * And since there are two loops, explicitly * go to out to avoid confusion. */ - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); goto out; } @@ -1372,7 +1371,7 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, */ ret = 0; } - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); cur += num_bytes; } out: @@ -1397,7 +1396,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, ASSERT(generic_ref->type != BTRFS_REF_NOT_SET && generic_ref->action); BUG_ON(generic_ref->type == BTRFS_REF_METADATA && - generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID); + generic_ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID); if (generic_ref->type == BTRFS_REF_METADATA) ret = btrfs_add_delayed_tree_ref(trans, generic_ref, NULL); @@ -2376,7 +2375,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, out: btrfs_free_path(path); - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0); return ret; } @@ -2438,10 +2437,9 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(buf, fi); btrfs_init_generic_ref(&generic_ref, action, bytenr, num_bytes, parent); - generic_ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&generic_ref, ref_root, key.objectid, - key.offset); - generic_ref.skip_qgroup = for_reloc; + key.offset, root->root_key.objectid, + for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &generic_ref); else @@ -2453,9 +2451,8 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, num_bytes = fs_info->nodesize; btrfs_init_generic_ref(&generic_ref, action, bytenr, num_bytes, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_tree_ref(&generic_ref, level - 1, ref_root); - generic_ref.skip_qgroup = for_reloc; + btrfs_init_tree_ref(&generic_ref, level - 1, ref_root, + root->root_key.objectid, for_reloc); if (inc) ret = btrfs_inc_extent_ref(trans, &generic_ref); else @@ -3196,7 +3193,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0); + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3289,7 +3286,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, buf->start, buf->len, parent); btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf), - root->root_key.objectid); + root->root_key.objectid, 0, false); if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { btrfs_ref_tree_mod(fs_info, &generic_ref); @@ -3373,9 +3370,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) * tree, just update pinning info and exit early. */ if ((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) { + ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID)) { /* unlocks the pinned mutex */ btrfs_pin_extent(trans, ref->bytenr, ref->len, 1); ret = 0; @@ -3386,9 +3383,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref) } if (!((ref->type == BTRFS_REF_METADATA && - ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) || + ref->tree_ref.owning_root == BTRFS_TREE_LOG_OBJECTID) || (ref->type == BTRFS_REF_DATA && - ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID))) + ref->data_ref.owning_root == BTRFS_TREE_LOG_OBJECTID))) btrfs_ref_tree_mod(fs_info, ref); return ret; @@ -3476,7 +3473,9 @@ enum btrfs_extent_allocation_policy { */ struct find_free_extent_ctl { /* Basic allocation info */ + u64 ram_bytes; u64 num_bytes; + u64 min_alloc_size; u64 empty_size; u64 flags; int delalloc; @@ -3495,6 +3494,9 @@ struct find_free_extent_ctl { /* Allocation is called for tree-log */ bool for_treelog; + /* Allocation is called for data relocation */ + bool for_data_reloc; + /* RAID index, converted from flags */ int index; @@ -3756,8 +3758,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, u64 avail; u64 bytenr = block_group->start; u64 log_bytenr; + u64 data_reloc_bytenr; int ret = 0; - bool skip; + bool skip = false; ASSERT(btrfs_is_zoned(block_group->fs_info)); @@ -3767,19 +3770,49 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, */ spin_lock(&fs_info->treelog_bg_lock); log_bytenr = fs_info->treelog_bg; - skip = log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || - (!ffe_ctl->for_treelog && bytenr == log_bytenr)); + if (log_bytenr && ((ffe_ctl->for_treelog && bytenr != log_bytenr) || + (!ffe_ctl->for_treelog && bytenr == log_bytenr))) + skip = true; spin_unlock(&fs_info->treelog_bg_lock); if (skip) return 1; + /* + * Do not allow non-relocation blocks in the dedicated relocation block + * group, and vice versa. + */ + spin_lock(&fs_info->relocation_bg_lock); + data_reloc_bytenr = fs_info->data_reloc_bg; + if (data_reloc_bytenr && + ((ffe_ctl->for_data_reloc && bytenr != data_reloc_bytenr) || + (!ffe_ctl->for_data_reloc && bytenr == data_reloc_bytenr))) + skip = true; + spin_unlock(&fs_info->relocation_bg_lock); + if (skip) + return 1; + /* Check RO and no space case before trying to activate it */ + spin_lock(&block_group->lock); + if (block_group->ro || + block_group->alloc_offset == block_group->zone_capacity) { + spin_unlock(&block_group->lock); + return 1; + } + spin_unlock(&block_group->lock); + + if (!btrfs_zone_activate(block_group)) + return 1; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); spin_lock(&fs_info->treelog_bg_lock); + spin_lock(&fs_info->relocation_bg_lock); ASSERT(!ffe_ctl->for_treelog || block_group->start == fs_info->treelog_bg || fs_info->treelog_bg == 0); + ASSERT(!ffe_ctl->for_data_reloc || + block_group->start == fs_info->data_reloc_bg || + fs_info->data_reloc_bg == 0); if (block_group->ro) { ret = 1; @@ -3796,7 +3829,18 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, goto out; } - avail = block_group->length - block_group->alloc_offset; + /* + * Do not allow currently used block group to be the data relocation + * dedicated block group. + */ + if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg && + (block_group->used || block_group->reserved)) { + ret = 1; + goto out; + } + + WARN_ON_ONCE(block_group->alloc_offset > block_group->zone_capacity); + avail = block_group->zone_capacity - block_group->alloc_offset; if (avail < num_bytes) { if (ffe_ctl->max_extent_size < avail) { /* @@ -3813,6 +3857,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, if (ffe_ctl->for_treelog && !fs_info->treelog_bg) fs_info->treelog_bg = block_group->start; + if (ffe_ctl->for_data_reloc && !fs_info->data_reloc_bg) + fs_info->data_reloc_bg = block_group->start; + ffe_ctl->found_offset = start + block_group->alloc_offset; block_group->alloc_offset += num_bytes; spin_lock(&ctl->tree_lock); @@ -3829,6 +3876,9 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; + if (ret && ffe_ctl->for_data_reloc) + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); @@ -3932,18 +3982,30 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg) ffe_ctl->orig_have_caching_bg = true; - if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT && - ffe_ctl->have_caching_bg) - return 1; - - if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES) - return 1; - if (ins->objectid) { found_extent(ffe_ctl, ins); return 0; } + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && + !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->index)) { + /* + * If we have enough free space left in an already active block + * group and we can't activate any other zone now, retry the + * active ones with a smaller allocation size. Returning early + * from here will tell btrfs_reserve_extent() to haven the + * size. + */ + return -ENOSPC; + } + + if (ffe_ctl->loop >= LOOP_CACHING_WAIT && ffe_ctl->have_caching_bg) + return 1; + + ffe_ctl->index++; + if (ffe_ctl->index < BTRFS_NR_RAID_TYPES) + return 1; + /* * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking * caching kthreads as we move along @@ -4085,6 +4147,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, ffe_ctl->hint_byte = fs_info->treelog_bg; spin_unlock(&fs_info->treelog_bg_lock); } + if (ffe_ctl->for_data_reloc) { + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg) + ffe_ctl->hint_byte = fs_info->data_reloc_bg; + spin_unlock(&fs_info->relocation_bg_lock); + } return 0; default: BUG(); @@ -4117,65 +4185,62 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, * |- If not found, re-iterate all block groups */ static noinline int find_free_extent(struct btrfs_root *root, - u64 ram_bytes, u64 num_bytes, u64 empty_size, - u64 hint_byte_orig, struct btrfs_key *ins, - u64 flags, int delalloc) + struct btrfs_key *ins, + struct find_free_extent_ctl *ffe_ctl) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; - struct find_free_extent_ctl ffe_ctl = {0}; struct btrfs_space_info *space_info; bool full_search = false; - bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); - WARN_ON(num_bytes < fs_info->sectorsize); - - ffe_ctl.num_bytes = num_bytes; - ffe_ctl.empty_size = empty_size; - ffe_ctl.flags = flags; - ffe_ctl.search_start = 0; - ffe_ctl.delalloc = delalloc; - ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags); - ffe_ctl.have_caching_bg = false; - ffe_ctl.orig_have_caching_bg = false; - ffe_ctl.found_offset = 0; - ffe_ctl.hint_byte = hint_byte_orig; - ffe_ctl.for_treelog = for_treelog; - ffe_ctl.policy = BTRFS_EXTENT_ALLOC_CLUSTERED; + WARN_ON(ffe_ctl->num_bytes < fs_info->sectorsize); + ffe_ctl->search_start = 0; /* For clustered allocation */ - ffe_ctl.retry_clustered = false; - ffe_ctl.retry_unclustered = false; - ffe_ctl.last_ptr = NULL; - ffe_ctl.use_cluster = true; + ffe_ctl->empty_cluster = 0; + ffe_ctl->last_ptr = NULL; + ffe_ctl->use_cluster = true; + ffe_ctl->have_caching_bg = false; + ffe_ctl->orig_have_caching_bg = false; + ffe_ctl->index = btrfs_bg_flags_to_raid_index(ffe_ctl->flags); + ffe_ctl->loop = 0; + /* For clustered allocation */ + ffe_ctl->retry_clustered = false; + ffe_ctl->retry_unclustered = false; + ffe_ctl->cached = 0; + ffe_ctl->max_extent_size = 0; + ffe_ctl->total_free_space = 0; + ffe_ctl->found_offset = 0; + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_CLUSTERED; if (btrfs_is_zoned(fs_info)) - ffe_ctl.policy = BTRFS_EXTENT_ALLOC_ZONED; + ffe_ctl->policy = BTRFS_EXTENT_ALLOC_ZONED; ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, num_bytes, empty_size, flags); + trace_find_free_extent(root, ffe_ctl->num_bytes, ffe_ctl->empty_size, + ffe_ctl->flags); - space_info = btrfs_find_space_info(fs_info, flags); + space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", flags); + btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); return -ENOSPC; } - ret = prepare_allocation(fs_info, &ffe_ctl, space_info, ins); + ret = prepare_allocation(fs_info, ffe_ctl, space_info, ins); if (ret < 0) return ret; - ffe_ctl.search_start = max(ffe_ctl.search_start, - first_logical_byte(fs_info, 0)); - ffe_ctl.search_start = max(ffe_ctl.search_start, ffe_ctl.hint_byte); - if (ffe_ctl.search_start == ffe_ctl.hint_byte) { + ffe_ctl->search_start = max(ffe_ctl->search_start, + first_logical_byte(fs_info, 0)); + ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); + if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, - ffe_ctl.search_start); + ffe_ctl->search_start); /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -4183,7 +4248,7 @@ static noinline int find_free_extent(struct btrfs_root *root, * However if we are re-searching with an ideal block group * picked out then we don't care that the block group is cached. */ - if (block_group && block_group_bits(block_group, flags) && + if (block_group && block_group_bits(block_group, ffe_ctl->flags) && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -4197,9 +4262,10 @@ static noinline int find_free_extent(struct btrfs_root *root, btrfs_put_block_group(block_group); up_read(&space_info->groups_sem); } else { - ffe_ctl.index = btrfs_bg_flags_to_raid_index( - block_group->flags); - btrfs_lock_block_group(block_group, delalloc); + ffe_ctl->index = btrfs_bg_flags_to_raid_index( + block_group->flags); + btrfs_lock_block_group(block_group, + ffe_ctl->delalloc); goto have_block_group; } } else if (block_group) { @@ -4207,31 +4273,33 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: - ffe_ctl.have_caching_bg = false; - if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) || - ffe_ctl.index == 0) + ffe_ctl->have_caching_bg = false; + if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || + ffe_ctl->index == 0) full_search = true; down_read(&space_info->groups_sem); list_for_each_entry(block_group, - &space_info->block_groups[ffe_ctl.index], list) { + &space_info->block_groups[ffe_ctl->index], list) { struct btrfs_block_group *bg_ret; /* If the block group is read-only, we can skip it entirely. */ if (unlikely(block_group->ro)) { - if (for_treelog) + if (ffe_ctl->for_treelog) btrfs_clear_treelog_bg(block_group); + if (ffe_ctl->for_data_reloc) + btrfs_clear_data_reloc_bg(block_group); continue; } - btrfs_grab_block_group(block_group, delalloc); - ffe_ctl.search_start = block_group->start; + btrfs_grab_block_group(block_group, ffe_ctl->delalloc); + ffe_ctl->search_start = block_group->start; /* * this can happen if we end up cycling through all the * raid types, but we want to make sure we only allocate * for the proper type. */ - if (!block_group_bits(block_group, flags)) { + if (!block_group_bits(block_group, ffe_ctl->flags)) { u64 extra = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID56_MASK | @@ -4242,7 +4310,7 @@ search: * doesn't provide them, bail. This does allow us to * fill raid0 from raid1. */ - if ((flags & extra) && !(block_group->flags & extra)) + if ((ffe_ctl->flags & extra) && !(block_group->flags & extra)) goto loop; /* @@ -4250,14 +4318,14 @@ search: * It's possible that we have MIXED_GROUP flag but no * block group is mixed. Just skip such block group. */ - btrfs_release_block_group(block_group, delalloc); + btrfs_release_block_group(block_group, ffe_ctl->delalloc); continue; } have_block_group: - ffe_ctl.cached = btrfs_block_group_done(block_group); - if (unlikely(!ffe_ctl.cached)) { - ffe_ctl.have_caching_bg = true; + ffe_ctl->cached = btrfs_block_group_done(block_group); + if (unlikely(!ffe_ctl->cached)) { + ffe_ctl->have_caching_bg = true; ret = btrfs_cache_block_group(block_group, 0); /* @@ -4280,10 +4348,11 @@ have_block_group: goto loop; bg_ret = NULL; - ret = do_allocation(block_group, &ffe_ctl, &bg_ret); + ret = do_allocation(block_group, ffe_ctl, &bg_ret); if (ret == 0) { if (bg_ret && bg_ret != block_group) { - btrfs_release_block_group(block_group, delalloc); + btrfs_release_block_group(block_group, + ffe_ctl->delalloc); block_group = bg_ret; } } else if (ret == -EAGAIN) { @@ -4293,46 +4362,49 @@ have_block_group: } /* Checks */ - ffe_ctl.search_start = round_up(ffe_ctl.found_offset, - fs_info->stripesize); + ffe_ctl->search_start = round_up(ffe_ctl->found_offset, + fs_info->stripesize); /* move on to the next group */ - if (ffe_ctl.search_start + num_bytes > + if (ffe_ctl->search_start + ffe_ctl->num_bytes > block_group->start + block_group->length) { btrfs_add_free_space_unused(block_group, - ffe_ctl.found_offset, num_bytes); + ffe_ctl->found_offset, + ffe_ctl->num_bytes); goto loop; } - if (ffe_ctl.found_offset < ffe_ctl.search_start) + if (ffe_ctl->found_offset < ffe_ctl->search_start) btrfs_add_free_space_unused(block_group, - ffe_ctl.found_offset, - ffe_ctl.search_start - ffe_ctl.found_offset); + ffe_ctl->found_offset, + ffe_ctl->search_start - ffe_ctl->found_offset); - ret = btrfs_add_reserved_bytes(block_group, ram_bytes, - num_bytes, delalloc); + ret = btrfs_add_reserved_bytes(block_group, ffe_ctl->ram_bytes, + ffe_ctl->num_bytes, + ffe_ctl->delalloc); if (ret == -EAGAIN) { btrfs_add_free_space_unused(block_group, - ffe_ctl.found_offset, num_bytes); + ffe_ctl->found_offset, + ffe_ctl->num_bytes); goto loop; } btrfs_inc_block_group_reservations(block_group); /* we are all good, lets return */ - ins->objectid = ffe_ctl.search_start; - ins->offset = num_bytes; + ins->objectid = ffe_ctl->search_start; + ins->offset = ffe_ctl->num_bytes; - trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start, - num_bytes); - btrfs_release_block_group(block_group, delalloc); + trace_btrfs_reserve_extent(block_group, ffe_ctl->search_start, + ffe_ctl->num_bytes); + btrfs_release_block_group(block_group, ffe_ctl->delalloc); break; loop: - release_block_group(block_group, &ffe_ctl, delalloc); + release_block_group(block_group, ffe_ctl, ffe_ctl->delalloc); cond_resched(); } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, ins, &ffe_ctl, full_search); + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); if (ret > 0) goto search; @@ -4341,12 +4413,12 @@ loop: * Use ffe_ctl->total_free_space as fallback if we can't find * any contiguous hole. */ - if (!ffe_ctl.max_extent_size) - ffe_ctl.max_extent_size = ffe_ctl.total_free_space; + if (!ffe_ctl->max_extent_size) + ffe_ctl->max_extent_size = ffe_ctl->total_free_space; spin_lock(&space_info->lock); - space_info->max_extent_size = ffe_ctl.max_extent_size; + space_info->max_extent_size = ffe_ctl->max_extent_size; spin_unlock(&space_info->lock); - ins->offset = ffe_ctl.max_extent_size; + ins->offset = ffe_ctl->max_extent_size; } else if (ret == -ENOSPC) { ret = cache_block_group_error; } @@ -4404,16 +4476,28 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, struct btrfs_key *ins, int is_data, int delalloc) { struct btrfs_fs_info *fs_info = root->fs_info; + struct find_free_extent_ctl ffe_ctl = {}; bool final_tried = num_bytes == min_alloc_size; u64 flags; int ret; bool for_treelog = (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); + bool for_data_reloc = (btrfs_is_data_reloc_root(root) && is_data); flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, - hint_byte, ins, flags, delalloc); + + ffe_ctl.ram_bytes = ram_bytes; + ffe_ctl.num_bytes = num_bytes; + ffe_ctl.min_alloc_size = min_alloc_size; + ffe_ctl.empty_size = empty_size; + ffe_ctl.flags = flags; + ffe_ctl.delalloc = delalloc; + ffe_ctl.hint_byte = hint_byte; + ffe_ctl.for_treelog = for_treelog; + ffe_ctl.for_data_reloc = for_data_reloc; + + ret = find_free_extent(root, ins, &ffe_ctl); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); } else if (ret == -ENOSPC) { @@ -4431,8 +4515,8 @@ again: sinfo = btrfs_find_space_info(fs_info, flags); btrfs_err(fs_info, - "allocation failed flags %llu, wanted %llu tree-log %d", - flags, num_bytes, for_treelog); + "allocation failed flags %llu, wanted %llu tree-log %d, relocation: %d", + flags, num_bytes, for_treelog, for_data_reloc); if (sinfo) btrfs_dump_space_info(fs_info, sinfo, num_bytes, 1); @@ -4543,7 +4627,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1); + ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); @@ -4632,7 +4716,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; ret = btrfs_update_block_group(trans, extent_key.objectid, - fs_info->nodesize, 1); + fs_info->nodesize, true); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", extent_key.objectid, extent_key.offset); @@ -4655,7 +4739,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, ins->objectid, ins->offset, 0); - btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset); + btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, + offset, 0, false); btrfs_ref_tree_mod(root->fs_info, &generic_ref); return btrfs_add_delayed_data_ref(trans, &generic_ref, ram_bytes); @@ -4847,8 +4932,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, ins.objectid, ins.offset, parent); - generic_ref.real_root = root->root_key.objectid; - btrfs_init_tree_ref(&generic_ref, level, root_objectid); + btrfs_init_tree_ref(&generic_ref, level, root_objectid, + root->root_key.objectid, false); btrfs_ref_tree_mod(fs_info, &generic_ref); ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, extent_op); if (ret) @@ -4859,6 +4944,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, out_free_delayed: btrfs_free_delayed_extent_op(extent_op); out_free_buf: + btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); @@ -5264,7 +5350,8 @@ skip: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, fs_info->nodesize, parent); - btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid); + btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, + 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) goto out_unlock; @@ -5749,13 +5836,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return -ENOMEM; } - btrfs_assert_tree_locked(parent); + btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); atomic_inc(&parent->refs); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); - btrfs_assert_tree_locked(node); + btrfs_assert_tree_write_locked(node); level = btrfs_header_level(node); path->nodes[level] = node; path->slots[level] = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aaddd7225348..4e03a6d3aa32 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -241,7 +241,7 @@ int __init extent_io_init(void) return -ENOMEM; if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_io_bio, bio), + offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) goto free_buffer_cache; @@ -1975,10 +1975,18 @@ static noinline int lock_delalloc_pages(struct inode *inode, /* * Find and lock a contiguous range of bytes in the file marked as delalloc, no - * more than @max_bytes. @Start and @end are used to return the range, + * more than @max_bytes. * - * Return: true if we find something - * false if nothing was in the tree + * @start: The original start bytenr to search. + * Will store the extent range start bytenr. + * @end: The original end bytenr of the search range + * Will store the extent range end bytenr. + * + * Return true if we find a delalloc range which starts inside the original + * range, and @start/@end will store the delalloc range start/end. + * + * Return false if we can't find any delalloc range which starts inside the + * original range, and @start/@end will be the non-delalloc range start/end. */ EXPORT_FOR_TESTS noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, @@ -1986,6 +1994,8 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, u64 *end) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + const u64 orig_start = *start; + const u64 orig_end = *end; u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; @@ -1994,15 +2004,23 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, int ret; int loops = 0; + /* Caller should pass a valid @end to indicate the search range end */ + ASSERT(orig_end > orig_start); + + /* The range should at least cover part of the page */ + ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE || + orig_end <= page_offset(locked_page))); again: /* step one, find a bunch of delalloc bytes starting at start */ delalloc_start = *start; delalloc_end = 0; found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end, max_bytes, &cached_state); - if (!found || delalloc_end <= *start) { + if (!found || delalloc_end <= *start || delalloc_start > orig_end) { *start = delalloc_start; - *end = delalloc_end; + + /* @delalloc_end can be -1, never go beyond @orig_end */ + *end = min(delalloc_end, orig_end); free_extent_state(cached_state); return false; } @@ -2282,15 +2300,15 @@ int free_io_failure(struct extent_io_tree *failure_tree, * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) +static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) { struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; u64 sector; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; int ret; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); @@ -2299,12 +2317,12 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_is_zoned(fs_info)) return btrfs_repair_one_zone(fs_info, logical); - bio = btrfs_io_bio_alloc(1); + bio = btrfs_bio_alloc(1); bio->bi_iter.bi_size = 0; map_length = length; /* - * Avoid races with device replace and make sure our bbio has devices + * Avoid races with device replace and make sure our bioc has devices * associated to its stripes that don't go away while we are doing the * read repair operation. */ @@ -2317,28 +2335,28 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, * stripe's dev and sector. */ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &map_length, &bbio, 0); + &map_length, &bioc, 0); if (ret) { btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } - ASSERT(bbio->mirror_num == 1); + ASSERT(bioc->mirror_num == 1); } else { ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bbio, mirror_num); + &map_length, &bioc, mirror_num); if (ret) { btrfs_bio_counter_dec(fs_info); bio_put(bio); return -EIO; } - BUG_ON(mirror_num != bbio->mirror_num); + BUG_ON(mirror_num != bioc->mirror_num); } - sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; + sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; bio->bi_iter.bi_sector = sector; - dev = bbio->stripes[bbio->mirror_num - 1].dev; - btrfs_put_bbio(bbio); + dev = bioc->stripes[bioc->mirror_num - 1].dev; + btrfs_put_bioc(bioc); if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { btrfs_bio_counter_dec(fs_info); @@ -2618,10 +2636,10 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct btrfs_io_bio *failed_io_bio = btrfs_io_bio(failed_bio); + struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio); const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; - struct btrfs_io_bio *repair_io_bio; + struct btrfs_bio *repair_bbio; blk_status_t status; btrfs_debug(fs_info, @@ -2639,24 +2657,23 @@ int btrfs_repair_one_sector(struct inode *inode, return -EIO; } - repair_bio = btrfs_io_bio_alloc(1); - repair_io_bio = btrfs_io_bio(repair_bio); + repair_bio = btrfs_bio_alloc(1); + repair_bbio = btrfs_bio(repair_bio); repair_bio->bi_opf = REQ_OP_READ; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; repair_bio->bi_private = failed_bio->bi_private; - if (failed_io_bio->csum) { + if (failed_bbio->csum) { const u32 csum_size = fs_info->csum_size; - repair_io_bio->csum = repair_io_bio->csum_inline; - memcpy(repair_io_bio->csum, - failed_io_bio->csum + csum_size * icsum, csum_size); + repair_bbio->csum = repair_bbio->csum_inline; + memcpy(repair_bbio->csum, + failed_bbio->csum + csum_size * icsum, csum_size); } bio_add_page(repair_bio, page, failrec->len, pgoff); - repair_io_bio->logical = failrec->start; - repair_io_bio->iter = repair_bio->bi_iter; + repair_bbio->iter = repair_bio->bi_iter; btrfs_debug(btrfs_sb(inode->i_sb), "repair read error: submitting new read to mirror %d", @@ -2976,7 +2993,7 @@ static struct extent_buffer *find_extent_buffer_readpage( static void end_bio_extent_readpage(struct bio *bio) { struct bio_vec *bvec; - struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct btrfs_bio *bbio = btrfs_bio(bio); struct extent_io_tree *tree, *failure_tree; struct processed_extent processed = { 0 }; /* @@ -3003,7 +3020,7 @@ static void end_bio_extent_readpage(struct bio *bio) btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", bio->bi_iter.bi_sector, bio->bi_status, - io_bio->mirror_num); + bbio->mirror_num); tree = &BTRFS_I(inode)->io_tree; failure_tree = &BTRFS_I(inode)->io_failure_tree; @@ -3028,14 +3045,14 @@ static void end_bio_extent_readpage(struct bio *bio) end = start + bvec->bv_len - 1; len = bvec->bv_len; - mirror = io_bio->mirror_num; + mirror = bbio->mirror_num; if (likely(uptodate)) { if (is_data_inode(inode)) { - error_bitmap = btrfs_verify_data_csum(io_bio, + error_bitmap = btrfs_verify_data_csum(bbio, bio_offset, page, start, end); ret = error_bitmap; } else { - ret = btrfs_validate_metadata_buffer(io_bio, + ret = btrfs_validate_metadata_buffer(bbio, page, start, end, mirror); } if (ret) @@ -3106,7 +3123,7 @@ readpage_ok: } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); - btrfs_io_bio_free_csum(io_bio); + btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -3115,53 +3132,43 @@ readpage_ok: * new bio by bio_alloc_bioset as it does not initialize the bytes outside of * 'bio' because use of __GFP_ZERO is not supported. */ -static inline void btrfs_io_bio_init(struct btrfs_io_bio *btrfs_bio) +static inline void btrfs_bio_init(struct btrfs_bio *bbio) { - memset(btrfs_bio, 0, offsetof(struct btrfs_io_bio, bio)); + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); } /* - * The following helpers allocate a bio. As it's backed by a bioset, it'll - * never fail. We're returning a bio right now but you can call btrfs_io_bio - * for the appropriate container_of magic + * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs. + * + * The bio allocation is backed by bioset and does not fail. */ -struct bio *btrfs_bio_alloc(u64 first_byte) +struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) { struct bio *bio; - bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_VECS, &btrfs_bioset); - bio->bi_iter.bi_sector = first_byte >> 9; - btrfs_io_bio_init(btrfs_io_bio(bio)); + ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); + bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); + btrfs_bio_init(btrfs_bio(bio)); return bio; } struct bio *btrfs_bio_clone(struct bio *bio) { - struct btrfs_io_bio *btrfs_bio; + struct btrfs_bio *bbio; struct bio *new; /* Bio allocation backed by a bioset does not fail */ new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); - btrfs_bio = btrfs_io_bio(new); - btrfs_io_bio_init(btrfs_bio); - btrfs_bio->iter = bio->bi_iter; + bbio = btrfs_bio(new); + btrfs_bio_init(bbio); + bbio->iter = bio->bi_iter; return new; } -struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs) -{ - struct bio *bio; - - /* Bio allocation backed by a bioset does not fail */ - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); - btrfs_io_bio_init(btrfs_io_bio(bio)); - return bio; -} - struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) { struct bio *bio; - struct btrfs_io_bio *btrfs_bio; + struct btrfs_bio *bbio; ASSERT(offset <= UINT_MAX && size <= UINT_MAX); @@ -3169,11 +3176,11 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); - btrfs_bio = btrfs_io_bio(bio); - btrfs_io_bio_init(btrfs_bio); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio); bio_trim(bio, offset >> 9, size >> 9); - btrfs_bio->iter = bio->bi_iter; + bbio->iter = bio->bi_iter; return bio; } @@ -3307,14 +3314,15 @@ static int alloc_new_bio(struct btrfs_inode *inode, struct bio *bio; int ret; + bio = btrfs_bio_alloc(BIO_MAX_VECS); /* * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. */ if (bio_flags & EXTENT_BIO_COMPRESSED) - bio = btrfs_bio_alloc(disk_bytenr); + bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else - bio = btrfs_bio_alloc(disk_bytenr + offset); + bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; bio_ctrl->bio_flags = bio_flags; bio->bi_end_io = end_io_func; @@ -3327,7 +3335,7 @@ static int alloc_new_bio(struct btrfs_inode *inode, if (wbc) { struct block_device *bdev; - bdev = fs_info->fs_devices->latest_bdev; + bdev = fs_info->fs_devices->latest_dev->bdev; bio_set_dev(bio, bdev); wbc_init_bio(wbc, bio); } @@ -3341,7 +3349,7 @@ static int alloc_new_bio(struct btrfs_inode *inode, goto error; } - btrfs_io_bio(bio)->device = device; + btrfs_bio(bio)->device = device; } return 0; error: @@ -3599,6 +3607,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, bool force_bio_submit = false; u64 disk_bytenr; + ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { struct extent_state *cached = NULL; @@ -3777,17 +3786,18 @@ static void update_nr_written(struct writeback_control *wbc, */ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, - u64 delalloc_start, unsigned long *nr_written) + unsigned long *nr_written) { - u64 page_end = delalloc_start + PAGE_SIZE - 1; - bool found; + const u64 page_end = page_offset(page) + PAGE_SIZE - 1; + u64 delalloc_start = page_offset(page); u64 delalloc_to_write = 0; - u64 delalloc_end = 0; int ret; int page_started = 0; + while (delalloc_start < page_end) { + u64 delalloc_end = page_end; + bool found; - while (delalloc_end < page_end) { found = find_lock_delalloc_range(&inode->vfs_inode, page, &delalloc_start, &delalloc_end); @@ -3854,12 +3864,11 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, struct page *page, u64 *start, u64 *end) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage_info *spi = fs_info->subpage_info; u64 orig_start = *start; /* Declare as unsigned long so we can use bitmap ops */ - unsigned long dirty_bitmap; unsigned long flags; - int nbits = (orig_start - page_offset(page)) >> fs_info->sectorsize_bits; - int range_start_bit = nbits; + int range_start_bit; int range_end_bit; /* @@ -3872,13 +3881,18 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, return; } + range_start_bit = spi->dirty_offset + + (offset_in_page(orig_start) >> fs_info->sectorsize_bits); + /* We should have the page locked, but just in case */ spin_lock_irqsave(&subpage->lock, flags); - dirty_bitmap = subpage->dirty_bitmap; + bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit, + spi->dirty_offset + spi->bitmap_nr_bits); spin_unlock_irqrestore(&subpage->lock, flags); - bitmap_next_set_region(&dirty_bitmap, &range_start_bit, &range_end_bit, - BTRFS_SUBPAGE_BITMAP_SIZE); + range_start_bit -= spi->dirty_offset; + range_end_bit -= spi->dirty_offset; + *start = page_offset(page) + range_start_bit * fs_info->sectorsize; *end = page_offset(page) + range_end_bit * fs_info->sectorsize; } @@ -4054,8 +4068,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) { struct inode *inode = page->mapping->host; - u64 start = page_offset(page); - u64 page_end = start + PAGE_SIZE - 1; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + const u64 page_start = page_offset(page); + const u64 page_end = page_start + PAGE_SIZE - 1; int ret; int nr = 0; size_t pg_offset; @@ -4090,8 +4105,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (!epd->extent_locked) { - ret = writepage_delalloc(BTRFS_I(inode), page, wbc, start, - &nr_written); + ret = writepage_delalloc(BTRFS_I(inode), page, wbc, &nr_written); if (ret == 1) return 0; if (ret) @@ -4141,8 +4155,20 @@ done: * capable of that. */ if (PageError(page)) - end_extent_writepage(page, ret, start, page_end); - unlock_page(page); + end_extent_writepage(page, ret, page_start, page_end); + if (epd->extent_locked) { + /* + * If epd->extent_locked, it's from extent_write_locked_range(), + * the page can either be locked by lock_page() or + * process_one_page(). + * Let btrfs_page_unlock_writer() handle both cases. + */ + ASSERT(wbc); + btrfs_page_unlock_writer(fs_info, page, wbc->range_start, + wbc->range_end + 1 - wbc->range_start); + } else { + unlock_page(page); + } ASSERT(ret <= 0); return ret; } @@ -4155,6 +4181,9 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) static void end_extent_buffer_writeback(struct extent_buffer *eb) { + if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) + btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); + clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -4602,12 +4631,11 @@ static int submit_eb_subpage(struct page *page, int submitted = 0; u64 page_start = page_offset(page); int bit_start = 0; - const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE; int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; int ret; /* Lock and write each dirty extent buffers in the range */ - while (bit_start < nbits) { + while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; struct extent_buffer *eb; unsigned long flags; @@ -4623,7 +4651,8 @@ static int submit_eb_subpage(struct page *page, break; } spin_lock_irqsave(&subpage->lock, flags); - if (!((1 << bit_start) & subpage->dirty_bitmap)) { + if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, + subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); spin_unlock(&page->mapping->private_lock); bit_start++; @@ -4756,8 +4785,13 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, free_extent_buffer(eb); return ret; } - if (cache) + if (cache) { + /* Impiles write in zoned mode */ btrfs_put_block_group(cache); + /* Mark the last eb in a block group */ + if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) + set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); + } ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); if (ret < 0) @@ -4873,7 +4907,7 @@ retry: * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. */ - if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (!BTRFS_FS_ERROR(fs_info)) { ret = flush_write_bio(&epd); } else { ret = -EROFS; @@ -5069,23 +5103,28 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) return ret; } -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - int mode) +/* + * Submit the pages in the range to bio for call sites which delalloc range has + * already been ran (aka, ordered extent inserted) and all pages are still + * locked. + */ +int extent_write_locked_range(struct inode *inode, u64 start, u64 end) { + bool found_error = false; + int first_error = 0; int ret = 0; struct address_space *mapping = inode->i_mapping; struct page *page; - unsigned long nr_pages = (end - start + PAGE_SIZE) >> - PAGE_SHIFT; - + u64 cur = start; + unsigned long nr_pages; + const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; struct extent_page_data epd = { .bio_ctrl = { 0 }, .extent_locked = 1, - .sync_io = mode == WB_SYNC_ALL, + .sync_io = 1, }; struct writeback_control wbc_writepages = { - .sync_mode = mode, - .nr_to_write = nr_pages * 2, + .sync_mode = WB_SYNC_ALL, .range_start = start, .range_end = end + 1, /* We're called from an async helper function */ @@ -5093,33 +5132,51 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end, .no_cgroup_owner = 1, }; + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); + nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >> + PAGE_SHIFT; + wbc_writepages.nr_to_write = nr_pages * 2; + wbc_attach_fdatawrite_inode(&wbc_writepages, inode); - while (start <= end) { - page = find_get_page(mapping, start >> PAGE_SHIFT); - if (clear_page_dirty_for_io(page)) - ret = __extent_writepage(page, &wbc_writepages, &epd); - else { - btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), - page, start, start + PAGE_SIZE - 1, true); - unlock_page(page); + while (cur <= end) { + u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + + page = find_get_page(mapping, cur >> PAGE_SHIFT); + /* + * All pages in the range are locked since + * btrfs_run_delalloc_range(), thus there is no way to clear + * the page dirty flag. + */ + ASSERT(PageLocked(page)); + ASSERT(PageDirty(page)); + clear_page_dirty_for_io(page); + ret = __extent_writepage(page, &wbc_writepages, &epd); + ASSERT(ret <= 0); + if (ret < 0) { + found_error = true; + first_error = ret; } put_page(page); - start += PAGE_SIZE; + cur = cur_end + 1; } - ASSERT(ret <= 0); - if (ret == 0) + if (!found_error) ret = flush_write_bio(&epd); else end_write_bio(&epd, ret); wbc_detach_inode(&wbc_writepages); + if (found_error) + return first_error; return ret; } int extent_writepages(struct address_space *mapping, struct writeback_control *wbc) { + struct inode *inode = mapping->host; + const bool data_reloc = btrfs_is_data_reloc_root(BTRFS_I(inode)->root); + const bool zoned = btrfs_is_zoned(BTRFS_I(inode)->root->fs_info); int ret = 0; struct extent_page_data epd = { .bio_ctrl = { 0 }, @@ -5127,7 +5184,15 @@ int extent_writepages(struct address_space *mapping, .sync_io = wbc->sync_mode == WB_SYNC_ALL, }; + /* + * Allow only a single thread to do the reloc work in zoned mode to + * protect the write pointer updates. + */ + if (data_reloc && zoned) + btrfs_inode_lock(inode, 0); ret = extent_write_cache_pages(mapping, wbc, &epd); + if (data_reloc && zoned) + btrfs_inode_unlock(inode, 0); ASSERT(ret <= 0); if (ret < 0) { end_write_bio(&epd, ret); @@ -6137,13 +6202,15 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * page, but it may change in the future for 16K page size * support, so we still preallocate the memory in the loop. */ - ret = btrfs_alloc_subpage(fs_info, &prealloc, - BTRFS_SUBPAGE_METADATA); - if (ret < 0) { - unlock_page(p); - put_page(p); - exists = ERR_PTR(ret); - goto free_eb; + if (fs_info->sectorsize < PAGE_SIZE) { + prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (IS_ERR(prealloc)) { + ret = PTR_ERR(prealloc); + unlock_page(p); + put_page(p); + exists = ERR_PTR(ret); + goto free_eb; + } } spin_lock(&mapping->private_lock); @@ -7167,32 +7234,41 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } +#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *gang[BTRFS_SUBPAGE_BITMAP_SIZE]; + struct extent_buffer *gang[GANG_LOOKUP_SIZE]; struct extent_buffer *found = NULL; u64 page_start = page_offset(page); - int ret; - int i; + u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); - ASSERT(PAGE_SIZE / fs_info->nodesize <= BTRFS_SUBPAGE_BITMAP_SIZE); lockdep_assert_held(&fs_info->buffer_lock); - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, (void **)gang, - bytenr >> fs_info->sectorsize_bits, - PAGE_SIZE / fs_info->nodesize); - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) - break; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - break; + while (cur < page_start + PAGE_SIZE) { + int ret; + int i; + + ret = radix_tree_gang_lookup(&fs_info->buffer_radix, + (void **)gang, cur >> fs_info->sectorsize_bits, + min_t(unsigned int, GANG_LOOKUP_SIZE, + PAGE_SIZE / fs_info->nodesize)); + if (ret == 0) + goto out; + for (i = 0; i < ret; i++) { + /* Already beyond page end */ + if (gang[i]->start >= page_start + PAGE_SIZE) + goto out; + /* Found one */ + if (gang[i]->start >= bytenr) { + found = gang[i]; + goto out; + } } + cur = gang[ret - 1]->start + gang[ret - 1]->len; } +out: return found; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 53abdc280451..0399cf8e3c32 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -32,6 +32,7 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, + EXTENT_BUFFER_ZONE_FINISH, }; /* these are flags for __process_pages_contig */ @@ -183,8 +184,7 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); -int extent_write_locked_range(struct inode *inode, u64 start, u64 end, - int mode); +int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, @@ -277,14 +277,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); -struct bio *btrfs_bio_alloc(u64 first_byte); -struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); +struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 4a8e02f7b6c7..5a36add21305 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -360,7 +360,7 @@ static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) int i; for (i = 0; i < map->num_stripes; i++) { - struct btrfs_bio_stripe *stripe = &map->stripes[i]; + struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; set_extent_bits_nowait(&device->alloc_state, stripe->physical, @@ -375,7 +375,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) int i; for (i = 0; i < map->num_stripes; i++) { - struct btrfs_bio_stripe *stripe = &map->stripes[i]; + struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; __clear_extent_bit(&device->alloc_state, stripe->physical, diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 2673c6ba7a4e..d1cbb64a78f3 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -358,7 +358,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If * NULL, the checksum buffer is allocated and returned in - * btrfs_io_bio(bio)->csum instead. + * btrfs_bio(bio)->csum instead. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ @@ -397,19 +397,18 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return BLK_STS_RESOURCE; if (!dst) { - struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio); + struct btrfs_bio *bbio = btrfs_bio(bio); if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum = kmalloc_array(nblocks, csum_size, - GFP_NOFS); - if (!btrfs_bio->csum) { + bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); + if (!bbio->csum) { btrfs_free_path(path); return BLK_STS_RESOURCE; } } else { - btrfs_bio->csum = btrfs_bio->csum_inline; + bbio->csum = bbio->csum_inline; } - csum = btrfs_bio->csum; + csum = bbio->csum; } else { csum = dst; } @@ -665,7 +664,18 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, if (!ordered) { ordered = btrfs_lookup_ordered_extent(inode, offset); - BUG_ON(!ordered); /* Logic error */ + /* + * The bio range is not covered by any ordered extent, + * must be a code logic error. + */ + if (unlikely(!ordered)) { + WARN(1, KERN_WARNING + "no ordered extent for root %llu ino %llu offset %llu\n", + inode->root->root_key.objectid, + btrfs_ino(inode), offset); + kvfree(sums); + return BLK_STS_IOERR; + } } nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, @@ -698,12 +708,12 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, index = 0; } - data = kmap_atomic(bvec.bv_page); - crypto_shash_digest(shash, data + bvec.bv_offset - + (i * fs_info->sectorsize), + data = bvec_kmap_local(&bvec); + crypto_shash_digest(shash, + data + (i * fs_info->sectorsize), fs_info->sectorsize, sums->sums + index); - kunmap_atomic(data); + kunmap_local(data); index += fs_info->csum_size; offset += fs_info->sectorsize; this_sum_bytes += fs_info->sectorsize; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7ff577005d0f..581662d16b72 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -437,9 +437,15 @@ static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, /* * unlocks pages after btrfs_file_write is done with them */ -static void btrfs_drop_pages(struct page **pages, size_t num_pages) +static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, + struct page **pages, size_t num_pages, + u64 pos, u64 copied) { size_t i; + u64 block_start = round_down(pos, fs_info->sectorsize); + u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; + + ASSERT(block_len <= U32_MAX); for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty @@ -447,7 +453,8 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) * accessed as prepare_pages should have marked them accessed * in prepare_pages via find_or_create_page() */ - ClearPageChecked(pages[i]); + btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start, + block_len); unlock_page(pages[i]); put_page(pages[i]); } @@ -504,7 +511,7 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, struct page *p = pages[i]; btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); - ClearPageChecked(p); + btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes); btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); } @@ -734,8 +741,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, if (args->start >= inode->disk_i_size && !args->replace_extent) modify_tree = 0; - update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || - root == fs_info->tree_root); + update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, @@ -870,7 +876,8 @@ next_slot: btrfs_init_data_ref(&ref, root->root_key.objectid, new_key.objectid, - args->start - extent_offset); + args->start - extent_offset, + 0, false); ret = btrfs_inc_extent_ref(trans, &ref); BUG_ON(ret); /* -ENOMEM */ } @@ -956,7 +963,8 @@ delete_extent_item: btrfs_init_data_ref(&ref, root->root_key.objectid, key.objectid, - key.offset - extent_offset); + key.offset - extent_offset, 0, + false); ret = btrfs_free_extent(trans, &ref); BUG_ON(ret); /* -ENOMEM */ args->bytes_found += extent_end - key.offset; @@ -1021,8 +1029,7 @@ delete_extent_item: if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, - &args->extent_item_size, 1); + btrfs_setup_item_for_insert(root, path, &key, args->extent_item_size); args->extent_inserted = true; } @@ -1233,7 +1240,7 @@ again: btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr, num_bytes, 0); btrfs_init_data_ref(&ref, root->root_key.objectid, ino, - orig_offset); + orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1258,7 +1265,8 @@ again: other_end = 0; btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, num_bytes, 0); - btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset); + btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset, + 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { @@ -1710,7 +1718,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, * Fault pages before locking them in prepare_pages * to avoid recursive lock */ - if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { ret = -EFAULT; break; } @@ -1845,7 +1853,7 @@ again: btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); if (ret) { - btrfs_drop_pages(pages, num_pages); + btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); break; } @@ -1853,7 +1861,7 @@ again: if (only_release_metadata) btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_drop_pages(pages, num_pages); + btrfs_drop_pages(fs_info, pages, num_pages, pos, copied); cond_resched(); @@ -1957,7 +1965,7 @@ relock: } dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - 0); + 0, 0); btrfs_inode_unlock(inode, ilock_flags); @@ -2013,7 +2021,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state)) + if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; if (!(iocb->ki_flags & IOCB_DIRECT) && @@ -2621,7 +2629,7 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, extent_info->disk_len, 0); ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(inode), ref_offset); + btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } @@ -2704,14 +2712,16 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, drop_args.bytes_found); if (ret != -ENOSPC) { /* - * When cloning we want to avoid transaction aborts when - * nothing was done and we are attempting to clone parts - * of inline extents, in such cases -EOPNOTSUPP is - * returned by __btrfs_drop_extents() without having - * changed anything in the file. + * The only time we don't want to abort is if we are + * attempting to clone a partial inline extent, in which + * case we'll get EOPNOTSUPP. However if we aren't + * clone we need to abort no matter what, because if we + * got EOPNOTSUPP via prealloc then we messed up and + * need to abort. */ - if (extent_info && !extent_info->is_new_extent && - ret && ret != -EOPNOTSUPP) + if (ret && + (ret != -EOPNOTSUPP || + (extent_info && extent_info->is_new_extent))) btrfs_abort_transaction(trans, ret); break; } @@ -3658,7 +3668,8 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) return 0; btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0); + ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + 0, 0); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index da0eee7c9e5f..f3fee88c8ee0 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -22,6 +22,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "discard.h" +#include "subpage.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K @@ -411,7 +412,10 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) { - ClearPageChecked(io_ctl->pages[i]); + btrfs_page_clear_checked(io_ctl->fs_info, + io_ctl->pages[i], + page_offset(io_ctl->pages[i]), + PAGE_SIZE); unlock_page(io_ctl->pages[i]); put_page(io_ctl->pages[i]); } @@ -2539,10 +2543,16 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); + bool initial = (size == block_group->length); + u64 reclaimable_unusable; + + WARN_ON(!initial && offset + size > block_group->zone_capacity); spin_lock(&ctl->tree_lock); if (!used) to_free = size; + else if (initial) + to_free = block_group->zone_capacity; else if (offset >= block_group->alloc_offset) to_free = size; else if (offset + size <= block_group->alloc_offset) @@ -2565,12 +2575,15 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, spin_unlock(&block_group->lock); } + reclaimable_unusable = block_group->zone_unusable - + (block_group->length - block_group->zone_capacity); /* All the region is now unusable. Mark it as unused and reclaim */ if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && - block_group->zone_unusable >= - div_factor_fine(block_group->length, bg_reclaim_threshold)) { + reclaimable_unusable >= + div_factor_fine(block_group->zone_capacity, + bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } @@ -2754,8 +2767,9 @@ void btrfs_dump_free_space(struct btrfs_block_group *block_group, * out the free space after the allocation offset. */ if (btrfs_is_zoned(fs_info)) { - btrfs_info(fs_info, "free space %llu", - block_group->length - block_group->alloc_offset); + btrfs_info(fs_info, "free space %llu active %d", + block_group->zone_capacity - block_group->alloc_offset, + block_group->zone_is_active); return; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 487533c35ddb..b8c911a4a320 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6,6 +6,7 @@ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> +#include <linux/blk-cgroup.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> @@ -287,8 +288,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = page_address(cpage); + kaddr = kmap_atomic(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap_atomic(kaddr); i++; ptr += cur_size; @@ -455,11 +457,10 @@ struct async_chunk { struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; - atomic_t *pending; + struct async_cow *async_cow; }; struct async_cow { - /* Number of chunks in flight; must be first in the structure */ atomic_t num_chunks; struct async_chunk chunks[]; }; @@ -490,9 +491,6 @@ static noinline int add_async_extent(struct async_chunk *cow, */ static inline bool inode_can_compress(struct btrfs_inode *inode) { - /* Subpage doesn't support compression yet */ - if (inode->root->fs_info->sectorsize < PAGE_SIZE) - return false; if (inode->flags & BTRFS_INODE_NODATACOW || inode->flags & BTRFS_INODE_NODATASUM) return false; @@ -514,6 +512,38 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, btrfs_ino(inode)); return 0; } + /* + * Special check for subpage. + * + * We lock the full page then run each delalloc range in the page, thus + * for the following case, we will hit some subpage specific corner case: + * + * 0 32K 64K + * | |///////| |///////| + * \- A \- B + * + * In above case, both range A and range B will try to unlock the full + * page [0, 64K), causing the one finished later will have page + * unlocked already, triggering various page lock requirement BUG_ON()s. + * + * So here we add an artificial limit that subpage compression can only + * if the range is fully page aligned. + * + * In theory we only need to ensure the first page is fully covered, but + * the tailing partial page will be locked until the full compression + * finishes, delaying the write of other range. + * + * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range + * first to prevent any submitted async extent to unlock the full page. + * By this, we can ensure for subpage case that only the last async_cow + * will unlock the full page. + */ + if (fs_info->sectorsize < PAGE_SIZE) { + if (!IS_ALIGNED(start, PAGE_SIZE) || + !IS_ALIGNED(end + 1, PAGE_SIZE)) + return 0; + } + /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; @@ -615,13 +645,24 @@ again: total_compressed = actual_end - start; /* - * skip compression for a small file range(<=blocksize) that + * Skip compression for a small file range(<=blocksize) that * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) goto cleanup_and_bail_uncompressed; + /* + * For subpage case, we require full page alignment for the sector + * aligned range. + * Thus we must also check against @actual_end, not just @end. + */ + if (blocksize < PAGE_SIZE) { + if (!IS_ALIGNED(start, PAGE_SIZE) || + !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE)) + goto cleanup_and_bail_uncompressed; + } + total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; @@ -759,7 +800,7 @@ cont: * win, compare the page count read with the blocks on disk, * compression must free at least one sector size */ - total_in = ALIGN(total_in, PAGE_SIZE); + total_in = round_up(total_in, fs_info->sectorsize); if (total_compressed + blocksize <= total_in) { compressed_extents++; @@ -840,166 +881,148 @@ static void free_async_extent_pages(struct async_extent *async_extent) async_extent->pages = NULL; } -/* - * phase two of compressed writeback. This is the ordered portion - * of the code, which only gets called in the order the work was - * queued. We walk all the async extents created by compress_file_range - * and send them down to the disk. - */ -static noinline void submit_compressed_extents(struct async_chunk *async_chunk) +static int submit_uncompressed_range(struct btrfs_inode *inode, + struct async_extent *async_extent, + struct page *locked_page) { - struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct async_extent *async_extent; - u64 alloc_hint = 0; - struct btrfs_key ins; - struct extent_map *em; - struct btrfs_root *root = inode->root; - struct extent_io_tree *io_tree = &inode->io_tree; - int ret = 0; - -again: - while (!list_empty(&async_chunk->extents)) { - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); - list_del(&async_extent->list); - -retry: - lock_extent(io_tree, async_extent->start, - async_extent->start + async_extent->ram_size - 1); - /* did the compression code fall back to uncompressed IO? */ - if (!async_extent->pages) { - int page_started = 0; - unsigned long nr_written = 0; + u64 start = async_extent->start; + u64 end = async_extent->start + async_extent->ram_size - 1; + unsigned long nr_written = 0; + int page_started = 0; + int ret; - /* allocate blocks */ - ret = cow_file_range(inode, async_chunk->locked_page, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - &page_started, &nr_written, 0); + /* + * Call cow_file_range() to run the delalloc range directly, since we + * won't go to NOCOW or async path again. + * + * Also we call cow_file_range() with @unlock_page == 0, so that we + * can directly submit them without interruption. + */ + ret = cow_file_range(inode, locked_page, start, end, &page_started, + &nr_written, 0); + /* Inline extent inserted, page gets unlocked and everything is done */ + if (page_started) { + ret = 0; + goto out; + } + if (ret < 0) { + if (locked_page) + unlock_page(locked_page); + goto out; + } - /* JDM XXX */ + ret = extent_write_locked_range(&inode->vfs_inode, start, end); + /* All pages will be unlocked, including @locked_page */ +out: + kfree(async_extent); + return ret; +} - /* - * if page_started, cow_file_range inserted an - * inline extent and took care of all the unlocking - * and IO for us. Otherwise, we need to submit - * all those pages down to the drive. - */ - if (!page_started && !ret) - extent_write_locked_range(&inode->vfs_inode, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - WB_SYNC_ALL); - else if (ret && async_chunk->locked_page) - unlock_page(async_chunk->locked_page); - kfree(async_extent); - cond_resched(); - continue; - } +static int submit_one_async_extent(struct btrfs_inode *inode, + struct async_chunk *async_chunk, + struct async_extent *async_extent, + u64 *alloc_hint) +{ + struct extent_io_tree *io_tree = &inode->io_tree; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key ins; + struct page *locked_page = NULL; + struct extent_map *em; + int ret = 0; + u64 start = async_extent->start; + u64 end = async_extent->start + async_extent->ram_size - 1; - ret = btrfs_reserve_extent(root, async_extent->ram_size, - async_extent->compressed_size, - async_extent->compressed_size, - 0, alloc_hint, &ins, 1, 1); - if (ret) { - free_async_extent_pages(async_extent); + /* + * If async_chunk->locked_page is in the async_extent range, we need to + * handle it. + */ + if (async_chunk->locked_page) { + u64 locked_page_start = page_offset(async_chunk->locked_page); + u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; - if (ret == -ENOSPC) { - unlock_extent(io_tree, async_extent->start, - async_extent->start + - async_extent->ram_size - 1); + if (!(start >= locked_page_end || end <= locked_page_start)) + locked_page = async_chunk->locked_page; + } + lock_extent(io_tree, start, end); - /* - * we need to redirty the pages if we decide to - * fallback to uncompressed IO, otherwise we - * will not submit these pages down to lower - * layers. - */ - extent_range_redirty_for_io(&inode->vfs_inode, - async_extent->start, - async_extent->start + - async_extent->ram_size - 1); + /* We have fall back to uncompressed write */ + if (!async_extent->pages) + return submit_uncompressed_range(inode, async_extent, locked_page); - goto retry; - } - goto out_free; - } + ret = btrfs_reserve_extent(root, async_extent->ram_size, + async_extent->compressed_size, + async_extent->compressed_size, + 0, *alloc_hint, &ins, 1, 1); + if (ret) { + free_async_extent_pages(async_extent); /* - * here we're doing allocation and writeback of the - * compressed pages + * Here we used to try again by going back to non-compressed + * path for ENOSPC. But we can't reserve space even for + * compressed size, how could it work for uncompressed size + * which requires larger size? So here we directly go error + * path. */ - em = create_io_em(inode, async_extent->start, - async_extent->ram_size, /* len */ - async_extent->start, /* orig_start */ - ins.objectid, /* block_start */ - ins.offset, /* block_len */ - ins.offset, /* orig_block_len */ - async_extent->ram_size, /* ram_bytes */ - async_extent->compress_type, - BTRFS_ORDERED_COMPRESSED); - if (IS_ERR(em)) - /* ret value is not necessary due to void function */ - goto out_free_reserve; - free_extent_map(em); - - ret = btrfs_add_ordered_extent_compress(inode, - async_extent->start, - ins.objectid, - async_extent->ram_size, - ins.offset, - async_extent->compress_type); - if (ret) { - btrfs_drop_extent_cache(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, 0); - goto out_free_reserve; - } - btrfs_dec_block_group_reservations(fs_info, ins.objectid); + goto out_free; + } + + /* Here we're doing allocation and writeback of the compressed pages */ + em = create_io_em(inode, start, + async_extent->ram_size, /* len */ + start, /* orig_start */ + ins.objectid, /* block_start */ + ins.offset, /* block_len */ + ins.offset, /* orig_block_len */ + async_extent->ram_size, /* ram_bytes */ + async_extent->compress_type, + BTRFS_ORDERED_COMPRESSED); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out_free_reserve; + } + free_extent_map(em); - /* - * clear dirty, set writeback and unlock the pages. - */ - extent_clear_unlock_delalloc(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, - NULL, EXTENT_LOCKED | EXTENT_DELALLOC, - PAGE_UNLOCK | PAGE_START_WRITEBACK); - if (btrfs_submit_compressed_write(inode, async_extent->start, - async_extent->ram_size, - ins.objectid, - ins.offset, async_extent->pages, - async_extent->nr_pages, - async_chunk->write_flags, - async_chunk->blkcg_css)) { - struct page *p = async_extent->pages[0]; - const u64 start = async_extent->start; - const u64 end = start + async_extent->ram_size - 1; - - p->mapping = inode->vfs_inode.i_mapping; - btrfs_writepage_endio_finish_ordered(inode, p, start, - end, false); - - p->mapping = NULL; - extent_clear_unlock_delalloc(inode, start, end, NULL, 0, - PAGE_END_WRITEBACK | - PAGE_SET_ERROR); - free_async_extent_pages(async_extent); - } - alloc_hint = ins.objectid + ins.offset; - kfree(async_extent); - cond_resched(); + ret = btrfs_add_ordered_extent_compress(inode, start, /* file_offset */ + ins.objectid, /* disk_bytenr */ + async_extent->ram_size, /* num_bytes */ + ins.offset, /* disk_num_bytes */ + async_extent->compress_type); + if (ret) { + btrfs_drop_extent_cache(inode, start, end, 0); + goto out_free_reserve; } - return; + btrfs_dec_block_group_reservations(fs_info, ins.objectid); + + /* Clear dirty, set writeback and unlock the pages. */ + extent_clear_unlock_delalloc(inode, start, end, + NULL, EXTENT_LOCKED | EXTENT_DELALLOC, + PAGE_UNLOCK | PAGE_START_WRITEBACK); + if (btrfs_submit_compressed_write(inode, start, /* file_offset */ + async_extent->ram_size, /* num_bytes */ + ins.objectid, /* disk_bytenr */ + ins.offset, /* compressed_len */ + async_extent->pages, /* compressed_pages */ + async_extent->nr_pages, + async_chunk->write_flags, + async_chunk->blkcg_css)) { + const u64 start = async_extent->start; + const u64 end = start + async_extent->ram_size - 1; + + btrfs_writepage_endio_finish_ordered(inode, NULL, start, end, 0); + + extent_clear_unlock_delalloc(inode, start, end, NULL, 0, + PAGE_END_WRITEBACK | PAGE_SET_ERROR); + free_async_extent_pages(async_extent); + } + *alloc_hint = ins.objectid + ins.offset; + kfree(async_extent); + return ret; + out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: - extent_clear_unlock_delalloc(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, + extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, @@ -1007,7 +1030,39 @@ out_free: PAGE_END_WRITEBACK | PAGE_SET_ERROR); free_async_extent_pages(async_extent); kfree(async_extent); - goto again; + return ret; +} + +/* + * Phase two of compressed writeback. This is the ordered portion of the code, + * which only gets called in the order the work was queued. We walk all the + * async extents created by compress_file_range and send them down to the disk. + */ +static noinline void submit_compressed_extents(struct async_chunk *async_chunk) +{ + struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct async_extent *async_extent; + u64 alloc_hint = 0; + int ret = 0; + + while (!list_empty(&async_chunk->extents)) { + u64 extent_start; + u64 ram_size; + + async_extent = list_entry(async_chunk->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + extent_start = async_extent->start; + ram_size = async_extent->ram_size; + + ret = submit_one_async_extent(inode, async_chunk, async_extent, + &alloc_hint); + btrfs_debug(fs_info, +"async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", + inode->root->root_key.objectid, + btrfs_ino(inode), extent_start, ram_size, ret); + } } static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, @@ -1150,7 +1205,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * fails during the stage where it updates the bytenr of file extent * items. */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) min_alloc_size = num_bytes; else min_alloc_size = fs_info->sectorsize; @@ -1186,8 +1241,7 @@ static noinline int cow_file_range(struct btrfs_inode *inode, if (ret) goto out_drop_extent_cache; - if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + if (btrfs_is_data_reloc_root(root)) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); /* @@ -1325,18 +1379,17 @@ static noinline void async_cow_submit(struct btrfs_work *work) static noinline void async_cow_free(struct btrfs_work *work) { struct async_chunk *async_chunk; + struct async_cow *async_cow; async_chunk = container_of(work, struct async_chunk, work); if (async_chunk->inode) btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); - /* - * Since the pointer to 'pending' is at the beginning of the array of - * async_chunk's, freeing it ensures the whole array has been freed. - */ - if (atomic_dec_and_test(async_chunk->pending)) - kvfree(async_chunk->pending); + + async_cow = async_chunk->async_cow; + if (atomic_dec_and_test(&async_cow->num_chunks)) + kvfree(async_cow); } static int cow_file_range_async(struct btrfs_inode *inode, @@ -1397,7 +1450,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, * lightweight reference for the callback lifetime */ ihold(&inode->vfs_inode); - async_chunk[i].pending = &ctx->num_chunks; + async_chunk[i].async_cow = ctx; async_chunk[i].inode = &inode->vfs_inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; @@ -1470,7 +1523,7 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, __set_page_dirty_nobuffers(locked_page); account_page_redirty(locked_page); - extent_write_locked_range(&inode->vfs_inode, start, end, WB_SYNC_ALL); + extent_write_locked_range(&inode->vfs_inode, start, end); *page_started = 1; return 0; @@ -1503,8 +1556,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, int *page_started, unsigned long *nr_written) { const bool is_space_ino = btrfs_is_free_space_inode(inode); - const bool is_reloc_ino = (inode->root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID); + const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; @@ -1866,8 +1918,7 @@ out_check: btrfs_dec_nocow_writers(fs_info, disk_bytenr); nocow = false; - if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent * extent_clear_unlock_delalloc() in error handler @@ -1946,8 +1997,23 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page int ret; const bool zoned = btrfs_is_zoned(inode->root->fs_info); + /* + * The range must cover part of the @locked_page, or the returned + * @page_started can confuse the caller. + */ + ASSERT(!(end <= page_offset(locked_page) || + start >= page_offset(locked_page) + PAGE_SIZE)); + if (should_nocow(inode, start, end)) { - ASSERT(!zoned); + /* + * Normally on a zoned device we're only doing COW writes, but + * in case of relocation on a zoned filesystem we have taken + * precaution, that we're only writing sequentially. It's safe + * to use run_delalloc_nocow() here, like for regular + * preallocated inodes. + */ + ASSERT(!zoned || + (zoned && btrfs_is_data_reloc_root(inode->root))); ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, nr_written); } else if (!inode_can_compress(inode) || @@ -2206,7 +2272,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (btrfs_is_testing(fs_info)) return; - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && + if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); @@ -2234,48 +2300,6 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, } /* - * btrfs_bio_fits_in_stripe - Checks whether the size of the given bio will fit - * in a chunk's stripe. This function ensures that bios do not span a - * stripe/chunk - * - * @page - The page we are about to add to the bio - * @size - size we want to add to the bio - * @bio - bio we want to ensure is smaller than a stripe - * @bio_flags - flags of the bio - * - * return 1 if page cannot be added to the bio - * return 0 if page can be added to the bio - * return error otherwise - */ -int btrfs_bio_fits_in_stripe(struct page *page, size_t size, struct bio *bio, - unsigned long bio_flags) -{ - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - u64 logical = bio->bi_iter.bi_sector << 9; - u32 bio_len = bio->bi_iter.bi_size; - struct extent_map *em; - int ret = 0; - struct btrfs_io_geometry geom; - - if (bio_flags & EXTENT_BIO_COMPRESSED) - return 0; - - em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize); - if (IS_ERR(em)) - return PTR_ERR(em); - ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio), logical, &geom); - if (ret < 0) - goto out; - - if (geom.len < bio_len + size) - ret = 1; -out: - free_extent_map(em); - return ret; -} - -/* * in order to insert checksums into the metadata in large chunks, * we wait until bio submission time. All the pages in the bio are * checksummed and sums are attached onto the ordered extent record. @@ -2531,7 +2555,7 @@ blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, goto mapit; } else if (async && !skip_sum) { /* csum items have already been cloned */ - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + if (btrfs_is_data_reloc_root(root)) goto mapit; /* we're doing a write, do the async checksumming */ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, @@ -2764,7 +2788,7 @@ out_page: clear_page_dirty_for_io(page); SetPageError(page); } - ClearPageChecked(page); + btrfs_page_clear_checked(inode->root->fs_info, page, page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -2819,7 +2843,7 @@ int btrfs_writepage_cow_fixup(struct page *page) * page->mapping outside of the page lock. */ ihold(inode); - SetPageChecked(page); + btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; @@ -3010,8 +3034,12 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - if (ordered_extent->bdev) + /* A valid bdev implies a write on a sequential zone */ + if (ordered_extent->bdev) { btrfs_rewrite_logical_zoned(ordered_extent); + btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); + } btrfs_free_io_failure_record(inode, start, end); @@ -3208,7 +3236,7 @@ void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, * * The length of such check is always one sector size. */ -static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, +static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff, u64 start) { @@ -3224,7 +3252,7 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, ASSERT(pgoff + len <= PAGE_SIZE); offset_sectors = bio_offset >> fs_info->sectorsize_bits; - csum_expected = ((u8 *)io_bio->csum) + offset_sectors * csum_size; + csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; kaddr = kmap_atomic(page); shash->tfm = fs_info->csum_shash; @@ -3238,9 +3266,9 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, return 0; zeroit: btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - io_bio->mirror_num); - if (io_bio->device) - btrfs_dev_stat_inc_and_print(io_bio->device, + bbio->mirror_num); + if (bbio->device) + btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); memset(kaddr + pgoff, 1, len); flush_dcache_page(page); @@ -3260,33 +3288,29 @@ zeroit: * Return a bitmap where bit set means a csum mismatch, and bit not set means * csum match. */ -unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end) +unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, + u64 start, u64 end) { struct inode *inode = page->mapping->host; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_root *root = BTRFS_I(inode)->root; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; unsigned int result = 0; - if (PageChecked(page)) { - ClearPageChecked(page); + if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) { + btrfs_page_clear_checked(fs_info, page, start, end + 1 - start); return 0; } /* - * For subpage case, above PageChecked is not safe as it's not subpage - * compatible. - * But for now only cow fixup and compressed read utilize PageChecked - * flag, while in this context we can easily use io_bio->csum to - * determine if we really need to do csum verification. - * - * So for now, just exit if io_bio->csum is NULL, as it means it's - * compressed read, and its compressed data csum has already been - * verified. + * This only happens for NODATASUM or compressed read. + * Normally this should be covered by above check for compressed read + * or the next check for NODATASUM. Just do a quicker exit here. */ - if (io_bio->csum == NULL) + if (bbio->csum == NULL) return 0; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) @@ -3303,7 +3327,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, u64 file_offset = pg_off + page_offset(page); int ret; - if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + if (btrfs_is_data_reloc_root(root) && test_range_bit(io_tree, file_offset, file_offset + sectorsize - 1, EXTENT_NODATASUM, 1, NULL)) { @@ -3313,7 +3337,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, EXTENT_NODATASUM); continue; } - ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, + ret = check_data_csum(inode, bbio, bio_offset, page, pg_off, page_offset(page) + pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> @@ -4004,7 +4028,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * without delay */ if (!btrfs_is_free_space_inode(inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && !btrfs_is_data_reloc_root(root) && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); @@ -4034,11 +4058,11 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, * also drops the back refs in the inode to the directory */ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len) { + struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; @@ -4098,19 +4122,9 @@ skip_backref: goto err; } - ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - if (ret != 0 && ret != -ENOENT) { - btrfs_abort_transaction(trans, ret); - goto err; - } - - ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, - index); - if (ret == -ENOENT) - ret = 0; - else if (ret) - btrfs_abort_transaction(trans, ret); + btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, + dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, index); /* * If we have a pending delayed iput we could end up with the final iput @@ -4138,15 +4152,14 @@ out: } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *dir, struct btrfs_inode *inode, const char *name, int name_len) { int ret; - ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len); if (!ret) { drop_nlink(&inode->vfs_inode); - ret = btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, inode->root, inode); } return ret; } @@ -4175,7 +4188,6 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { - struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; struct inode *inode = d_inode(dentry); int ret; @@ -4187,7 +4199,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 0); - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), dentry->d_name.name, dentry->d_name.len); if (ret) @@ -4201,7 +4213,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) out: btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(root->fs_info); + btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); return ret; } @@ -4368,7 +4380,7 @@ static void btrfs_prune_dentries(struct btrfs_root *root) struct inode *inode; u64 objectid = 0; - if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (!BTRFS_FS_ERROR(fs_info)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); spin_lock(&root->inode_lock); @@ -4552,7 +4564,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); int err = 0; - struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; u64 last_unlink_trans; @@ -4577,7 +4588,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), dentry->d_name.name, dentry->d_name.len); if (!err) { @@ -4598,7 +4609,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); - btrfs_btree_balance_dirty(root->fs_info); + btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); return err; } @@ -4907,9 +4918,9 @@ delete: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, extent_start, extent_num_bytes, 0); - ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - ino, extent_offset); + ino, extent_offset, + root->root_key.objectid, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -5105,7 +5116,8 @@ again: len); flush_dcache_page(page); } - ClearPageChecked(page); + btrfs_page_clear_checked(fs_info, page, block_start, + block_end + 1 - block_start); btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); unlock_extent_cached(io_tree, block_start, block_end, &cached_state); @@ -6435,7 +6447,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; - int nitems = name ? 2 : 1; + struct btrfs_item_batch batch; unsigned long ptr; unsigned int nofs_flag; int ret; @@ -6527,7 +6539,11 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, goto fail; } - ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); + batch.keys = &key[0]; + batch.data_sizes = &sizes[0]; + batch.total_data_size = sizes[0] + (name ? sizes[1] : 0); + batch.nr = name ? 2 : 1; + ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret != 0) goto fail_unlock; @@ -7961,7 +7977,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->type = IOMAP_MAPPED; } iomap->offset = start; - iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) @@ -8038,13 +8054,13 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { __endio_write_update_ordered(BTRFS_I(dip->inode), - dip->logical_offset, + dip->file_offset, dip->bytes, !dip->dio_bio->bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, - dip->logical_offset, - dip->logical_offset + dip->bytes - 1); + dip->file_offset, + dip->file_offset + dip->bytes - 1); } bio_endio(dip->dio_bio); @@ -8072,10 +8088,11 @@ static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, return ret; } -static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, - struct btrfs_io_bio *io_bio, +static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, + struct btrfs_bio *bbio, const bool uptodate) { + struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; const u32 sectorsize = fs_info->sectorsize; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; @@ -8083,11 +8100,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); struct bio_vec bvec; struct bvec_iter iter; - u64 start = io_bio->logical; + const u64 orig_file_offset = dip->file_offset; + u64 start = orig_file_offset; u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; - __bio_for_each_segment(bvec, &io_bio->bio, iter, io_bio->iter) { + __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) { unsigned int i, nr_sectors, pgoff; nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); @@ -8095,7 +8113,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); if (uptodate && - (!csum || !check_data_csum(inode, io_bio, + (!csum || !check_data_csum(inode, bbio, bio_offset, bvec.bv_page, pgoff, start))) { clean_io_failure(fs_info, failure_tree, io_tree, @@ -8105,12 +8123,12 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, } else { int ret; - ASSERT((start - io_bio->logical) < UINT_MAX); + ASSERT((start - orig_file_offset) < UINT_MAX); ret = btrfs_repair_one_sector(inode, - &io_bio->bio, - start - io_bio->logical, + &bbio->bio, + start - orig_file_offset, bvec.bv_page, pgoff, - start, io_bio->mirror_num, + start, bbio->mirror_num, submit_dio_repair_bio); if (ret) err = errno_to_blk_status(ret); @@ -8151,15 +8169,13 @@ static void btrfs_end_dio_bio(struct bio *bio) bio->bi_opf, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); - if (bio_op(bio) == REQ_OP_READ) { - err = btrfs_check_read_dio_bio(dip->inode, btrfs_io_bio(bio), - !err); - } + if (bio_op(bio) == REQ_OP_READ) + err = btrfs_check_read_dio_bio(dip, btrfs_bio(bio), !err); if (err) dip->dio_bio->bi_status = err; - btrfs_record_physical_zoned(dip->inode, dip->logical_offset, bio); + btrfs_record_physical_zoned(dip->inode, dip->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); @@ -8201,10 +8217,10 @@ static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, } else { u64 csum_offset; - csum_offset = file_offset - dip->logical_offset; + csum_offset = file_offset - dip->file_offset; csum_offset >>= fs_info->sectorsize_bits; csum_offset *= fs_info->csum_size; - btrfs_io_bio(bio)->csum = dip->csums + csum_offset; + btrfs_bio(bio)->csum = dip->csums + csum_offset; } map: ret = btrfs_map_bio(fs_info, bio, 0); @@ -8239,7 +8255,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return NULL; dip->inode = inode; - dip->logical_offset = file_offset; + dip->file_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; @@ -8247,7 +8263,7 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, return dip; } -static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, +static void btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { struct inode *inode = iter->inode; @@ -8277,7 +8293,7 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, } dio_bio->bi_status = BLK_STS_RESOURCE; bio_endio(dio_bio); - return BLK_QC_T_NONE; + return; } if (!write) { @@ -8320,7 +8336,6 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, bio = btrfs_bio_clone_partial(dio_bio, clone_offset, clone_len); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; - btrfs_io_bio(bio)->logical = file_offset; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, @@ -8371,15 +8386,13 @@ static blk_qc_t btrfs_submit_direct(const struct iomap_iter *iter, free_extent_map(em); } while (submit_len > 0); - return BLK_QC_T_NONE; + return; out_err_em: free_extent_map(em); out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); - - return BLK_QC_T_NONE; } const struct iomap_ops btrfs_dio_iomap_ops = { @@ -8696,9 +8709,9 @@ next: * did something wrong. */ ASSERT(!PageOrdered(page)); + btrfs_page_clear_checked(fs_info, page, page_offset(page), PAGE_SIZE); if (!inode_evicting) __btrfs_releasepage(page, GFP_NOFS); - ClearPageChecked(page); clear_page_extent_mapped(page); } @@ -8842,7 +8855,7 @@ again: memzero_page(page, zero_start, PAGE_SIZE - zero_start); flush_dcache_page(page); } - ClearPageChecked(page); + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); @@ -9152,8 +9165,10 @@ void btrfs_destroy_inode(struct inode *vfs_inode) WARN_ON(inode->block_rsv.reserved); WARN_ON(inode->block_rsv.size); WARN_ON(inode->outstanding_extents); - WARN_ON(inode->delalloc_bytes); - WARN_ON(inode->new_delalloc_bytes); + if (!S_ISDIR(vfs_inode->i_mode)) { + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + } WARN_ON(inode->csum_bytes); WARN_ON(inode->defrag_bytes); @@ -9450,7 +9465,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { /* src is an inode */ - ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), + ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_dentry->d_name.name, old_dentry->d_name.len); @@ -9466,7 +9481,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); } else { /* dest is an inode */ - ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), + ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_dentry->d_name.name, new_dentry->d_name.len); @@ -9741,7 +9756,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, */ btrfs_pin_log_trans(root); log_pinned = true; - ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir), + ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), old_dentry->d_name.name, old_dentry->d_name.len); @@ -9761,7 +9776,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { - ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), new_dentry->d_name.name, new_dentry->d_name.len); @@ -9979,7 +9994,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte }; struct btrfs_fs_info *fs_info = root->fs_info; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return -EROFS; return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); @@ -9998,7 +10013,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, struct list_head splice; int ret; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return -EROFS; INIT_LIST_HEAD(&splice); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cc61813213d8..fb8cc9642ac4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -48,6 +48,7 @@ #include "space-info.h" #include "delalloc-space.h" #include "block-group.h" +#include "subpage.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI @@ -81,7 +82,8 @@ struct btrfs_ioctl_send_args_32 { compat_uptr_t clone_sources; /* in */ __u64 parent_root; /* in */ __u64 flags; /* in */ - __u64 reserved[4]; /* in */ + __u32 version; /* in */ + __u8 reserved[28]; /* in */ } __attribute__ ((__packed__)); #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \ @@ -985,129 +987,32 @@ out: return ret; } -/* - * When we're defragging a range, we don't want to kick it off again - * if it is really just waiting for delalloc to send it down. - * If we find a nice big extent or delalloc range for the bytes in the - * file you want to defrag, we return 0 to let you know to skip this - * part of the file - */ -static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) -{ - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em = NULL; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - u64 end; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE); - read_unlock(&em_tree->lock); - - if (em) { - end = extent_map_end(em); - free_extent_map(em); - if (end - offset > thresh) - return 0; - } - /* if we already have a nice delalloc here, just stop */ - thresh /= 2; - end = count_range_bits(io_tree, &offset, offset + thresh, - thresh, EXTENT_DELALLOC, 1); - if (end >= thresh) - return 0; - return 1; -} - -/* - * helper function to walk through a file and find extents - * newer than a specific transid, and smaller than thresh. - * - * This is used by the defragging code to find new and small - * extents - */ -static int find_new_extents(struct btrfs_root *root, - struct inode *inode, u64 newer_than, - u64 *off, u32 thresh) -{ - struct btrfs_path *path; - struct btrfs_key min_key; - struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; - int type; - int ret; - u64 ino = btrfs_ino(BTRFS_I(inode)); - - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - - min_key.objectid = ino; - min_key.type = BTRFS_EXTENT_DATA_KEY; - min_key.offset = *off; - - while (1) { - ret = btrfs_search_forward(root, &min_key, path, newer_than); - if (ret != 0) - goto none; -process_slot: - if (min_key.objectid != ino) - goto none; - if (min_key.type != BTRFS_EXTENT_DATA_KEY) - goto none; - - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - type = btrfs_file_extent_type(leaf, extent); - if (type == BTRFS_FILE_EXTENT_REG && - btrfs_file_extent_num_bytes(leaf, extent) < thresh && - check_defrag_in_cache(inode, min_key.offset, thresh)) { - *off = min_key.offset; - btrfs_free_path(path); - return 0; - } - - path->slots[0]++; - if (path->slots[0] < btrfs_header_nritems(leaf)) { - btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]); - goto process_slot; - } - - if (min_key.offset == (u64)-1) - goto none; - - min_key.offset++; - btrfs_release_path(path); - } -none: - btrfs_free_path(path); - return -ENOENT; -} - -static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, + bool locked) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em; - u64 len = PAGE_SIZE; + const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; /* * hopefully we have this extent in the tree already, try without * the full extent lock */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); if (!em) { struct extent_state *cached = NULL; - u64 end = start + len - 1; + u64 end = start + sectorsize - 1; /* get the big lock and read metadata off disk */ - lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); - unlock_extent_cached(io_tree, start, end, &cached); + if (!locked) + lock_extent_bits(io_tree, start, end, &cached); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); + if (!locked) + unlock_extent_cached(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -1116,7 +1021,8 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) return em; } -static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, + bool locked) { struct extent_map *next; bool ret = true; @@ -1125,7 +1031,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) if (em->start + em->len >= i_size_read(inode)) return false; - next = defrag_lookup_extent(inode, em->start + em->len); + next = defrag_lookup_extent(inode, em->start + em->len, locked); if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) ret = false; else if ((em->block_start + em->block_len == next->block_start) && @@ -1136,297 +1042,435 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) return ret; } -static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, - u64 *last_len, u64 *skip, u64 *defrag_end, - int compress) +/* + * Prepare one page to be defragged. + * + * This will ensure: + * + * - Returned page is locked and has been set up properly. + * - No ordered extent exists in the page. + * - The page is uptodate. + * + * NOTE: Caller should also wait for page writeback after the cluster is + * prepared, here we don't do writeback wait for each page. + */ +static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, + pgoff_t index) { - struct extent_map *em; - int ret = 1; - bool next_mergeable = true; - bool prev_mergeable = true; + struct address_space *mapping = inode->vfs_inode.i_mapping; + gfp_t mask = btrfs_alloc_write_mask(mapping); + u64 page_start = (u64)index << PAGE_SHIFT; + u64 page_end = page_start + PAGE_SIZE - 1; + struct extent_state *cached_state = NULL; + struct page *page; + int ret; + +again: + page = find_or_create_page(mapping, index, mask); + if (!page) + return ERR_PTR(-ENOMEM); /* - * make sure that once we start defragging an extent, we keep on - * defragging it + * Since we can defragment files opened read-only, we can encounter + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We + * can't do I/O using huge pages yet, so return an error for now. + * Filesystem transparent huge pages are typically only used for + * executables that explicitly enable them, so this isn't very + * restrictive. */ - if (start < *defrag_end) - return 1; + if (PageCompound(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ETXTBSY); + } - *skip = 0; + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ERR_PTR(ret); + } - em = defrag_lookup_extent(inode, start); - if (!em) - return 0; + /* Wait for any existing ordered extent in the range */ + while (1) { + struct btrfs_ordered_extent *ordered; - /* this will cover holes, and inline extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - ret = 0; - goto out; - } + lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); + unlock_extent_cached(&inode->io_tree, page_start, page_end, + &cached_state); + if (!ordered) + break; - if (!*defrag_end) - prev_mergeable = false; + unlock_page(page); + btrfs_start_ordered_extent(ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* + * We unlocked the page above, so we need check if it was + * released or not. + */ + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + } - next_mergeable = defrag_check_next_extent(inode, em); - /* - * we hit a real extent, if it is big or the next extent is not a - * real extent, don't bother defragging it - */ - if (!compress && (*last_len == 0 || *last_len >= thresh) && - (em->len >= thresh || (!next_mergeable && !prev_mergeable))) - ret = 0; -out: /* - * last_len ends up being a counter of how many bytes we've defragged. - * every time we choose not to defrag an extent, we reset *last_len - * so that the next tiny extent will force a defrag. - * - * The end result of this is that tiny extents before a single big - * extent will force at least part of that big extent to be defragged. + * Now the page range has no ordered extent any more. Read the page to + * make it uptodate. */ - if (ret) { - *defrag_end = extent_map_end(em); - } else { - *last_len = 0; - *skip = extent_map_end(em); - *defrag_end = 0; + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + if (!PageUptodate(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EIO); + } } - - free_extent_map(em); - return ret; + return page; } +struct defrag_target_range { + struct list_head list; + u64 start; + u64 len; +}; + /* - * it doesn't do much good to defrag one or two pages - * at a time. This pulls in a nice chunk of pages - * to COW and defrag. - * - * It also makes sure the delalloc code has enough - * dirty data to avoid making new small extents as part - * of the defrag + * Collect all valid target extents. * - * It's a good idea to start RA on this range - * before calling this. + * @start: file offset to lookup + * @len: length to lookup + * @extent_thresh: file extent size threshold, any extent size >= this value + * will be ignored + * @newer_than: only defrag extents newer than this value + * @do_compress: whether the defrag is doing compression + * if true, @extent_thresh will be ignored and all regular + * file extents meeting @newer_than will be targets. + * @locked: if the range has already held extent lock + * @target_list: list of targets file extents */ -static int cluster_pages_for_defrag(struct inode *inode, - struct page **pages, - unsigned long start_index, - unsigned long num_pages) +static int defrag_collect_targets(struct btrfs_inode *inode, + u64 start, u64 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + bool locked, struct list_head *target_list) { - unsigned long file_end; - u64 isize = i_size_read(inode); - u64 page_start; - u64 page_end; - u64 page_cnt; - u64 start = (u64)start_index << PAGE_SHIFT; - u64 search_start; - int ret; - int i; - int i_done; - struct btrfs_ordered_extent *ordered; - struct extent_state *cached_state = NULL; - struct extent_io_tree *tree; - struct extent_changeset *data_reserved = NULL; - gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); + u64 cur = start; + int ret = 0; - file_end = (isize - 1) >> PAGE_SHIFT; - if (!isize || start_index > file_end) - return 0; + while (cur < start + len) { + struct extent_map *em; + struct defrag_target_range *new; + bool next_mergeable = true; + u64 range_len; - page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); + em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); + if (!em) + break; - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - start, page_cnt << PAGE_SHIFT); - if (ret) - return ret; - i_done = 0; - tree = &BTRFS_I(inode)->io_tree; + /* Skip hole/inline/preallocated extents */ + if (em->block_start >= EXTENT_MAP_LAST_BYTE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + goto next; - /* step one, lock all the pages */ - for (i = 0; i < page_cnt; i++) { - struct page *page; -again: - page = find_or_create_page(inode->i_mapping, - start_index + i, mask); - if (!page) - break; + /* Skip older extent */ + if (em->generation < newer_than) + goto next; - ret = set_page_extent_mapped(page); - if (ret < 0) { - unlock_page(page); - put_page(page); - break; + /* + * For do_compress case, we want to compress all valid file + * extents, thus no @extent_thresh or mergeable check. + */ + if (do_compress) + goto add; + + /* Skip too large extent */ + if (em->len >= extent_thresh) + goto next; + + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, + locked); + if (!next_mergeable) { + struct defrag_target_range *last; + + /* Empty target list, no way to merge with last entry */ + if (list_empty(target_list)) + goto next; + last = list_entry(target_list->prev, + struct defrag_target_range, list); + /* Not mergeable with last entry */ + if (last->start + last->len != cur) + goto next; + + /* Mergeable, fall through to add it to @target_list. */ } - page_start = page_offset(page); - page_end = page_start + PAGE_SIZE - 1; - while (1) { - lock_extent_bits(tree, page_start, page_end, - &cached_state); - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), - page_start); - unlock_extent_cached(tree, page_start, page_end, - &cached_state); - if (!ordered) - break; - - unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); - btrfs_put_ordered_extent(ordered); - lock_page(page); - /* - * we unlocked the page above, so we need check if - * it was released or not. - */ - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - goto again; +add: + range_len = min(extent_map_end(em), start + len) - cur; + /* + * This one is a good target, check if it can be merged into + * last range of the target list. + */ + if (!list_empty(target_list)) { + struct defrag_target_range *last; + + last = list_entry(target_list->prev, + struct defrag_target_range, list); + ASSERT(last->start + last->len <= cur); + if (last->start + last->len == cur) { + /* Mergeable, enlarge the last entry */ + last->len += range_len; + goto next; } + /* Fall through to allocate a new entry */ } - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - ret = -EIO; - break; - } + /* Allocate new defrag_target_range */ + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) { + free_extent_map(em); + ret = -ENOMEM; + break; } + new->start = cur; + new->len = range_len; + list_add_tail(&new->list, target_list); - if (page->mapping != inode->i_mapping) { - unlock_page(page); - put_page(page); - goto again; +next: + cur = extent_map_end(em); + free_extent_map(em); + } + if (ret < 0) { + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + + list_for_each_entry_safe(entry, tmp, target_list, list) { + list_del_init(&entry->list); + kfree(entry); } + } + return ret; +} + +#define CLUSTER_SIZE (SZ_256K) + +/* + * Defrag one contiguous target range. + * + * @inode: target inode + * @target: target range to defrag + * @pages: locked pages covering the defrag range + * @nr_pages: number of locked pages + * + * Caller should ensure: + * + * - Pages are prepared + * Pages should be locked, no ordered extent in the pages range, + * no writeback. + * + * - Extent bits are locked + */ +static int defrag_one_locked_target(struct btrfs_inode *inode, + struct defrag_target_range *target, + struct page **pages, int nr_pages, + struct extent_state **cached_state) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_changeset *data_reserved = NULL; + const u64 start = target->start; + const u64 len = target->len; + unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; + unsigned long start_index = start >> PAGE_SHIFT; + unsigned long first_index = page_index(pages[0]); + int ret = 0; + int i; + + ASSERT(last_index - first_index + 1 <= nr_pages); + + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); + if (ret < 0) + return ret; + clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, 0, 0, cached_state); + set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); - pages[i] = page; - i_done++; + /* Update the page status */ + for (i = start_index - first_index; i <= last_index - first_index; i++) { + ClearPageChecked(pages[i]); + btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); } - if (!i_done || ret) - goto out; + btrfs_delalloc_release_extents(inode, len); + extent_changeset_free(data_reserved); - if (!(inode->i_sb->s_flags & SB_ACTIVE)) - goto out; + return ret; +} - /* - * so now we have a nice long stream of locked - * and up to date pages, lets wait on them - */ - for (i = 0; i < i_done; i++) - wait_on_page_writeback(pages[i]); +static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + u32 extent_thresh, u64 newer_than, bool do_compress) +{ + struct extent_state *cached_state = NULL; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + struct page **pages; + const u32 sectorsize = inode->root->fs_info->sectorsize; + u64 last_index = (start + len - 1) >> PAGE_SHIFT; + u64 start_index = start >> PAGE_SHIFT; + unsigned int nr_pages = last_index - start_index + 1; + int ret = 0; + int i; - page_start = page_offset(pages[0]); - page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE; + ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; - lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state); + /* Prepare all pages */ + for (i = 0; i < nr_pages; i++) { + pages[i] = defrag_prepare_one_page(inode, start_index + i); + if (IS_ERR(pages[i])) { + ret = PTR_ERR(pages[i]); + pages[i] = NULL; + goto free_pages; + } + } + for (i = 0; i < nr_pages; i++) + wait_on_page_writeback(pages[i]); + /* Lock the pages range */ + lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); /* - * When defragmenting we skip ranges that have holes or inline extents, - * (check should_defrag_range()), to avoid unnecessary IO and wasting - * space. At btrfs_defrag_file(), we check if a range should be defragged - * before locking the inode and then, if it should, we trigger a sync - * page cache readahead - we lock the inode only after that to avoid - * blocking for too long other tasks that possibly want to operate on - * other file ranges. But before we were able to get the inode lock, - * some other task may have punched a hole in the range, or we may have - * now an inline extent, in which case we should not defrag. So check - * for that here, where we have the inode and the range locked, and bail - * out if that happened. + * Now we have a consistent view about the extent map, re-check + * which range really needs to be defragged. + * + * And this time we have extent locked already, pass @locked = true + * so that we won't relock the extent range and cause deadlock. */ - search_start = page_start; - while (search_start < page_end) { - struct extent_map *em; + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, true, + &target_list); + if (ret < 0) + goto unlock_extent; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, search_start, - page_end - search_start); - if (IS_ERR(em)) { - ret = PTR_ERR(em); - goto out_unlock_range; - } - if (em->block_start >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - /* Ok, 0 means we did not defrag anything */ - ret = 0; - goto out_unlock_range; + list_for_each_entry(entry, &target_list, list) { + ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + &cached_state); + if (ret < 0) + break; + } + + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } +unlock_extent: + unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); +free_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + put_page(pages[i]); } - search_start = extent_map_end(em); - free_extent_map(em); } + kfree(pages); + return ret; +} - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, - page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 0, 0, &cached_state); +static int defrag_one_cluster(struct btrfs_inode *inode, + struct file_ra_state *ra, + u64 start, u32 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + unsigned long *sectors_defragged, + unsigned long max_sectors) +{ + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + int ret; - if (i_done != page_cnt) { - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); - spin_unlock(&BTRFS_I(inode)->lock); - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start, (page_cnt - i_done) << PAGE_SHIFT, true); - } + BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, false, + &target_list); + if (ret < 0) + goto out; + list_for_each_entry(entry, &target_list, list) { + u32 range_len = entry->len; - set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, - &cached_state); + /* Reached the limit */ + if (max_sectors && max_sectors == *sectors_defragged) + break; - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state); + if (max_sectors) + range_len = min_t(u32, range_len, + (max_sectors - *sectors_defragged) * sectorsize); - for (i = 0; i < i_done; i++) { - clear_page_dirty_for_io(pages[i]); - ClearPageChecked(pages[i]); - set_page_dirty(pages[i]); - unlock_page(pages[i]); - put_page(pages[i]); + if (ra) + page_cache_sync_readahead(inode->vfs_inode.i_mapping, + ra, NULL, entry->start >> PAGE_SHIFT, + ((entry->start + range_len - 1) >> PAGE_SHIFT) - + (entry->start >> PAGE_SHIFT) + 1); + /* + * Here we may not defrag any range if holes are punched before + * we locked the pages. + * But that's fine, it only affects the @sectors_defragged + * accounting. + */ + ret = defrag_one_range(inode, entry->start, range_len, + extent_thresh, newer_than, do_compress); + if (ret < 0) + break; + *sectors_defragged += range_len; } - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); - extent_changeset_free(data_reserved); - return i_done; - -out_unlock_range: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, &cached_state); out: - for (i = 0; i < i_done; i++) { - unlock_page(pages[i]); - put_page(pages[i]); + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); } - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - start, page_cnt << PAGE_SHIFT, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT); - extent_changeset_free(data_reserved); return ret; - } -int btrfs_defrag_file(struct inode *inode, struct file *file, +/* + * Entry point to file defragmentation. + * + * @inode: inode to be defragged + * @ra: readahead state (can be NUL) + * @range: defrag options including range and flags + * @newer_than: minimum transid to defrag + * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode + * will be defragged. + */ +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct file_ra_state *ra = NULL; - unsigned long last_index; + unsigned long sectors_defragged = 0; u64 isize = i_size_read(inode); - u64 last_len = 0; - u64 skip = 0; - u64 defrag_end = 0; - u64 newer_off = range->start; - unsigned long i; - unsigned long ra_index = 0; - int ret; - int defrag_count = 0; + u64 cur; + u64 last_byte; + bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; + bool ra_allocated = false; int compress_type = BTRFS_COMPRESS_ZLIB; + int ret = 0; u32 extent_thresh = range->extent_thresh; - unsigned long max_cluster = SZ_256K >> PAGE_SHIFT; - unsigned long cluster = max_cluster; - u64 new_align = ~((u64)SZ_128K - 1); - struct page **pages = NULL; - bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; if (isize == 0) return 0; @@ -1444,172 +1488,87 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (extent_thresh == 0) extent_thresh = SZ_256K; + if (range->start + range->len > range->start) { + /* Got a specific range */ + last_byte = min(isize, range->start + range->len) - 1; + } else { + /* Defrag until file end */ + last_byte = isize - 1; + } + /* - * If we were not given a file, allocate a readahead context. As + * If we were not given a ra, allocate a readahead context. As * readahead is just an optimization, defrag will work without it so * we don't error out. */ - if (!file) { + if (!ra) { + ra_allocated = true; ra = kzalloc(sizeof(*ra), GFP_KERNEL); if (ra) file_ra_state_init(ra, inode->i_mapping); - } else { - ra = &file->f_ra; - } - - pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out_ra; - } - - /* find the last page to defrag */ - if (range->start + range->len > range->start) { - last_index = min_t(u64, isize - 1, - range->start + range->len - 1) >> PAGE_SHIFT; - } else { - last_index = (isize - 1) >> PAGE_SHIFT; - } - - if (newer_than) { - ret = find_new_extents(root, inode, newer_than, - &newer_off, SZ_64K); - if (!ret) { - range->start = newer_off; - /* - * we always align our defrag to help keep - * the extents in the file evenly spaced - */ - i = (newer_off & new_align) >> PAGE_SHIFT; - } else - goto out_ra; - } else { - i = range->start >> PAGE_SHIFT; } - if (!max_to_defrag) - max_to_defrag = last_index - i + 1; - - /* - * make writeback starts from i, so the defrag range can be - * written sequentially. - */ - if (i < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = i; - - while (i <= last_index && defrag_count < max_to_defrag && - (i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) { - /* - * make sure we stop running if someone unmounts - * the FS - */ - if (!(inode->i_sb->s_flags & SB_ACTIVE)) - break; - if (btrfs_defrag_cancelled(fs_info)) { - btrfs_debug(fs_info, "defrag_file cancelled"); - ret = -EAGAIN; - goto error; - } + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT, - extent_thresh, &last_len, &skip, - &defrag_end, do_compress)){ - unsigned long next; - /* - * the should_defrag function tells us how much to skip - * bump our counter by the suggested amount - */ - next = DIV_ROUND_UP(skip, PAGE_SIZE); - i = max(i + 1, next); - continue; - } + while (cur < last_byte) { + u64 cluster_end; - if (!newer_than) { - cluster = (PAGE_ALIGN(defrag_end) >> - PAGE_SHIFT) - i; - cluster = min(cluster, max_cluster); - } else { - cluster = max_cluster; - } + /* The cluster size 256K should always be page aligned */ + BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); - if (i + cluster > ra_index) { - ra_index = max(i, ra_index); - if (ra) - page_cache_sync_readahead(inode->i_mapping, ra, - file, ra_index, cluster); - ra_index += cluster; - } + /* We want the cluster end at page boundary when possible */ + cluster_end = (((cur >> PAGE_SHIFT) + + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; + cluster_end = min(cluster_end, last_byte); btrfs_inode_lock(inode, 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; - } else { - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = cluster_pages_for_defrag(inode, pages, i, cluster); + btrfs_inode_unlock(inode, 0); + break; } - if (ret < 0) { + if (!(inode->i_sb->s_flags & SB_ACTIVE)) { btrfs_inode_unlock(inode, 0); - goto out_ra; + break; } - - defrag_count += ret; - balance_dirty_pages_ratelimited(inode->i_mapping); + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + cluster_end + 1 - cur, extent_thresh, + newer_than, do_compress, + §ors_defragged, max_to_defrag); btrfs_inode_unlock(inode, 0); - - if (newer_than) { - if (newer_off == (u64)-1) - break; - - if (ret > 0) - i += ret; - - newer_off = max(newer_off + 1, - (u64)i << PAGE_SHIFT); - - ret = find_new_extents(root, inode, newer_than, - &newer_off, SZ_64K); - if (!ret) { - range->start = newer_off; - i = (newer_off & new_align) >> PAGE_SHIFT; - } else { - break; - } - } else { - if (ret > 0) { - i += ret; - last_len += ret << PAGE_SHIFT; - } else { - i++; - last_len = 0; - } - } + if (ret < 0) + break; + cur = cluster_end + 1; } - ret = defrag_count; -error: - if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) { - filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) + if (ra_allocated) + kfree(ra); + if (sectors_defragged) { + /* + * We have defragged some sectors, for compression case they + * need to be written back immediately. + */ + if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + if (range->compress_type == BTRFS_COMPRESS_LZO) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + else if (range->compress_type == BTRFS_COMPRESS_ZSTD) + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + ret = sectors_defragged; } - - if (range->compress_type == BTRFS_COMPRESS_LZO) { - btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); - } else if (range->compress_type == BTRFS_COMPRESS_ZSTD) { - btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); - } - -out_ra: if (do_compress) { btrfs_inode_lock(inode, 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; btrfs_inode_unlock(inode, 0); } - if (!file) - kfree(ra); - kfree(pages); return ret; } @@ -1658,6 +1617,7 @@ static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info, static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size; @@ -1713,7 +1673,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, btrfs_info(fs_info, "resizing devid %llu", devid); } - device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + args.devid = devid; + device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) { btrfs_info(fs_info, "resizer unable to find device %llu", devid); @@ -1730,7 +1691,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, } if (!strcmp(sizestr, "max")) - new_size = device->bdev->bd_inode->i_size; + new_size = bdev_nr_bytes(device->bdev); else { if (sizestr[0] == '-') { mod = -1; @@ -1771,7 +1732,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_finish; } - if (new_size > device->bdev->bd_inode->i_size) { + if (new_size > bdev_nr_bytes(device->bdev)) { ret = -EFBIG; goto out_finish; } @@ -2261,9 +2222,8 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { - ret = fault_in_pages_writeable(ubuf + sk_offset, - *buf_size - sk_offset); - if (ret) + ret = -EFAULT; + if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset)) break; ret = btrfs_search_forward(root, &key, path, sk->min_transid); @@ -3136,12 +3096,6 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) goto out; } - /* Subpage defrag will be supported in later commits */ - if (root->fs_info->sectorsize < PAGE_SIZE) { - ret = -ENOTTY; - goto out; - } - switch (inode->i_mode & S_IFMT) { case S_IFDIR: if (!capable(CAP_SYS_ADMIN)) { @@ -3176,7 +3130,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } - ret = btrfs_defrag_file(file_inode(file), file, + ret = btrfs_defrag_file(file_inode(file), &file->f_ra, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; @@ -3220,6 +3174,7 @@ out: static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args_v2 *vol_args; @@ -3231,35 +3186,39 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write_file(file); - if (ret) - return ret; - vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto err_drop; + goto out; } if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) { ret = -EOPNOTSUPP; goto out; } + vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; - if (!(vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) && - strcmp("cancel", vol_args->name) == 0) + if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) { + args.devid = vol_args->devid; + } else if (!strcmp("cancel", vol_args->name)) { cancel = true; + } else { + ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); + if (ret) + goto out; + } + + ret = mnt_want_write_file(file); + if (ret) + goto out; ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret) - goto out; - /* Exclusive operation is now claimed */ + goto err_drop; - if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) - ret = btrfs_rm_device(fs_info, NULL, vol_args->devid, &bdev, &mode); - else - ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); + /* Exclusive operation is now claimed */ + ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); btrfs_exclop_finish(fs_info); @@ -3271,17 +3230,19 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) btrfs_info(fs_info, "device deleted: %s", vol_args->name); } -out: - kfree(vol_args); err_drop: mnt_drop_write_file(file); if (bdev) blkdev_put(bdev, mode); +out: + btrfs_put_dev_args_from_path(&args); + kfree(vol_args); return ret; } static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ioctl_vol_args *vol_args; @@ -3293,32 +3254,38 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - ret = mnt_want_write_file(file); - if (ret) - return ret; - vol_args = memdup_user(arg, sizeof(*vol_args)); - if (IS_ERR(vol_args)) { - ret = PTR_ERR(vol_args); - goto out_drop_write; - } + if (IS_ERR(vol_args)) + return PTR_ERR(vol_args); + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; - cancel = (strcmp("cancel", vol_args->name) == 0); + if (!strcmp("cancel", vol_args->name)) { + cancel = true; + } else { + ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name); + if (ret) + goto out; + } + + ret = mnt_want_write_file(file); + if (ret) + goto out; ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE, cancel); if (ret == 0) { - ret = btrfs_rm_device(fs_info, vol_args->name, 0, &bdev, &mode); + ret = btrfs_rm_device(fs_info, &args, &bdev, &mode); if (!ret) btrfs_info(fs_info, "disk deleted %s", vol_args->name); btrfs_exclop_finish(fs_info); } - kfree(vol_args); -out_drop_write: mnt_drop_write_file(file); if (bdev) blkdev_put(bdev, mode); +out: + btrfs_put_dev_args_from_path(&args); + kfree(vol_args); return ret; } @@ -3379,22 +3346,21 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, void __user *arg) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_ioctl_dev_info_args *di_args; struct btrfs_device *dev; int ret = 0; - char *s_uuid = NULL; di_args = memdup_user(arg, sizeof(*di_args)); if (IS_ERR(di_args)) return PTR_ERR(di_args); + args.devid = di_args->devid; if (!btrfs_is_empty_uuid(di_args->uuid)) - s_uuid = di_args->uuid; + args.uuid = di_args->uuid; rcu_read_lock(); - dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid, - NULL); - + dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev) { ret = -ENODEV; goto out; @@ -4430,7 +4396,6 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_quota_rescan_args qsa = {0}; - int ret = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4441,9 +4406,9 @@ static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info, } if (copy_to_user(arg, &qsa, sizeof(qsa))) - ret = -EFAULT; + return -EFAULT; - return ret; + return 0; } static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index a2e1f1f5c6e3..bbc45534ae9a 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -96,11 +96,12 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); #ifdef CONFIG_BTRFS_DEBUG -static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { - lockdep_assert_held(&eb->lock); +static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) +{ + lockdep_assert_held_write(&eb->lock); } #else -static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { } +static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { } #endif void btrfs_unlock_up_safe(struct btrfs_path *path, int level); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index c25dfd1a8a54..65cb0766e62d 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -32,19 +32,19 @@ * payload. * One regular LZO compressed extent can have one or more segments. * For inlined LZO compressed extent, only one segment is allowed. - * One segment represents at most one page of uncompressed data. + * One segment represents at most one sector of uncompressed data. * * 2.1 Segment header * Fixed size. LZO_LEN (4) bytes long, LE32. * Records the total size of the segment (not including the header). - * Segment header never crosses page boundary, thus it's possible to - * have at most 3 padding zeros at the end of the page. + * Segment header never crosses sector boundary, thus it's possible to + * have at most 3 padding zeros at the end of the sector. * * 2.2 Data Payload - * Variable size. Size up limit should be lzo1x_worst_compress(PAGE_SIZE) - * which is 4419 for a 4KiB page. + * Variable size. Size up limit should be lzo1x_worst_compress(sectorsize) + * which is 4419 for a 4KiB sectorsize. * - * Example: + * Example with 4K sectorsize: * Page 1: * 0 0x2 0x4 0x6 0x8 0xa 0xc 0xe 0x10 * 0x0000 | Header | SegHdr 01 | Data payload 01 ... | @@ -112,163 +112,174 @@ static inline size_t read_compress_length(const char *buf) return le32_to_cpu(dlen); } +/* + * Will do: + * + * - Write a segment header into the destination + * - Copy the compressed buffer into the destination + * - Make sure we have enough space in the last sector to fit a segment header + * If not, we will pad at most (LZO_LEN (4)) - 1 bytes of zeros. + * + * Will allocate new pages when needed. + */ +static int copy_compressed_data_to_page(char *compressed_data, + size_t compressed_size, + struct page **out_pages, + u32 *cur_out, + const u32 sectorsize) +{ + u32 sector_bytes_left; + u32 orig_out; + struct page *cur_page; + char *kaddr; + + /* + * We never allow a segment header crossing sector boundary, previous + * run should ensure we have enough space left inside the sector. + */ + ASSERT((*cur_out / sectorsize) == (*cur_out + LZO_LEN - 1) / sectorsize); + + cur_page = out_pages[*cur_out / PAGE_SIZE]; + /* Allocate a new page */ + if (!cur_page) { + cur_page = alloc_page(GFP_NOFS); + if (!cur_page) + return -ENOMEM; + out_pages[*cur_out / PAGE_SIZE] = cur_page; + } + + kaddr = kmap(cur_page); + write_compress_length(kaddr + offset_in_page(*cur_out), + compressed_size); + *cur_out += LZO_LEN; + + orig_out = *cur_out; + + /* Copy compressed data */ + while (*cur_out - orig_out < compressed_size) { + u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, + orig_out + compressed_size - *cur_out); + + kunmap(cur_page); + cur_page = out_pages[*cur_out / PAGE_SIZE]; + /* Allocate a new page */ + if (!cur_page) { + cur_page = alloc_page(GFP_NOFS); + if (!cur_page) + return -ENOMEM; + out_pages[*cur_out / PAGE_SIZE] = cur_page; + } + kaddr = kmap(cur_page); + + memcpy(kaddr + offset_in_page(*cur_out), + compressed_data + *cur_out - orig_out, copy_len); + + *cur_out += copy_len; + } + + /* + * Check if we can fit the next segment header into the remaining space + * of the sector. + */ + sector_bytes_left = round_up(*cur_out, sectorsize) - *cur_out; + if (sector_bytes_left >= LZO_LEN || sector_bytes_left == 0) + goto out; + + /* The remaining size is not enough, pad it with zeros */ + memset(kaddr + offset_in_page(*cur_out), 0, + sector_bytes_left); + *cur_out += sector_bytes_left; + +out: + kunmap(cur_page); + return 0; +} + int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out) { struct workspace *workspace = list_entry(ws, struct workspace, list); + const u32 sectorsize = btrfs_sb(mapping->host->i_sb)->sectorsize; + struct page *page_in = NULL; + char *sizes_ptr; int ret = 0; - char *data_in; - char *cpage_out, *sizes_ptr; - int nr_pages = 0; - struct page *in_page = NULL; - struct page *out_page = NULL; - unsigned long bytes_left; - unsigned long len = *total_out; - unsigned long nr_dest_pages = *out_pages; - const unsigned long max_out = nr_dest_pages * PAGE_SIZE; - size_t in_len; - size_t out_len; - char *buf; - unsigned long tot_in = 0; - unsigned long tot_out = 0; - unsigned long pg_bytes_left; - unsigned long out_offset; - unsigned long bytes; + /* Points to the file offset of input data */ + u64 cur_in = start; + /* Points to the current output byte */ + u32 cur_out = 0; + u32 len = *total_out; *out_pages = 0; *total_out = 0; *total_in = 0; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); - /* - * store the size of all chunks of compressed data in - * the first 4 bytes + * Skip the header for now, we will later come back and write the total + * compressed size */ - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { - ret = -ENOMEM; - goto out; - } - cpage_out = page_address(out_page); - out_offset = LZO_LEN; - tot_out = LZO_LEN; - pages[0] = out_page; - nr_pages = 1; - pg_bytes_left = PAGE_SIZE - LZO_LEN; - - /* compress at most one page of data each time */ - in_len = min(len, PAGE_SIZE); - while (tot_in < len) { - ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, - &out_len, workspace->mem); - if (ret != LZO_E_OK) { - pr_debug("BTRFS: lzo in loop returned %d\n", - ret); + cur_out += LZO_LEN; + while (cur_in < start + len) { + char *data_in; + const u32 sectorsize_mask = sectorsize - 1; + u32 sector_off = (cur_in - start) & sectorsize_mask; + u32 in_len; + size_t out_len; + + /* Get the input page first */ + if (!page_in) { + page_in = find_get_page(mapping, cur_in >> PAGE_SHIFT); + ASSERT(page_in); + } + + /* Compress at most one sector of data each time */ + in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); + ASSERT(in_len); + data_in = kmap(page_in); + ret = lzo1x_1_compress(data_in + + offset_in_page(cur_in), in_len, + workspace->cbuf, &out_len, + workspace->mem); + kunmap(page_in); + if (ret < 0) { + pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; goto out; } - /* store the size of this chunk of compressed data */ - write_compress_length(cpage_out + out_offset, out_len); - tot_out += LZO_LEN; - out_offset += LZO_LEN; - pg_bytes_left -= LZO_LEN; - - tot_in += in_len; - tot_out += out_len; - - /* copy bytes from the working buffer into the pages */ - buf = workspace->cbuf; - while (out_len) { - bytes = min_t(unsigned long, pg_bytes_left, out_len); - - memcpy(cpage_out + out_offset, buf, bytes); - - out_len -= bytes; - pg_bytes_left -= bytes; - buf += bytes; - out_offset += bytes; - - /* - * we need another page for writing out. - * - * Note if there's less than 4 bytes left, we just - * skip to a new page. - */ - if ((out_len == 0 && pg_bytes_left < LZO_LEN) || - pg_bytes_left == 0) { - if (pg_bytes_left) { - memset(cpage_out + out_offset, 0, - pg_bytes_left); - tot_out += pg_bytes_left; - } - - /* we're done, don't allocate new page */ - if (out_len == 0 && tot_in >= len) - break; - - if (nr_pages == nr_dest_pages) { - out_page = NULL; - ret = -E2BIG; - goto out; - } - - out_page = alloc_page(GFP_NOFS); - if (out_page == NULL) { - ret = -ENOMEM; - goto out; - } - cpage_out = page_address(out_page); - pages[nr_pages++] = out_page; - - pg_bytes_left = PAGE_SIZE; - out_offset = 0; - } - } + ret = copy_compressed_data_to_page(workspace->cbuf, out_len, + pages, &cur_out, sectorsize); + if (ret < 0) + goto out; + + cur_in += in_len; - /* we're making it bigger, give up */ - if (tot_in > 8192 && tot_in < tot_out) { + /* + * Check if we're making it bigger after two sectors. And if + * it is so, give up. + */ + if (cur_in - start > sectorsize * 2 && cur_in - start < cur_out) { ret = -E2BIG; goto out; } - /* we're all done */ - if (tot_in >= len) - break; - - if (tot_out > max_out) - break; - - bytes_left = len - tot_in; - put_page(in_page); - - start += PAGE_SIZE; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); - in_len = min(bytes_left, PAGE_SIZE); - } - - if (tot_out >= tot_in) { - ret = -E2BIG; - goto out; + /* Check if we have reached page boundary */ + if (IS_ALIGNED(cur_in, PAGE_SIZE)) { + put_page(page_in); + page_in = NULL; + } } - /* store the size of all chunks of compressed data */ - sizes_ptr = page_address(pages[0]); - write_compress_length(sizes_ptr, tot_out); + /* Store the size of all chunks of compressed data */ + sizes_ptr = kmap_local_page(pages[0]); + write_compress_length(sizes_ptr, cur_out); + kunmap_local(sizes_ptr); ret = 0; - *total_out = tot_out; - *total_in = tot_in; + *total_out = cur_out; + *total_in = cur_in - start; out: - *out_pages = nr_pages; - - if (in_page) - put_page(in_page); - + *out_pages = DIV_ROUND_UP(cur_out, PAGE_SIZE); return ret; } @@ -283,6 +294,7 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { + char *kaddr; struct page *cur_page; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); @@ -290,9 +302,11 @@ static void copy_compressed_segment(struct compressed_bio *cb, ASSERT(copy_len); cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; + kaddr = kmap(cur_page); memcpy(dest + *cur_in - orig_in, - page_address(cur_page) + offset_in_page(*cur_in), + kaddr + offset_in_page(*cur_in), copy_len); + kunmap(cur_page); *cur_in += copy_len; } @@ -303,6 +317,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); const struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); const u32 sectorsize = fs_info->sectorsize; + char *kaddr; int ret; /* Compressed data length, can be unaligned */ u32 len_in; @@ -311,7 +326,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - len_in = read_compress_length(page_address(cb->compressed_pages[0])); + kaddr = kmap(cb->compressed_pages[0]); + len_in = read_compress_length(kaddr); + kunmap(cb->compressed_pages[0]); cur_in += LZO_LEN; /* @@ -345,8 +362,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) (cur_in + LZO_LEN - 1) / sectorsize); cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; ASSERT(cur_page); - seg_len = read_compress_length(page_address(cur_page) + - offset_in_page(cur_in)); + kaddr = kmap(cur_page); + seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); + kunmap(cur_page); cur_in += LZO_LEN; /* Copy the compressed segment payload into workspace */ @@ -431,7 +449,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes = min_t(unsigned long, destlen, out_len - start_byte); - kaddr = page_address(dest_page); + kaddr = kmap_local_page(dest_page); memcpy(kaddr, workspace->buf + start_byte, bytes); /* @@ -441,6 +459,7 @@ int lzo_decompress(struct list_head *ws, unsigned char *data_in, */ if (bytes < destlen) memset(kaddr+bytes, 0, destlen-bytes); + kunmap_local(kaddr); out: return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index d8d268ca8aa7..0e239a4c3b26 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -60,8 +60,7 @@ enum btrfs_rbio_ops { }; struct btrfs_raid_bio { - struct btrfs_fs_info *fs_info; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; /* while we're doing rmw on a stripe * we put it into a hash table so we can @@ -192,7 +191,7 @@ static void scrub_parity_work(struct btrfs_work *work); static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) { btrfs_init_work(&rbio->work, work_func, NULL, NULL); - btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work); + btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); } /* @@ -271,7 +270,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { - u64 num = rbio->bbio->raid_map[0]; + u64 num = rbio->bioc->raid_map[0]; /* * we shift down quite a bit. We're using byte @@ -345,7 +344,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; h = table->table + bucket; /* hold the lock for the bucket because we may be @@ -400,7 +399,7 @@ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; spin_lock_irqsave(&table->cache_lock, flags); __remove_rbio_from_cache(rbio); @@ -460,7 +459,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) return; - table = rbio->fs_info->stripe_hash_table; + table = rbio->bioc->fs_info->stripe_hash_table; spin_lock_irqsave(&table->cache_lock, flags); spin_lock(&rbio->bio_list_lock); @@ -559,8 +558,7 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; - if (last->bbio->raid_map[0] != - cur->bbio->raid_map[0]) + if (last->bioc->raid_map[0] != cur->bioc->raid_map[0]) return 0; /* we can't merge with different operations */ @@ -669,11 +667,11 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; - h = rbio->fs_info->stripe_hash_table->table + rbio_bucket(rbio); + h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); spin_lock_irqsave(&h->lock, flags); list_for_each_entry(cur, &h->hash_list, hash_list) { - if (cur->bbio->raid_map[0] != rbio->bbio->raid_map[0]) + if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0]) continue; spin_lock(&cur->bio_list_lock); @@ -751,7 +749,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) int keep_cache = 0; bucket = rbio_bucket(rbio); - h = rbio->fs_info->stripe_hash_table->table + bucket; + h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; if (list_empty(&rbio->plug_list)) cache_rbio(rbio); @@ -838,7 +836,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) } } - btrfs_put_bbio(rbio->bbio); + btrfs_put_bioc(rbio->bioc); kfree(rbio); } @@ -865,7 +863,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) struct bio *extra; if (rbio->generic_bio_cnt) - btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt); + btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -906,7 +904,7 @@ static void raid_write_end_io(struct bio *bio) /* OK, we have read all the stripes we need to. */ max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? - 0 : rbio->bbio->max_errors; + 0 : rbio->bioc->max_errors; if (atomic_read(&rbio->error) > max_errors) err = BLK_STS_IOERR; @@ -961,12 +959,12 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, - struct btrfs_bio *bbio, + struct btrfs_io_context *bioc, u64 stripe_len) { struct btrfs_raid_bio *rbio; int nr_data = 0; - int real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; int num_pages = rbio_nr_pages(stripe_len, real_stripes); int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; @@ -987,8 +985,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); - rbio->bbio = bbio; - rbio->fs_info = fs_info; + rbio->bioc = bioc; rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; rbio->real_stripes = real_stripes; @@ -1015,9 +1012,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); #undef CONSUME_ALLOC - if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) nr_data = real_stripes - 1; - else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) nr_data = real_stripes - 2; else BUG(); @@ -1077,10 +1074,10 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, struct bio *last = bio_list->tail; int ret; struct bio *bio; - struct btrfs_bio_stripe *stripe; + struct btrfs_io_stripe *stripe; u64 disk_start; - stripe = &rbio->bbio->stripes[stripe_nr]; + stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + (page_index << PAGE_SHIFT); /* if the device is missing, just fail this stripe */ @@ -1105,8 +1102,8 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, } /* put a new bio on the list */ - bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); - btrfs_io_bio(bio)->device = stripe->dev; + bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); + btrfs_bio(bio)->device = stripe->dev; bio->bi_iter.bi_size = 0; bio_set_dev(bio, stripe->dev->bdev); bio->bi_iter.bi_sector = disk_start >> 9; @@ -1155,11 +1152,11 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) int i = 0; start = bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->bbio->raid_map[0]; + stripe_offset = start - rbio->bioc->raid_map[0]; page_index = stripe_offset >> PAGE_SHIFT; if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_io_bio(bio)->iter; + bio->bi_iter = btrfs_bio(bio)->iter; bio_for_each_segment(bvec, bio, iter) { rbio->bio_pages[page_index + i] = bvec.bv_page; @@ -1179,7 +1176,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) */ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { - struct btrfs_bio *bbio = rbio->bbio; + struct btrfs_io_context *bioc = rbio->bioc; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; int stripe; @@ -1284,11 +1281,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } } - if (likely(!bbio->num_tgtdevs)) + if (likely(!bioc->num_tgtdevs)) goto write_data; for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bbio->tgtdev_map[stripe]) + if (!bioc->tgtdev_map[stripe]) continue; for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { @@ -1302,7 +1299,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) } ret = rbio_add_io_page(rbio, &bio_list, page, - rbio->bbio->tgtdev_map[stripe], + rbio->bioc->tgtdev_map[stripe], pagenr, rbio->stripe_len); if (ret) goto cleanup; @@ -1339,12 +1336,12 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, { u64 physical = bio->bi_iter.bi_sector; int i; - struct btrfs_bio_stripe *stripe; + struct btrfs_io_stripe *stripe; physical <<= 9; - for (i = 0; i < rbio->bbio->num_stripes; i++) { - stripe = &rbio->bbio->stripes[i]; + for (i = 0; i < rbio->bioc->num_stripes; i++) { + stripe = &rbio->bioc->stripes[i]; if (in_range(physical, stripe->physical, rbio->stripe_len) && stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; @@ -1365,7 +1362,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, int i; for (i = 0; i < rbio->nr_data; i++) { - u64 stripe_start = rbio->bbio->raid_map[i]; + u64 stripe_start = rbio->bioc->raid_map[i]; if (in_range(logical, stripe_start, rbio->stripe_len)) return i; @@ -1456,7 +1453,7 @@ static void raid_rmw_end_io(struct bio *bio) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) goto cleanup; /* @@ -1538,8 +1535,8 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) } /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that + * The bioc may be freed once we submit the last bio. Make sure not to + * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { @@ -1547,7 +1544,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid_rmw_end_io; bio->bi_opf = REQ_OP_READ; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -1719,17 +1716,18 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len) +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; int ret; - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc, stripe_len); if (IS_ERR(rbio)) { - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return PTR_ERR(rbio); } bio_list_add(&rbio->bio_list, bio); @@ -1842,7 +1840,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } /* all raid6 handling here */ - if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) { + if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { /* * single failure, rebuild from parity raid5 * style @@ -1874,8 +1872,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) * here due to a crc mismatch and we can't give them the * data they want */ - if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bbio->raid_map[faila] == + if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bioc->raid_map[faila] == RAID5_P_STRIPE) { err = BLK_STS_IOERR; goto cleanup; @@ -1887,7 +1885,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) goto pstripe; } - if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) { + if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { raid6_datap_recov(rbio->real_stripes, PAGE_SIZE, faila, pointers); } else { @@ -2006,7 +2004,7 @@ static void raid_recover_end_io(struct bio *bio) if (!atomic_dec_and_test(&rbio->stripes_pending)) return; - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) rbio_orig_end_io(rbio, BLK_STS_IOERR); else __raid_recover_end_io(rbio); @@ -2074,7 +2072,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * were up to date, or we might have no bios to read because * the devices were gone. */ - if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) { + if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { __raid_recover_end_io(rbio); return 0; } else { @@ -2083,8 +2081,8 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) } /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that + * The bioc may be freed once we submit the last bio. Make sure not to + * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { @@ -2092,7 +2090,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid_recover_end_io; bio->bi_opf = REQ_OP_READ; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -2116,22 +2114,22 @@ cleanup: * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - int mirror_num, int generic_io) +int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len, int mirror_num, int generic_io) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int ret; if (generic_io) { - ASSERT(bbio->mirror_num == mirror_num); - btrfs_io_bio(bio)->mirror_num = mirror_num; + ASSERT(bioc->mirror_num == mirror_num); + btrfs_bio(bio)->mirror_num = mirror_num; } - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc, stripe_len); if (IS_ERR(rbio)) { if (generic_io) - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return PTR_ERR(rbio); } @@ -2142,11 +2140,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { btrfs_warn(fs_info, - "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)", +"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, - (u64)bio->bi_iter.bi_size, bbio->map_type); + (u64)bio->bi_iter.bi_size, bioc->map_type); if (generic_io) - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); kfree(rbio); return -EIO; } @@ -2155,7 +2153,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; } else { - btrfs_get_bbio(bbio); + btrfs_get_bioc(bioc); } /* @@ -2214,23 +2212,23 @@ static void read_rebuild_work(struct btrfs_work *work) /* * The following code is used to scrub/replace the parity stripe * - * Caller must have already increased bio_counter for getting @bbio. + * Caller must have already increased bio_counter for getting @bioc. * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. */ -struct btrfs_raid_bio * -raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - struct btrfs_device *scrub_dev, - unsigned long *dbitmap, int stripe_nsectors) +struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + struct btrfs_io_context *bioc, + u64 stripe_len, struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(fs_info, bbio, stripe_len); + rbio = alloc_rbio(fs_info, bioc, stripe_len); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2242,12 +2240,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, rbio->operation = BTRFS_RBIO_PARITY_SCRUB; /* - * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted + * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted * to the end position, so this search can start from the first parity * stripe. */ for (i = rbio->nr_data; i < rbio->real_stripes; i++) { - if (bbio->stripes[i].dev == scrub_dev) { + if (bioc->stripes[i].dev == scrub_dev) { rbio->scrubp = i; break; } @@ -2260,7 +2258,7 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); /* - * We have already increased bio_counter when getting bbio, record it + * We have already increased bio_counter when getting bioc, record it * so we can free it at rbio_orig_end_io(). */ rbio->generic_bio_cnt = 1; @@ -2275,10 +2273,10 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, int stripe_offset; int index; - ASSERT(logical >= rbio->bbio->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] + + ASSERT(logical >= rbio->bioc->raid_map[0]); + ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] + rbio->stripe_len * rbio->nr_data); - stripe_offset = (int)(logical - rbio->bbio->raid_map[0]); + stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); index = stripe_offset >> PAGE_SHIFT; rbio->bio_pages[index] = page; } @@ -2312,7 +2310,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { - struct btrfs_bio *bbio = rbio->bbio; + struct btrfs_io_context *bioc = rbio->bioc; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = rbio->finish_pbitmap; int nr_data = rbio->nr_data; @@ -2335,7 +2333,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, else BUG(); - if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) { + if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); } @@ -2435,7 +2433,7 @@ writeback: page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); ret = rbio_add_io_page(rbio, &bio_list, page, - bbio->tgtdev_map[rbio->scrubp], + bioc->tgtdev_map[rbio->scrubp], pagenr, rbio->stripe_len); if (ret) goto cleanup; @@ -2483,7 +2481,7 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) */ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) { - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) goto cleanup; if (rbio->faila >= 0 || rbio->failb >= 0) { @@ -2504,7 +2502,7 @@ static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) * the data, so the capability of the repair is declined. * (In the case of RAID5, we can not repair anything) */ - if (dfail > rbio->bbio->max_errors - 1) + if (dfail > rbio->bioc->max_errors - 1) goto cleanup; /* @@ -2625,8 +2623,8 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) } /* - * the bbio may be freed once we submit the last bio. Make sure - * not to touch it after that + * The bioc may be freed once we submit the last bio. Make sure not to + * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); while ((bio = bio_list_pop(&bio_list))) { @@ -2634,7 +2632,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) bio->bi_end_io = raid56_parity_scrub_end_io; bio->bi_opf = REQ_OP_READ; - btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); submit_bio(bio); } @@ -2670,12 +2668,13 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) /* The following code is used for dev replace of a missing RAID 5/6 device. */ struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 length) +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, + u64 length) { + struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - rbio = alloc_rbio(fs_info, bbio, length); + rbio = alloc_rbio(fs_info, bioc, length); if (IS_ERR(rbio)) return NULL; @@ -2695,7 +2694,7 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, } /* - * When we get bbio, we have already increased bio_counter, record it + * When we get bioc, we have already increased bio_counter, record it * so we can free it at rbio_orig_end_io() */ rbio->generic_bio_cnt = 1; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 2503485db859..72c00fc284b5 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -30,25 +30,23 @@ static inline int nr_data_stripes(const struct map_lookup *map) struct btrfs_raid_bio; struct btrfs_device; -int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - int mirror_num, int generic_io); -int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len); +int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len, int mirror_num, int generic_io); +int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, + u64 stripe_len); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, u64 logical); -struct btrfs_raid_bio * -raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 stripe_len, - struct btrfs_device *scrub_dev, - unsigned long *dbitmap, int stripe_nsectors); +struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, + struct btrfs_io_context *bioc, u64 stripe_len, + struct btrfs_device *scrub_dev, + unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, - struct btrfs_bio *bbio, u64 length); +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, + u64 length); void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 06713a8fe26b..eb96fdc3be25 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -227,7 +227,7 @@ start_machine: } static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, - struct btrfs_bio *bbio) + struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = dev->fs_info; int ret; @@ -275,11 +275,11 @@ static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, kref_init(&zone->refcnt); zone->elems = 0; zone->device = dev; /* our device always sits at index 0 */ - for (i = 0; i < bbio->num_stripes; ++i) { + for (i = 0; i < bioc->num_stripes; ++i) { /* bounds have already been checked */ - zone->devs[i] = bbio->stripes[i].dev; + zone->devs[i] = bioc->stripes[i].dev; } - zone->ndevs = bbio->num_stripes; + zone->ndevs = bioc->num_stripes; spin_lock(&fs_info->reada_lock); ret = radix_tree_insert(&dev->reada_zones, @@ -309,7 +309,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, int ret; struct reada_extent *re = NULL; struct reada_extent *re_exist = NULL; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; u64 length; @@ -345,28 +345,28 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, */ length = fs_info->nodesize; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio, 0); - if (ret || !bbio || length < fs_info->nodesize) + &length, &bioc, 0); + if (ret || !bioc || length < fs_info->nodesize) goto error; - if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { + if (bioc->num_stripes > BTRFS_MAX_MIRRORS) { btrfs_err(fs_info, "readahead: more than %d copies not supported", BTRFS_MAX_MIRRORS); goto error; } - real_stripes = bbio->num_stripes - bbio->num_tgtdevs; + real_stripes = bioc->num_stripes - bioc->num_tgtdevs; for (nzones = 0; nzones < real_stripes; ++nzones) { struct reada_zone *zone; - dev = bbio->stripes[nzones].dev; + dev = bioc->stripes[nzones].dev; /* cannot read ahead on missing device. */ if (!dev->bdev) continue; - zone = reada_find_zone(dev, logical, bbio); + zone = reada_find_zone(dev, logical, bioc); if (!zone) continue; @@ -464,7 +464,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!have_zone) goto error; - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return re; error: @@ -488,7 +488,7 @@ error: kref_put(&zone->refcnt, reada_zone_release); spin_unlock(&fs_info->reada_lock); } - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); kfree(re); return re_exist; } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index d2062d5f71dd..e2b9f8616501 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -678,10 +678,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, if (generic_ref->type == BTRFS_REF_METADATA) { if (!parent) - ref_root = generic_ref->tree_ref.root; + ref_root = generic_ref->tree_ref.owning_root; owner = generic_ref->tree_ref.level; } else if (!parent) { - ref_root = generic_ref->data_ref.ref_root; + ref_root = generic_ref->data_ref.owning_root; owner = generic_ref->data_ref.ino; offset = generic_ref->data_ref.offset; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9b0814318e72..e0f93b357548 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -138,7 +138,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, } btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); - ClearPageChecked(page); + btrfs_page_clear_checked(fs_info, page, file_offset, block_size); btrfs_page_set_dirty(fs_info, page, file_offset, block_size); out_unlock: if (page) { @@ -649,7 +649,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { - int ret; + int ret = 0; u64 i, tail_len, chunk_count; struct btrfs_root *root_dst = BTRFS_I(dst)->root; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 914d403b4415..33a0ee7ac590 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -25,6 +25,7 @@ #include "backref.h" #include "misc.h" #include "subpage.h" +#include "zoned.h" /* * Relocation overview @@ -1145,9 +1146,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans, key.offset -= btrfs_file_extent_offset(leaf, fi); btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, num_bytes, parent); - ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, + root->root_key.objectid, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1156,9 +1157,9 @@ int replace_file_extents(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, num_bytes, parent); - ref.real_root = root->root_key.objectid; btrfs_init_data_ref(&ref, btrfs_header_owner(leaf), - key.objectid, key.offset); + key.objectid, key.offset, + root->root_key.objectid, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1367,8 +1368,8 @@ again: btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr, blocksize, path->nodes[level]->start); - ref.skip_qgroup = true; - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, + 0, true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1376,8 +1377,8 @@ again: } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, blocksize, 0); - ref.skip_qgroup = true; - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0, + true); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1386,8 +1387,8 @@ again: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, blocksize, path->nodes[level]->start); - btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); - ref.skip_qgroup = true; + btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid, + 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1396,8 +1397,8 @@ again: btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, blocksize, 0); - btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); - ref.skip_qgroup = true; + btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, + 0, true); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2473,9 +2474,9 @@ static int do_relocation(struct btrfs_trans_handle *trans, btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, node->eb->start, blocksize, upper->eb->start); - ref.real_root = root->root_key.objectid; btrfs_init_tree_ref(&ref, node->level, - btrfs_header_owner(upper->eb)); + btrfs_header_owner(upper->eb), + root->root_key.objectid, false); ret = btrfs_inc_extent_ref(trans, &ref); if (!ret) ret = btrfs_drop_subtree(trans, root, eb, @@ -2691,8 +2692,12 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, list_add_tail(&node->list, &rc->backref_cache.changed); } else { path->lowest_level = node->level; + if (root == root->fs_info->chunk_root) + btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, key, path, 0, 1); btrfs_release_path(path); + if (root == root->fs_info->chunk_root) + btrfs_trans_release_chunk_metadata(trans); if (ret > 0) ret = 0; } @@ -2852,31 +2857,6 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; - /* - * On a zoned filesystem, we cannot preallocate the file region. - * Instead, we dirty and fiemap_write the region. - */ - if (btrfs_is_zoned(inode->root->fs_info)) { - struct btrfs_root *root = inode->root; - struct btrfs_trans_handle *trans; - - end = cluster->end - offset + 1; - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); - i_size_write(&inode->vfs_inode, end); - ret = btrfs_update_inode(trans, root, inode); - if (ret) { - btrfs_abort_transaction(trans, ret); - btrfs_end_transaction(trans); - return ret; - } - - return btrfs_end_transaction(trans); - } - btrfs_inode_lock(&inode->vfs_inode, 0); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; @@ -2903,9 +2883,8 @@ static noinline_for_stack int prealloc_file_extent_cluster( return ret; } -static noinline_for_stack -int setup_extent_mapping(struct inode *inode, u64 start, u64 end, - u64 block_start) +static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode, + u64 start, u64 end, u64 block_start) { struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em; @@ -3084,7 +3063,6 @@ release_page: static int relocate_file_extent_cluster(struct inode *inode, struct file_extent_cluster *cluster) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 offset = BTRFS_I(inode)->index_cnt; unsigned long index; unsigned long last_index; @@ -3105,7 +3083,7 @@ static int relocate_file_extent_cluster(struct inode *inode, file_ra_state_init(ra, inode->i_mapping); - ret = setup_extent_mapping(inode, cluster->start - offset, + ret = setup_relocation_extent_mapping(inode, cluster->start - offset, cluster->end - offset, cluster->start); if (ret) goto out; @@ -3114,8 +3092,6 @@ static int relocate_file_extent_cluster(struct inode *inode, for (index = (cluster->start - offset) >> PAGE_SHIFT; index <= last_index && !ret; index++) ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index); - if (btrfs_is_zoned(fs_info) && !ret) - ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret == 0) WARN_ON(cluster_nr != cluster->nr); out: @@ -3770,12 +3746,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, struct btrfs_path *path; struct btrfs_inode_item *item; struct extent_buffer *leaf; - u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC; int ret; - if (btrfs_is_zoned(trans->fs_info)) - flags &= ~BTRFS_INODE_PREALLOC; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3790,7 +3762,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_generation(leaf, item, 1); btrfs_set_inode_size(leaf, item, 0); btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); - btrfs_set_inode_flags(leaf, item, flags); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | + BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(leaf); out: btrfs_free_path(path); @@ -4063,6 +4036,9 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) rc->block_group->start, rc->block_group->length); + ret = btrfs_zone_finish(rc->block_group); + WARN_ON(ret && ret != -EAGAIN); + while (1) { int finishes_stage; @@ -4386,8 +4362,7 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, if (!rc) return 0; - BUG_ON(rc->stage == UPDATE_DATA_PTRS && - root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID); + BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root)); level = btrfs_header_level(buf); if (btrfs_header_generation(buf) <= diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 702dc5441f03..12ceb14a1141 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -39,10 +39,8 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot, need_reset = 1; } if (need_reset) { - memset(&item->generation_v2, 0, - sizeof(*item) - offsetof(struct btrfs_root_item, - generation_v2)); - + /* Clear all members from generation_v2 onwards. */ + memset_startat(item, 0, generation_v2); generate_random_guid(item->uuid); } } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 088641ba7a8e..cf82ea6f54fb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -57,7 +57,7 @@ struct scrub_ctx; struct scrub_recover { refcount_t refs; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; u64 map_length; }; @@ -254,7 +254,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx); static inline int scrub_is_page_on_raid56(struct scrub_page *spage) { return spage->recover && - (spage->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -798,7 +798,7 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, { if (refcount_dec_and_test(&recover->refs)) { btrfs_bio_counter_dec(fs_info); - btrfs_put_bbio(recover->bbio); + btrfs_put_bioc(recover->bioc); kfree(recover); } } @@ -1027,8 +1027,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock_other = sblocks_for_recheck + mirror_index; } else { struct scrub_recover *r = sblock_bad->pagev[0]->recover; - int max_allowed = r->bbio->num_stripes - - r->bbio->num_tgtdevs; + int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; if (mirror_index >= max_allowed) break; @@ -1218,14 +1217,14 @@ out: return 0; } -static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio) +static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) { - if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5) + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) return 2; - else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) + else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) return 3; else - return (int)bbio->num_stripes; + return (int)bioc->num_stripes; } static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, @@ -1269,7 +1268,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, u64 flags = original_sblock->pagev[0]->flags; u64 have_csum = original_sblock->pagev[0]->have_csum; struct scrub_recover *recover; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; u64 sublen; u64 mapped_length; u64 stripe_offset; @@ -1288,7 +1287,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, while (length > 0) { sublen = min_t(u64, length, fs_info->sectorsize); mapped_length = sublen; - bbio = NULL; + bioc = NULL; /* * With a length of sectorsize, each returned stripe represents @@ -1296,27 +1295,27 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, */ btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &mapped_length, &bbio); - if (ret || !bbio || mapped_length < sublen) { - btrfs_put_bbio(bbio); + logical, &mapped_length, &bioc); + if (ret || !bioc || mapped_length < sublen) { + btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); return -ENOMEM; } refcount_set(&recover->refs, 1); - recover->bbio = bbio; + recover->bioc = bioc; recover->map_length = mapped_length; BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK); - nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); + nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { @@ -1348,17 +1347,17 @@ leave_nomem: sctx->fs_info->csum_size); scrub_stripe_index_and_offset(logical, - bbio->map_type, - bbio->raid_map, + bioc->map_type, + bioc->raid_map, mapped_length, - bbio->num_stripes - - bbio->num_tgtdevs, + bioc->num_stripes - + bioc->num_tgtdevs, mirror_index, &stripe_index, &stripe_offset); - spage->physical = bbio->stripes[stripe_index].physical + + spage->physical = bioc->stripes[stripe_index].physical + stripe_offset; - spage->dev = bbio->stripes[stripe_index].dev; + spage->dev = bioc->stripes[stripe_index].dev; BUG_ON(page_index >= original_sblock->page_count); spage->physical_for_dev_replace = @@ -1401,7 +1400,7 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, bio->bi_end_io = scrub_bio_wait_endio; mirror_num = spage->sblock->pagev[0]->mirror_num; - ret = raid56_parity_recover(fs_info, bio, spage->recover->bbio, + ret = raid56_parity_recover(bio, spage->recover->bioc, spage->recover->map_length, mirror_num, 0); if (ret) @@ -1423,7 +1422,7 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, if (!first_page->dev->bdev) goto out; - bio = btrfs_io_bio_alloc(BIO_MAX_VECS); + bio = btrfs_bio_alloc(BIO_MAX_VECS); bio_set_dev(bio, first_page->dev->bdev); for (page_num = 0; page_num < sblock->page_count; page_num++) { @@ -1480,7 +1479,7 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, } WARN_ON(!spage->page); - bio = btrfs_io_bio_alloc(1); + bio = btrfs_bio_alloc(1); bio_set_dev(bio, spage->dev->bdev); bio_add_page(bio, spage->page, fs_info->sectorsize, 0); @@ -1562,7 +1561,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, return -EIO; } - bio = btrfs_io_bio_alloc(1); + bio = btrfs_bio_alloc(1); bio_set_dev(bio, spage_bad->dev->bdev); bio->bi_iter.bi_sector = spage_bad->physical >> 9; bio->bi_opf = REQ_OP_WRITE; @@ -1676,7 +1675,7 @@ again: sbio->dev = sctx->wr_tgtdev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio); + bio = btrfs_bio_alloc(sctx->pages_per_wr_bio); sbio->bio = bio; } @@ -2102,7 +2101,7 @@ again: sbio->dev = spage->dev; bio = sbio->bio; if (!bio) { - bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio); + bio = btrfs_bio_alloc(sctx->pages_per_rd_bio); sbio->bio = bio; } @@ -2203,7 +2202,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; u64 length = sblock->page_count * PAGE_SIZE; u64 logical = sblock->pagev[0]->logical; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; int ret; @@ -2211,27 +2210,27 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio); - if (ret || !bbio || !bbio->raid_map) - goto bbio_out; + &length, &bioc); + if (ret || !bioc || !bioc->raid_map) + goto bioc_out; if (WARN_ON(!sctx->is_dev_replace || - !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { + !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) { /* * We shouldn't be scrubbing a missing device. Even for dev * replace, we should only get here for RAID 5/6. We either * managed to mount something with no mirrors remaining or * there's a bug in scrub_remap_extent()/btrfs_map_block(). */ - goto bbio_out; + goto bioc_out; } - bio = btrfs_io_bio_alloc(0); + bio = btrfs_bio_alloc(BIO_MAX_VECS); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; - rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length); + rbio = raid56_alloc_missing_rbio(bio, bioc, length); if (!rbio) goto rbio_out; @@ -2249,9 +2248,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) rbio_out: bio_put(bio); -bbio_out: +bioc_out: btrfs_bio_counter_dec(fs_info); - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); @@ -2826,7 +2825,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_fs_info *fs_info = sctx->fs_info; struct bio *bio; struct btrfs_raid_bio *rbio; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; u64 length; int ret; @@ -2838,17 +2837,17 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, - &length, &bbio); - if (ret || !bbio || !bbio->raid_map) - goto bbio_out; + &length, &bioc); + if (ret || !bioc || !bioc->raid_map) + goto bioc_out; - bio = btrfs_io_bio_alloc(0); + bio = btrfs_bio_alloc(BIO_MAX_VECS); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; - rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio, - length, sparity->scrub_dev, + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length, + sparity->scrub_dev, sparity->dbitmap, sparity->nsectors); if (!rbio) @@ -2860,9 +2859,9 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); -bbio_out: +bioc_out: btrfs_bio_counter_dec(fs_info); - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); @@ -2901,7 +2900,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct btrfs_root *root = fs_info->extent_root; struct btrfs_root *csum_root = fs_info->csum_root; struct btrfs_extent_item *extent; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; u64 flags; int ret; int slot; @@ -3044,22 +3043,22 @@ again: extent_len); mapped_length = extent_len; - bbio = NULL; + bioc = NULL; ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, - extent_logical, &mapped_length, &bbio, + extent_logical, &mapped_length, &bioc, 0); if (!ret) { - if (!bbio || mapped_length < extent_len) + if (!bioc || mapped_length < extent_len) ret = -EIO; } if (ret) { - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); goto out; } - extent_physical = bbio->stripes[0].physical; - extent_mirror_num = bbio->mirror_num; - extent_dev = bbio->stripes[0].dev; - btrfs_put_bbio(bbio); + extent_physical = bioc->stripes[0].physical; + extent_mirror_num = bioc->mirror_num; + extent_dev = bioc->stripes[0].dev; + btrfs_put_bioc(bioc); ret = btrfs_lookup_csums_range(csum_root, extent_logical, @@ -3956,7 +3955,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, int ret; struct btrfs_fs_info *fs_info = sctx->fs_info; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return -EROFS; /* Seed devices of a new filesystem has their own generation. */ @@ -4068,6 +4067,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, int readonly, int is_dev_replace) { + struct btrfs_dev_lookup_args args = { .devid = devid }; struct scrub_ctx *sctx; int ret; struct btrfs_device *dev; @@ -4115,7 +4115,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, goto out_free_ctx; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -4288,11 +4288,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_device *dev) int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress) { + struct btrfs_dev_lookup_args args = { .devid = devid }; struct btrfs_device *dev; struct scrub_ctx *sctx = NULL; mutex_lock(&fs_info->fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, &args); if (dev) sctx = dev->scrub_ctx; if (sctx) @@ -4309,20 +4310,20 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info, int *extent_mirror_num) { u64 mapped_length; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; int ret; mapped_length = extent_len; ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical, - &mapped_length, &bbio, 0); - if (ret || !bbio || mapped_length < extent_len || - !bbio->stripes[0].dev->bdev) { - btrfs_put_bbio(bbio); + &mapped_length, &bioc, 0); + if (ret || !bioc || mapped_length < extent_len || + !bioc->stripes[0].dev->bdev) { + btrfs_put_bioc(bioc); return; } - *extent_physical = bbio->stripes[0].physical; - *extent_mirror_num = bbio->mirror_num; - *extent_dev = bbio->stripes[0].dev; - btrfs_put_bbio(bbio); + *extent_physical = bioc->stripes[0].physical; + *extent_mirror_num = bioc->mirror_num; + *extent_dev = bioc->stripes[0].dev; + btrfs_put_bioc(bioc); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 72f9b865e847..040324d71118 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -84,6 +84,8 @@ struct send_ctx { u64 total_send_size; u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ + /* Protocol version compatibility requested */ + u32 proto; struct btrfs_root *send_root; struct btrfs_root *parent_root; @@ -312,6 +314,16 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx, sctx->parent_root->root_key.objectid : 0)); } +__maybe_unused +static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) +{ + switch (sctx->proto) { + case 1: return cmd < __BTRFS_SEND_C_MAX_V1; + case 2: return cmd < __BTRFS_SEND_C_MAX_V2; + default: return false; + } +} + static int is_waiting_for_move(struct send_ctx *sctx, u64 ino); static struct waiting_dir_move * @@ -2720,19 +2732,12 @@ static int send_create_inode_if_needed(struct send_ctx *sctx) if (S_ISDIR(sctx->cur_inode_mode)) { ret = did_create_dir(sctx, sctx->cur_ino); if (ret < 0) - goto out; - if (ret) { - ret = 0; - goto out; - } + return ret; + else if (ret > 0) + return 0; } - ret = send_create_inode(sctx, sctx->cur_ino); - if (ret < 0) - goto out; - -out: - return ret; + return send_create_inode(sctx, sctx->cur_ino); } struct recorded_ref { @@ -7276,6 +7281,17 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) sctx->flags = arg->flags; + if (arg->flags & BTRFS_SEND_FLAG_VERSION) { + if (arg->version > BTRFS_SEND_STREAM_VERSION) { + ret = -EPROTO; + goto out; + } + /* Zero means "use the highest version" */ + sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION; + } else { + sctx->proto = 1; + } + sctx->send_filp = fget(arg->send_fd); if (!sctx->send_filp) { ret = -EBADF; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index de91488b7cd0..23bcefc84e49 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -48,6 +48,7 @@ struct btrfs_tlv_header { enum btrfs_send_cmd { BTRFS_SEND_C_UNSPEC, + /* Version 1 */ BTRFS_SEND_C_SUBVOL, BTRFS_SEND_C_SNAPSHOT, @@ -76,6 +77,12 @@ enum btrfs_send_cmd { BTRFS_SEND_C_END, BTRFS_SEND_C_UPDATE_EXTENT, + __BTRFS_SEND_C_MAX_V1, + + /* Version 2 */ + __BTRFS_SEND_C_MAX_V2, + + /* End */ __BTRFS_SEND_C_MAX, }; #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 5ada02e0e629..48d77f360a24 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -414,9 +414,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info, { lockdep_assert_held(&info->lock); - btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull", + /* The free space could be negative in case of overcommit */ + btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull", info->flags, - info->total_bytes - btrfs_space_info_used(info, true), + (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu", @@ -884,6 +885,7 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, { struct reserve_ticket *ticket; u64 tickets_id = space_info->tickets_id; + const bool aborted = BTRFS_FS_ERROR(fs_info); trace_btrfs_fail_all_tickets(fs_info, space_info); @@ -897,16 +899,19 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - if (ticket->steal && + if (!aborted && ticket->steal && steal_from_global_rsv(fs_info, space_info, ticket)) return true; - if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) btrfs_info(fs_info, "failing ticket with %llu bytes", ticket->bytes); remove_ticket(space_info, ticket); - ticket->error = -ENOSPC; + if (aborted) + ticket->error = -EIO; + else + ticket->error = -ENOSPC; wake_up(&ticket->wait); /* @@ -915,7 +920,8 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, * here to see if we can make progress with the next ticket in * the list. */ - btrfs_try_granting_tickets(fs_info, space_info); + if (!aborted) + btrfs_try_granting_tickets(fs_info, space_info); } return (tickets_id != space_info->tickets_id); } @@ -1171,6 +1177,10 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) spin_unlock(&space_info->lock); return; } + + /* Something happened, fail everything and bail. */ + if (BTRFS_FS_ERROR(fs_info)) + goto aborted_fs; last_tickets_id = space_info->tickets_id; spin_unlock(&space_info->lock); } @@ -1201,9 +1211,20 @@ static void btrfs_async_reclaim_data_space(struct work_struct *work) } else { flush_state = 0; } + + /* Something happened, fail everything and bail. */ + if (BTRFS_FS_ERROR(fs_info)) + goto aborted_fs; + } spin_unlock(&space_info->lock); } + return; + +aborted_fs: + maybe_fail_all_tickets(fs_info, space_info); + space_info->flush = 0; + spin_unlock(&space_info->lock); } void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index cb10e56ee31e..29bd8c7a7706 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -63,11 +63,41 @@ * This means a slightly higher tree locking latency. */ +void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) +{ + unsigned int cur = 0; + unsigned int nr_bits; + + ASSERT(IS_ALIGNED(PAGE_SIZE, sectorsize)); + + nr_bits = PAGE_SIZE / sectorsize; + subpage_info->bitmap_nr_bits = nr_bits; + + subpage_info->uptodate_offset = cur; + cur += nr_bits; + + subpage_info->error_offset = cur; + cur += nr_bits; + + subpage_info->dirty_offset = cur; + cur += nr_bits; + + subpage_info->writeback_offset = cur; + cur += nr_bits; + + subpage_info->ordered_offset = cur; + cur += nr_bits; + + subpage_info->checked_offset = cur; + cur += nr_bits; + + subpage_info->total_nr_bits = cur; +} + int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type) { - struct btrfs_subpage *subpage = NULL; - int ret; + struct btrfs_subpage *subpage; /* * We have cases like a dummy extent buffer page, which is not mappped @@ -75,13 +105,15 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, */ if (page->mapping) ASSERT(PageLocked(page)); + /* Either not subpage, or the page already has private attached */ if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) return 0; - ret = btrfs_alloc_subpage(fs_info, &subpage, type); - if (ret < 0) - return ret; + subpage = btrfs_alloc_subpage(fs_info, type); + if (IS_ERR(subpage)) + return PTR_ERR(subpage); + attach_page_private(page, subpage); return 0; } @@ -100,24 +132,28 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, btrfs_free_subpage(subpage); } -int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - struct btrfs_subpage **ret, - enum btrfs_subpage_type type) +struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + enum btrfs_subpage_type type) { - if (fs_info->sectorsize == PAGE_SIZE) - return 0; + struct btrfs_subpage *ret; + unsigned int real_size; + + ASSERT(fs_info->sectorsize < PAGE_SIZE); + + real_size = struct_size(ret, bitmaps, + BITS_TO_LONGS(fs_info->subpage_info->total_nr_bits)); + ret = kzalloc(real_size, GFP_NOFS); + if (!ret) + return ERR_PTR(-ENOMEM); - *ret = kzalloc(sizeof(struct btrfs_subpage), GFP_NOFS); - if (!*ret) - return -ENOMEM; - spin_lock_init(&(*ret)->lock); + spin_lock_init(&ret->lock); if (type == BTRFS_SUBPAGE_METADATA) { - atomic_set(&(*ret)->eb_refs, 0); + atomic_set(&ret->eb_refs, 0); } else { - atomic_set(&(*ret)->readers, 0); - atomic_set(&(*ret)->writers, 0); + atomic_set(&ret->readers, 0); + atomic_set(&ret->writers, 0); } - return 0; + return ret; } void btrfs_free_subpage(struct btrfs_subpage *subpage) @@ -222,8 +258,16 @@ static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) u32 orig_len = *len; *start = max_t(u64, page_offset(page), orig_start); - *len = min_t(u64, page_offset(page) + PAGE_SIZE, - orig_start + orig_len) - *start; + /* + * For certain call sites like btrfs_drop_pages(), we may have pages + * beyond the target range. In that case, just set @len to 0, subpage + * helpers can handle @len == 0 without any problem. + */ + if (page_offset(page) >= orig_start + orig_len) + *len = 0; + else + *len = min_t(u64, page_offset(page) + PAGE_SIZE, + orig_start + orig_len) - *start; } void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, @@ -248,6 +292,16 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, btrfs_subpage_assert(fs_info, page, start, len); + /* + * We have call sites passing @lock_page into + * extent_clear_unlock_delalloc() for compression path. + * + * This @locked_page is locked by plain lock_page(), thus its + * subpage::writers is 0. Handle them in a special way. + */ + if (atomic_read(&subpage->writers) == 0) + return true; + ASSERT(atomic_read(&subpage->writers) >= nbits); return atomic_sub_and_test(nbits, &subpage->writers); } @@ -289,37 +343,59 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, unlock_page(page); } -/* - * Convert the [start, start + len) range into a u16 bitmap - * - * For example: if start == page_offset() + 16K, len = 16K, we get 0x00f0. - */ -static u16 btrfs_subpage_calc_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +static bool bitmap_test_range_all_set(unsigned long *addr, unsigned int start, + unsigned int nbits) { - const int bit_start = offset_in_page(start) >> fs_info->sectorsize_bits; - const int nbits = len >> fs_info->sectorsize_bits; + unsigned int found_zero; - btrfs_subpage_assert(fs_info, page, start, len); + found_zero = find_next_zero_bit(addr, start + nbits, start); + if (found_zero == start + nbits) + return true; + return false; +} - /* - * Here nbits can be 16, thus can go beyond u16 range. We make the - * first left shift to be calculate in unsigned long (at least u32), - * then truncate the result to u16. - */ - return (u16)(((1UL << nbits) - 1) << bit_start); +static bool bitmap_test_range_all_zero(unsigned long *addr, unsigned int start, + unsigned int nbits) +{ + unsigned int found_set; + + found_set = find_next_bit(addr, start + nbits, start); + if (found_set == start + nbits) + return true; + return false; } +#define subpage_calc_start_bit(fs_info, page, name, start, len) \ +({ \ + unsigned int start_bit; \ + \ + btrfs_subpage_assert(fs_info, page, start, len); \ + start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + start_bit += fs_info->subpage_info->name##_offset; \ + start_bit; \ +}) + +#define subpage_test_bitmap_all_set(fs_info, subpage, name) \ + bitmap_test_range_all_set(subpage->bitmaps, \ + fs_info->subpage_info->name##_offset, \ + fs_info->subpage_info->bitmap_nr_bits) + +#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ + bitmap_test_range_all_zero(subpage->bitmaps, \ + fs_info->subpage_info->name##_offset, \ + fs_info->subpage_info->bitmap_nr_bits) + void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->uptodate_bitmap |= tmp; - if (subpage->uptodate_bitmap == U16_MAX) + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) SetPageUptodate(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -328,11 +404,12 @@ void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->uptodate_bitmap &= ~tmp; + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); ClearPageUptodate(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -341,11 +418,12 @@ void btrfs_subpage_set_error(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + error, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->error_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); SetPageError(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -354,12 +432,13 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + error, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->error_bitmap &= ~tmp; - if (subpage->error_bitmap == 0) + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, error)) ClearPageError(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -368,11 +447,12 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + dirty, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->dirty_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&subpage->lock, flags); set_page_dirty(page); } @@ -391,13 +471,14 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + dirty, start, len); unsigned long flags; bool last = false; spin_lock_irqsave(&subpage->lock, flags); - subpage->dirty_bitmap &= ~tmp; - if (subpage->dirty_bitmap == 0) + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty)) last = true; spin_unlock_irqrestore(&subpage->lock, flags); return last; @@ -417,11 +498,12 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->writeback_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); set_page_writeback(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -430,12 +512,13 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->writeback_bitmap &= ~tmp; - if (subpage->writeback_bitmap == 0) { + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { ASSERT(PageWriteback(page)); end_page_writeback(page); } @@ -446,11 +529,12 @@ void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->ordered_bitmap |= tmp; + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); SetPageOrdered(page); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -459,15 +543,46 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); - subpage->ordered_bitmap &= ~tmp; - if (subpage->ordered_bitmap == 0) + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) ClearPageOrdered(page); spin_unlock_irqrestore(&subpage->lock, flags); } + +void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + checked, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) + SetPageChecked(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + checked, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); + ClearPageChecked(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -477,12 +592,14 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ - const u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); \ + unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \ + name, start, len); \ unsigned long flags; \ bool ret; \ \ spin_lock_irqsave(&subpage->lock, flags); \ - ret = ((subpage->name##_bitmap & tmp) == tmp); \ + ret = bitmap_test_range_all_set(subpage->bitmaps, start_bit, \ + len >> fs_info->sectorsize_bits); \ spin_unlock_irqrestore(&subpage->lock, flags); \ return ret; \ } @@ -491,6 +608,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(ordered); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -561,6 +679,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, PageWriteback); IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, PageOrdered); +IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked); /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit @@ -579,5 +698,48 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, return; ASSERT(PagePrivate(page) && page->private); - ASSERT(subpage->dirty_bitmap == 0); + ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty)); +} + +/* + * Handle different locked pages with different page sizes: + * + * - Page locked by plain lock_page() + * It should not have any subpage::writers count. + * Can be unlocked by unlock_page(). + * This is the most common locked page for __extent_writepage() called + * inside extent_write_cache_pages() or extent_write_full_page(). + * Rarer cases include the @locked_page from extent_write_locked_range(). + * + * - Page locked by lock_delalloc_pages() + * There is only one caller, all pages except @locked_page for + * extent_write_locked_range(). + * In this case, we have to call subpage helper to handle the case. + */ +void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, + u64 start, u32 len) +{ + struct btrfs_subpage *subpage; + + ASSERT(PageLocked(page)); + /* For regular page size case, we just unlock the page */ + if (fs_info->sectorsize == PAGE_SIZE) + return unlock_page(page); + + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + /* + * For subpage case, there are two types of locked page. With or + * without writers number. + * + * Since we own the page lock, no one else could touch subpage::writers + * and we are safe to do several atomic operations without spinlock. + */ + if (atomic_read(&subpage->writers)) + /* No writers, locked by plain lock_page() */ + return unlock_page(page); + + /* Have writers, use proper subpage helper to end it */ + btrfs_page_end_writer_lock(fs_info, page, start, len); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 0120948f37a1..7accb5c40d33 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -6,10 +6,38 @@ #include <linux/spinlock.h> /* - * Maximum page size we support is 64K, minimum sector size is 4K, u16 bitmap - * is sufficient. Regular bitmap_* is not used due to size reasons. + * Extra info for subpapge bitmap. + * + * For subpage we pack all uptodate/error/dirty/writeback/ordered bitmaps into + * one larger bitmap. + * + * This structure records how they are organized in the bitmap: + * + * /- uptodate_offset /- error_offset /- dirty_offset + * | | | + * v v v + * |u|u|u|u|........|u|u|e|e|.......|e|e| ... |o|o| + * |<- bitmap_nr_bits ->| + * |<--------------- total_nr_bits ---------------->| */ -#define BTRFS_SUBPAGE_BITMAP_SIZE 16 +struct btrfs_subpage_info { + /* Number of bits for each bitmap */ + unsigned int bitmap_nr_bits; + + /* Total number of bits for the whole bitmap */ + unsigned int total_nr_bits; + + /* + * *_start indicates where the bitmap starts, the length is always + * @bitmap_size, which is calculated from PAGE_SIZE / sectorsize. + */ + unsigned int uptodate_offset; + unsigned int error_offset; + unsigned int dirty_offset; + unsigned int writeback_offset; + unsigned int ordered_offset; + unsigned int checked_offset; +}; /* * Structure to trace status of each sector inside a page, attached to @@ -18,10 +46,6 @@ struct btrfs_subpage { /* Common members for both data and metadata pages */ spinlock_t lock; - u16 uptodate_bitmap; - u16 error_bitmap; - u16 dirty_bitmap; - u16 writeback_bitmap; /* * Both data and metadata needs to track how many readers are for the * page. @@ -38,14 +62,11 @@ struct btrfs_subpage { * manages whether the subpage can be detached. */ atomic_t eb_refs; - /* Structures only used by data */ - struct { - atomic_t writers; - /* Tracke pending ordered extent in this sector */ - u16 ordered_bitmap; - }; + /* Structures only used by data */ + atomic_t writers; }; + unsigned long bitmaps[]; }; enum btrfs_subpage_type { @@ -53,15 +74,15 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type); void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct page *page); /* Allocate additional data where page represents more than one sector */ -int btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - struct btrfs_subpage **ret, - enum btrfs_subpage_type type); +struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, + enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, @@ -122,11 +143,14 @@ DECLARE_BTRFS_SUBPAGE_OPS(error); DECLARE_BTRFS_SUBPAGE_OPS(dirty); DECLARE_BTRFS_SUBPAGE_OPS(writeback); DECLARE_BTRFS_SUBPAGE_OPS(ordered); +DECLARE_BTRFS_SUBPAGE_OPS(checked); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len); void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct page *page); +void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, + u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 537d90bf5d84..a1c54a2c787c 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1705,7 +1705,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, goto error_close_devices; } - bdev = fs_devices->latest_bdev; + bdev = fs_devices->latest_dev->bdev; s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC, fs_info); if (IS_ERR(s)) { @@ -2006,7 +2006,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; } else { - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { btrfs_err(fs_info, "Remounting read-write after error is not allowed"); ret = -EINVAL; @@ -2463,30 +2463,16 @@ static int btrfs_unfreeze(struct super_block *sb) static int btrfs_show_devname(struct seq_file *m, struct dentry *root) { struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb); - struct btrfs_device *dev, *first_dev = NULL; /* - * Lightweight locking of the devices. We should not need - * device_list_mutex here as we only read the device data and the list - * is protected by RCU. Even if a device is deleted during the list - * traversals, we'll get valid data, the freeing callback will wait at - * least until the rcu_read_unlock. + * There should be always a valid pointer in latest_dev, it may be stale + * for a short moment in case it's being deleted but still valid until + * the end of RCU grace period. */ rcu_read_lock(); - list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) - continue; - if (!dev->name) - continue; - if (!first_dev || dev->devid < first_dev->devid) - first_dev = dev; - } - - if (first_dev) - seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\"); - else - WARN_ON(1); + seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\"); rcu_read_unlock(); + return 0; } diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 25a6f587852b..f9eff3b0f77c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -177,7 +177,7 @@ static ssize_t btrfs_feature_attr_show(struct kobject *kobj, } else val = can_modify_feature(fa); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } static ssize_t btrfs_feature_attr_store(struct kobject *kobj, @@ -330,7 +330,7 @@ static const struct attribute_group btrfs_feature_attr_group = { static ssize_t rmdir_subvol_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return scnprintf(buf, PAGE_SIZE, "0\n"); + return sysfs_emit(buf, "0\n"); } BTRFS_ATTR(static_feature, rmdir_subvol, rmdir_subvol_show); @@ -345,12 +345,12 @@ static ssize_t supported_checksums_show(struct kobject *kobj, * This "trick" only works as long as 'enum btrfs_csum_type' has * no holes in it */ - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", - (i == 0 ? "" : " "), btrfs_super_csum_name(i)); + ret += sysfs_emit_at(buf, ret, "%s%s", (i == 0 ? "" : " "), + btrfs_super_csum_name(i)); } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); @@ -358,7 +358,7 @@ BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); static ssize_t send_stream_version_show(struct kobject *kobj, struct kobj_attribute *ka, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION); + return sysfs_emit(buf, "%d\n", BTRFS_SEND_STREAM_VERSION); } BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); @@ -378,9 +378,8 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj, int i; for (i = 0; i < ARRAY_SIZE(rescue_opts); i++) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", - (i ? " " : ""), rescue_opts[i]); - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "%s%s", (i ? " " : ""), rescue_opts[i]); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } BTRFS_ATTR(static_feature, supported_rescue_options, @@ -394,10 +393,10 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, /* 4K sector size is also supported with 64K page size */ if (PAGE_SIZE == SZ_64K) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%u ", SZ_4K); + ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); /* Only sectorsize == PAGE_SIZE is now supported */ - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE); + ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); return ret; } @@ -437,7 +436,7 @@ static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discardable_bytes)); } BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); @@ -448,7 +447,7 @@ static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%d\n", + return sysfs_emit(buf, "%d\n", atomic_read(&fs_info->discard_ctl.discardable_extents)); } BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); @@ -459,8 +458,8 @@ static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - fs_info->discard_ctl.discard_bitmap_bytes); + return sysfs_emit(buf, "%llu\n", + fs_info->discard_ctl.discard_bitmap_bytes); } BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); @@ -470,7 +469,7 @@ static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%lld\n", + return sysfs_emit(buf, "%lld\n", atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); } BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); @@ -481,8 +480,8 @@ static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - fs_info->discard_ctl.discard_extent_bytes); + return sysfs_emit(buf, "%llu\n", + fs_info->discard_ctl.discard_extent_bytes); } BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); @@ -492,8 +491,8 @@ static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", - READ_ONCE(fs_info->discard_ctl.iops_limit)); + return sysfs_emit(buf, "%u\n", + READ_ONCE(fs_info->discard_ctl.iops_limit)); } static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, @@ -523,8 +522,8 @@ static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", - READ_ONCE(fs_info->discard_ctl.kbps_limit)); + return sysfs_emit(buf, "%u\n", + READ_ONCE(fs_info->discard_ctl.kbps_limit)); } static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, @@ -553,8 +552,8 @@ static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - READ_ONCE(fs_info->discard_ctl.max_discard_size)); + return sysfs_emit(buf, "%llu\n", + READ_ONCE(fs_info->discard_ctl.max_discard_size)); } static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, @@ -627,7 +626,7 @@ static ssize_t btrfs_show_u64(u64 *value_ptr, spinlock_t *lock, char *buf) val = *value_ptr; if (lock) spin_unlock(lock); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return sysfs_emit(buf, "%llu\n", val); } static ssize_t global_rsv_size_show(struct kobject *kobj, @@ -673,7 +672,7 @@ static ssize_t raid_bytes_show(struct kobject *kobj, val += block_group->used; } up_read(&sinfo->groups_sem); - return scnprintf(buf, PAGE_SIZE, "%llu\n", val); + return sysfs_emit(buf, "%llu\n", val); } /* @@ -771,7 +770,7 @@ static ssize_t btrfs_label_show(struct kobject *kobj, ssize_t ret; spin_lock(&fs_info->super_lock); - ret = scnprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); + ret = sysfs_emit(buf, label[0] ? "%s\n" : "%s", label); spin_unlock(&fs_info->super_lock); return ret; @@ -819,7 +818,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); + return sysfs_emit(buf, "%u\n", fs_info->super_copy->nodesize); } BTRFS_ATTR(, nodesize, btrfs_nodesize_show); @@ -829,8 +828,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", - fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); } BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); @@ -840,7 +838,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); + return sysfs_emit(buf, "%u\n", fs_info->super_copy->sectorsize); } BTRFS_ATTR(, clone_alignment, btrfs_clone_alignment_show); @@ -852,7 +850,7 @@ static ssize_t quota_override_show(struct kobject *kobj, int quota_override; quota_override = test_bit(BTRFS_FS_QUOTA_OVERRIDE, &fs_info->flags); - return scnprintf(buf, PAGE_SIZE, "%d\n", quota_override); + return sysfs_emit(buf, "%d\n", quota_override); } static ssize_t quota_override_store(struct kobject *kobj, @@ -890,8 +888,7 @@ static ssize_t btrfs_metadata_uuid_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%pU\n", - fs_info->fs_devices->metadata_uuid); + return sysfs_emit(buf, "%pU\n", fs_info->fs_devices->metadata_uuid); } BTRFS_ATTR(, metadata_uuid, btrfs_metadata_uuid_show); @@ -902,9 +899,9 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); u16 csum_type = btrfs_super_csum_type(fs_info->super_copy); - return scnprintf(buf, PAGE_SIZE, "%s (%s)\n", - btrfs_super_csum_name(csum_type), - crypto_shash_driver_name(fs_info->csum_shash)); + return sysfs_emit(buf, "%s (%s)\n", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(fs_info->csum_shash)); } BTRFS_ATTR(, checksum, btrfs_checksum_show); @@ -941,7 +938,7 @@ static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, str = "UNKNOWN\n"; break; } - return scnprintf(buf, PAGE_SIZE, "%s", str); + return sysfs_emit(buf, "%s", str); } BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); @@ -950,7 +947,7 @@ static ssize_t btrfs_generation_show(struct kobject *kobj, { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", fs_info->generation); + return sysfs_emit(buf, "%llu\n", fs_info->generation); } BTRFS_ATTR(, generation, btrfs_generation_show); @@ -1028,8 +1025,7 @@ static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, struct btrfs_fs_info *fs_info = to_fs_info(kobj); ssize_t ret; - ret = scnprintf(buf, PAGE_SIZE, "%d\n", - READ_ONCE(fs_info->bg_reclaim_threshold)); + ret = sysfs_emit(buf, "%d\n", READ_ONCE(fs_info->bg_reclaim_threshold)); return ret; } @@ -1471,7 +1467,7 @@ static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); @@ -1484,7 +1480,7 @@ static ssize_t btrfs_devinfo_missing_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, missing, btrfs_devinfo_missing_show); @@ -1498,7 +1494,7 @@ static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); @@ -1509,8 +1505,7 @@ static ssize_t btrfs_devinfo_scrub_speed_max_show(struct kobject *kobj, struct btrfs_device *device = container_of(kobj, struct btrfs_device, devid_kobj); - return scnprintf(buf, PAGE_SIZE, "%llu\n", - READ_ONCE(device->scrub_speed_max)); + return sysfs_emit(buf, "%llu\n", READ_ONCE(device->scrub_speed_max)); } static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, @@ -1538,7 +1533,7 @@ static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - return scnprintf(buf, PAGE_SIZE, "%d\n", val); + return sysfs_emit(buf, "%d\n", val); } BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); @@ -1549,14 +1544,14 @@ static ssize_t btrfs_devinfo_error_stats_show(struct kobject *kobj, devid_kobj); if (!device->dev_stats_valid) - return scnprintf(buf, PAGE_SIZE, "invalid\n"); + return sysfs_emit(buf, "invalid\n"); /* * Print all at once so we get a snapshot of all values from the same * time. Keep them in sync and in order of definition of * btrfs_dev_stat_values. */ - return scnprintf(buf, PAGE_SIZE, + return sysfs_emit(buf, "write_errs %d\n" "read_errs %d\n" "flush_errs %d\n" diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index df54cdfdc250..2a95f7224e18 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, 1); + btrfs_setup_item_for_insert(root, path, &key, value_len); item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 73e96d505f4f..c2e72e7a8ff0 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -112,7 +112,7 @@ static int test_find_delalloc(u32 sectorsize) */ set_extent_delalloc(tmp, 0, sectorsize - 1, 0, NULL); start = 0; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -143,7 +143,7 @@ static int test_find_delalloc(u32 sectorsize) } set_extent_delalloc(tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -177,14 +177,14 @@ static int test_find_delalloc(u32 sectorsize) goto out_bits; } start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (found) { test_err("found range when we shouldn't have"); goto out_bits; } - if (end != (u64)-1) { + if (end != test_start + PAGE_SIZE - 1) { test_err("did not return the proper end offset"); goto out_bits; } @@ -198,7 +198,7 @@ static int test_find_delalloc(u32 sectorsize) */ set_extent_delalloc(tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, locked_page, &start, &end); if (!found) { @@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize) /* We unlocked it in the previous test */ lock_page(locked_page); start = test_start; - end = 0; + end = start + PAGE_SIZE - 1; /* * Currently if we fail to find dirty pages in the delalloc range we * will adjust max_bytes down to PAGE_SIZE and then re-search. If diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index c9874b12d337..cac89c388131 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, 1); + btrfs_setup_item_for_insert(root, &path, &key, value_len); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, 1); + btrfs_setup_item_for_insert(root, &path, &key, value_len); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 14b9fdc8aaa9..1c3a1189c0bd 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -283,7 +283,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); return -EROFS; } @@ -331,7 +331,7 @@ loop: */ kfree(cur_trans); goto loop; - } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + } else if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); kfree(cur_trans); return -EROFS; @@ -579,7 +579,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool do_chunk_alloc = false; int ret; - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) + if (BTRFS_FS_ERROR(fs_info)) return ERR_PTR(-EROFS); if (current->journal_info) { @@ -991,8 +991,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (throttle) btrfs_run_delayed_iputs(info); - if (TRANS_ABORTED(trans) || - test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) { + if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { wake_up_process(info->transaction_kthread); if (TRANS_ABORTED(trans)) err = trans->aborted; @@ -2155,7 +2154,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * abort to prevent writing a new superblock that reflects a * corrupt state (pointing to trees with unwritten nodes/leafs). */ - if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { ret = -EROFS; goto cleanup_transaction; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f7efc26aa82a..8ab33caf016f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,7 +94,7 @@ enum { }; static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, @@ -207,7 +207,7 @@ again: } atomic_inc(&root->log_writers); - if (ctx && !ctx->logging_new_name) { + if (!ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -368,25 +368,11 @@ static int process_one_buffer(struct btrfs_root *log, return ret; } -/* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. - * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). - * - * If the key is already in the destination tree the existing item is - * overwritten. If the existing item isn't big enough, it is extended. - * If it is too large, it is truncated. - * - * If the key isn't in the destination yet, a new item is inserted. - */ -static noinline int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +static int do_overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) { int ret; u32 item_size; @@ -403,10 +389,22 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans, item_size = btrfs_item_size_nr(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); - /* look for the key in the destination tree */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; + /* Our caller must have done a search for the key for us. */ + ASSERT(path->nodes[0] != NULL); + + /* + * And the slot must point to the exact key or the slot where the key + * should be at (the first item with a key greater than 'key') + */ + if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + struct btrfs_key found_key; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); + ret = btrfs_comp_cpu_keys(&found_key, key); + ASSERT(ret >= 0); + } else { + ret = 1; + } if (ret == 0) { char *src_copy; @@ -585,6 +583,36 @@ no_copy: } /* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + + /* Look for the key in the destination tree. */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; + + return do_overwrite_item(trans, root, path, eb, slot, key); +} + +/* * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log */ @@ -761,7 +789,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, ins.objectid, ins.offset, 0); btrfs_init_data_ref(&ref, root->root_key.objectid, - key->objectid, offset); + key->objectid, offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) goto out; @@ -893,11 +921,11 @@ out: * item */ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_path *path, struct btrfs_inode *dir, struct btrfs_dir_item *di) { + struct btrfs_root *root = dir->root; struct inode *inode; char *name; int name_len; @@ -926,7 +954,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name, + ret = btrfs_unlink_inode(trans, dir, BTRFS_I(inode), name, name_len); if (ret) goto out; @@ -939,9 +967,11 @@ out: } /* - * helper function to see if a given name and sequence number found - * in an inode back reference are already in a directory and correctly - * point to this inode + * See if a given name and sequence number found in an inode back reference are + * already in a directory and correctly point to this inode. + * + * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it + * exists. */ static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, @@ -950,29 +980,34 @@ static noinline int inode_in_dir(struct btrfs_root *root, { struct btrfs_dir_item *di; struct btrfs_key location; - int match = 0; + int ret = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, index, name, name_len, 0); - if (di && !IS_ERR(di)) { + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } else if (di) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); if (location.objectid != objectid) goto out; - } else + } else { goto out; - btrfs_release_path(path); + } + btrfs_release_path(path); di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); - if (di && !IS_ERR(di)) { - btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); - if (location.objectid != objectid) - goto out; - } else + if (IS_ERR(di)) { + ret = PTR_ERR(di); goto out; - match = 1; + } else if (di) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid == objectid) + ret = 1; + } out: btrfs_release_path(path); - return match; + return ret; } /* @@ -1084,7 +1119,7 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, root, dir, inode, + ret = btrfs_unlink_inode(trans, dir, inode, victim_name, victim_name_len); kfree(victim_name); if (ret) @@ -1155,7 +1190,7 @@ again: inc_nlink(&inode->vfs_inode); btrfs_release_path(path); - ret = btrfs_unlink_inode(trans, root, + ret = btrfs_unlink_inode(trans, BTRFS_I(victim_parent), inode, victim_name, @@ -1182,8 +1217,10 @@ next: /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), ref_index, name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { + ret = drop_one_dir_item(trans, path, dir, di); if (ret) return ret; } @@ -1192,8 +1229,10 @@ next: /* look for a conflicting name */ di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, namelen, 0); - if (di && !IS_ERR(di)) { - ret = drop_one_dir_item(trans, root, path, dir, di); + if (IS_ERR(di)) { + return PTR_ERR(di); + } else if (di) { + ret = drop_one_dir_item(trans, path, dir, di); if (ret) return ret; } @@ -1313,7 +1352,7 @@ again: kfree(name); goto out; } - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), inode, name, namelen); kfree(name); iput(dir); @@ -1374,10 +1413,11 @@ out: return ret; } -static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, +static int add_link(struct btrfs_trans_handle *trans, struct inode *dir, struct inode *inode, const char *name, int namelen, u64 ref_index) { + struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_dir_item *dir_item; struct btrfs_key key; struct btrfs_path *path; @@ -1411,7 +1451,7 @@ static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root, ret = -ENOENT; goto out; } - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(other_inode), name, namelen); if (ret) goto out; @@ -1517,10 +1557,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - /* if we already have a perfect match, we're done */ - if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, - name, namelen)) { + ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), + btrfs_ino(BTRFS_I(inode)), ref_index, + name, namelen); + if (ret < 0) { + goto out; + } else if (ret == 0) { /* * look for a conflicting back reference in the * metadata. if we find one we have to unlink that name @@ -1555,7 +1597,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = btrfs_inode_ref_exists(inode, dir, key->type, name, namelen); if (ret > 0) { - ret = btrfs_unlink_inode(trans, root, + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, namelen); @@ -1571,7 +1613,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; /* insert our name */ - ret = add_link(trans, root, dir, inode, name, namelen, + ret = add_link(trans, dir, inode, name, namelen, ref_index); if (ret) goto out; @@ -1580,6 +1622,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; } + /* Else, ret == 1, we already have a perfect match, we're done. */ ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; kfree(name); @@ -1936,8 +1979,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_key log_key; struct inode *dir; u8 log_type; - int exists; - int ret = 0; + bool exists; + int ret; bool update_size = (key->type == BTRFS_DIR_INDEX_KEY); bool name_added = false; @@ -1957,12 +2000,12 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, name_len); btrfs_dir_item_key_to_cpu(eb, di, &log_key); - exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); - if (exists == 0) - exists = 1; - else - exists = 0; + ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); + if (ret < 0) + goto out; + exists = (ret == 0); + ret = 0; if (key->type == BTRFS_DIR_ITEM_KEY) { dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, @@ -1977,7 +2020,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = -EINVAL; goto out; } - if (IS_ERR_OR_NULL(dst_di)) { + + if (IS_ERR(dst_di)) { + ret = PTR_ERR(dst_di); + goto out; + } else if (!dst_di) { /* we need a sequence number to insert, so we only * do inserts for the BTRFS_DIR_INDEX_KEY types */ @@ -2003,7 +2050,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (!exists) goto out; - ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di); + ret = drop_one_dir_item(trans, path, BTRFS_I(dir), dst_di); if (ret) goto out; @@ -2233,13 +2280,13 @@ out: * to is unlinked */ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, struct inode *dir, struct btrfs_key *dir_key) { + struct btrfs_root *root = BTRFS_I(dir)->root; int ret; struct extent_buffer *eb; int slot; @@ -2281,7 +2328,7 @@ again: dir_key->offset, name, name_len, 0); } - if (!log_di || log_di == ERR_PTR(-ENOENT)) { + if (!log_di) { btrfs_dir_item_key_to_cpu(eb, di, &location); btrfs_release_path(path); btrfs_release_path(log_path); @@ -2300,7 +2347,7 @@ again: } inc_nlink(inode); - ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(inode), name, name_len); if (!ret) ret = btrfs_run_delayed_items(trans); @@ -2482,7 +2529,9 @@ again: else { ret = find_dir_range(log, path, dirid, key_type, &range_start, &range_end); - if (ret != 0) + if (ret < 0) + goto out; + else if (ret > 0) break; } @@ -2511,7 +2560,7 @@ again: if (found_key.offset > range_end) break; - ret = check_item_in_log(trans, root, log, path, + ret = check_item_in_log(trans, log, path, log_path, dir, &found_key); if (ret) @@ -3019,9 +3068,6 @@ static void wait_for_writer(struct btrfs_root *root) static inline void btrfs_remove_log_ctx(struct btrfs_root *root, struct btrfs_log_ctx *ctx) { - if (!ctx) - return; - mutex_lock(&root->log_mutex); list_del_init(&ctx->list); mutex_unlock(&root->log_mutex); @@ -3310,7 +3356,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * writing the super here would result in transid mismatches. If there * is an error here just bail. */ - if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { + if (BTRFS_FS_ERROR(fs_info)) { ret = -EIO; btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); @@ -3434,6 +3480,9 @@ static bool inode_logged(struct btrfs_trans_handle *trans, if (inode->logged_trans == trans->transid) return true; + if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &inode->root->state)) + return false; + /* * The inode's logged_trans is always 0 when we load it (because it is * not persisted in the inode item or elsewhere). So if it is 0, the @@ -3472,10 +3521,10 @@ static bool inode_logged(struct btrfs_trans_handle *trans, * This optimizations allows us to avoid relogging the entire inode * or the entire directory. */ -int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *dir, u64 index) +void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *dir, u64 index) { struct btrfs_root *log; struct btrfs_dir_item *di; @@ -3485,11 +3534,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, u64 dir_ino = btrfs_ino(dir); if (!inode_logged(trans, dir)) - return 0; + return; ret = join_running_log_trans(root); if (ret) - return 0; + return; mutex_lock(&dir->log_mutex); @@ -3537,49 +3586,36 @@ fail: btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); - if (err == -ENOSPC) { + if (err < 0) btrfs_set_log_full_commit(trans); - err = 0; - } else if (err < 0 && err != -ENOENT) { - /* ENOENT can be returned if the entry hasn't been fsynced yet */ - btrfs_abort_transaction(trans, err); - } - btrfs_end_log_trans(root); - - return err; } /* see comments for btrfs_del_dir_entries_in_log */ -int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *inode, u64 dirid) +void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; u64 index; int ret; if (!inode_logged(trans, inode)) - return 0; + return; ret = join_running_log_trans(root); if (ret) - return 0; + return; log = root->log_root; mutex_lock(&inode->log_mutex); ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), dirid, &index); mutex_unlock(&inode->log_mutex); - if (ret == -ENOSPC) { + if (ret < 0 && ret != -ENOENT) btrfs_set_log_full_commit(trans); - ret = 0; - } else if (ret < 0 && ret != -ENOENT) - btrfs_abort_transaction(trans, ret); btrfs_end_log_trans(root); - - return ret; } /* @@ -3615,31 +3651,231 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, return 0; } +static int flush_dir_items_batch(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct extent_buffer *src, + struct btrfs_path *dst_path, + int start_slot, + int count) +{ + char *ins_data = NULL; + struct btrfs_item_batch batch; + struct extent_buffer *dst; + unsigned long src_offset; + unsigned long dst_offset; + struct btrfs_key key; + u32 item_size; + int ret; + int i; + + ASSERT(count > 0); + batch.nr = count; + + if (count == 1) { + btrfs_item_key_to_cpu(src, &key, start_slot); + item_size = btrfs_item_size_nr(src, start_slot); + batch.keys = &key; + batch.data_sizes = &item_size; + batch.total_data_size = item_size; + } else { + struct btrfs_key *ins_keys; + u32 *ins_sizes; + + ins_data = kmalloc(count * sizeof(u32) + + count * sizeof(struct btrfs_key), GFP_NOFS); + if (!ins_data) + return -ENOMEM; + + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + count * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + batch.total_data_size = 0; + + for (i = 0; i < count; i++) { + const int slot = start_slot + i; + + btrfs_item_key_to_cpu(src, &ins_keys[i], slot); + ins_sizes[i] = btrfs_item_size_nr(src, slot); + batch.total_data_size += ins_sizes[i]; + } + } + + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); + if (ret) + goto out; + + dst = dst_path->nodes[0]; + /* + * Copy all the items in bulk, in a single copy operation. Item data is + * organized such that it's placed at the end of a leaf and from right + * to left. For example, the data for the second item ends at an offset + * that matches the offset where the data for the first item starts, the + * data for the third item ends at an offset that matches the offset + * where the data of the second items starts, and so on. + * Therefore our source and destination start offsets for copy match the + * offsets of the last items (highest slots). + */ + dst_offset = btrfs_item_ptr_offset(dst, dst_path->slots[0] + count - 1); + src_offset = btrfs_item_ptr_offset(src, start_slot + count - 1); + copy_extent_buffer(dst, src, dst_offset, src_offset, batch.total_data_size); + btrfs_release_path(dst_path); +out: + kfree(ins_data); + + return ret; +} + +static int process_dir_items_leaf(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, + int key_type, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = inode->root->log_root; + struct extent_buffer *src = path->nodes[0]; + const int nritems = btrfs_header_nritems(src); + const u64 ino = btrfs_ino(inode); + const bool inode_logged_before = inode_logged(trans, inode); + u64 last_logged_key_offset; + bool last_found = false; + int batch_start = 0; + int batch_size = 0; + int i; + + if (key_type == BTRFS_DIR_ITEM_KEY) + last_logged_key_offset = inode->last_dir_item_offset; + else + last_logged_key_offset = inode->last_dir_index_offset; + + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_key key; + int ret; + + btrfs_item_key_to_cpu(src, &key, i); + + if (key.objectid != ino || key.type != key_type) { + last_found = true; + break; + } + + ctx->last_dir_item_offset = key.offset; + /* + * We must make sure that when we log a directory entry, the + * corresponding inode, after log replay, has a matching link + * count. For example: + * + * touch foo + * mkdir mydir + * sync + * ln foo mydir/bar + * xfs_io -c "fsync" mydir + * <crash> + * <mount fs and log replay> + * + * Would result in a fsync log that when replayed, our file inode + * would have a link count of 1, but we get two directory entries + * pointing to the same inode. After removing one of the names, + * it would not be possible to remove the other name, which + * resulted always in stale file handle errors, and would not be + * possible to rmdir the parent directory, since its i_size could + * never be decremented to the value BTRFS_EMPTY_DIR_SIZE, + * resulting in -ENOTEMPTY errors. + */ + if (!ctx->log_new_dentries) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(src, di, &di_key); + if ((btrfs_dir_transid(src, di) == trans->transid || + btrfs_dir_type(src, di) == BTRFS_FT_DIR) && + di_key.type != BTRFS_ROOT_ITEM_KEY) + ctx->log_new_dentries = true; + } + + if (!inode_logged_before) + goto add_to_batch; + + /* + * If we were logged before and have logged dir items, we can skip + * checking if any item with a key offset larger than the last one + * we logged is in the log tree, saving time and avoiding adding + * contention on the log tree. + */ + if (key.offset > last_logged_key_offset) + goto add_to_batch; + /* + * Check if the key was already logged before. If not we can add + * it to a batch for bulk insertion. + */ + ret = btrfs_search_slot(NULL, log, &key, dst_path, 0, 0); + if (ret < 0) { + return ret; + } else if (ret > 0) { + btrfs_release_path(dst_path); + goto add_to_batch; + } + + /* + * Item exists in the log. Overwrite the item in the log if it + * has different content or do nothing if it has exactly the same + * content. And then flush the current batch if any - do it after + * overwriting the current item, or we would deadlock otherwise, + * since we are holding a path for the existing item. + */ + ret = do_overwrite_item(trans, log, dst_path, src, i, &key); + if (ret < 0) + return ret; + + if (batch_size > 0) { + ret = flush_dir_items_batch(trans, log, src, dst_path, + batch_start, batch_size); + if (ret < 0) + return ret; + batch_size = 0; + } + continue; +add_to_batch: + if (batch_size == 0) + batch_start = i; + batch_size++; + } + + if (batch_size > 0) { + int ret; + + ret = flush_dir_items_batch(trans, log, src, dst_path, + batch_start, batch_size); + if (ret < 0) + return ret; + } + + return last_found ? 1 : 0; +} + /* * log all the items included in the current transaction for a given * directory. This also creates the range items in the log tree required * to replay anything deleted before the fsync */ static noinline int log_dir_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, int key_type, struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; + struct btrfs_root *root = inode->root; struct btrfs_root *log = root->log_root; - struct extent_buffer *src; int err = 0; int ret; - int i; - int nritems; u64 first_offset = min_offset; u64 last_offset = (u64)-1; u64 ino = btrfs_ino(inode); - log = root->log_root; - min_key.objectid = ino; min_key.type = key_type; min_key.offset = min_offset; @@ -3713,62 +3949,14 @@ search: * from our directory */ while (1) { - struct btrfs_key tmp; - src = path->nodes[0]; - nritems = btrfs_header_nritems(src); - for (i = path->slots[0]; i < nritems; i++) { - struct btrfs_dir_item *di; - - btrfs_item_key_to_cpu(src, &min_key, i); - - if (min_key.objectid != ino || min_key.type != key_type) - goto done; - - if (need_resched()) { - btrfs_release_path(path); - cond_resched(); - goto search; - } - - ret = overwrite_item(trans, log, dst_path, src, i, - &min_key); - if (ret) { + ret = process_dir_items_leaf(trans, inode, path, dst_path, + key_type, ctx); + if (ret != 0) { + if (ret < 0) err = ret; - goto done; - } - - /* - * We must make sure that when we log a directory entry, - * the corresponding inode, after log replay, has a - * matching link count. For example: - * - * touch foo - * mkdir mydir - * sync - * ln foo mydir/bar - * xfs_io -c "fsync" mydir - * <crash> - * <mount fs and log replay> - * - * Would result in a fsync log that when replayed, our - * file inode would have a link count of 1, but we get - * two directory entries pointing to the same inode. - * After removing one of the names, it would not be - * possible to remove the other name, which resulted - * always in stale file handle errors, and would not - * be possible to rmdir the parent directory, since - * its i_size could never decrement to the value - * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. - */ - di = btrfs_item_ptr(src, i, struct btrfs_dir_item); - btrfs_dir_item_key_to_cpu(src, di, &tmp); - if (ctx && - (btrfs_dir_transid(src, di) == trans->transid || - btrfs_dir_type(src, di) == BTRFS_FT_DIR) && - tmp.type != BTRFS_ROOT_ITEM_KEY) - ctx->log_new_dentries = true; + goto done; } - path->slots[0] = nritems; + path->slots[0] = btrfs_header_nritems(path->nodes[0]); /* * look ahead to the next item and see if it is also @@ -3782,21 +3970,26 @@ search: err = ret; goto done; } - btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); - if (tmp.objectid != ino || tmp.type != key_type) { + btrfs_item_key_to_cpu(path->nodes[0], &min_key, path->slots[0]); + if (min_key.objectid != ino || min_key.type != key_type) { last_offset = (u64)-1; goto done; } if (btrfs_header_generation(path->nodes[0]) != trans->transid) { ret = overwrite_item(trans, log, dst_path, path->nodes[0], path->slots[0], - &tmp); + &min_key); if (ret) err = ret; else - last_offset = tmp.offset; + last_offset = min_key.offset; goto done; } + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } } done: btrfs_release_path(path); @@ -3829,7 +4022,7 @@ done: * key logged by this transaction. */ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, struct btrfs_log_ctx *ctx) @@ -3839,11 +4032,33 @@ static noinline int log_directory_changes(struct btrfs_trans_handle *trans, int ret; int key_type = BTRFS_DIR_ITEM_KEY; + /* + * If this is the first time we are being logged in the current + * transaction, or we were logged before but the inode was evicted and + * reloaded later, in which case its logged_trans is 0, reset the values + * of the last logged key offsets. Note that we don't use the helper + * function inode_logged() here - that is because the function returns + * true after an inode eviction, assuming the worst case as it can not + * know for sure if the inode was logged before. So we can not skip key + * searches in the case the inode was evicted, because it may not have + * been logged in this transaction and may have been logged in a past + * transaction, so we need to reset the last dir item and index offsets + * to (u64)-1. + */ + if (inode->logged_trans != trans->transid) { + inode->last_dir_item_offset = (u64)-1; + inode->last_dir_index_offset = (u64)-1; + } again: min_key = 0; max_key = 0; + if (key_type == BTRFS_DIR_ITEM_KEY) + ctx->last_dir_item_offset = inode->last_dir_item_offset; + else + ctx->last_dir_item_offset = inode->last_dir_index_offset; + while (1) { - ret = log_dir_items(trans, root, inode, path, dst_path, key_type, + ret = log_dir_items(trans, inode, path, dst_path, key_type, ctx, min_key, &max_key); if (ret) return ret; @@ -3853,8 +4068,11 @@ again: } if (key_type == BTRFS_DIR_ITEM_KEY) { + inode->last_dir_item_offset = ctx->last_dir_item_offset; key_type = BTRFS_DIR_INDEX_KEY; goto again; + } else { + inode->last_dir_index_offset = ctx->last_dir_item_offset; } return 0; } @@ -3865,17 +4083,21 @@ again: * This cannot be run for file data extents because it does not * free the extents they point to. */ -static int drop_objectid_items(struct btrfs_trans_handle *trans, +static int drop_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, - u64 objectid, int max_key_type) + struct btrfs_inode *inode, + int max_key_type) { int ret; struct btrfs_key key; struct btrfs_key found_key; int start_slot; - key.objectid = objectid; + if (!inode_logged(trans, inode)) + return 0; + + key.objectid = btrfs_ino(inode); key.type = max_key_type; key.offset = (u64)-1; @@ -3892,7 +4114,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - if (found_key.objectid != objectid) + if (found_key.objectid != key.objectid) break; found_key.offset = 0; @@ -3917,6 +4139,21 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans, return ret; } +static int truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log_root, + struct btrfs_inode *inode, + u64 new_size, u32 min_type) +{ + int ret; + + do { + ret = btrfs_truncate_inode_items(trans, log_root, inode, + new_size, min_type, NULL); + } while (ret == -EAGAIN); + + return ret; +} + static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, @@ -4089,6 +4326,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; + struct btrfs_item_batch batch; char *ins_data; int i; struct list_head ordered_sums; @@ -4103,13 +4341,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + batch.keys = ins_keys; + batch.data_sizes = ins_sizes; + batch.total_data_size = 0; + batch.nr = nr; for (i = 0; i < nr; i++) { ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + batch.total_data_size += ins_sizes[i]; btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); } - ret = btrfs_insert_empty_items(trans, log, dst_path, - ins_keys, ins_sizes, nr); + ret = btrfs_insert_empty_items(trans, log, dst_path, &batch); if (ret) { kfree(ins_data); return ret; @@ -4321,13 +4563,13 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, } static int log_one_extent(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, struct btrfs_root *root, + struct btrfs_inode *inode, const struct extent_map *em, struct btrfs_path *path, struct btrfs_log_ctx *ctx) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct btrfs_root *log = root->log_root; + struct btrfs_root *log = inode->root->log_root; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct btrfs_map_token token; @@ -4340,14 +4582,25 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ret) return ret; - drop_args.path = path; - drop_args.start = em->start; - drop_args.end = em->start + em->len; - drop_args.replace_extent = true; - drop_args.extent_item_size = sizeof(*fi); - ret = btrfs_drop_extents(trans, log, inode, &drop_args); - if (ret) - return ret; + /* + * If this is the first time we are logging the inode in the current + * transaction, we can avoid btrfs_drop_extents(), which is expensive + * because it does a deletion search, which always acquires write locks + * for extent buffers at levels 2, 1 and 0. This not only wastes time + * but also adds significant contention in a log tree, since log trees + * are small, with a root at level 2 or 3 at most, due to their short + * life span. + */ + if (inode_logged(trans, inode)) { + drop_args.path = path; + drop_args.start = em->start; + drop_args.end = em->start + em->len; + drop_args.replace_extent = true; + drop_args.extent_item_size = sizeof(*fi); + ret = btrfs_drop_extents(trans, log, inode, &drop_args); + if (ret) + return ret; + } if (!drop_args.extent_inserted) { key.objectid = btrfs_ino(inode); @@ -4505,13 +4758,9 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, * Avoid logging extent items logged in past fsync calls * and leading to duplicate keys in the log tree. */ - do { - ret = btrfs_truncate_inode_items(trans, - root->log_root, - inode, truncate_offset, - BTRFS_EXTENT_DATA_KEY, - NULL); - } while (ret == -EAGAIN); + ret = truncate_inode_items(trans, root->log_root, inode, + truncate_offset, + BTRFS_EXTENT_DATA_KEY); if (ret) goto out; dropped_extents = true; @@ -4538,7 +4787,6 @@ out: } static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_log_ctx *ctx) @@ -4603,7 +4851,7 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path, ctx); + ret = log_one_extent(trans, inode, em, path, ctx); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -4692,11 +4940,11 @@ static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, * with a journal, ext3/4, xfs, f2fs, etc). */ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path) { + struct btrfs_root *root = inode->root; int ret; struct btrfs_key key; const u64 ino = btrfs_ino(inode); @@ -4770,10 +5018,10 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, * truncate operation that changes the inode's size. */ static int btrfs_log_holes(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; const u64 ino = btrfs_ino(inode); @@ -5050,7 +5298,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { ret = PTR_ERR(inode); } else { - ret = btrfs_log_inode(trans, root, + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE_ALL, ctx); @@ -5110,8 +5358,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_OTHER_INODE, ctx); + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_OTHER_INODE, ctx); if (ret) { btrfs_add_delayed_iput(inode); continue; @@ -5222,7 +5469,7 @@ again: &other_ino, &other_parent); if (ret < 0) { return ret; - } else if (ret > 0 && ctx && + } else if (ret > 0 && other_ino != btrfs_ino(BTRFS_I(ctx->inode))) { if (ins_nr > 0) { ins_nr++; @@ -5322,7 +5569,7 @@ next_key: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_inode *inode, int inode_only, struct btrfs_log_ctx *ctx) { @@ -5330,7 +5577,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_path *dst_path; struct btrfs_key min_key; struct btrfs_key max_key; - struct btrfs_root *log = root->log_root; + struct btrfs_root *log = inode->root->log_root; int err = 0; int ret = 0; bool fast_search = false; @@ -5372,22 +5619,11 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, * Only run delayed items if we are a directory. We want to make sure * all directory indexes hit the fs/subvolume tree so we can find them * and figure out which index ranges have to be logged. - * - * Otherwise commit the delayed inode only if the full sync flag is set, - * as we want to make sure an up to date version is in the subvolume - * tree so copy_inode_items_to_log() / copy_items() can find it and copy - * it to the log tree. For a non full sync, we always log the inode item - * based on the in-memory struct btrfs_inode which is always up to date. */ - if (S_ISDIR(inode->vfs_inode.i_mode)) - ret = btrfs_commit_inode_delayed_items(trans, inode); - else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - ret = btrfs_commit_inode_delayed_inode(inode); - - if (ret) { - btrfs_free_path(path); - btrfs_free_path(dst_path); - return ret; + if (S_ISDIR(inode->vfs_inode.i_mode)) { + err = btrfs_commit_inode_delayed_items(trans, inode); + if (err) + goto out; } if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) { @@ -5426,9 +5662,9 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); if (inode_only == LOG_INODE_EXISTS) max_key_type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, ino, max_key_type); + ret = drop_inode_items(trans, log, path, inode, max_key_type); } else { - if (inode_only == LOG_INODE_EXISTS) { + if (inode_only == LOG_INODE_EXISTS && inode_logged(trans, inode)) { /* * Make sure the new inode item we write to the log has * the same isize as the current one (if it exists). @@ -5450,19 +5686,16 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &inode->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, ino, - max_key.type); + ret = drop_inode_items(trans, log, path, inode, + max_key.type); } else { clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags); - while(1) { - ret = btrfs_truncate_inode_items(trans, - log, inode, 0, 0, NULL); - if (ret != -EAGAIN) - break; - } + if (inode_logged(trans, inode)) + ret = truncate_inode_items(trans, log, + inode, 0, 0); } } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, &inode->runtime_flags) || @@ -5470,8 +5703,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (inode_only == LOG_INODE_ALL) fast_search = true; max_key.type = BTRFS_XATTR_ITEM_KEY; - ret = drop_objectid_items(trans, log, path, ino, - max_key.type); + ret = drop_inode_items(trans, log, path, inode, + max_key.type); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; @@ -5494,14 +5727,14 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + err = btrfs_log_all_xattrs(trans, inode, path, dst_path); if (err) goto out_unlock; xattrs_logged = true; if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_holes(trans, root, inode, path); + err = btrfs_log_holes(trans, inode, path); if (err) goto out_unlock; } @@ -5521,16 +5754,14 @@ log_extents: * BTRFS_INODE_COPY_EVERYTHING set. */ if (!xattrs_logged && inode->logged_trans < trans->transid) { - err = btrfs_log_all_xattrs(trans, root, inode, path, - dst_path); + err = btrfs_log_all_xattrs(trans, inode, path, dst_path); if (err) goto out_unlock; btrfs_release_path(path); } } if (fast_search) { - ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - ctx); + ret = btrfs_log_changed_extents(trans, inode, dst_path, ctx); if (ret) { err = ret; goto out_unlock; @@ -5545,59 +5776,52 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) { - ret = log_directory_changes(trans, root, inode, path, dst_path, - ctx); + ret = log_directory_changes(trans, inode, path, dst_path, ctx); if (ret) { err = ret; goto out_unlock; } } + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; /* - * If we are logging that an ancestor inode exists as part of logging a - * new name from a link or rename operation, don't mark the inode as - * logged - otherwise if an explicit fsync is made against an ancestor, - * the fsync considers the inode in the log and doesn't sync the log, - * resulting in the ancestor missing after a power failure unless the - * log was synced as part of an fsync against any other unrelated inode. - * So keep it simple for this case and just don't flag the ancestors as - * logged. + * Don't update last_log_commit if we logged that an inode exists. + * We do this for three reasons: + * + * 1) We might have had buffered writes to this inode that were + * flushed and had their ordered extents completed in this + * transaction, but we did not previously log the inode with + * LOG_INODE_ALL. Later the inode was evicted and after that + * it was loaded again and this LOG_INODE_EXISTS log operation + * happened. We must make sure that if an explicit fsync against + * the inode is performed later, it logs the new extents, an + * updated inode item, etc, and syncs the log. The same logic + * applies to direct IO writes instead of buffered writes. + * + * 2) When we log the inode with LOG_INODE_EXISTS, its inode item + * is logged with an i_size of 0 or whatever value was logged + * before. If later the i_size of the inode is increased by a + * truncate operation, the log is synced through an fsync of + * some other inode and then finally an explicit fsync against + * this inode is made, we must make sure this fsync logs the + * inode with the new i_size, the hole between old i_size and + * the new i_size, and syncs the log. + * + * 3) If we are logging that an ancestor inode exists as part of + * logging a new name from a link or rename operation, don't update + * its last_log_commit - otherwise if an explicit fsync is made + * against an ancestor, the fsync considers the inode in the log + * and doesn't sync the log, resulting in the ancestor missing after + * a power failure unless the log was synced as part of an fsync + * against any other unrelated inode. */ - if (!ctx || - !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && - &inode->vfs_inode != ctx->inode)) { - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - /* - * Don't update last_log_commit if we logged that an inode exists. - * We do this for two reasons: - * - * 1) We might have had buffered writes to this inode that were - * flushed and had their ordered extents completed in this - * transaction, but we did not previously log the inode with - * LOG_INODE_ALL. Later the inode was evicted and after that - * it was loaded again and this LOG_INODE_EXISTS log operation - * happened. We must make sure that if an explicit fsync against - * the inode is performed later, it logs the new extents, an - * updated inode item, etc, and syncs the log. The same logic - * applies to direct IO writes instead of buffered writes. - * - * 2) When we log the inode with LOG_INODE_EXISTS, its inode item - * is logged with an i_size of 0 or whatever value was logged - * before. If later the i_size of the inode is increased by a - * truncate operation, the log is synced through an fsync of - * some other inode and then finally an explicit fsync against - * this inode is made, we must make sure this fsync logs the - * inode with the new i_size, the hole between old i_size and - * the new i_size, and syncs the log. - */ - if (inode_only != LOG_INODE_EXISTS) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); - } + if (inode_only != LOG_INODE_EXISTS) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); out_unlock: mutex_unlock(&inode->log_mutex); - +out: btrfs_free_path(path); btrfs_free_path(dst_path); return err; @@ -5697,6 +5921,14 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_dir_list *dir_elem; int ret = 0; + /* + * If we are logging a new name, as part of a link or rename operation, + * don't bother logging new dentries, as we just want to log the names + * of an inode and that any new parents exist. + */ + if (ctx->logging_new_name) + return 0; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -5773,7 +6005,7 @@ process_leaf: ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), + ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); btrfs_add_delayed_iput(di_inode); if (ret) @@ -5917,11 +6149,10 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, continue; } - if (ctx) - ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), + ctx->log_new_dentries = false; + ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), LOG_INODE_ALL, ctx); - if (!ret && ctx && ctx->log_new_dentries) + if (!ret && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, root, BTRFS_I(dir_inode), ctx); btrfs_add_delayed_iput(dir_inode); @@ -5967,7 +6198,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation >= trans->transid && need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, root, BTRFS_I(inode), + ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); if (ret) @@ -6022,7 +6253,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (inode->generation >= trans->transid && need_log_inode(trans, inode)) { - ret = btrfs_log_inode(trans, root, inode, + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); if (ret) break; @@ -6165,7 +6396,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); + ret = btrfs_log_inode(trans, inode, inode_only, ctx); if (ret) goto end_trans; @@ -6182,7 +6413,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries) + if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) log_dentries = true; /* @@ -6308,8 +6539,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) ret = walk_log_tree(trans, log_root_tree, &wc); if (ret) { - btrfs_handle_fs_error(fs_info, ret, - "Failed to pin buffers while recovering log root tree."); + btrfs_abort_transaction(trans, ret); goto error; } @@ -6322,8 +6552,7 @@ again: ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); if (ret < 0) { - btrfs_handle_fs_error(fs_info, ret, - "Couldn't find tree log root."); + btrfs_abort_transaction(trans, ret); goto error; } if (ret > 0) { @@ -6340,8 +6569,7 @@ again: log = btrfs_read_tree_root(log_root_tree, &found_key); if (IS_ERR(log)) { ret = PTR_ERR(log); - btrfs_handle_fs_error(fs_info, ret, - "Couldn't read tree log root."); + btrfs_abort_transaction(trans, ret); goto error; } @@ -6369,8 +6597,7 @@ again: if (!ret) goto next; - btrfs_handle_fs_error(fs_info, ret, - "Couldn't read target root for tree log recovery."); + btrfs_abort_transaction(trans, ret); goto error; } @@ -6378,14 +6605,15 @@ again: ret = btrfs_record_root_in_trans(trans, wc.replay_dest); if (ret) /* The loop needs to continue due to the root refs */ - btrfs_handle_fs_error(fs_info, ret, - "failed to record the log root in transaction"); + btrfs_abort_transaction(trans, ret); else ret = walk_log_tree(trans, log, &wc); if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, path); + if (ret) + btrfs_abort_transaction(trans, ret); } if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { @@ -6402,6 +6630,8 @@ again: * could only happen during mount. */ ret = btrfs_init_root_free_objectid(root); + if (ret) + btrfs_abort_transaction(trans, ret); } wc.replay_dest->log_root = NULL; diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 731bd9c029f5..f6811c3df38a 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -17,6 +17,8 @@ struct btrfs_log_ctx { int log_transid; bool log_new_dentries; bool logging_new_name; + /* Tracks the last logged dir item/index key offset. */ + u64 last_dir_item_offset; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ @@ -68,14 +70,14 @@ int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, struct btrfs_log_ctx *ctx); -int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *dir, u64 index); -int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - struct btrfs_inode *inode, u64 dirid); +void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *dir, u64 index); +void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 28d443d3ef93..4968535dfff0 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -451,7 +451,7 @@ static int del_orphan(struct btrfs_trans_handle *trans, struct btrfs_inode *inod */ static int rollback_verity(struct btrfs_inode *inode) { - struct btrfs_trans_handle *trans; + struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = inode->root; int ret; @@ -473,6 +473,7 @@ static int rollback_verity(struct btrfs_inode *inode) trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { ret = PTR_ERR(trans); + trans = NULL; btrfs_handle_fs_error(root->fs_info, ret, "failed to start transaction in verity rollback %llu", (u64)inode->vfs_inode.i_ino); @@ -490,8 +491,9 @@ static int rollback_verity(struct btrfs_inode *inode) btrfs_abort_transaction(trans, ret); goto out; } - btrfs_end_transaction(trans); out: + if (trans) + btrfs_end_transaction(trans); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 464485aa7318..61ac57bcbf1a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -14,6 +14,7 @@ #include <linux/semaphore.h> #include <linux/uuid.h> #include <linux/list_sort.h> +#include <linux/namei.h> #include "misc.h" #include "ctree.h" #include "extent_map.h" @@ -250,7 +251,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, + struct btrfs_io_context **bioc_ret, int mirror_num, int need_raid_map); /* @@ -508,7 +509,7 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, } if (flush) - filemap_write_and_wait((*bdev)->bd_inode->i_mapping); + sync_blockdev(*bdev); ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE); if (ret) { blkdev_put(*bdev, flags); @@ -812,9 +813,13 @@ static noinline struct btrfs_device *device_list_add(const char *path, device = NULL; } else { + struct btrfs_dev_lookup_args args = { + .devid = devid, + .uuid = disk_super->dev_item.uuid, + }; + mutex_lock(&fs_devices->device_list_mutex); - device = btrfs_find_device(fs_devices, devid, - disk_super->dev_item.uuid, NULL); + device = btrfs_find_device(fs_devices, &args); /* * If this disk has been pulled into an fs devices created by @@ -1091,7 +1096,7 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices) list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) __btrfs_free_extra_devids(seed_dev, &latest_dev); - fs_devices->latest_bdev = latest_dev->bdev; + fs_devices->latest_dev = latest_dev; mutex_unlock(&uuid_mutex); } @@ -1122,8 +1127,10 @@ static void btrfs_close_one_device(struct btrfs_device *device) if (device->devid == BTRFS_DEV_REPLACE_DEVID) clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); - if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) { + clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); fs_devices->missing_devices--; + } btrfs_close_bdev(device); if (device->bdev) { @@ -1137,6 +1144,19 @@ static void btrfs_close_one_device(struct btrfs_device *device) atomic_set(&device->dev_stats_ccnt, 0); extent_io_tree_release(&device->alloc_state); + /* + * Reset the flush error record. We might have a transient flush error + * in this mount, and if so we aborted the current transaction and set + * the fs to an error state, guaranteeing no super blocks can be further + * committed. However that error might be transient and if we unmount the + * filesystem and mount it again, we should allow the mount to succeed + * (btrfs_check_rw_degradable() should not fail) - if after mounting the + * filesystem again we still get flush errors, then we will again abort + * any transaction and set the error state, guaranteeing no commits of + * unsafe super blocks. + */ + device->last_flush_error = 0; + /* Verify the device is back in a pristine state */ ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); @@ -1209,7 +1229,7 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, return -EINVAL; fs_devices->opened = 1; - fs_devices->latest_bdev = latest_dev->bdev; + fs_devices->latest_dev = latest_dev; fs_devices->total_rw_bytes = 0; fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR; fs_devices->read_policy = BTRFS_READ_POLICY_PID; @@ -1273,7 +1293,7 @@ static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev pgoff_t index; /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode)) + if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); /* make sure our super fits in the page */ @@ -1830,8 +1850,10 @@ static int btrfs_add_dev_item(struct btrfs_trans_handle *trans, key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; + btrfs_reserve_chunk_metadata(trans, true); ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path, &key, sizeof(*dev_item)); + btrfs_trans_release_chunk_metadata(trans); if (ret) goto out; @@ -1869,18 +1891,22 @@ out: /* * Function to update ctime/mtime for a given device path. * Mainly used for ctime/mtime based probe like libblkid. + * + * We don't care about errors here, this is just to be kind to userspace. */ -static void update_dev_time(struct block_device *bdev) +static void update_dev_time(const char *device_path) { - struct inode *inode = bdev->bd_inode; + struct path path; struct timespec64 now; + int ret; - /* Shouldn't happen but just in case. */ - if (!inode) + ret = kern_path(device_path, LOOKUP_FOLLOW, &path); + if (ret) return; - now = current_time(inode); - generic_update_time(inode, &now, S_MTIME | S_CTIME); + now = current_time(d_inode(path.dentry)); + inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME); + path_put(&path); } static int btrfs_rm_dev_item(struct btrfs_device *device) @@ -1904,7 +1930,9 @@ static int btrfs_rm_dev_item(struct btrfs_device *device) key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; + btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + btrfs_trans_release_chunk_metadata(trans); if (ret) { if (ret > 0) ret = -ENOENT; @@ -1973,7 +2001,7 @@ static struct btrfs_device * btrfs_find_next_active_device( } /* - * Helper function to check if the given device is part of s_bdev / latest_bdev + * Helper function to check if the given device is part of s_bdev / latest_dev * and replace it with the provided or the next active device, in the context * where this function called, there should be always be another device (or * this_dev) which is active. @@ -1992,8 +2020,8 @@ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, (fs_info->sb->s_bdev == device->bdev)) fs_info->sb->s_bdev = next_device->bdev; - if (fs_info->fs_devices->latest_bdev == device->bdev) - fs_info->fs_devices->latest_bdev = next_device->bdev; + if (fs_info->fs_devices->latest_dev->bdev == device->bdev) + fs_info->fs_devices->latest_dev = next_device; } /* @@ -2056,11 +2084,12 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, btrfs_kobject_uevent(bdev, KOBJ_CHANGE); /* Update ctime/mtime for device path for libblkid */ - update_dev_time(bdev); + update_dev_time(device_path); } -int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid, struct block_device **bdev, fmode_t *mode) +int btrfs_rm_device(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + struct block_device **bdev, fmode_t *mode) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2068,22 +2097,23 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, u64 num_devices; int ret = 0; - mutex_lock(&uuid_mutex); - + /* + * The device list in fs_devices is accessed without locks (neither + * uuid_mutex nor device_list_mutex) as it won't change on a mounted + * filesystem and another device rm cannot run. + */ num_devices = btrfs_num_devices(fs_info); ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1); if (ret) goto out; - device = btrfs_find_device_by_devspec(fs_info, devid, device_path); - - if (IS_ERR(device)) { - if (PTR_ERR(device) == -ENOENT && - device_path && strcmp(device_path, "missing") == 0) + device = btrfs_find_device(fs_info->fs_devices, args); + if (!device) { + if (args->missing) ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND; else - ret = PTR_ERR(device); + ret = -ENOENT; goto out; } @@ -2113,11 +2143,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, mutex_unlock(&fs_info->chunk_mutex); } - mutex_unlock(&uuid_mutex); ret = btrfs_shrink_device(device, 0); if (!ret) btrfs_reada_remove_dev(device); - mutex_lock(&uuid_mutex); if (ret) goto error_undo; @@ -2146,7 +2174,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, /* * In normal cases the cur_devices == fs_devices. But in case * of deleting a seed device, the cur_devices should point to - * its own fs_devices listed under the fs_devices->seed. + * its own fs_devices listed under the fs_devices->seed_list. */ cur_devices = device->fs_devices; mutex_lock(&fs_devices->device_list_mutex); @@ -2197,14 +2225,21 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, synchronize_rcu(); btrfs_free_device(device); - if (cur_devices->open_devices == 0) { + /* + * This can happen if cur_devices is the private seed devices list. We + * cannot call close_fs_devices() here because it expects the uuid_mutex + * to be held, but in fact we don't need that for the private + * seed_devices, we can simply decrement cur_devices->opened and then + * remove it from our list and free the fs_devices. + */ + if (cur_devices->num_devices == 0) { list_del_init(&cur_devices->seed_list); - close_fs_devices(cur_devices); + ASSERT(cur_devices->opened == 1); + cur_devices->opened--; free_fs_devices(cur_devices); } out: - mutex_unlock(&uuid_mutex); return ret; error_undo: @@ -2292,13 +2327,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_unlock(&fs_devices->device_list_mutex); - /* - * The update_dev_time() with in btrfs_scratch_superblocks() - * may lead to a call to btrfs_show_devname() which will try - * to hold device_list_mutex. And here this device - * is already out of device list, so we don't have to hold - * the device_list_mutex lock. - */ btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev, tgtdev->name->str); @@ -2307,69 +2335,98 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_free_device(tgtdev); } -static struct btrfs_device *btrfs_find_device_by_path( - struct btrfs_fs_info *fs_info, const char *device_path) +/** + * Populate args from device at path + * + * @fs_info: the filesystem + * @args: the args to populate + * @path: the path to the device + * + * This will read the super block of the device at @path and populate @args with + * the devid, fsid, and uuid. This is meant to be used for ioctls that need to + * lookup a device to operate on, but need to do it before we take any locks. + * This properly handles the special case of "missing" that a user may pass in, + * and does some basic sanity checks. The caller must make sure that @path is + * properly NUL terminated before calling in, and must call + * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and + * uuid buffers. + * + * Return: 0 for success, -errno for failure + */ +int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + const char *path) { - int ret = 0; struct btrfs_super_block *disk_super; - u64 devid; - u8 *dev_uuid; struct block_device *bdev; - struct btrfs_device *device; + int ret; - ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ, - fs_info->bdev_holder, 0, &bdev, &disk_super); - if (ret) - return ERR_PTR(ret); + if (!path || !path[0]) + return -EINVAL; + if (!strcmp(path, "missing")) { + args->missing = true; + return 0; + } - devid = btrfs_stack_device_id(&disk_super->dev_item); - dev_uuid = disk_super->dev_item.uuid; + args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL); + args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL); + if (!args->uuid || !args->fsid) { + btrfs_put_dev_args_from_path(args); + return -ENOMEM; + } + + ret = btrfs_get_bdev_and_sb(path, FMODE_READ, fs_info->bdev_holder, 0, + &bdev, &disk_super); + if (ret) + return ret; + args->devid = btrfs_stack_device_id(&disk_super->dev_item); + memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE); if (btrfs_fs_incompat(fs_info, METADATA_UUID)) - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->metadata_uuid); + memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); else - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - disk_super->fsid); - + memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE); btrfs_release_disk_super(disk_super); - if (!device) - device = ERR_PTR(-ENOENT); blkdev_put(bdev, FMODE_READ); - return device; + return 0; } /* - * Lookup a device given by device id, or the path if the id is 0. + * Only use this jointly with btrfs_get_dev_args_from_path() because we will + * allocate our ->uuid and ->fsid pointers, everybody else uses local variables + * that don't need to be freed. */ +void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args) +{ + kfree(args->uuid); + kfree(args->fsid); + args->uuid = NULL; + args->fsid = NULL; +} + struct btrfs_device *btrfs_find_device_by_devspec( struct btrfs_fs_info *fs_info, u64 devid, const char *device_path) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_device *device; + int ret; if (devid) { - device = btrfs_find_device(fs_info->fs_devices, devid, NULL, - NULL); + args.devid = devid; + device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) return ERR_PTR(-ENOENT); return device; } - if (!device_path || !device_path[0]) - return ERR_PTR(-EINVAL); - - if (strcmp(device_path, "missing") == 0) { - /* Find first missing device */ - list_for_each_entry(device, &fs_info->fs_devices->devices, - dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state) && !device->bdev) - return device; - } + ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path); + if (ret) + return ERR_PTR(ret); + device = btrfs_find_device(fs_info->fs_devices, &args); + btrfs_put_dev_args_from_path(&args); + if (!device) return ERR_PTR(-ENOENT); - } - - return btrfs_find_device_by_path(fs_info, device_path); + return device; } /* @@ -2446,6 +2503,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) */ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = fs_info->chunk_root; struct btrfs_path *path; @@ -2455,7 +2513,6 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) struct btrfs_key key; u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - u64 devid; int ret; path = btrfs_alloc_path(); @@ -2467,7 +2524,9 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) key.type = BTRFS_DEV_ITEM_KEY; while (1) { + btrfs_reserve_chunk_metadata(trans, false); ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + btrfs_trans_release_chunk_metadata(trans); if (ret < 0) goto error; @@ -2492,13 +2551,14 @@ next_slot: dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); - devid = btrfs_device_id(leaf, dev_item); + args.devid = btrfs_device_id(leaf, dev_item); read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid); + args.uuid = dev_uuid; + args.fsid = fs_uuid; + device = btrfs_find_device(fs_info->fs_devices, &args); BUG_ON(!device); /* Logic error */ if (device->fs_devices->seeding) { @@ -2597,8 +2657,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path device->io_width = fs_info->sectorsize; device->io_align = fs_info->sectorsize; device->sector_size = fs_info->sectorsize; - device->total_bytes = round_down(i_size_read(bdev->bd_inode), - fs_info->sectorsize); + device->total_bytes = + round_down(bdev_nr_bytes(bdev), fs_info->sectorsize); device->disk_total_bytes = device->total_bytes; device->commit_total_bytes = device->total_bytes; set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); @@ -2614,6 +2674,8 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_abort_transaction(trans, ret); goto error_trans; } + btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev, + device); } device->fs_devices = fs_devices; @@ -2720,7 +2782,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_forget_devices(device_path); /* Update ctime/mtime for blkid or udev */ - update_dev_time(bdev); + update_dev_time(device_path); return ret; @@ -2813,6 +2875,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_super_block *super_copy = fs_info->super_copy; u64 old_total; u64 diff; + int ret; if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) return -EACCES; @@ -2841,7 +2904,11 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, &trans->transaction->dev_update_list); mutex_unlock(&fs_info->chunk_mutex); - return btrfs_update_device(trans, device); + btrfs_reserve_chunk_metadata(trans, false); + ret = btrfs_update_device(trans, device); + btrfs_trans_release_chunk_metadata(trans); + + return ret; } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) @@ -3083,7 +3150,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) const u64 sys_flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *sys_bg; - sys_bg = btrfs_alloc_chunk(trans, sys_flags); + sys_bg = btrfs_create_chunk(trans, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -4876,8 +4943,10 @@ again: round_down(old_total - diff, fs_info->sectorsize)); mutex_unlock(&fs_info->chunk_mutex); + btrfs_reserve_chunk_metadata(trans, false); /* Now btrfs_update_device() will change the on-disk size. */ ret = btrfs_update_device(trans, device); + btrfs_trans_release_chunk_metadata(trans); if (ret < 0) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); @@ -4960,7 +5029,7 @@ static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type) } /* - * Structure used internally for __btrfs_alloc_chunk() function. + * Structure used internally for btrfs_create_chunk() function. * Wraps needed parameters. */ struct alloc_chunk_ctl { @@ -5364,7 +5433,7 @@ error_del_extent: return block_group; } -struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, +struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type) { struct btrfs_fs_info *info = trans->fs_info; @@ -5565,12 +5634,12 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) */ alloc_profile = btrfs_metadata_alloc_profile(fs_info); - meta_bg = btrfs_alloc_chunk(trans, alloc_profile); + meta_bg = btrfs_create_chunk(trans, alloc_profile); if (IS_ERR(meta_bg)) return PTR_ERR(meta_bg); alloc_profile = btrfs_system_alloc_profile(fs_info); - sys_bg = btrfs_alloc_chunk(trans, alloc_profile); + sys_bg = btrfs_create_chunk(trans, alloc_profile); if (IS_ERR(sys_bg)) return PTR_ERR(sys_bg); @@ -5584,17 +5653,17 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map) return btrfs_raid_array[index].tolerated_failures; } -int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) +bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct extent_map *em; struct map_lookup *map; - int readonly = 0; int miss_ndevs = 0; int i; + bool ret = true; em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); if (IS_ERR(em)) - return 1; + return false; map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { @@ -5605,21 +5674,20 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) } if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &map->stripes[i].dev->dev_state)) { - readonly = 1; + ret = false; goto end; } } /* - * If the number of missing devices is larger than max errors, - * we can not write the data into that chunk successfully, so - * set it readonly. + * If the number of missing devices is larger than max errors, we can + * not write the data into that chunk successfully. */ if (miss_ndevs > btrfs_chunk_max_errors(map)) - readonly = 1; + ret = false; end: free_extent_map(em); - return readonly; + return ret; } void btrfs_mapping_tree_free(struct extent_map_tree *tree) @@ -5782,7 +5850,7 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, } /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ -static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) +static void sort_parity_stripes(struct btrfs_io_context *bioc, int num_stripes) { int i; int again = 1; @@ -5791,52 +5859,55 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes) again = 0; for (i = 0; i < num_stripes - 1; i++) { /* Swap if parity is on a smaller index */ - if (bbio->raid_map[i] > bbio->raid_map[i + 1]) { - swap(bbio->stripes[i], bbio->stripes[i + 1]); - swap(bbio->raid_map[i], bbio->raid_map[i + 1]); + if (bioc->raid_map[i] > bioc->raid_map[i + 1]) { + swap(bioc->stripes[i], bioc->stripes[i + 1]); + swap(bioc->raid_map[i], bioc->raid_map[i + 1]); again = 1; } } } } -static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) +static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info, + int total_stripes, + int real_stripes) { - struct btrfs_bio *bbio = kzalloc( - /* the size of the btrfs_bio */ - sizeof(struct btrfs_bio) + - /* plus the variable array for the stripes */ - sizeof(struct btrfs_bio_stripe) * (total_stripes) + - /* plus the variable array for the tgt dev */ + struct btrfs_io_context *bioc = kzalloc( + /* The size of btrfs_io_context */ + sizeof(struct btrfs_io_context) + + /* Plus the variable array for the stripes */ + sizeof(struct btrfs_io_stripe) * (total_stripes) + + /* Plus the variable array for the tgt dev */ sizeof(int) * (real_stripes) + /* - * plus the raid_map, which includes both the tgt dev - * and the stripes + * Plus the raid_map, which includes both the tgt dev + * and the stripes. */ sizeof(u64) * (total_stripes), GFP_NOFS|__GFP_NOFAIL); - atomic_set(&bbio->error, 0); - refcount_set(&bbio->refs, 1); + atomic_set(&bioc->error, 0); + refcount_set(&bioc->refs, 1); - bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes); - bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes); + bioc->fs_info = fs_info; + bioc->tgtdev_map = (int *)(bioc->stripes + total_stripes); + bioc->raid_map = (u64 *)(bioc->tgtdev_map + real_stripes); - return bbio; + return bioc; } -void btrfs_get_bbio(struct btrfs_bio *bbio) +void btrfs_get_bioc(struct btrfs_io_context *bioc) { - WARN_ON(!refcount_read(&bbio->refs)); - refcount_inc(&bbio->refs); + WARN_ON(!refcount_read(&bioc->refs)); + refcount_inc(&bioc->refs); } -void btrfs_put_bbio(struct btrfs_bio *bbio) +void btrfs_put_bioc(struct btrfs_io_context *bioc) { - if (!bbio) + if (!bioc) return; - if (refcount_dec_and_test(&bbio->refs)) - kfree(bbio); + if (refcount_dec_and_test(&bioc->refs)) + kfree(bioc); } /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ @@ -5846,11 +5917,11 @@ void btrfs_put_bbio(struct btrfs_bio *bbio) */ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, - struct btrfs_bio **bbio_ret) + struct btrfs_io_context **bioc_ret) { struct extent_map *em; struct map_lookup *map; - struct btrfs_bio *bbio; + struct btrfs_io_context *bioc; u64 length = *length_ret; u64 offset; u64 stripe_nr; @@ -5869,8 +5940,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, int ret = 0; int i; - /* discard always return a bbio */ - ASSERT(bbio_ret); + /* Discard always returns a bioc. */ + ASSERT(bioc_ret); em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) @@ -5933,26 +6004,25 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, &stripe_index); } - bbio = alloc_btrfs_bio(num_stripes, 0); - if (!bbio) { + bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); + if (!bioc) { ret = -ENOMEM; goto out; } for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = + bioc->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; + bioc->stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - bbio->stripes[i].length = stripes_per_dev * + bioc->stripes[i].length = stripes_per_dev * map->stripe_len; if (i / sub_stripes < remaining_stripes) - bbio->stripes[i].length += - map->stripe_len; + bioc->stripes[i].length += map->stripe_len; /* * Special for the first stripe and @@ -5963,19 +6033,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * off end_off */ if (i < sub_stripes) - bbio->stripes[i].length -= - stripe_offset; + bioc->stripes[i].length -= stripe_offset; if (stripe_index >= last_stripe && stripe_index <= (last_stripe + sub_stripes - 1)) - bbio->stripes[i].length -= - stripe_end_offset; + bioc->stripes[i].length -= stripe_end_offset; if (i == sub_stripes - 1) stripe_offset = 0; } else { - bbio->stripes[i].length = length; + bioc->stripes[i].length = length; } stripe_index++; @@ -5985,9 +6053,9 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } } - *bbio_ret = bbio; - bbio->map_type = map->type; - bbio->num_stripes = num_stripes; + *bioc_ret = bioc; + bioc->map_type = map->type; + bioc->num_stripes = num_stripes; out: free_extent_map(em); return ret; @@ -6011,7 +6079,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, u64 srcdev_devid, int *mirror_num, u64 *physical) { - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; int num_stripes; int index_srcdev = 0; int found = 0; @@ -6020,20 +6088,20 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, int ret = 0; ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &length, &bbio, 0, 0); + logical, &length, &bioc, 0, 0); if (ret) { - ASSERT(bbio == NULL); + ASSERT(bioc == NULL); return ret; } - num_stripes = bbio->num_stripes; + num_stripes = bioc->num_stripes; if (*mirror_num > num_stripes) { /* * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, * that means that the requested area is not left of the left * cursor */ - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); return -EIO; } @@ -6043,7 +6111,7 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, * pointer to the one of the target drive. */ for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid != srcdev_devid) + if (bioc->stripes[i].dev->devid != srcdev_devid) continue; /* @@ -6051,15 +6119,15 @@ static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, * mirror with the lowest physical address */ if (found && - physical_of_found <= bbio->stripes[i].physical) + physical_of_found <= bioc->stripes[i].physical) continue; index_srcdev = i; found = 1; - physical_of_found = bbio->stripes[i].physical; + physical_of_found = bioc->stripes[i].physical; } - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); ASSERT(found); if (!found) @@ -6090,12 +6158,12 @@ static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical) } static void handle_ops_on_dev_replace(enum btrfs_map_op op, - struct btrfs_bio **bbio_ret, + struct btrfs_io_context **bioc_ret, struct btrfs_dev_replace *dev_replace, u64 logical, int *num_stripes_ret, int *max_errors_ret) { - struct btrfs_bio *bbio = *bbio_ret; + struct btrfs_io_context *bioc = *bioc_ret; u64 srcdev_devid = dev_replace->srcdev->devid; int tgtdev_indexes = 0; int num_stripes = *num_stripes_ret; @@ -6125,17 +6193,17 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, */ index_where_to_add = num_stripes; for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { + if (bioc->stripes[i].dev->devid == srcdev_devid) { /* write to new disk, too */ - struct btrfs_bio_stripe *new = - bbio->stripes + index_where_to_add; - struct btrfs_bio_stripe *old = - bbio->stripes + i; + struct btrfs_io_stripe *new = + bioc->stripes + index_where_to_add; + struct btrfs_io_stripe *old = + bioc->stripes + i; new->physical = old->physical; new->length = old->length; new->dev = dev_replace->tgtdev; - bbio->tgtdev_map[i] = index_where_to_add; + bioc->tgtdev_map[i] = index_where_to_add; index_where_to_add++; max_errors++; tgtdev_indexes++; @@ -6155,30 +6223,29 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, * full copy of the source drive. */ for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { + if (bioc->stripes[i].dev->devid == srcdev_devid) { /* * In case of DUP, in order to keep it simple, * only add the mirror with the lowest physical * address */ if (found && - physical_of_found <= - bbio->stripes[i].physical) + physical_of_found <= bioc->stripes[i].physical) continue; index_srcdev = i; found = 1; - physical_of_found = bbio->stripes[i].physical; + physical_of_found = bioc->stripes[i].physical; } } if (found) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; + struct btrfs_io_stripe *tgtdev_stripe = + bioc->stripes + num_stripes; tgtdev_stripe->physical = physical_of_found; tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; + bioc->stripes[index_srcdev].length; tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; + bioc->tgtdev_map[index_srcdev] = num_stripes; tgtdev_indexes++; num_stripes++; @@ -6187,8 +6254,8 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, *num_stripes_ret = num_stripes; *max_errors_ret = max_errors; - bbio->num_tgtdevs = tgtdev_indexes; - *bbio_ret = bbio; + bioc->num_tgtdevs = tgtdev_indexes; + *bioc_ret = bioc; } static bool need_full_stripe(enum btrfs_map_op op) @@ -6291,7 +6358,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, + struct btrfs_io_context **bioc_ret, int mirror_num, int need_raid_map) { struct extent_map *em; @@ -6306,7 +6373,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int num_stripes; int max_errors = 0; int tgtdev_indexes = 0; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; int num_alloc_stripes; @@ -6315,7 +6382,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 raid56_full_stripe_start = (u64)-1; struct btrfs_io_geometry geom; - ASSERT(bbio_ret); + ASSERT(bioc_ret); ASSERT(op != BTRFS_MAP_DISCARD); em = btrfs_get_chunk_map(fs_info, logical, *length); @@ -6459,20 +6526,20 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, tgtdev_indexes = num_stripes; } - bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes); - if (!bbio) { + bioc = alloc_btrfs_io_context(fs_info, num_alloc_stripes, tgtdev_indexes); + if (!bioc) { ret = -ENOMEM; goto out; } for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = map->stripes[stripe_index].physical + + bioc->stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; + bioc->stripes[i].dev = map->stripes[stripe_index].dev; stripe_index++; } - /* build raid_map */ + /* Build raid_map */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { u64 tmp; @@ -6484,15 +6551,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* Fill in the logical address of each stripe */ tmp = stripe_nr * data_stripes; for (i = 0; i < data_stripes; i++) - bbio->raid_map[(i+rot) % num_stripes] = + bioc->raid_map[(i + rot) % num_stripes] = em->start + (tmp + i) * map->stripe_len; - bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; + bioc->raid_map[(i + rot) % map->num_stripes] = RAID5_P_STRIPE; if (map->type & BTRFS_BLOCK_GROUP_RAID6) - bbio->raid_map[(i+rot+1) % num_stripes] = + bioc->raid_map[(i + rot + 1) % num_stripes] = RAID6_Q_STRIPE; - sort_parity_stripes(bbio, num_stripes); + sort_parity_stripes(bioc, num_stripes); } if (need_full_stripe(op)) @@ -6500,15 +6567,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && need_full_stripe(op)) { - handle_ops_on_dev_replace(op, &bbio, dev_replace, logical, + handle_ops_on_dev_replace(op, &bioc, dev_replace, logical, &num_stripes, &max_errors); } - *bbio_ret = bbio; - bbio->map_type = map->type; - bbio->num_stripes = num_stripes; - bbio->max_errors = max_errors; - bbio->mirror_num = mirror_num; + *bioc_ret = bioc; + bioc->map_type = map->type; + bioc->num_stripes = num_stripes; + bioc->max_errors = max_errors; + bioc->mirror_num = mirror_num; /* * this is the case that REQ_READ && dev_replace_is_ongoing && @@ -6517,9 +6584,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, */ if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) { WARN_ON(num_stripes > 1); - bbio->stripes[0].dev = dev_replace->tgtdev; - bbio->stripes[0].physical = physical_to_patch_in_first_stripe; - bbio->mirror_num = map->num_stripes + 1; + bioc->stripes[0].dev = dev_replace->tgtdev; + bioc->stripes[0].physical = physical_to_patch_in_first_stripe; + bioc->mirror_num = map->num_stripes + 1; } out: if (dev_replace_is_ongoing) { @@ -6533,43 +6600,43 @@ out: int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num) + struct btrfs_io_context **bioc_ret, int mirror_num) { if (op == BTRFS_MAP_DISCARD) return __btrfs_map_block_for_discard(fs_info, logical, - length, bbio_ret); + length, bioc_ret); - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, + return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, mirror_num, 0); } /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret) + struct btrfs_io_context **bioc_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); + return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); } -static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) +static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) { - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; + bio->bi_private = bioc->private; + bio->bi_end_io = bioc->end_io; bio_endio(bio); - btrfs_put_bbio(bbio); + btrfs_put_bioc(bioc); } static void btrfs_end_bio(struct bio *bio) { - struct btrfs_bio *bbio = bio->bi_private; + struct btrfs_io_context *bioc = bio->bi_private; int is_orig_bio = 0; if (bio->bi_status) { - atomic_inc(&bbio->error); + atomic_inc(&bioc->error); if (bio->bi_status == BLK_STS_IOERR || bio->bi_status == BLK_STS_TARGET) { - struct btrfs_device *dev = btrfs_io_bio(bio)->device; + struct btrfs_device *dev = btrfs_bio(bio)->device; ASSERT(dev->bdev); if (btrfs_op(bio) == BTRFS_MAP_WRITE) @@ -6584,22 +6651,22 @@ static void btrfs_end_bio(struct bio *bio) } } - if (bio == bbio->orig_bio) + if (bio == bioc->orig_bio) is_orig_bio = 1; - btrfs_bio_counter_dec(bbio->fs_info); + btrfs_bio_counter_dec(bioc->fs_info); - if (atomic_dec_and_test(&bbio->stripes_pending)) { + if (atomic_dec_and_test(&bioc->stripes_pending)) { if (!is_orig_bio) { bio_put(bio); - bio = bbio->orig_bio; + bio = bioc->orig_bio; } - btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; + btrfs_bio(bio)->mirror_num = bioc->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio */ - if (atomic_read(&bbio->error) > bbio->max_errors) { + if (atomic_read(&bioc->error) > bioc->max_errors) { bio->bi_status = BLK_STS_IOERR; } else { /* @@ -6609,19 +6676,19 @@ static void btrfs_end_bio(struct bio *bio) bio->bi_status = BLK_STS_OK; } - btrfs_end_bbio(bbio, bio); + btrfs_end_bioc(bioc, bio); } else if (!is_orig_bio) { bio_put(bio); } } -static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, +static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, u64 physical, struct btrfs_device *dev) { - struct btrfs_fs_info *fs_info = bbio->fs_info; + struct btrfs_fs_info *fs_info = bioc->fs_info; - bio->bi_private = bbio; - btrfs_io_bio(bio)->device = dev; + bio->bi_private = bioc; + btrfs_bio(bio)->device = dev; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; /* @@ -6650,20 +6717,20 @@ static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio, btrfsic_submit_bio(bio); } -static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) +static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) { - atomic_inc(&bbio->error); - if (atomic_dec_and_test(&bbio->stripes_pending)) { + atomic_inc(&bioc->error); + if (atomic_dec_and_test(&bioc->stripes_pending)) { /* Should be the original bio. */ - WARN_ON(bio != bbio->orig_bio); + WARN_ON(bio != bioc->orig_bio); - btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; + btrfs_bio(bio)->mirror_num = bioc->mirror_num; bio->bi_iter.bi_sector = logical >> 9; - if (atomic_read(&bbio->error) > bbio->max_errors) + if (atomic_read(&bioc->error) > bioc->max_errors) bio->bi_status = BLK_STS_IOERR; else bio->bi_status = BLK_STS_OK; - btrfs_end_bbio(bbio, bio); + btrfs_end_bioc(bioc, bio); } } @@ -6678,36 +6745,34 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int ret; int dev_nr; int total_devs; - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; length = bio->bi_iter.bi_size; map_length = length; btrfs_bio_counter_inc_blocked(fs_info); ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, - &map_length, &bbio, mirror_num, 1); + &map_length, &bioc, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); return errno_to_blk_status(ret); } - total_devs = bbio->num_stripes; - bbio->orig_bio = first_bio; - bbio->private = first_bio->bi_private; - bbio->end_io = first_bio->bi_end_io; - bbio->fs_info = fs_info; - atomic_set(&bbio->stripes_pending, bbio->num_stripes); + total_devs = bioc->num_stripes; + bioc->orig_bio = first_bio; + bioc->private = first_bio->bi_private; + bioc->end_io = first_bio->bi_end_io; + atomic_set(&bioc->stripes_pending, bioc->num_stripes); - if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && + if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - ret = raid56_parity_write(fs_info, bio, bbio, - map_length); + ret = raid56_parity_write(bio, bioc, map_length); } else { - ret = raid56_parity_recover(fs_info, bio, bbio, - map_length, mirror_num, 1); + ret = raid56_parity_recover(bio, bioc, map_length, + mirror_num, 1); } btrfs_bio_counter_dec(fs_info); @@ -6722,12 +6787,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { - dev = bbio->stripes[dev_nr].dev; + dev = bioc->stripes[dev_nr].dev; if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || (btrfs_op(first_bio) == BTRFS_MAP_WRITE && !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bbio_error(bbio, first_bio, logical); + bioc_error(bioc, first_bio, logical); continue; } @@ -6736,12 +6801,39 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, else bio = first_bio; - submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev); + submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); } btrfs_bio_counter_dec(fs_info); return BLK_STS_OK; } +static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, + const struct btrfs_fs_devices *fs_devices) +{ + if (args->fsid == NULL) + return true; + if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0) + return true; + return false; +} + +static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args, + const struct btrfs_device *device) +{ + ASSERT((args->devid != (u64)-1) || args->missing); + + if ((args->devid != (u64)-1) && device->devid != args->devid) + return false; + if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0) + return false; + if (!args->missing) + return true; + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) && + !device->bdev) + return true; + return false; +} + /* * Find a device specified by @devid or @uuid in the list of @fs_devices, or * return NULL. @@ -6749,31 +6841,25 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, * If devid and uuid are both specified, the match must be exact, otherwise * only devid is used. */ -struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid) +struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, + const struct btrfs_dev_lookup_args *args) { struct btrfs_device *device; struct btrfs_fs_devices *seed_devs; - if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + if (dev_args_match_fs_devices(args, fs_devices)) { list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (device->devid == devid && - (!uuid || memcmp(device->uuid, uuid, - BTRFS_UUID_SIZE) == 0)) + if (dev_args_match_device(args, device)) return device; } } list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { - if (!fsid || - !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - list_for_each_entry(device, &seed_devs->devices, - dev_list) { - if (device->devid == devid && - (!uuid || memcmp(device->uuid, uuid, - BTRFS_UUID_SIZE) == 0)) - return device; - } + if (!dev_args_match_fs_devices(args, seed_devs)) + continue; + list_for_each_entry(device, &seed_devs->devices, dev_list) { + if (dev_args_match_device(args, device)) + return device; } } @@ -6939,6 +7025,7 @@ static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; struct extent_map_tree *map_tree = &fs_info->mapping_tree; struct map_lookup *map; @@ -7016,11 +7103,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); devid = btrfs_stripe_devid_nr(leaf, chunk, i); + args.devid = devid; read_extent_buffer(leaf, uuid, (unsigned long) btrfs_stripe_dev_uuid_nr(chunk, i), BTRFS_UUID_SIZE); - map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, - devid, uuid, NULL); + args.uuid = uuid; + map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args); if (!map->stripes[i].dev && !btrfs_test_opt(fs_info, DEGRADED)) { free_extent_map(em); @@ -7138,6 +7226,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, static int read_one_dev(struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; @@ -7146,11 +7235,13 @@ static int read_one_dev(struct extent_buffer *leaf, u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - devid = btrfs_device_id(leaf, dev_item); + devid = args.devid = btrfs_device_id(leaf, dev_item); read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), BTRFS_FSID_SIZE); + args.uuid = dev_uuid; + args.fsid = fs_uuid; if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) { fs_devices = open_seed_devices(fs_info, fs_uuid); @@ -7158,8 +7249,7 @@ static int read_one_dev(struct extent_buffer *leaf, return PTR_ERR(fs_devices); } - device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid, - fs_uuid); + device = btrfs_find_device(fs_info->fs_devices, &args); if (!device) { if (!btrfs_test_opt(fs_info, DEGRADED)) { btrfs_report_missing_device(fs_info, devid, @@ -7223,7 +7313,7 @@ static int read_one_dev(struct extent_buffer *leaf, fill_device_from_item(leaf, dev_item, device); if (device->bdev) { - u64 max_total_bytes = i_size_read(device->bdev->bd_inode); + u64 max_total_bytes = bdev_nr_bytes(device->bdev); if (device->total_bytes > max_total_bytes) { btrfs_err(fs_info, @@ -7828,12 +7918,14 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_get_dev_stats *stats) { + BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_device *dev; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; int i; mutex_lock(&fs_devices->device_list_mutex); - dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL); + args.devid = stats->devid; + dev = btrfs_find_device(fs_info->fs_devices, &args); mutex_unlock(&fs_devices->device_list_mutex); if (!dev) { @@ -7909,6 +8001,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 chunk_offset, u64 devid, u64 physical_offset, u64 physical_len) { + struct btrfs_dev_lookup_args args = { .devid = devid }; struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; @@ -7964,7 +8057,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } /* Make sure no dev extent is beyond device boundary */ - dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL); + dev = btrfs_find_device(fs_info->fs_devices, &args); if (!dev) { btrfs_err(fs_info, "failed to find devid %llu", devid); ret = -EUCLEAN; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2183361db614..3b8130680749 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -236,17 +236,40 @@ struct btrfs_fs_devices { bool fsid_change; struct list_head fs_list; + /* + * Number of devices under this fsid including missing and + * replace-target device and excludes seed devices. + */ u64 num_devices; + + /* + * The number of devices that successfully opened, including + * replace-target, excludes seed devices. + */ u64 open_devices; + + /* The number of devices that are under the chunk allocation list. */ u64 rw_devices; + + /* Count of missing devices under this fsid excluding seed device. */ u64 missing_devices; u64 total_rw_bytes; + + /* + * Count of devices from btrfs_super_block::num_devices for this fsid, + * which includes the seed device, excludes the transient replace-target + * device. + */ u64 total_devices; /* Highest generation number of seen devices */ u64 latest_generation; - struct block_device *latest_bdev; + /* + * The mount device or a device with highest generation after removal + * or replace. + */ + struct btrfs_device *latest_dev; /* all of the devices in the FS, protected by a mutex * so we can safely walk it to write out the supers without @@ -300,48 +323,62 @@ struct btrfs_fs_devices { / sizeof(struct btrfs_stripe) + 1) /* - * we need the mirror number and stripe index to be passed around - * the call chain while we are processing end_io (especially errors). - * Really, what we need is a btrfs_bio structure that has this info - * and is properly sized with its stripe array, but we're not there - * quite yet. We have our own btrfs bioset, and all of the bios - * we allocate are actually btrfs_io_bios. We'll cram as much of - * struct btrfs_bio as we can into this over time. + * Additional info to pass along bio. + * + * Mostly for btrfs specific features like csum and mirror_num. */ -struct btrfs_io_bio { +struct btrfs_bio { unsigned int mirror_num; + + /* @device is for stripe IO submission. */ struct btrfs_device *device; - u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; struct bvec_iter iter; + /* * This member must come last, bio_alloc_bioset will allocate enough - * bytes for entire btrfs_io_bio but relies on bio being last. + * bytes for entire btrfs_bio but relies on bio being last. */ struct bio bio; }; -static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio) +static inline struct btrfs_bio *btrfs_bio(struct bio *bio) { - return container_of(bio, struct btrfs_io_bio, bio); + return container_of(bio, struct btrfs_bio, bio); } -static inline void btrfs_io_bio_free_csum(struct btrfs_io_bio *io_bio) +static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) { - if (io_bio->csum != io_bio->csum_inline) { - kfree(io_bio->csum); - io_bio->csum = NULL; + if (bbio->csum != bbio->csum_inline) { + kfree(bbio->csum); + bbio->csum = NULL; } } -struct btrfs_bio_stripe { +struct btrfs_io_stripe { struct btrfs_device *dev; u64 physical; u64 length; /* only used for discard mappings */ }; -struct btrfs_bio { +/* + * Context for IO subsmission for device stripe. + * + * - Track the unfinished mirrors for mirror based profiles + * Mirror based profiles are SINGLE/DUP/RAID1/RAID10. + * + * - Contain the logical -> physical mapping info + * Used by submit_stripe_bio() for mapping logical bio + * into physical device address. + * + * - Contain device replace info + * Used by handle_ops_on_dev_replace() to copy logical bios + * into the new device. + * + * - Contain RAID56 full stripe logical bytenrs + */ +struct btrfs_io_context { refcount_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; @@ -361,7 +398,7 @@ struct btrfs_bio { * so raid_map[0] is the start of our full stripe */ u64 *raid_map; - struct btrfs_bio_stripe stripes[]; + struct btrfs_io_stripe stripes[]; }; struct btrfs_device_info { @@ -396,11 +433,11 @@ struct map_lookup { int num_stripes; int sub_stripes; int verified_stripes; /* For mount time dev extent verification */ - struct btrfs_bio_stripe stripes[]; + struct btrfs_io_stripe stripes[]; }; #define map_lookup_size(n) (sizeof(struct map_lookup) + \ - (sizeof(struct btrfs_bio_stripe) * (n))) + (sizeof(struct btrfs_io_stripe) * (n))) struct btrfs_balance_args; struct btrfs_balance_progress; @@ -414,6 +451,22 @@ struct btrfs_balance_control { struct btrfs_balance_progress stat; }; +/* + * Search for a given device by the set parameters + */ +struct btrfs_dev_lookup_args { + u64 devid; + u8 *uuid; + u8 *fsid; + bool missing; +}; + +/* We have to initialize to -1 because BTRFS_DEV_REPLACE_DEVID is 0 */ +#define BTRFS_DEV_LOOKUP_ARGS_INIT { .devid = (u64)-1 } + +#define BTRFS_DEV_LOOKUP_ARGS(name) \ + struct btrfs_dev_lookup_args name = BTRFS_DEV_LOOKUP_ARGS_INIT + enum btrfs_map_op { BTRFS_MAP_READ, BTRFS_MAP_WRITE, @@ -437,20 +490,20 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) } } -void btrfs_get_bbio(struct btrfs_bio *bbio); -void btrfs_put_bbio(struct btrfs_bio *bbio); +void btrfs_get_bioc(struct btrfs_io_context *bioc); +void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num); + struct btrfs_io_context **bioc_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret); + struct btrfs_io_context **bioc_ret); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); -struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans, +struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, @@ -467,19 +520,23 @@ void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid, const char *devpath); +int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, + struct btrfs_dev_lookup_args *args, + const char *path); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, const u64 *devid, const u8 *uuid); +void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, - const char *device_path, u64 devid, + struct btrfs_dev_lookup_args *args, struct block_device **bdev, fmode_t *mode); void __exit btrfs_cleanup_fs_uuids(void); int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size); -struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, - u64 devid, u8 *uuid, u8 *fsid); +struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices, + const struct btrfs_dev_lookup_args *args); int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *path); int btrfs_balance(struct btrfs_fs_info *fs_info, @@ -493,7 +550,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_uuid_scan_kthread(void *data); -int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset); +bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset); int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes, u64 *start, u64 *max_avail); void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 8a4514283a4b..2837b4c8424d 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -138,7 +138,7 @@ int btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, * matches our target xattr, so lets check. */ ret = 0; - btrfs_assert_tree_locked(path->nodes[0]); + btrfs_assert_tree_write_locked(path->nodes[0]); di = btrfs_match_dir_item_name(fs_info, path, name, name_len); if (!di && !(flags & XATTR_REPLACE)) { ret = -ENOSPC; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 8afa90074891..767a0c6c9694 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,22 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); + data_in = kmap(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = page_address(in_page); + data_in = kmap(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -192,6 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -202,7 +207,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -229,6 +234,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -239,7 +245,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = page_address(out_page); + cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -258,8 +264,13 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (in_page) + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); put_page(in_page); + } return ret; } @@ -276,7 +287,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - data_in = page_address(pages_in[page_in_index]); + data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -298,6 +309,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); + kunmap(pages_in[page_in_index]); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -324,13 +336,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - + kunmap(pages_in[page_in_index]); page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = page_address(pages_in[page_in_index]); + data_in = kmap(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); @@ -342,6 +354,8 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = 0; done: zlib_inflateEnd(&workspace->strm); + if (data_in) + kunmap(pages_in[page_in_index]); if (!ret) zero_fill_bio(cb->orig_bio); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 47af1ab3bf12..67d932d70798 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -4,6 +4,7 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> +#include <linux/atomic.h> #include "ctree.h" #include "volumes.h" #include "zoned.h" @@ -39,12 +40,30 @@ #define BTRFS_NR_SB_LOG_ZONES 2 /* + * Minimum of active zones we need: + * + * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors + * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group + * - 1 zone for tree-log dedicated block group + * - 1 zone for relocation + */ +#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) + +/* * Maximum supported zone size. Currently, SMR disks have a zone size of * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not * expect the zone size to become larger than 8GiB in the near future. */ #define BTRFS_MAX_ZONE_SIZE SZ_8G +#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) + +static inline bool sb_zone_is_full(const struct blk_zone *zone) +{ + return (zone->cond == BLK_ZONE_COND_FULL) || + (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity); +} + static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; @@ -60,14 +79,13 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, bool empty[BTRFS_NR_SB_LOG_ZONES]; bool full[BTRFS_NR_SB_LOG_ZONES]; sector_t sector; + int i; - ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL && - zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL); - - empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY); - empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY); - full[0] = (zones[0].cond == BLK_ZONE_COND_FULL); - full[1] = (zones[1].cond == BLK_ZONE_COND_FULL); + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL); + empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY); + full[i] = sb_zone_is_full(&zones[i]); + } /* * Possible states of log buffer zones @@ -296,6 +314,9 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_zoned_device_info *zone_info = NULL; struct block_device *bdev = device->bdev; + struct request_queue *queue = bdev_get_queue(bdev); + unsigned int max_active_zones; + unsigned int nactive; sector_t nr_sectors; sector_t sector = 0; struct blk_zone *zones = NULL; @@ -351,6 +372,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; + max_active_zones = queue_max_active_zones(queue); + if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) { + btrfs_err_in_rcu(fs_info, +"zoned: %s: max active zones %u is too small, need at least %u active zones", + rcu_str_deref(device->name), max_active_zones, + BTRFS_MIN_ACTIVE_ZONES); + ret = -EINVAL; + goto out; + } + zone_info->max_active_zones = max_active_zones; + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { ret = -ENOMEM; @@ -363,6 +395,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) goto out; } + zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); + if (!zone_info->active_zones) { + ret = -ENOMEM; + goto out; + } + zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL); if (!zones) { ret = -ENOMEM; @@ -370,6 +408,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) } /* Get zones type */ + nactive = 0; while (sector < nr_sectors) { nr_zones = BTRFS_REPORT_NR_ZONES; ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones, @@ -380,8 +419,17 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) for (i = 0; i < nr_zones; i++) { if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ) __set_bit(nreported, zone_info->seq_zones); - if (zones[i].cond == BLK_ZONE_COND_EMPTY) + switch (zones[i].cond) { + case BLK_ZONE_COND_EMPTY: __set_bit(nreported, zone_info->empty_zones); + break; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + __set_bit(nreported, zone_info->active_zones); + nactive++; + break; + } nreported++; } sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len; @@ -396,6 +444,19 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) goto out; } + if (max_active_zones) { + if (nactive > max_active_zones) { + btrfs_err_in_rcu(device->fs_info, + "zoned: %u active zones on %s exceeds max_active_zones %u", + nactive, rcu_str_deref(device->name), + max_active_zones); + ret = -EIO; + goto out; + } + atomic_set(&zone_info->active_zones_left, + max_active_zones - nactive); + } + /* Validate superblock log */ nr_zones = BTRFS_NR_SB_LOG_ZONES; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { @@ -478,6 +539,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) out: kfree(zones); out_free_zone_info: + bitmap_free(zone_info->active_zones); bitmap_free(zone_info->empty_zones); bitmap_free(zone_info->seq_zones); kfree(zone_info); @@ -493,6 +555,7 @@ void btrfs_destroy_dev_zone_info(struct btrfs_device *device) if (!zone_info) return; + bitmap_free(zone_info->active_zones); bitmap_free(zone_info->seq_zones); bitmap_free(zone_info->empty_zones); kfree(zone_info); @@ -585,7 +648,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) /* * stripe_size is always aligned to BTRFS_STRIPE_LEN in - * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size, + * btrfs_create_chunk(). Since we want stripe_len == zone_size, * check the alignment here. */ if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) { @@ -664,7 +727,7 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { - ASSERT(reset->cond == BLK_ZONE_COND_FULL); + ASSERT(sb_zone_is_full(reset)); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, reset->start, reset->len, @@ -676,9 +739,20 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset->wp = reset->start; } } else if (ret != -ENOENT) { - /* For READ, we want the precious one */ + /* + * For READ, we want the previous one. Move write pointer to + * the end of a zone, if it is at the head of a zone. + */ + u64 zone_end = 0; + if (wp == zones[0].start << SECTOR_SHIFT) - wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT; + zone_end = zones[1].start + zones[1].capacity; + else if (wp == zones[1].start << SECTOR_SHIFT) + zone_end = zones[0].start + zones[0].capacity; + if (zone_end) + wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT, + BTRFS_SUPER_INFO_SIZE); + wp -= BTRFS_SUPER_INFO_SIZE; } @@ -771,36 +845,56 @@ static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo, return true; } -void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) { struct btrfs_zoned_device_info *zinfo = device->zone_info; struct blk_zone *zone; + int i; if (!is_sb_log_zone(zinfo, mirror)) - return; + return 0; zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror]; - if (zone->cond != BLK_ZONE_COND_FULL) { + for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) { + /* Advance the next zone */ + if (zone->cond == BLK_ZONE_COND_FULL) { + zone++; + continue; + } + if (zone->cond == BLK_ZONE_COND_EMPTY) zone->cond = BLK_ZONE_COND_IMP_OPEN; - zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); + zone->wp += SUPER_INFO_SECTORS; + + if (sb_zone_is_full(zone)) { + /* + * No room left to write new superblock. Since + * superblock is written with REQ_SYNC, it is safe to + * finish the zone now. + * + * If the write pointer is exactly at the capacity, + * explicit ZONE_FINISH is not necessary. + */ + if (zone->wp != zone->start + zone->capacity) { + int ret; + + ret = blkdev_zone_mgmt(device->bdev, + REQ_OP_ZONE_FINISH, zone->start, + zone->len, GFP_NOFS); + if (ret) + return ret; + } - if (zone->wp == zone->start + zone->len) + zone->wp = zone->start + zone->len; zone->cond = BLK_ZONE_COND_FULL; - - return; + } + return 0; } - zone++; - ASSERT(zone->cond != BLK_ZONE_COND_FULL); - if (zone->cond == BLK_ZONE_COND_EMPTY) - zone->cond = BLK_ZONE_COND_IMP_OPEN; - - zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT); - - if (zone->wp == zone->start + zone->len) - zone->cond = BLK_ZONE_COND_FULL; + /* All the zones are FULL. Should not reach here. */ + ASSERT(0); + return -EIO; } int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) @@ -895,6 +989,41 @@ u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, return pos; } +static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno = (pos >> zone_info->zone_size_shift); + + /* We can use any number of zones */ + if (zone_info->max_active_zones == 0) + return true; + + if (!test_bit(zno, zone_info->active_zones)) { + /* Active zone left? */ + if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0) + return false; + if (test_and_set_bit(zno, zone_info->active_zones)) { + /* Someone already set the bit */ + atomic_inc(&zone_info->active_zones_left); + } + } + + return true; +} + +static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) +{ + struct btrfs_zoned_device_info *zone_info = device->zone_info; + unsigned int zno = (pos >> zone_info->zone_size_shift); + + /* We can use any number of zones */ + if (zone_info->max_active_zones == 0) + return; + + if (test_and_clear_bit(zno, zone_info->active_zones)) + atomic_inc(&zone_info->active_zones_left); +} + int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { @@ -910,6 +1039,7 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, *bytes = length; while (length) { btrfs_dev_set_zone_empty(device, physical); + btrfs_dev_clear_active_zone(device, physical); physical += device->zone_info->zone_size; length -= device->zone_info->zone_size; } @@ -1039,6 +1169,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) int i; unsigned int nofs_flag; u64 *alloc_offsets = NULL; + u64 *caps = NULL; + unsigned long *active = NULL; u64 last_alloc = 0; u32 num_sequential = 0, num_conventional = 0; @@ -1063,10 +1195,28 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) map = em->map_lookup; + cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); + if (!cache->physical_map) { + ret = -ENOMEM; + goto out; + } + alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS); if (!alloc_offsets) { - free_extent_map(em); - return -ENOMEM; + ret = -ENOMEM; + goto out; + } + + caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS); + if (!caps) { + ret = -ENOMEM; + goto out; + } + + active = bitmap_zalloc(map->num_stripes, GFP_NOFS); + if (!active) { + ret = -ENOMEM; + goto out; } for (i = 0; i < map->num_stripes; i++) { @@ -1131,6 +1281,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + caps[i] = (zone.capacity << SECTOR_SHIFT); + switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: @@ -1144,14 +1296,22 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) alloc_offsets[i] = 0; break; case BLK_ZONE_COND_FULL: - alloc_offsets[i] = fs_info->zone_size; + alloc_offsets[i] = caps[i]; break; default: /* Partially used zone */ alloc_offsets[i] = ((zone.wp - zone.start) << SECTOR_SHIFT); + __set_bit(i, active); break; } + + /* + * Consider a zone as active if we can allow any number of + * active zones. + */ + if (!device->zone_info->max_active_zones) + __set_bit(i, active); } if (num_sequential > 0) @@ -1169,6 +1329,9 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * calculate_alloc_pointer() which takes extent buffer * locks to avoid deadlock. */ + + /* Zone capacity is always zone size in emulation */ + cache->zone_capacity = cache->length; if (new) { cache->alloc_offset = 0; goto out; @@ -1195,6 +1358,8 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } cache->alloc_offset = alloc_offsets[0]; + cache->zone_capacity = caps[0]; + cache->zone_is_active = test_bit(0, active); break; case BTRFS_BLOCK_GROUP_DUP: case BTRFS_BLOCK_GROUP_RAID1: @@ -1210,6 +1375,13 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (cache->zone_is_active) { + btrfs_get_block_group(cache); + spin_lock(&fs_info->zone_active_bgs_lock); + list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + } + out: if (cache->alloc_offset > fs_info->zone_size) { btrfs_err(fs_info, @@ -1218,6 +1390,14 @@ out: ret = -EIO; } + if (cache->alloc_offset > cache->zone_capacity) { + btrfs_err(fs_info, +"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", + cache->alloc_offset, cache->zone_capacity, + cache->start); + ret = -EIO; + } + /* An extent is allocated after the write pointer */ if (!ret && num_conventional && last_alloc > cache->alloc_offset) { btrfs_err(fs_info, @@ -1229,6 +1409,12 @@ out: if (!ret) cache->meta_write_pointer = cache->alloc_offset + cache->start; + if (ret) { + kfree(cache->physical_map); + cache->physical_map = NULL; + } + bitmap_free(active); + kfree(caps); kfree(alloc_offsets); free_extent_map(em); @@ -1243,17 +1429,15 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) return; WARN_ON(cache->bytes_super != 0); - unusable = cache->alloc_offset - cache->used; - free = cache->length - cache->alloc_offset; + unusable = (cache->alloc_offset - cache->used) + + (cache->length - cache->zone_capacity); + free = cache->zone_capacity - cache->alloc_offset; /* We only need ->free_space in ALLOC_SEQ block groups */ cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; cache->free_space_ctl->free_space = free; cache->zone_unusable = unusable; - - /* Should not have any excluded extents. Just in case, though */ - btrfs_free_excluded_extents(cache); } void btrfs_redirty_list_add(struct btrfs_transaction *trans, @@ -1304,6 +1488,17 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!is_data_inode(&inode->vfs_inode)) return false; + /* + * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the + * extent layout the relocation code has. + * Furthermore we have set aside own block-group from which only the + * relocation "process" can allocate and make sure only one process at a + * time can add pages to an extent that gets relocated, so it's safe to + * use regular REQ_OP_WRITE for this special case. + */ + if (btrfs_is_data_reloc_root(inode->root)) + return false; + cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); if (!cache) @@ -1440,27 +1635,27 @@ int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 len static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, struct blk_zone *zone) { - struct btrfs_bio *bbio = NULL; + struct btrfs_io_context *bioc = NULL; u64 mapped_length = PAGE_SIZE; unsigned int nofs_flag; int nmirrors; int i, ret; ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &mapped_length, &bbio); - if (ret || !bbio || mapped_length < PAGE_SIZE) { - btrfs_put_bbio(bbio); + &mapped_length, &bioc); + if (ret || !bioc || mapped_length < PAGE_SIZE) { + btrfs_put_bioc(bioc); return -EIO; } - if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) return -EINVAL; nofs_flag = memalloc_nofs_save(); - nmirrors = (int)bbio->num_stripes; + nmirrors = (int)bioc->num_stripes; for (i = 0; i < nmirrors; i++) { - u64 physical = bbio->stripes[i].physical; - struct btrfs_device *dev = bbio->stripes[i].dev; + u64 physical = bioc->stripes[i].physical; + struct btrfs_device *dev = bioc->stripes[i].dev; /* Missing device */ if (!dev->bdev) @@ -1530,3 +1725,251 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, return device; } + +/** + * Activate block group and underlying device zones + * + * @block_group: the block group to activate + * + * Return: true on success, false otherwise + */ +bool btrfs_zone_activate(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; + bool ret; + + if (!btrfs_is_zoned(block_group->fs_info)) + return true; + + map = block_group->physical_map; + /* Currently support SINGLE profile only */ + ASSERT(map->num_stripes == 1); + device = map->stripes[0].dev; + physical = map->stripes[0].physical; + + if (device->zone_info->max_active_zones == 0) + return true; + + spin_lock(&block_group->lock); + + if (block_group->zone_is_active) { + ret = true; + goto out_unlock; + } + + /* No space left */ + if (block_group->alloc_offset == block_group->zone_capacity) { + ret = false; + goto out_unlock; + } + + if (!btrfs_dev_set_active_zone(device, physical)) { + /* Cannot activate the zone */ + ret = false; + goto out_unlock; + } + + /* Successfully activated all the zones */ + block_group->zone_is_active = 1; + + spin_unlock(&block_group->lock); + + /* For the active block group list */ + btrfs_get_block_group(block_group); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(list_empty(&block_group->active_bg_list)); + list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs); + spin_unlock(&fs_info->zone_active_bgs_lock); + + return true; + +out_unlock: + spin_unlock(&block_group->lock); + return ret; +} + +int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + map = block_group->physical_map; + /* Currently support SINGLE profile only */ + ASSERT(map->num_stripes == 1); + + device = map->stripes[0].dev; + physical = map->stripes[0].physical; + + if (device->zone_info->max_active_zones == 0) + return 0; + + spin_lock(&block_group->lock); + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + return 0; + } + + /* Check if we have unwritten allocated space */ + if ((block_group->flags & + (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && + block_group->alloc_offset > block_group->meta_write_pointer) { + spin_unlock(&block_group->lock); + return -EAGAIN; + } + spin_unlock(&block_group->lock); + + ret = btrfs_inc_block_group_ro(block_group, false); + if (ret) + return ret; + + /* Ensure all writes in this block group finish */ + btrfs_wait_block_group_reservations(block_group); + /* No need to wait for NOCOW writers. Zoned mode does not allow that. */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, + block_group->length); + + spin_lock(&block_group->lock); + + /* + * Bail out if someone already deactivated the block group, or + * allocated space is left in the block group. + */ + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return 0; + } + + if (block_group->reserved) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return -EAGAIN; + } + + block_group->zone_is_active = 0; + block_group->alloc_offset = block_group->zone_capacity; + block_group->free_space_ctl->free_space = 0; + btrfs_clear_treelog_bg(block_group); + spin_unlock(&block_group->lock); + + ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, + physical >> SECTOR_SHIFT, + device->zone_info->zone_size >> SECTOR_SHIFT, + GFP_NOFS); + btrfs_dec_block_group_ro(block_group); + + if (!ret) { + btrfs_dev_clear_active_zone(device, physical); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + /* For active_bg_list */ + btrfs_put_block_group(block_group); + } + + return ret; +} + +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index) +{ + struct btrfs_device *device; + bool ret = false; + + if (!btrfs_is_zoned(fs_devices->fs_info)) + return true; + + /* Non-single profiles are not supported yet */ + if (raid_index != BTRFS_RAID_SINGLE) + return false; + + /* Check if there is a device with active zones left */ + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + struct btrfs_zoned_device_info *zinfo = device->zone_info; + + if (!device->bdev) + continue; + + if (!zinfo->max_active_zones || + atomic_read(&zinfo->active_zones_left)) { + ret = true; + break; + } + } + mutex_unlock(&fs_devices->device_list_mutex); + + return ret; +} + +void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) +{ + struct btrfs_block_group *block_group; + struct map_lookup *map; + struct btrfs_device *device; + u64 physical; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + ASSERT(block_group); + + if (logical + length < block_group->start + block_group->zone_capacity) + goto out; + + spin_lock(&block_group->lock); + + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + goto out; + } + + block_group->zone_is_active = 0; + /* We should have consumed all the free space */ + ASSERT(block_group->alloc_offset == block_group->zone_capacity); + ASSERT(block_group->free_space_ctl->free_space == 0); + btrfs_clear_treelog_bg(block_group); + spin_unlock(&block_group->lock); + + map = block_group->physical_map; + device = map->stripes[0].dev; + physical = map->stripes[0].physical; + + if (!device->zone_info->max_active_zones) + goto out; + + btrfs_dev_clear_active_zone(device, physical); + + spin_lock(&fs_info->zone_active_bgs_lock); + ASSERT(!list_empty(&block_group->active_bg_list)); + list_del_init(&block_group->active_bg_list); + spin_unlock(&fs_info->zone_active_bgs_lock); + + btrfs_put_block_group(block_group); + +out: + btrfs_put_block_group(block_group); +} + +void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->relocation_bg_lock); + if (fs_info->data_reloc_bg == bg->start) + fs_info->data_reloc_bg = 0; + spin_unlock(&fs_info->relocation_bg_lock); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 4b299705bb12..e53ab7b96437 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -23,8 +23,11 @@ struct btrfs_zoned_device_info { u64 zone_size; u8 zone_size_shift; u32 nr_zones; + unsigned int max_active_zones; + atomic_t active_zones_left; unsigned long *seq_zones; unsigned long *empty_zones; + unsigned long *active_zones; struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX]; }; @@ -40,7 +43,7 @@ int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret); int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, u64 *bytenr_ret); -void btrfs_advance_sb_log(struct btrfs_device *device, int mirror); +int btrfs_advance_sb_log(struct btrfs_device *device, int mirror); int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror); u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start, u64 hole_end, u64 num_bytes); @@ -66,6 +69,13 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, u64 physical_start, u64 physical_pos); struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +bool btrfs_zone_activate(struct btrfs_block_group *block_group); +int btrfs_zone_finish(struct btrfs_block_group *block_group); +bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, + int raid_index); +void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); +void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -113,8 +123,10 @@ static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror, return 0; } -static inline void btrfs_advance_sb_log(struct btrfs_device *device, int mirror) -{ } +static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) +{ + return 0; +} static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { @@ -199,6 +211,27 @@ static inline struct btrfs_device *btrfs_zoned_get_device( return ERR_PTR(-EOPNOTSUPP); } +static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) +{ + return true; +} + +static inline int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + return 0; +} + +static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, + int raid_index) +{ + return true; +} + +static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } + +static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 56dce9f00988..f06b68040352 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -399,7 +399,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = page_address(in_page); + workspace->in_buf.src = kmap(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); @@ -411,7 +411,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -446,6 +446,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -457,7 +458,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -472,12 +473,13 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; + kunmap(in_page); put_page(in_page); start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = page_address(in_page); + workspace->in_buf.src = kmap(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -504,6 +506,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; + kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; ret = -E2BIG; @@ -515,7 +518,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = page_address(out_page); + workspace->out_buf.dst = kmap(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -531,8 +534,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, out: *out_pages = nr_pages; /* Cleanup */ - if (in_page) + if (in_page) { + kunmap(in_page); put_page(in_page); + } + if (out_page) + kunmap(out_page); return ret; } @@ -556,7 +563,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = page_address(pages_in[page_in_index]); + workspace->in_buf.src = kmap(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -592,14 +599,14 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - page_in_index++; + kunmap(pages_in[page_in_index++]); if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = page_address(pages_in[page_in_index]); + workspace->in_buf.src = kmap(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -607,6 +614,8 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) ret = 0; zero_fill_bio(cb->orig_bio); done: + if (workspace->in_buf.src) + kunmap(pages_in[page_in_index]); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index ab7573d72dd7..46bc589b7a03 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -878,7 +878,7 @@ link_dev_buffers(struct page *page, struct buffer_head *head) static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size) { sector_t retval = ~((sector_t)0); - loff_t sz = i_size_read(bdev->bd_inode); + loff_t sz = bdev_nr_bytes(bdev); if (sz) { unsigned int sizebits = blksize_bits(size); @@ -897,7 +897,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, struct buffer_head *head = page_buffers(page); struct buffer_head *bh = head; int uptodate = PageUptodate(page); - sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size); + sector_t end_block = blkdev_max_block(bdev, size); do { if (!buffer_mapped(bh)) { @@ -1425,12 +1425,16 @@ void invalidate_bh_lrus(void) } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); -void invalidate_bh_lrus_cpu(int cpu) +/* + * It's called from workqueue context so we need a bh_lru_lock to close + * the race with preemption/irq. + */ +void invalidate_bh_lrus_cpu(void) { struct bh_lru *b; bh_lru_lock(); - b = per_cpu_ptr(&bh_lrus, cpu); + b = this_cpu_ptr(&bh_lrus); __invalidate_bh_lrus(b); bh_lru_unlock(); } diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index fac2e8e7b533..effe37ef8629 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -37,11 +37,11 @@ static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) /* * Handle completion of a read from the cache. */ -static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2) +static void cachefiles_read_complete(struct kiocb *iocb, long ret) { struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); - _enter("%ld,%ld", ret, ret2); + _enter("%ld", ret); if (ki->term_func) { if (ret >= 0) @@ -139,7 +139,7 @@ static int cachefiles_read(struct netfs_cache_resources *cres, fallthrough; default: ki->was_async = false; - cachefiles_read_complete(&ki->iocb, ret, 0); + cachefiles_read_complete(&ki->iocb, ret); if (ret > 0) ret = 0; break; @@ -159,12 +159,12 @@ presubmission_error: /* * Handle completion of a write to the cache. */ -static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2) +static void cachefiles_write_complete(struct kiocb *iocb, long ret) { struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); struct inode *inode = file_inode(ki->iocb.ki_filp); - _enter("%ld,%ld", ret, ret2); + _enter("%ld", ret); /* Tell lockdep we inherited freeze protection from submission thread */ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); @@ -244,7 +244,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, fallthrough; default: ki->was_async = false; - cachefiles_write_complete(&ki->iocb, ret, 0); + cachefiles_write_complete(&ki->iocb, ret); if (ret > 0) ret = 0; break; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 8ffc40e84a59..fcf4f3b72923 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -25,20 +25,20 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode, struct cachefiles_object *object; struct fscache_retrieval *op = monitor->op; struct wait_page_key *key = _key; - struct page *page = wait->private; + struct folio *folio = wait->private; ASSERT(key); _enter("{%lu},%u,%d,{%p,%u}", monitor->netfs_page->index, mode, sync, - key->page, key->bit_nr); + key->folio, key->bit_nr); - if (key->page != page || key->bit_nr != PG_locked) + if (key->folio != folio || key->bit_nr != PG_locked) return 0; - _debug("--- monitor %p %lx ---", page, page->flags); + _debug("--- monitor %p %lx ---", folio, folio->flags); - if (!PageUptodate(page) && !PageError(page)) { + if (!folio_test_uptodate(folio) && !folio_test_error(folio)) { /* unlocked, not uptodate and not erronous? */ _debug("page probably truncated"); } @@ -107,7 +107,7 @@ static int cachefiles_read_reissue(struct cachefiles_object *object, put_page(backpage2); INIT_LIST_HEAD(&monitor->op_link); - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); if (trylock_page(backpage)) { ret = -EIO; @@ -294,7 +294,7 @@ monitor_backing_page: get_page(backpage); monitor->back_page = backpage; monitor->monitor.private = backpage; - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); monitor = NULL; /* but the page may have been read before the monitor was installed, so @@ -548,7 +548,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, get_page(backpage); monitor->back_page = backpage; monitor->monitor.private = backpage; - add_page_wait_queue(backpage, &monitor->monitor); + folio_add_wait_queue(page_folio(backpage), &monitor->monitor); monitor = NULL; /* but the page may have been read before the monitor was diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6c0e52fd0743..8f537f1d9d1d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2263,7 +2263,7 @@ retry: list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; - if (unlikely(s->s_mds > max)) { + if (unlikely(s->s_mds >= max)) { spin_unlock(&ci->i_unsafe_lock); goto retry; } @@ -2277,7 +2277,7 @@ retry: list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; - if (unlikely(s->s_mds > max)) { + if (unlikely(s->s_mds >= max)) { spin_unlock(&ci->i_unsafe_lock); goto retry; } @@ -2330,7 +2330,6 @@ retry: int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct ceph_file_info *fi = file->private_data; struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); u64 flush_tid; @@ -2365,14 +2364,9 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (err < 0) ret = err; - if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) { - spin_lock(&file->f_lock); - err = errseq_check_and_advance(&ci->i_meta_err, - &fi->meta_err); - spin_unlock(&file->f_lock); - if (err < 0) - ret = err; - } + err = file_check_and_advance_wb_err(file); + if (err < 0) + ret = err; out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d16fd2d5fd42..b129ea551378 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -233,7 +233,6 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, spin_lock_init(&fi->rw_contexts_lock); INIT_LIST_HEAD(&fi->rw_contexts); - fi->meta_err = errseq_sample(&ci->i_meta_err); fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen); return 0; @@ -1023,7 +1022,7 @@ static void ceph_aio_complete(struct inode *inode, ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : CEPH_CAP_FILE_RD)); - aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); + aio_req->iocb->ki_complete(aio_req->iocb, ret); ceph_free_cap_flush(aio_req->prealloc_cf); kfree(aio_req); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 2df1e1284451..1c7574105478 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -541,8 +541,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ceph_fscache_inode_init(ci); - ci->i_meta_err = 0; - return &ci->vfs_inode; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index bdeb271f47d9..d8c31069fbf2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -302,9 +302,6 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - /* No mandatory locks */ - if (fl->fl_type & LOCK_MAND) - return -EOPNOTSUPP; dout("ceph_flock, fl_file: %p\n", fl->fl_file); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 7cad180d6deb..d64413adc0fd 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1493,7 +1493,6 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, { struct ceph_mds_request *req; struct rb_node *p; - struct ceph_inode_info *ci; dout("cleanup_session_requests mds%d\n", session->s_mds); mutex_lock(&mdsc->mutex); @@ -1502,16 +1501,10 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, struct ceph_mds_request, r_unsafe_item); pr_warn_ratelimited(" dropping unsafe request %llu\n", req->r_tid); - if (req->r_target_inode) { - /* dropping unsafe change of inode's attributes */ - ci = ceph_inode(req->r_target_inode); - errseq_set(&ci->i_meta_err, -EIO); - } - if (req->r_unsafe_dir) { - /* dropping unsafe directory operation */ - ci = ceph_inode(req->r_unsafe_dir); - errseq_set(&ci->i_meta_err, -EIO); - } + if (req->r_target_inode) + mapping_set_error(req->r_target_inode->i_mapping, -EIO); + if (req->r_unsafe_dir) + mapping_set_error(req->r_unsafe_dir->i_mapping, -EIO); __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1678,7 +1671,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_unlock(&mdsc->cap_dirty_lock); if (dirty_dropped) { - errseq_set(&ci->i_meta_err, -EIO); + mapping_set_error(inode->i_mapping, -EIO); if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 9b1b7f4cfdd4..fd8742bae847 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1002,16 +1002,16 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) struct ceph_fs_client *new = fc->s_fs_info; struct ceph_mount_options *fsopt = new->mount_options; struct ceph_options *opt = new->client->options; - struct ceph_fs_client *other = ceph_sb_to_client(sb); + struct ceph_fs_client *fsc = ceph_sb_to_client(sb); dout("ceph_compare_super %p\n", sb); - if (compare_mount_options(fsopt, opt, other)) { + if (compare_mount_options(fsopt, opt, fsc)) { dout("monitor(s)/mount options don't match\n"); return 0; } if ((opt->flags & CEPH_OPT_FSID) && - ceph_fsid_compare(&opt->fsid, &other->client->fsid)) { + ceph_fsid_compare(&opt->fsid, &fsc->client->fsid)) { dout("fsid doesn't match\n"); return 0; } @@ -1019,6 +1019,17 @@ static int ceph_compare_super(struct super_block *sb, struct fs_context *fc) dout("flags differ\n"); return 0; } + + if (fsc->blocklisted && !ceph_test_mount_opt(fsc, CLEANRECOVER)) { + dout("client is blocklisted (and CLEANRECOVER is not set)\n"); + return 0; + } + + if (fsc->mount_state == CEPH_MOUNT_SHUTDOWN) { + dout("client has been forcibly unmounted\n"); + return 0; + } + return 1; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a40eb14c282a..14f951cd5b61 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -429,8 +429,6 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; #endif - errseq_t i_meta_err; - struct inode vfs_inode; /* at end */ }; @@ -774,7 +772,6 @@ struct ceph_file_info { spinlock_t rw_contexts_lock; struct list_head rw_contexts; - errseq_t meta_err; u32 filp_gen; atomic_t num_locks; }; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 159a1ffa4f4b..fcf7dfdecf96 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1311,7 +1311,7 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, int err; err = security_dentry_init_security(dentry, mode, &dentry->d_name, - &as_ctx->sec_ctx, + &name, &as_ctx->sec_ctx, &as_ctx->sec_ctxlen); if (err < 0) { WARN_ON_ONCE(err != -EOPNOTSUPP); @@ -1335,7 +1335,6 @@ int ceph_security_init_secctx(struct dentry *dentry, umode_t mode, * It only supports single security module and only selinux has * dentry_init_security hook. */ - name = XATTR_NAME_SELINUX; name_len = strlen(name); err = ceph_pagelist_reserve(pagelist, 4 * 2 + name_len + as_ctx->sec_ctxlen); diff --git a/fs/cifs/cache.c b/fs/cifs/cache.c index 8a3b30ec860c..8be57aaedab6 100644 --- a/fs/cifs/cache.c +++ b/fs/cifs/cache.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/cache.c - CIFS filesystem cache index structure definitions + * CIFS filesystem cache index structure definitions * * Copyright (c) 2010 Novell, Inc. * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 51a824fc926a..de2c12bcfa4b 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * fs/cifs_debug.c * * Copyright (C) International Business Machines Corp., 2000,2005 * diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 4fd788586399..f97407520ea1 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/cifs_fs_sb.h * * Copyright (c) International Business Machines Corp., 2002,2004 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index ef723be358af..b87cbbe6d2d4 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/cifs_ioctl.h * * Structure definitions for io control for cifs/smb3 * diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 8fa26a8530f8..353bd0dd7026 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/cifs_spnego.c -- SPNEGO upcall management for CIFS + * SPNEGO upcall management for CIFS * * Copyright (c) 2007 Red Hat, Inc. * Author(s): Jeff Layton (jlayton@redhat.com) diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 31387d0ea32e..e6a0451877d4 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/cifs_spnego.h -- SPNEGO upcall management for CIFS + * SPNEGO upcall management for CIFS * * Copyright (c) 2007 Red Hat, Inc. * Author(s): Jeff Layton (jlayton@redhat.com) diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 171ad8b42107..e7582dd79179 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * fs/cifs/cifs_unicode.c * * Copyright (c) International Business Machines Corp., 2000,2009 * Modified by Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 388eb536cff1..ee3aab3dd4ac 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/cifsacl.c * * Copyright (C) International Business Machines Corp., 2007,2008 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index f8292bcf8594..ccbfc754bd3c 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/cifsacl.h * * Copyright (c) International Business Machines Corp., 2007 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 2e6f40344037..d118282071b3 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/cifsencrypt.c * * Encryption and hashing operations relating to NTLM, NTLMv2. See MS-NLMP * for more detailed information diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8c20bfa187ac..dca42aa87d30 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/cifsfs.c * * Copyright (C) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) @@ -39,7 +38,6 @@ #include <linux/key-type.h> #include "cifs_spnego.h" #include "fscache.h" -#include "smb2pdu.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index d25a4099b32e..b50da1901ebd 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/cifsfs.h * * Copyright (c) International Business Machines Corp., 2002, 2007 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index c068f7d8d879..abff31dcd005 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/cifsglob.h * * Copyright (C) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) @@ -21,6 +20,7 @@ #include <crypto/internal/hash.h> #include <linux/scatterlist.h> #include <uapi/linux/cifs/cifs_mount.h> +#include "../smbfs_common/smb2pdu.h" #include "smb2pdu.h" #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ @@ -777,7 +777,7 @@ revert_current_mid(struct TCP_Server_Info *server, const unsigned int val) static inline void revert_current_mid_from_hdr(struct TCP_Server_Info *server, - const struct smb2_sync_hdr *shdr) + const struct smb2_hdr *shdr) { unsigned int num = le16_to_cpu(shdr->CreditCharge); @@ -1400,6 +1400,7 @@ struct cifsInodeInfo { #define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ #define CIFS_INO_LOCK (5) /* lock bit for synchronization */ #define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */ +#define CIFS_INO_CLOSE_ON_LOCK (7) /* Not to defer the close when lock is set */ unsigned long flags; spinlock_t writers_lock; unsigned int writers; /* Number of writers on this inode */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 98e8e5aa0613..d2ff438fd31f 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/cifspdu.h * * Copyright (c) International Business Machines Corp., 2002,2009 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index f9740c21ca3d..d0f85b666662 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/cifsproto.h * * Copyright (c) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) @@ -268,6 +267,9 @@ extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode); extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon); +extern void cifs_close_deferred_file_under_dentry(struct cifs_tcon *cifs_tcon, + const char *path); + extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb3_fs_context *ctx); extern void cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a8e41c1e80ca..243d17696f06 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/cifssmb.c * * Copyright (C) International Business Machines Corp., 2002,2010 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0db344807ef1..0abbff4e4135 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/connect.c * * Copyright (C) International Business Machines Corp., 2002,2011 * Author(s): Steve French (sfrench@us.ibm.com) @@ -678,7 +677,7 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed) static unsigned int smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; + struct smb2_hdr *shdr = (struct smb2_hdr *)buffer; /* * SMB1 does not use credits. @@ -795,7 +794,6 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) */ } - kfree(server->hostname); kfree(server); length = atomic_dec_return(&tcpSesAllocCount); @@ -879,7 +877,7 @@ cifs_handle_standard(struct TCP_Server_Info *server, struct mid_q_entry *mid) static void smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; + struct smb2_hdr *shdr = (struct smb2_hdr *)buffer; int scredits, in_flight; /* @@ -1090,7 +1088,7 @@ next_pdu: module_put_and_exit(0); } -/** +/* * Returns true if srcaddr isn't specified and rhs isn't specified, or * if srcaddr is specified and matches the IP address of the rhs argument */ @@ -1236,6 +1234,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb3_fs_context * if (!net_eq(cifs_net_ns(server), current->nsproxy->net_ns)) return 0; + if (strcasecmp(server->hostname, ctx->server_hostname)) + return 0; + if (!match_address(server, addr, (struct sockaddr *)&ctx->srcaddr)) return 0; @@ -1337,6 +1338,7 @@ cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect) kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; + kfree(server->hostname); task = xchg(&server->tsk, NULL); if (task) @@ -1362,14 +1364,15 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx) goto out_err; } + tcp_ses->hostname = kstrdup(ctx->server_hostname, GFP_KERNEL); + if (!tcp_ses->hostname) { + rc = -ENOMEM; + goto out_err; + } + tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); - tcp_ses->hostname = extract_hostname(ctx->UNC); - if (IS_ERR(tcp_ses->hostname)) { - rc = PTR_ERR(tcp_ses->hostname); - goto out_err_crypto_release; - } tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); tcp_ses->noblockcnt = ctx->rootfs; @@ -1498,8 +1501,7 @@ out_err_crypto_release: out_err: if (tcp_ses) { - if (!IS_ERR(tcp_ses->hostname)) - kfree(tcp_ses->hostname); + kfree(tcp_ses->hostname); if (tcp_ses->ssocket) sock_release(tcp_ses->ssocket); kfree(tcp_ses); @@ -1550,6 +1552,9 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx) /** * cifs_setup_ipc - helper to setup the IPC tcon for the session + * @ses: smb session to issue the request on + * @ctx: the superblock configuration context to use for building the + * new tree connection for the IPC (interprocess communication RPC) * * A new IPC connection is made and stored in the session * tcon_ipc. The IPC tcon has the same lifetime as the session. @@ -1605,6 +1610,7 @@ out: /** * cifs_free_ipc - helper to release the session IPC tcon + * @ses: smb session to unmount the IPC from * * Needs to be called everytime a session is destroyed. * @@ -1855,6 +1861,8 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx __attribute__((unused)), /** * cifs_get_smb_ses - get a session matching @ctx data from @server + * @server: server to setup the session to + * @ctx: superblock configuration context to use to setup the session * * This function assumes it is being called from cifs_mount() where we * already got a server reference (server refcount +1). See @@ -2065,6 +2073,8 @@ cifs_put_tcon(struct cifs_tcon *tcon) /** * cifs_get_tcon - get a tcon matching @ctx data from @ses + * @ses: smb session to issue the request on + * @ctx: the superblock configuration context to use for building the * * - tcon refcount is the number of mount points using the tcon. * - ses refcount is the number of tcon using the session. @@ -2382,9 +2392,10 @@ cifs_match_super(struct super_block *sb, void *data) spin_lock(&cifs_tcp_ses_lock); cifs_sb = CIFS_SB(sb); tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb)); - if (IS_ERR(tlink)) { + if (tlink == NULL) { + /* can not match superblock if tlink were ever null */ spin_unlock(&cifs_tcp_ses_lock); - return rc; + return 0; } tcon = tlink_tcon(tlink); ses = tcon->ses; @@ -2638,11 +2649,12 @@ generic_ip_connect(struct TCP_Server_Info *server) rc = 0; if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); + trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); sock_release(socket); server->ssocket = NULL; return rc; } - + trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr); if (sport == htons(RFC1001_PORT)) rc = ip_rfc1001_connect(server); @@ -3030,7 +3042,7 @@ build_unc_path_to_root(const struct smb3_fs_context *ctx, return full_path; } -/** +/* * expand_dfs_referral - Perform a dfs referral query and update the cifs_sb * * If a referral is found, cifs_sb->ctx->mount_options will be (re-)allocated diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 5f8a302ffcb2..6e8e7cc26ae2 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/dir.c * * vfs operations that deal with dentries * diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 8c616aaeb7c4..0458d28d71aa 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/dns_resolve.c * * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h index 9fa2807ef79e..afc0df381246 100644 --- a/fs/cifs/dns_resolve.h +++ b/fs/cifs/dns_resolve.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/dns_resolve.h -- DNS Resolver upcall management for CIFS DFS - * Handles host name to IP address resolution + * DNS Resolver upcall management for CIFS DFS + * Handles host name to IP address resolution * * Copyright (c) International Business Machines Corp., 2008 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/export.c b/fs/cifs/export.c index 747a540db954..37c28415df1e 100644 --- a/fs/cifs/export.c +++ b/fs/cifs/export.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/export.c * * Copyright (C) International Business Machines Corp., 2007 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d0216472f1c6..1b855fcb179e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/file.c * * vfs operations that deal with files * @@ -883,8 +882,9 @@ int cifs_close(struct inode *inode, struct file *file) dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); if ((cinode->oplock == CIFS_CACHE_RHW_FLG) && cinode->lease_granted && + !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && dclose) { - if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { + if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { inode->i_ctime = inode->i_mtime = current_time(inode); cifs_fscache_update_inode_cookie(inode); } @@ -1865,6 +1865,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) cifs_read_flock(flock, &type, &lock, &unlock, &wait_flag, tcon->ses->server); cifs_sb = CIFS_FILE_SB(file); + set_bit(CIFS_INO_CLOSE_ON_LOCK, &CIFS_I(d_inode(cfile->dentry))->flags); if (cap_unix(tcon->ses) && (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) && @@ -3112,7 +3113,7 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; struct dentry *dentry = ctx->cfile->dentry; - int rc; + ssize_t rc; tcon = tlink_tcon(ctx->cfile->tlink); cifs_sb = CIFS_SB(dentry->d_sb); @@ -3183,7 +3184,7 @@ restart_loop: mutex_unlock(&ctx->aio_mutex); if (ctx->iocb && ctx->iocb->ki_complete) - ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + ctx->iocb->ki_complete(ctx->iocb, ctx->rc); else complete(&ctx->done); } @@ -3916,7 +3917,7 @@ again: mutex_unlock(&ctx->aio_mutex); if (ctx->iocb && ctx->iocb->ki_complete) - ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + ctx->iocb->ki_complete(ctx->iocb, ctx->rc); else complete(&ctx->done); } diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 3109def8e199..38d96a480745 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -116,6 +116,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag("nosharesock", Opt_nosharesock), fsparam_flag_no("persistenthandles", Opt_persistent), fsparam_flag_no("resilienthandles", Opt_resilient), + fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay), fsparam_flag("domainauto", Opt_domainauto), fsparam_flag("rdma", Opt_rdma), fsparam_flag("modesid", Opt_modesid), @@ -318,6 +319,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(mount_options); DUP_CTX_STR(username); DUP_CTX_STR(password); + DUP_CTX_STR(server_hostname); DUP_CTX_STR(UNC); DUP_CTX_STR(source); DUP_CTX_STR(domainname); @@ -456,6 +458,11 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) if (!pos) return -EINVAL; + /* record the server hostname */ + ctx->server_hostname = kstrndup(devname + 2, pos - devname - 2, GFP_KERNEL); + if (!ctx->server_hostname) + return -ENOMEM; + /* skip past delimiter */ ++pos; @@ -1383,6 +1390,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } } break; + case Opt_tcp_nodelay: + /* tcp nodelay should not usually be needed since we CORK/UNCORK the socket */ + if (result.negated) + ctx->sockopt_tcp_nodelay = false; + else + ctx->sockopt_tcp_nodelay = true; + break; case Opt_domainauto: ctx->domainauto = true; break; @@ -1496,6 +1510,8 @@ smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) ctx->username = NULL; kfree_sensitive(ctx->password); ctx->password = NULL; + kfree(ctx->server_hostname); + ctx->server_hostname = NULL; kfree(ctx->UNC); ctx->UNC = NULL; kfree(ctx->source); diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index a42ba71d7a81..b2d22cf9cb18 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -98,6 +98,7 @@ enum cifs_param { Opt_nosharesock, Opt_persistent, Opt_resilient, + Opt_tcp_nodelay, Opt_domainauto, Opt_rdma, Opt_modesid, @@ -166,6 +167,7 @@ struct smb3_fs_context { char *password; char *domainname; char *source; + char *server_hostname; char *UNC; char *nodename; char *iocharset; /* local code page for mapping to and from Unicode */ diff --git a/fs/cifs/fscache.c b/fs/cifs/fscache.c index fab47fa7df74..8eedd20c44ab 100644 --- a/fs/cifs/fscache.c +++ b/fs/cifs/fscache.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/fscache.c - CIFS filesystem cache interface + * CIFS filesystem cache interface * * Copyright (c) 2010 Novell, Inc. * Author(s): Suresh Jayaraman <sjayaraman@suse.de> diff --git a/fs/cifs/fscache.h b/fs/cifs/fscache.h index 82e856b9cf89..9baa1d0f22bd 100644 --- a/fs/cifs/fscache.h +++ b/fs/cifs/fscache.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/fscache.h - CIFS filesystem cache interface definitions + * CIFS filesystem cache interface definitions * * Copyright (c) 2010 Novell, Inc. * Authors(s): Suresh Jayaraman (sjayaraman@suse.de> diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 50c01cff4c84..82848412ad85 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/inode.c * * Copyright (C) International Business Machines Corp., 2002,2010 * Author(s): Steve French (sfrench@us.ibm.com) @@ -1625,7 +1624,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto unlink_out; } - cifs_close_deferred_file(CIFS_I(inode)); + cifs_close_deferred_file_under_dentry(tcon, full_path); if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = CIFSPOSIXDelFile(xid, tcon, full_path, @@ -2114,9 +2113,9 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, goto cifs_rename_exit; } - cifs_close_deferred_file(CIFS_I(d_inode(source_dentry))); + cifs_close_deferred_file_under_dentry(tcon, from_name); if (d_inode(target_dentry) != NULL) - cifs_close_deferred_file(CIFS_I(d_inode(target_dentry))); + cifs_close_deferred_file_under_dentry(tcon, to_name); rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 42c6a0bac6c8..0359b604bdbc 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/ioctl.c * * vfs operations that deal with io control * @@ -359,7 +358,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) if (pSMBFile == NULL) break; tcon = tlink_tcon(pSMBFile->tlink); - caps = le64_to_cpu(tcon->fsUnixInfo.Capability); + /* caps = le64_to_cpu(tcon->fsUnixInfo.Capability); */ if (get_user(ExtAttrBits, (int __user *)arg)) { rc = -EFAULT; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index f0a6d63bc08c..852e54ee82c2 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/link.c * * Copyright (C) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 9469f1cf0b46..ba2c3e897b29 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/misc.c * * Copyright (C) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) @@ -153,7 +152,7 @@ cifs_buf_get(void) * SMB2 header is bigger than CIFS one - no problems to clean some * more bytes for CIFS. */ - size_t buf_size = sizeof(struct smb2_sync_hdr); + size_t buf_size = sizeof(struct smb2_hdr); /* * We could use negotiated size instead of max_msgsize - @@ -265,7 +264,8 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , /* Uid is not converted */ buffer->Uid = treeCon->ses->Suid; - buffer->Mid = get_next_mid(treeCon->ses->server); + if (treeCon->ses->server) + buffer->Mid = get_next_mid(treeCon->ses->server); } if (treeCon->Flags & SMB_SHARE_IS_IN_DFS) buffer->Flags2 |= SMBFLG2_DFS; @@ -591,6 +591,7 @@ void cifs_put_writer(struct cifsInodeInfo *cinode) /** * cifs_queue_oplock_break - queue the oplock break handler for cfile + * @cfile: The file to break the oplock on * * This function is called from the demultiplex thread when it * receives an oplock break for @cfile. @@ -736,7 +737,7 @@ cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) if (cancel_delayed_work(&cfile->deferred)) { tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) - continue; + break; tmp_list->cfile = cfile; list_add_tail(&tmp_list->list, &file_head); } @@ -767,7 +768,7 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) if (cancel_delayed_work(&cfile->deferred)) { tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); if (tmp_list == NULL) - continue; + break; tmp_list->cfile = cfile; list_add_tail(&tmp_list->list, &file_head); } @@ -781,6 +782,43 @@ cifs_close_all_deferred_files(struct cifs_tcon *tcon) kfree(tmp_list); } } +void +cifs_close_deferred_file_under_dentry(struct cifs_tcon *tcon, const char *path) +{ + struct cifsFileInfo *cfile; + struct list_head *tmp; + struct file_list *tmp_list, *tmp_next_list; + struct list_head file_head; + void *page; + const char *full_path; + + INIT_LIST_HEAD(&file_head); + page = alloc_dentry_path(); + spin_lock(&tcon->open_file_lock); + list_for_each(tmp, &tcon->openFileList) { + cfile = list_entry(tmp, struct cifsFileInfo, tlist); + full_path = build_path_from_dentry(cfile->dentry, page); + if (strstr(full_path, path)) { + if (delayed_work_pending(&cfile->deferred)) { + if (cancel_delayed_work(&cfile->deferred)) { + tmp_list = kmalloc(sizeof(struct file_list), GFP_ATOMIC); + if (tmp_list == NULL) + break; + tmp_list->cfile = cfile; + list_add_tail(&tmp_list->list, &file_head); + } + } + } + } + spin_unlock(&tcon->open_file_lock); + + list_for_each_entry_safe(tmp_list, tmp_next_list, &file_head, list) { + _cifsFileInfo_put(tmp_list->cfile, true, false); + list_del(&tmp_list->list); + kfree(tmp_list); + } + free_dentry_path(page); +} /* parses DFS refferal V3 structure * caller is responsible for freeing target_nodes @@ -1029,6 +1067,9 @@ setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) /** * cifs_alloc_hash - allocate hash and hash context together + * @name: The name of the crypto hash algo + * @shash: Where to put the pointer to the hash algo + * @sdesc: Where to put the pointer to the hash descriptor * * The caller has to make sure @sdesc is initialized to either NULL or * a valid context. Both can be freed via cifs_free_hash(). @@ -1067,6 +1108,8 @@ cifs_alloc_hash(const char *name, /** * cifs_free_hash - free hash and hash context together + * @shash: Where to find the pointer to the hash algo + * @sdesc: Where to find the pointer to the hash descriptor * * Freeing a NULL hash or context is safe. */ @@ -1082,8 +1125,10 @@ cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc) /** * rqst_page_get_length - obtain the length and offset for a page in smb_rqst - * Input: rqst - a smb_rqst, page - a page index for rqst - * Output: *len - the length for this page, *offset - the offset for this page + * @rqst: The request descriptor + * @page: The index of the page to query + * @len: Where to store the length for this page: + * @offset: Where to store the offset for this page */ void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page, unsigned int *len, unsigned int *offset) @@ -1116,6 +1161,8 @@ void extract_unc_hostname(const char *unc, const char **h, size_t *len) /** * copy_path_name - copy src path to dst, possibly truncating + * @dst: The destination buffer + * @src: The source name * * returns number of bytes written (including trailing nul) */ diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 0e728aac67e9..fa9fbd6a819c 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * fs/cifs/netmisc.c * * Copyright (c) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 378133ce8869..25a2b8ef88b9 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/ntlmssp.h * * Copyright (c) International Business Machines Corp., 2002,2007 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 54d77c99e21c..1929e80c09ee 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/readdir.c * * Directory search handling * diff --git a/fs/cifs/rfc1002pdu.h b/fs/cifs/rfc1002pdu.h index 137f7c95afd6..ae1d025da294 100644 --- a/fs/cifs/rfc1002pdu.h +++ b/fs/cifs/rfc1002pdu.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/rfc1002pdu.h * * Protocol Data Unit definitions for RFC 1001/1002 support * diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 118403fbeda2..23e02db7923f 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/sess.c * * SMB/CIFS session setup handling routines * diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index c9d8a50062b8..f5dcc4940b6d 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/smb2file.c * * Copyright (C) International Business Machines Corp., 2002, 2011 * Author(s): Steve French (sfrench@us.ibm.com), diff --git a/fs/cifs/smb2glob.h b/fs/cifs/smb2glob.h index d0e9f3782bd9..ca692b2283cd 100644 --- a/fs/cifs/smb2glob.h +++ b/fs/cifs/smb2glob.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/smb2glob.h * * Definitions for various global variables and structures * diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 957b2594f02e..8297703492ee 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/smb2inode.c * * Copyright (C) International Business Machines Corp., 2002, 2011 * Etersoft, 2012 diff --git a/fs/cifs/smb2maperror.c b/fs/cifs/smb2maperror.c index 181514b8770d..194799ddd382 100644 --- a/fs/cifs/smb2maperror.c +++ b/fs/cifs/smb2maperror.c @@ -2439,14 +2439,16 @@ smb2_print_status(__le32 status) int map_smb2_to_linux_error(char *buf, bool log_err) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; unsigned int i; int rc = -EIO; __le32 smb2err = shdr->Status; if (smb2err == 0) { - trace_smb3_cmd_done(shdr->TreeId, shdr->SessionId, - le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId)); + trace_smb3_cmd_done(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), + le16_to_cpu(shdr->Command), + le64_to_cpu(shdr->MessageId)); return 0; } @@ -2470,8 +2472,10 @@ map_smb2_to_linux_error(char *buf, bool log_err) cifs_dbg(FYI, "Mapping SMB2 status code 0x%08x to POSIX err %d\n", __le32_to_cpu(smb2err), rc); - trace_smb3_cmd_err(shdr->TreeId, shdr->SessionId, - le16_to_cpu(shdr->Command), - le64_to_cpu(shdr->MessageId), le32_to_cpu(smb2err), rc); + trace_smb3_cmd_err(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), + le16_to_cpu(shdr->Command), + le64_to_cpu(shdr->MessageId), + le32_to_cpu(smb2err), rc); return rc; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 668f77108831..cdcdef32759e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/smb2misc.c * * Copyright (C) International Business Machines Corp., 2002,2011 * Etersoft, 2012 @@ -9,7 +8,6 @@ * */ #include <linux/ctype.h> -#include "smb2pdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "smb2proto.h" @@ -20,7 +18,7 @@ #include "nterr.h" static int -check_smb2_hdr(struct smb2_sync_hdr *shdr, __u64 mid) +check_smb2_hdr(struct smb2_hdr *shdr, __u64 mid) { __u64 wire_mid = le64_to_cpu(shdr->MessageId); @@ -82,9 +80,9 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { /* SMB2_OPLOCK_BREAK */ cpu_to_le16(24) }; -#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_sync_hdr) + sizeof(struct smb2_negotiate_rsp)) +#define SMB311_NEGPROT_BASE_SIZE (sizeof(struct smb2_hdr) + sizeof(struct smb2_negotiate_rsp)) -static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len, +static __u32 get_neg_ctxt_len(struct smb2_hdr *hdr, __u32 len, __u32 non_ctxlen) { __u16 neg_count; @@ -136,13 +134,13 @@ static __u32 get_neg_ctxt_len(struct smb2_sync_hdr *hdr, __u32 len, int smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; - struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)shdr; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; + struct smb2_pdu *pdu = (struct smb2_pdu *)shdr; __u64 mid; __u32 clc_len; /* calculated length */ int command; - int pdu_size = sizeof(struct smb2_sync_pdu); - int hdr_size = sizeof(struct smb2_sync_hdr); + int pdu_size = sizeof(struct smb2_pdu); + int hdr_size = sizeof(struct smb2_hdr); /* * Add function to do table lookup of StructureSize by command @@ -156,7 +154,7 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr) /* decrypt frame now that it is completely read in */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(ses, &srvr->smb_ses_list, smb_ses_list) { - if (ses->Suid == thdr->SessionId) + if (ses->Suid == le64_to_cpu(thdr->SessionId)) break; } spin_unlock(&cifs_tcp_ses_lock); @@ -297,7 +295,7 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { * area and the offset to it (from the beginning of the smb are also returned. */ char * -smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr) +smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *shdr) { *off = 0; *len = 0; @@ -402,8 +400,8 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr) unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *srvr) { - struct smb2_sync_pdu *pdu = (struct smb2_sync_pdu *)buf; - struct smb2_sync_hdr *shdr = &pdu->sync_hdr; + struct smb2_pdu *pdu = (struct smb2_pdu *)buf; + struct smb2_hdr *shdr = &pdu->hdr; int offset; /* the offset from the beginning of SMB to data area */ int data_length; /* the length of the variable length data area */ /* Structure Size has already been checked to make sure it is 64 */ @@ -670,7 +668,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) cifs_dbg(FYI, "Checking for oplock break\n"); - if (rsp->sync_hdr.Command != SMB2_OPLOCK_BREAK) + if (rsp->hdr.Command != SMB2_OPLOCK_BREAK) return false; if (rsp->StructureSize != @@ -817,25 +815,25 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, int smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *sync_hdr = mid->resp_buf; + struct smb2_hdr *hdr = mid->resp_buf; struct smb2_create_rsp *rsp = mid->resp_buf; struct cifs_tcon *tcon; int rc; - if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || sync_hdr->Command != SMB2_CREATE || - sync_hdr->Status != STATUS_SUCCESS) + if ((mid->optype & CIFS_CP_CREATE_CLOSE_OP) || hdr->Command != SMB2_CREATE || + hdr->Status != STATUS_SUCCESS) return 0; - tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId, - sync_hdr->TreeId); + tcon = smb2_find_smb_tcon(server, le64_to_cpu(hdr->SessionId), + le32_to_cpu(hdr->Id.SyncId.TreeId)); if (!tcon) return -ENOENT; rc = __smb2_handle_cancelled_cmd(tcon, - le16_to_cpu(sync_hdr->Command), - le64_to_cpu(sync_hdr->MessageId), - rsp->PersistentFileId, - rsp->VolatileFileId); + le16_to_cpu(hdr->Command), + le64_to_cpu(hdr->MessageId), + le64_to_cpu(rsp->PersistentFileId), + le64_to_cpu(rsp->VolatileFileId)); if (rc) cifs_put_tcon(tcon); @@ -857,10 +855,10 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct kvec *iov, int nvec) { int i, rc; struct sdesc *d; - struct smb2_sync_hdr *hdr; + struct smb2_hdr *hdr; struct TCP_Server_Info *server = cifs_ses_server(ses); - hdr = (struct smb2_sync_hdr *)iov[0].iov_base; + hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ if (hdr->Command == SMB2_NEGOTIATE) goto ok; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index bda606dc72b1..7acf71defea7 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -325,7 +325,7 @@ static struct mid_q_entry * __smb2_find_mid(struct TCP_Server_Info *server, char *buf, bool dequeue) { struct mid_q_entry *mid; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; __u64 wire_mid = le64_to_cpu(shdr->MessageId); if (shdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { @@ -367,11 +367,11 @@ static void smb2_dump_detail(void *buf, struct TCP_Server_Info *server) { #ifdef CONFIG_CIFS_DEBUG2 - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; cifs_server_dbg(VFS, "Cmd: %d Err: 0x%x Flags: 0x%x Mid: %llu Pid: %d\n", shdr->Command, shdr->Status, shdr->Flags, shdr->MessageId, - shdr->ProcessId); + shdr->Id.SyncId.ProcessId); cifs_server_dbg(VFS, "smb buf %p len %u\n", buf, server->ops->calc_smb_size(buf, server)); #endif @@ -885,10 +885,10 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, atomic_inc(&tcon->num_remote_opens); o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; - oparms.fid->persistent_fid = o_rsp->PersistentFileId; - oparms.fid->volatile_fid = o_rsp->VolatileFileId; + oparms.fid->persistent_fid = le64_to_cpu(o_rsp->PersistentFileId); + oparms.fid->volatile_fid = le64_to_cpu(o_rsp->VolatileFileId); #ifdef CONFIG_CIFS_DEBUG2 - oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); + oparms.fid->mid = le64_to_cpu(o_rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ tcon->crfid.tcon = tcon; @@ -2391,12 +2391,12 @@ again: /* If the open failed there is nothing to do */ op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; - if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) { + if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) { cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc); goto qdf_free; } - fid->persistent_fid = op_rsp->PersistentFileId; - fid->volatile_fid = op_rsp->VolatileFileId; + fid->persistent_fid = le64_to_cpu(op_rsp->PersistentFileId); + fid->volatile_fid = le64_to_cpu(op_rsp->VolatileFileId); /* Anything else than ENODATA means a genuine error */ if (rc && rc != -ENODATA) { @@ -2410,7 +2410,7 @@ again: atomic_inc(&tcon->num_remote_opens); qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base; - if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + if (qd_rsp->hdr.Status == STATUS_NO_MORE_FILES) { trace_smb3_query_dir_done(xid, fid->persistent_fid, tcon->tid, tcon->ses->Suid, 0, 0); srch_inf->endOfSearch = true; @@ -2462,7 +2462,7 @@ smb2_close_dir(const unsigned int xid, struct cifs_tcon *tcon, static bool smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; int scredits, in_flight; if (shdr->Status != STATUS_PENDING) @@ -2489,13 +2489,14 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) static bool smb2_is_session_expired(char *buf) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; if (shdr->Status != STATUS_NETWORK_SESSION_EXPIRED && shdr->Status != STATUS_USER_SESSION_DELETED) return false; - trace_smb3_ses_expired(shdr->TreeId, shdr->SessionId, + trace_smb3_ses_expired(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), le16_to_cpu(shdr->Command), le64_to_cpu(shdr->MessageId)); cifs_dbg(FYI, "Session expired or deleted\n"); @@ -2506,7 +2507,7 @@ smb2_is_session_expired(char *buf) static bool smb2_is_status_io_timeout(char *buf) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; if (shdr->Status == STATUS_IO_TIMEOUT) return true; @@ -2517,7 +2518,7 @@ smb2_is_status_io_timeout(char *buf) static void smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) { - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct list_head *tmp, *tmp1; struct cifs_ses *ses; struct cifs_tcon *tcon; @@ -2530,7 +2531,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) ses = list_entry(tmp, struct cifs_ses, smb_ses_list); list_for_each(tmp1, &ses->tcon_list) { tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); - if (tcon->tid == shdr->TreeId) { + if (tcon->tid == le32_to_cpu(shdr->Id.SyncId.TreeId)) { tcon->need_reconnect = true; spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", @@ -2558,9 +2559,9 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, void smb2_set_related(struct smb_rqst *rqst) { - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; - shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base); + shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); if (shdr == NULL) { cifs_dbg(FYI, "shdr NULL in smb2_set_related\n"); return; @@ -2573,13 +2574,13 @@ char smb2_padding[7] = {0, 0, 0, 0, 0, 0, 0}; void smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst) { - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct cifs_ses *ses = tcon->ses; struct TCP_Server_Info *server = ses->server; unsigned long len = smb_rqst_len(server, rqst); int i, num_padding; - shdr = (struct smb2_sync_hdr *)(rqst->rq_iov[0].iov_base); + shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base); if (shdr == NULL) { cifs_dbg(FYI, "shdr NULL in smb2_set_next_command\n"); return; @@ -3124,7 +3125,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, resp_buftype, rsp_iov); create_rsp = rsp_iov[0].iov_base; - if (create_rsp && create_rsp->sync_hdr.Status) + if (create_rsp && create_rsp->hdr.Status) err_iov = rsp_iov[0]; ioctl_rsp = rsp_iov[1].iov_base; @@ -4369,8 +4370,8 @@ static void fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, struct smb_rqst *old_rq, __le16 cipher_type) { - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)old_rq->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)old_rq->rq_iov[0].iov_base; memset(tr_hdr, 0, sizeof(struct smb2_transform_hdr)); tr_hdr->ProtocolId = SMB2_TRANSFORM_PROTO_NUM; @@ -4496,7 +4497,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); - rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key); + rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { cifs_server_dbg(VFS, "%s: Could not get %scryption key\n", __func__, enc ? "en" : "de"); @@ -4788,7 +4789,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, unsigned int cur_page_idx; unsigned int pad_len; struct cifs_readdata *rdata = mid->callback_data; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *shdr = (struct smb2_hdr *)buf; struct bio_vec *bvec = NULL; struct iov_iter iter; struct kvec iov; @@ -5117,7 +5118,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, { int ret, length; char *buf = server->smallbuf; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; unsigned int pdu_length = server->pdu_size; unsigned int buf_size; struct mid_q_entry *mid_entry; @@ -5147,7 +5148,7 @@ receive_encrypted_standard(struct TCP_Server_Info *server, next_is_large = server->large_buf; one_more: - shdr = (struct smb2_sync_hdr *)buf; + shdr = (struct smb2_hdr *)buf; if (shdr->NextCommand) { if (next_is_large) next_buffer = (char *)cifs_buf_get(); @@ -5213,7 +5214,7 @@ smb3_receive_transform(struct TCP_Server_Info *server, unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); if (pdu_length < sizeof(struct smb2_transform_hdr) + - sizeof(struct smb2_sync_hdr)) { + sizeof(struct smb2_hdr)) { cifs_server_dbg(VFS, "Transform message is too small (%u)\n", pdu_length); cifs_reconnect(server); @@ -5246,7 +5247,7 @@ smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid) static int smb2_next_header(char *buf) { - struct smb2_sync_hdr *hdr = (struct smb2_sync_hdr *)buf; + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; struct smb2_transform_hdr *t_hdr = (struct smb2_transform_hdr *)buf; if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) @@ -5788,7 +5789,7 @@ struct smb_version_values smb20_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5809,7 +5810,7 @@ struct smb_version_values smb21_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5830,7 +5831,7 @@ struct smb_version_values smb3any_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5851,7 +5852,7 @@ struct smb_version_values smbdefault_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5872,7 +5873,7 @@ struct smb_version_values smb30_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5893,7 +5894,7 @@ struct smb_version_values smb302_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, @@ -5914,7 +5915,7 @@ struct smb_version_values smb311_values = { .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, - .header_size = sizeof(struct smb2_sync_hdr), + .header_size = sizeof(struct smb2_hdr), .header_preamble_size = 0, .max_header_size = MAX_SMB2_HDR_SIZE, .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index b6d2e3591927..d2ecb2ea37c0 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/smb2pdu.c * * Copyright (C) International Business Machines Corp., 2009, 2013 * Etersoft, 2012 @@ -24,7 +23,6 @@ #include <linux/uuid.h> #include <linux/pagemap.h> #include <linux/xattr.h> -#include "smb2pdu.h" #include "cifsglob.h" #include "cifsacl.h" #include "cifsproto.h" @@ -85,7 +83,7 @@ int smb3_encryption_required(const struct cifs_tcon *tcon) } static void -smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, +smb2_hdr_assemble(struct smb2_hdr *shdr, __le16 smb2_cmd, const struct cifs_tcon *tcon, struct TCP_Server_Info *server) { @@ -105,7 +103,7 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, } else { shdr->CreditRequest = cpu_to_le16(2); } - shdr->ProcessId = cpu_to_le32((__u16)current->tgid); + shdr->Id.SyncId.ProcessId = cpu_to_le32((__u16)current->tgid); if (!tcon) goto out; @@ -116,10 +114,10 @@ smb2_hdr_assemble(struct smb2_sync_hdr *shdr, __le16 smb2_cmd, shdr->CreditCharge = cpu_to_le16(1); /* else CreditCharge MBZ */ - shdr->TreeId = tcon->tid; + shdr->Id.SyncId.TreeId = cpu_to_le32(tcon->tid); /* Uid is not converted */ if (tcon->ses) - shdr->SessionId = tcon->ses->Suid; + shdr->SessionId = cpu_to_le64(tcon->ses->Suid); /* * If we would set SMB2_FLAGS_DFS_OPERATIONS on open we also would have @@ -332,7 +330,7 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf, unsigned int *total_len) { - struct smb2_sync_pdu *spdu = (struct smb2_sync_pdu *)buf; + struct smb2_pdu *spdu = (struct smb2_pdu *)buf; /* lookup word count ie StructureSize from table */ __u16 parmsize = smb2_req_struct_sizes[le16_to_cpu(smb2_command)]; @@ -342,10 +340,10 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, */ memset(buf, 0, 256); - smb2_hdr_assemble(&spdu->sync_hdr, smb2_command, tcon, server); + smb2_hdr_assemble(&spdu->hdr, smb2_command, tcon, server); spdu->StructureSize2 = cpu_to_le16(parmsize); - *total_len = parmsize + sizeof(struct smb2_sync_hdr); + *total_len = parmsize + sizeof(struct smb2_hdr); } /* @@ -368,7 +366,7 @@ static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon, } fill_small_buf(smb2_command, tcon, server, - (struct smb2_sync_hdr *)(*request_buf), + (struct smb2_hdr *)(*request_buf), total_len); if (tcon != NULL) { @@ -415,8 +413,8 @@ build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt) pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES; pneg_ctxt->DataLength = cpu_to_le16(38); pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1); - pneg_ctxt->SaltLength = cpu_to_le16(SMB311_LINUX_CLIENT_SALT_SIZE); - get_random_bytes(pneg_ctxt->Salt, SMB311_LINUX_CLIENT_SALT_SIZE); + pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE); + get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE); pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512; } @@ -858,7 +856,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) if (rc) return rc; - req->sync_hdr.SessionId = 0; + req->hdr.SessionId = 0; memset(server->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); memset(ses->preauth_sha_hash, 0, SMB2_PREAUTH_HASH_SIZE); @@ -1019,7 +1017,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) server->cipher_type = SMB2_ENCRYPTION_AES128_CCM; security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, - (struct smb2_sync_hdr *)rsp); + (struct smb2_hdr *)rsp); /* * See MS-SMB2 section 2.2.4: if no blob, client picks default which * for us will be @@ -1251,23 +1249,23 @@ SMB2_sess_alloc_buffer(struct SMB2_sess_data *sess_data) return rc; if (sess_data->ses->binding) { - req->sync_hdr.SessionId = sess_data->ses->Suid; - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.SessionId = cpu_to_le64(sess_data->ses->Suid); + req->hdr.Flags |= SMB2_FLAGS_SIGNED; req->PreviousSessionId = 0; req->Flags = SMB2_SESSION_REQ_FLAG_BINDING; } else { /* First session, not a reauthenticate */ - req->sync_hdr.SessionId = 0; + req->hdr.SessionId = 0; /* * if reconnect, we need to send previous sess id * otherwise it is 0 */ - req->PreviousSessionId = sess_data->previous_session; + req->PreviousSessionId = cpu_to_le64(sess_data->previous_session); req->Flags = 0; /* MBZ */ } /* enough to enable echos and oplocks and one max size write */ - req->sync_hdr.CreditRequest = cpu_to_le16(130); + req->hdr.CreditRequest = cpu_to_le16(130); /* only one of SMB2 signing flags may be set in SMB2 request */ if (server->sign) @@ -1426,7 +1424,7 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data) rsp = (struct smb2_sess_setup_rsp *)sess_data->iov[0].iov_base; /* keep session id and flags if binding */ if (!ses->binding) { - ses->Suid = rsp->sync_hdr.SessionId; + ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1502,7 +1500,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) /* If true, rc here is expected and not an error */ if (sess_data->buf0_type != CIFS_NO_BUFFER && - rsp->sync_hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) + rsp->hdr.Status == STATUS_MORE_PROCESSING_REQUIRED) rc = 0; if (rc) @@ -1524,7 +1522,7 @@ SMB2_sess_auth_rawntlmssp_negotiate(struct SMB2_sess_data *sess_data) /* keep existing ses id and flags if binding */ if (!ses->binding) { - ses->Suid = rsp->sync_hdr.SessionId; + ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1559,7 +1557,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) goto out; req = (struct smb2_sess_setup_req *) sess_data->iov[0].iov_base; - req->sync_hdr.SessionId = ses->Suid; + req->hdr.SessionId = cpu_to_le64(ses->Suid); rc = build_ntlmssp_auth_blob(&ntlmssp_blob, &blob_length, ses, sess_data->nls_cp); @@ -1585,7 +1583,7 @@ SMB2_sess_auth_rawntlmssp_authenticate(struct SMB2_sess_data *sess_data) /* keep existing ses id and flags if binding */ if (!ses->binding) { - ses->Suid = rsp->sync_hdr.SessionId; + ses->Suid = le64_to_cpu(rsp->hdr.SessionId); ses->session_flags = le16_to_cpu(rsp->SessionFlags); } @@ -1716,12 +1714,12 @@ SMB2_logoff(const unsigned int xid, struct cifs_ses *ses) return rc; /* since no tcon, smb2_init can not do this, so do here */ - req->sync_hdr.SessionId = ses->Suid; + req->hdr.SessionId = cpu_to_le64(ses->Suid); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) flags |= CIFS_TRANSFORM_REQ; else if (server->sign) - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.Flags |= SMB2_FLAGS_SIGNED; flags |= CIFS_NO_RSP_BUF; @@ -1829,14 +1827,14 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, !(ses->session_flags & (SMB2_SESSION_FLAG_IS_GUEST|SMB2_SESSION_FLAG_IS_NULL)) && ((ses->user_name != NULL) || (ses->sectype == Kerberos))) - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.Flags |= SMB2_FLAGS_SIGNED; memset(&rqst, 0, sizeof(struct smb_rqst)); rqst.rq_iov = iov; rqst.rq_nvec = 2; /* Need 64 for max size write so ask for more in case not there yet */ - req->sync_hdr.CreditRequest = cpu_to_le16(64); + req->hdr.CreditRequest = cpu_to_le16(64); rc = cifs_send_recv(xid, ses, server, &rqst, &resp_buftype, flags, &rsp_iov); @@ -1872,7 +1870,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree, tcon->maximal_access = le32_to_cpu(rsp->MaximalAccess); tcon->tidStatus = CifsGood; tcon->need_reconnect = false; - tcon->tid = rsp->sync_hdr.TreeId; + tcon->tid = le32_to_cpu(rsp->hdr.Id.SyncId.TreeId); strlcpy(tcon->treeName, tree, sizeof(tcon->treeName)); if ((rsp->Capabilities & SMB2_SHARE_CAP_DFS) && @@ -1893,9 +1891,8 @@ tcon_exit: return rc; tcon_error_exit: - if (rsp && rsp->sync_hdr.Status == STATUS_BAD_NETWORK_NAME) { + if (rsp && rsp->hdr.Status == STATUS_BAD_NETWORK_NAME) cifs_tcon_dbg(VFS, "BAD_NETWORK_NAME: %s\n", tree); - } goto tcon_exit; } @@ -2398,7 +2395,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) buf->sd.OffsetDacl = cpu_to_le32(ptr - (__u8 *)&buf->sd); /* Ship the ACL for now. we will copy it into buf later. */ aclptr = ptr; - ptr += sizeof(struct cifs_acl); + ptr += sizeof(struct smb3_acl); /* create one ACE to hold the mode embedded in reserved special SID */ acelen = setup_special_mode_ACE((struct cifs_ace *)ptr, (__u64)mode); @@ -2423,7 +2420,7 @@ create_sd_buf(umode_t mode, bool set_owner, unsigned int *len) acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */ acl.AclSize = cpu_to_le16(acl_size); acl.AceCount = cpu_to_le16(ace_count); - memcpy(aclptr, &acl, sizeof(struct cifs_acl)); + memcpy(aclptr, &acl, sizeof(struct smb3_acl)); buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd); *len = roundup(ptr - (__u8 *)buf, 8); @@ -2609,7 +2606,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, if (tcon->share_flags & SHI1005_FLAGS_DFS) { int name_len; - req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, tcon->treeName, utf16_path); @@ -2673,11 +2670,13 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, } rsp = (struct smb2_create_rsp *)rsp_iov.iov_base; - trace_smb3_posix_mkdir_done(xid, rsp->PersistentFileId, tcon->tid, + trace_smb3_posix_mkdir_done(xid, le64_to_cpu(rsp->PersistentFileId), + tcon->tid, ses->Suid, CREATE_NOT_FILE, FILE_WRITE_ATTRIBUTES); - SMB2_close(xid, tcon, rsp->PersistentFileId, rsp->VolatileFileId); + SMB2_close(xid, tcon, le64_to_cpu(rsp->PersistentFileId), + le64_to_cpu(rsp->VolatileFileId)); /* Eventually save off posix specific response info and timestaps */ @@ -2741,7 +2740,7 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (tcon->share_flags & SHI1005_FLAGS_DFS) { int name_len; - req->sync_hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; + req->hdr.Flags |= SMB2_FLAGS_DFS_OPERATIONS; rc = alloc_path_with_tree_prefix(©_path, ©_size, &name_len, tcon->treeName, path); @@ -2944,16 +2943,17 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, } goto creat_exit; } else - trace_smb3_open_done(xid, rsp->PersistentFileId, tcon->tid, + trace_smb3_open_done(xid, le64_to_cpu(rsp->PersistentFileId), + tcon->tid, ses->Suid, oparms->create_options, oparms->desired_access); atomic_inc(&tcon->num_remote_opens); - oparms->fid->persistent_fid = rsp->PersistentFileId; - oparms->fid->volatile_fid = rsp->VolatileFileId; + oparms->fid->persistent_fid = le64_to_cpu(rsp->PersistentFileId); + oparms->fid->volatile_fid = le64_to_cpu(rsp->VolatileFileId); oparms->fid->access = oparms->desired_access; #ifdef CONFIG_CIFS_DEBUG2 - oparms->fid->mid = le64_to_cpu(rsp->sync_hdr.MessageId); + oparms->fid->mid = le64_to_cpu(rsp->hdr.MessageId); #endif /* CIFS_DEBUG2 */ if (buf) { @@ -3053,7 +3053,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, * response size smaller. */ req->MaxOutputResponse = cpu_to_le32(max_response_size); - req->sync_hdr.CreditCharge = + req->hdr.CreditCharge = cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size), SMB2_MAX_BUFFER_SIZE)); if (is_fsctl) @@ -3063,7 +3063,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, /* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) - req->sync_hdr.Flags |= SMB2_FLAGS_SIGNED; + req->hdr.Flags |= SMB2_FLAGS_SIGNED; return 0; } @@ -3237,8 +3237,8 @@ SMB2_close_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (rc) return rc; - req->PersistentFileId = persistent_fid; - req->VolatileFileId = volatile_fid; + req->PersistentFileId = cpu_to_le64(persistent_fid); + req->VolatileFileId = cpu_to_le64(volatile_fid); if (query_attrs) req->Flags = SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB; else @@ -3601,8 +3601,8 @@ SMB2_notify_init(const unsigned int xid, struct smb_rqst *rqst, if (rc) return rc; - req->PersistentFileId = persistent_fid; - req->VolatileFileId = volatile_fid; + req->PersistentFileId = cpu_to_le64(persistent_fid); + req->VolatileFileId = cpu_to_le64(volatile_fid); /* See note 354 of MS-SMB2, 64K max */ req->OutputBufferLength = cpu_to_le32(SMB2_MAX_BUFFER_SIZE - MAX_SMB2_HDR_SIZE); @@ -3688,7 +3688,7 @@ smb2_echo_callback(struct mid_q_entry *mid) if (mid->mid_state == MID_RESPONSE_RECEIVED || mid->mid_state == MID_RESPONSE_MALFORMED) { - credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; } @@ -3788,7 +3788,7 @@ SMB2_echo(struct TCP_Server_Info *server) if (rc) return rc; - req->sync_hdr.CreditRequest = cpu_to_le16(1); + req->hdr.CreditRequest = cpu_to_le16(1); iov[0].iov_len = total_len; iov[0].iov_base = (char *)req; @@ -3824,8 +3824,8 @@ SMB2_flush_init(const unsigned int xid, struct smb_rqst *rqst, if (rc) return rc; - req->PersistentFileId = persistent_fid; - req->VolatileFileId = volatile_fid; + req->PersistentFileId = cpu_to_le64(persistent_fid); + req->VolatileFileId = cpu_to_le64(volatile_fid); iov[0].iov_base = (char *)req; iov[0].iov_len = total_len; @@ -3891,8 +3891,8 @@ smb2_new_read_req(void **buf, unsigned int *total_len, unsigned int remaining_bytes, int request_type) { int rc = -EACCES; - struct smb2_read_plain_req *req = NULL; - struct smb2_sync_hdr *shdr; + struct smb2_read_req *req = NULL; + struct smb2_hdr *shdr; struct TCP_Server_Info *server = io_parms->server; rc = smb2_plain_req_init(SMB2_READ, io_parms->tcon, server, @@ -3903,11 +3903,11 @@ smb2_new_read_req(void **buf, unsigned int *total_len, if (server == NULL) return -ECONNABORTED; - shdr = &req->sync_hdr; - shdr->ProcessId = cpu_to_le32(io_parms->pid); + shdr = &req->hdr; + shdr->Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = io_parms->persistent_fid; - req->VolatileFileId = io_parms->volatile_fid; + req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid); + req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid); req->ReadChannelInfoOffset = 0; /* reserved */ req->ReadChannelInfoLength = 0; /* reserved */ req->Channel = 0; /* reserved */ @@ -3941,7 +3941,7 @@ smb2_new_read_req(void **buf, unsigned int *total_len, if (need_invalidate) req->Channel = SMB2_CHANNEL_RDMA_V1; req->ReadChannelInfoOffset = - cpu_to_le16(offsetof(struct smb2_read_plain_req, Buffer)); + cpu_to_le16(offsetof(struct smb2_read_req, Buffer)); req->ReadChannelInfoLength = cpu_to_le16(sizeof(struct smbd_buffer_descriptor_v1)); v1 = (struct smbd_buffer_descriptor_v1 *) &req->Buffer[0]; @@ -3965,10 +3965,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * Related requests use info from previous read request * in chain. */ - shdr->SessionId = 0xFFFFFFFFFFFFFFFF; - shdr->TreeId = 0xFFFFFFFF; - req->PersistentFileId = 0xFFFFFFFFFFFFFFFF; - req->VolatileFileId = 0xFFFFFFFFFFFFFFFF; + shdr->SessionId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); + shdr->Id.SyncId.TreeId = cpu_to_le32(0xFFFFFFFF); + req->PersistentFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); + req->VolatileFileId = cpu_to_le64(0xFFFFFFFFFFFFFFFF); } } if (remaining_bytes > io_parms->length) @@ -3986,8 +3986,8 @@ smb2_readv_callback(struct mid_q_entry *mid) struct cifs_readdata *rdata = mid->callback_data; struct cifs_tcon *tcon = tlink_tcon(rdata->cfile->tlink); struct TCP_Server_Info *server = rdata->server; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rdata->iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rdata->iov[0].iov_base; struct cifs_credits credits = { .value = 0, .instance = 0 }; struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], .rq_nvec = 1, @@ -4073,7 +4073,7 @@ smb2_async_readv(struct cifs_readdata *rdata) { int rc, flags = 0; char *buf; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct cifs_io_parms io_parms; struct smb_rqst rqst = { .rq_iov = rdata->iov, .rq_nvec = 1 }; @@ -4106,7 +4106,7 @@ smb2_async_readv(struct cifs_readdata *rdata) rdata->iov[0].iov_base = buf; rdata->iov[0].iov_len = total_len; - shdr = (struct smb2_sync_hdr *)buf; + shdr = (struct smb2_hdr *)buf; if (rdata->credits.value > 0) { shdr->CreditCharge = cpu_to_le16(DIV_ROUND_UP(rdata->bytes, @@ -4145,7 +4145,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, { struct smb_rqst rqst; int resp_buftype, rc; - struct smb2_read_plain_req *req = NULL; + struct smb2_read_req *req = NULL; struct smb2_read_rsp *rsp = NULL; struct kvec iov[1]; struct kvec rsp_iov; @@ -4179,19 +4179,22 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, if (rc != -ENODATA) { cifs_stats_fail_inc(io_parms->tcon, SMB2_READ_HE); cifs_dbg(VFS, "Send error in read = %d\n", rc); - trace_smb3_read_err(xid, req->PersistentFileId, + trace_smb3_read_err(xid, + le64_to_cpu(req->PersistentFileId), io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length, rc); } else - trace_smb3_read_done(xid, req->PersistentFileId, - io_parms->tcon->tid, ses->Suid, - io_parms->offset, 0); + trace_smb3_read_done(xid, + le64_to_cpu(req->PersistentFileId), + io_parms->tcon->tid, ses->Suid, + io_parms->offset, 0); free_rsp_buf(resp_buftype, rsp_iov.iov_base); cifs_small_buf_release(req); return rc == -ENODATA ? 0 : rc; } else - trace_smb3_read_done(xid, req->PersistentFileId, + trace_smb3_read_done(xid, + le64_to_cpu(req->PersistentFileId), io_parms->tcon->tid, ses->Suid, io_parms->offset, io_parms->length); @@ -4239,7 +4242,7 @@ smb2_writev_callback(struct mid_q_entry *mid) switch (mid->mid_state) { case MID_RESPONSE_RECEIVED: - credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; wdata->result = smb2_check_receive(mid, server, 0); if (wdata->result != 0) @@ -4265,7 +4268,7 @@ smb2_writev_callback(struct mid_q_entry *mid) wdata->result = -EAGAIN; break; case MID_RESPONSE_MALFORMED: - credits.value = le16_to_cpu(rsp->sync_hdr.CreditRequest); + credits.value = le16_to_cpu(rsp->hdr.CreditRequest); credits.instance = server->reconnect_instance; fallthrough; default: @@ -4312,7 +4315,7 @@ smb2_async_writev(struct cifs_writedata *wdata, { int rc = -EACCES, flags = 0; struct smb2_write_req *req = NULL; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct cifs_tcon *tcon = tlink_tcon(wdata->cfile->tlink); struct TCP_Server_Info *server = wdata->server; struct kvec iov[1]; @@ -4330,11 +4333,11 @@ smb2_async_writev(struct cifs_writedata *wdata, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - shdr = (struct smb2_sync_hdr *)req; - shdr->ProcessId = cpu_to_le32(wdata->cfile->pid); + shdr = (struct smb2_hdr *)req; + shdr->Id.SyncId.ProcessId = cpu_to_le32(wdata->cfile->pid); - req->PersistentFileId = wdata->cfile->fid.persistent_fid; - req->VolatileFileId = wdata->cfile->fid.volatile_fid; + req->PersistentFileId = cpu_to_le64(wdata->cfile->fid.persistent_fid); + req->VolatileFileId = cpu_to_le64(wdata->cfile->fid.volatile_fid); req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = 0; @@ -4431,7 +4434,8 @@ smb2_async_writev(struct cifs_writedata *wdata, wdata, flags, &wdata->credits); if (rc) { - trace_smb3_write_err(0 /* no xid */, req->PersistentFileId, + trace_smb3_write_err(0 /* no xid */, + le64_to_cpu(req->PersistentFileId), tcon->tid, tcon->ses->Suid, wdata->offset, wdata->bytes, rc); kref_put(&wdata->refcount, release); @@ -4482,10 +4486,10 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, if (smb3_encryption_required(io_parms->tcon)) flags |= CIFS_TRANSFORM_REQ; - req->sync_hdr.ProcessId = cpu_to_le32(io_parms->pid); + req->hdr.Id.SyncId.ProcessId = cpu_to_le32(io_parms->pid); - req->PersistentFileId = io_parms->persistent_fid; - req->VolatileFileId = io_parms->volatile_fid; + req->PersistentFileId = cpu_to_le64(io_parms->persistent_fid); + req->VolatileFileId = cpu_to_le64(io_parms->volatile_fid); req->WriteChannelInfoOffset = 0; req->WriteChannelInfoLength = 0; req->Channel = 0; @@ -4513,7 +4517,8 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, rsp = (struct smb2_write_rsp *)rsp_iov.iov_base; if (rc) { - trace_smb3_write_err(xid, req->PersistentFileId, + trace_smb3_write_err(xid, + le64_to_cpu(req->PersistentFileId), io_parms->tcon->tid, io_parms->tcon->ses->Suid, io_parms->offset, io_parms->length, rc); @@ -4521,10 +4526,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms, cifs_dbg(VFS, "Send error in write = %d\n", rc); } else { *nbytes = le32_to_cpu(rsp->DataLength); - trace_smb3_write_done(xid, req->PersistentFileId, - io_parms->tcon->tid, - io_parms->tcon->ses->Suid, - io_parms->offset, *nbytes); + trace_smb3_write_done(xid, + le64_to_cpu(req->PersistentFileId), + io_parms->tcon->tid, + io_parms->tcon->ses->Suid, + io_parms->offset, *nbytes); } cifs_small_buf_release(req); @@ -4867,7 +4873,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, if (rc) { if (rc == -ENODATA && - rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + rsp->hdr.Status == STATUS_NO_MORE_FILES) { trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid, tcon->ses->Suid, index, 0); srch_inf->endOfSearch = true; @@ -4915,7 +4921,7 @@ SMB2_set_info_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, if (rc) return rc; - req->sync_hdr.ProcessId = cpu_to_le32(pid); + req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid); req->InfoType = info_type; req->FileInfoClass = info_class; req->PersistentFileId = persistent_fid; @@ -5075,7 +5081,7 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon, req->VolatileFid = volatile_fid; req->PersistentFid = persistent_fid; req->OplockLevel = oplock_level; - req->sync_hdr.CreditRequest = cpu_to_le16(1); + req->hdr.CreditRequest = cpu_to_le16(1); flags |= CIFS_NO_RSP_BUF; @@ -5377,7 +5383,7 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->sync_hdr.ProcessId = cpu_to_le32(pid); + req->hdr.Id.SyncId.ProcessId = cpu_to_le32(pid); req->LockCount = cpu_to_le16(num_lock); req->PersistentFileId = persist_fid; @@ -5453,7 +5459,7 @@ SMB2_lease_break(const unsigned int xid, struct cifs_tcon *tcon, if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - req->sync_hdr.CreditRequest = cpu_to_le16(1); + req->hdr.CreditRequest = cpu_to_le16(1); req->StructureSize = cpu_to_le16(36); total_len += 12; diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index e9cac7970b66..33cfd0a1adf1 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/smb2pdu.h * * Copyright (c) International Business Machines Corp., 2009, 2013 * Etersoft, 2012 @@ -15,156 +14,12 @@ #include <net/sock.h> #include "cifsacl.h" -/* - * Note that, due to trying to use names similar to the protocol specifications, - * there are many mixed case field names in the structures below. Although - * this does not match typical Linux kernel style, it is necessary to be - * able to match against the protocol specfication. - * - * SMB2 commands - * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses - * (ie no useful data other than the SMB error code itself) and are marked such. - * Knowing this helps avoid response buffer allocations and copy in some cases. - */ - -/* List of commands in host endian */ -#define SMB2_NEGOTIATE_HE 0x0000 -#define SMB2_SESSION_SETUP_HE 0x0001 -#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */ -#define SMB2_TREE_CONNECT_HE 0x0003 -#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */ -#define SMB2_CREATE_HE 0x0005 -#define SMB2_CLOSE_HE 0x0006 -#define SMB2_FLUSH_HE 0x0007 /* trivial resp */ -#define SMB2_READ_HE 0x0008 -#define SMB2_WRITE_HE 0x0009 -#define SMB2_LOCK_HE 0x000A -#define SMB2_IOCTL_HE 0x000B -#define SMB2_CANCEL_HE 0x000C -#define SMB2_ECHO_HE 0x000D -#define SMB2_QUERY_DIRECTORY_HE 0x000E -#define SMB2_CHANGE_NOTIFY_HE 0x000F -#define SMB2_QUERY_INFO_HE 0x0010 -#define SMB2_SET_INFO_HE 0x0011 -#define SMB2_OPLOCK_BREAK_HE 0x0012 - -/* The same list in little endian */ -#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE) -#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE) -#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE) -#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE) -#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE) -#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE) -#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE) -#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE) -#define SMB2_READ cpu_to_le16(SMB2_READ_HE) -#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE) -#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE) -#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE) -#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE) -#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE) -#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE) -#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE) -#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE) -#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE) -#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE) - -#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF) - -#define NUMBER_OF_SMB2_COMMANDS 0x0013 - /* 52 transform hdr + 64 hdr + 88 create rsp */ #define SMB2_TRANSFORM_HEADER_SIZE 52 #define MAX_SMB2_HDR_SIZE 204 -#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) -#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) -#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc) - -/* - * SMB2 Header Definition - * - * "MBZ" : Must be Zero - * "BB" : BugBug, Something to check/review/analyze later - * "PDU" : "Protocol Data Unit" (ie a network "frame") - * - */ - -#define SMB2_HEADER_STRUCTURE_SIZE cpu_to_le16(64) - -struct smb2_sync_hdr { - __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ - __le16 StructureSize; /* 64 */ - __le16 CreditCharge; /* MBZ */ - __le32 Status; /* Error from server */ - __le16 Command; - __le16 CreditRequest; /* CreditResponse */ - __le32 Flags; - __le32 NextCommand; - __le64 MessageId; - __le32 ProcessId; - __u32 TreeId; /* opaque - so do not make little endian */ - __u64 SessionId; /* opaque - so do not make little endian */ - __u8 Signature[16]; -} __packed; - /* The total header size for SMB2 read and write */ -#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_sync_hdr)) - -struct smb2_sync_pdu { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize2; /* size of wct area (varies, request specific) */ -} __packed; - -#define SMB3_AES_CCM_NONCE 11 -#define SMB3_AES_GCM_NONCE 12 - -/* Transform flags (for 3.0 dialect this flag indicates CCM */ -#define TRANSFORM_FLAG_ENCRYPTED 0x0001 -struct smb2_transform_hdr { - __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ - __u8 Signature[16]; - __u8 Nonce[16]; - __le32 OriginalMessageSize; - __u16 Reserved1; - __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */ - __u64 SessionId; -} __packed; - -/* See MS-SMB2 2.2.42 */ -struct smb2_compression_transform_hdr_unchained { - __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ - __le32 OriginalCompressedSegmentSize; - __le16 CompressionAlgorithm; - __le16 Flags; - __le16 Length; /* if chained it is length, else offset */ -} __packed; - -/* See MS-SMB2 2.2.42.1 */ -#define SMB2_COMPRESSION_FLAG_NONE 0x0000 -#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001 - -struct compression_payload_header { - __le16 CompressionAlgorithm; - __le16 Flags; - __le32 Length; /* length of compressed playload including field below if present */ - /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */ -} __packed; - -/* See MS-SMB2 2.2.42.2 */ -struct smb2_compression_transform_hdr_chained { - __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ - __le32 OriginalCompressedSegmentSize; - /* struct compression_payload_header[] */ -} __packed; - -/* See MS-SMB2 2.2.42.2.2 */ -struct compression_pattern_payload_v1 { - __le16 Pattern; - __le16 Reserved1; - __le16 Reserved2; - __le32 Repetitions; -} __packed; +#define SMB2_READWRITE_PDU_HEADER_SIZE (48 + sizeof(struct smb2_hdr)) /* See MS-SMB2 2.2.43 */ struct smb2_rdma_transform { @@ -191,17 +46,6 @@ struct smb2_rdma_crypto_transform { } __packed; /* - * SMB2 flag definitions - */ -#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) -#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) -#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) -#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) -#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */ -#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) -#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */ - -/* * Definitions for SMB2 Protocol Data Units (network frames) * * See MS-SMB2.PDF specification for protocol details. @@ -215,7 +59,7 @@ struct smb2_rdma_crypto_transform { #define SMB2_ERROR_STRUCTURE_SIZE2 cpu_to_le16(9) struct smb2_err_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; __le16 Reserved; /* MBZ */ __le32 ByteCount; /* even if zero, at least one byte follows */ @@ -271,530 +115,6 @@ struct share_redirect_error_context_rsp { /* __u8 ResourceName[] */ /* Name of share as counted Unicode string */ } __packed; -#define SMB2_CLIENT_GUID_SIZE 16 - -struct smb2_negotiate_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 36 */ - __le16 DialectCount; - __le16 SecurityMode; - __le16 Reserved; /* MBZ */ - __le32 Capabilities; - __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; - /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ - __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ - __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ - __le16 Reserved2; - __le16 Dialects[4]; /* BB expand this if autonegotiate > 4 dialects */ -} __packed; - -/* Dialects */ -#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */ -#define SMB20_PROT_ID 0x0202 -#define SMB21_PROT_ID 0x0210 -#define SMB30_PROT_ID 0x0300 -#define SMB302_PROT_ID 0x0302 -#define SMB311_PROT_ID 0x0311 -#define BAD_PROT_ID 0xFFFF - -/* SecurityMode flags */ -#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001 -#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 -#define SMB2_SEC_MODE_FLAGS_ALL 0x0003 - -/* Capabilities flags */ -#define SMB2_GLOBAL_CAP_DFS 0x00000001 -#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ -#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ -#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ -#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ -#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ -#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ -/* Internal types */ -#define SMB2_NT_FIND 0x00100000 -#define SMB2_LARGE_FILES 0x00200000 - - -/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */ -#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) -#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) -#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3) -#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) -#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6) -#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7) -#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8) -#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) - -struct smb2_neg_context { - __le16 ContextType; - __le16 DataLength; - __le32 Reserved; - /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */ -} __packed; - -#define SMB311_LINUX_CLIENT_SALT_SIZE 32 -/* Hash Algorithm Types */ -#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) -#define SMB2_PREAUTH_HASH_SIZE 64 - -/* - * SaltLength that the server send can be zero, so the only three required - * fields (all __le16) end up six bytes total, so the minimum context data len - * in the response is six bytes which accounts for - * - * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm. - */ -#define MIN_PREAUTH_CTXT_DATA_LEN 6 - -struct smb2_preauth_neg_context { - __le16 ContextType; /* 1 */ - __le16 DataLength; - __le32 Reserved; - __le16 HashAlgorithmCount; /* 1 */ - __le16 SaltLength; - __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ - __u8 Salt[SMB311_LINUX_CLIENT_SALT_SIZE]; -} __packed; - -/* Encryption Algorithms Ciphers */ -#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) -#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) -/* we currently do not request AES256_CCM since presumably GCM faster */ -#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) -#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) - -/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ -#define MIN_ENCRYPT_CTXT_DATA_LEN 4 -struct smb2_encryption_neg_context { - __le16 ContextType; /* 2 */ - __le16 DataLength; - __le32 Reserved; - /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ - __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */ - __le16 Ciphers[3]; -} __packed; - -/* See MS-SMB2 2.2.3.1.3 */ -#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000) -#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001) -#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002) -#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) -/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */ -#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */ - -/* Compression Flags */ -#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000) -#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001) - -struct smb2_compression_capabilities_context { - __le16 ContextType; /* 3 */ - __le16 DataLength; - __u32 Reserved; - __le16 CompressionAlgorithmCount; - __u16 Padding; - __u32 Flags; - __le16 CompressionAlgorithms[3]; - __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */ - /* Check if pad needed */ -} __packed; - -/* - * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4. - * Its struct simply contains NetName, an array of Unicode characters - */ -struct smb2_netname_neg_context { - __le16 ContextType; /* 5 */ - __le16 DataLength; - __le32 Reserved; - __le16 NetName[]; /* hostname of target converted to UCS-2 */ -} __packed; - -/* - * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5 - * and 2.2.4.1.5 - */ - -/* Flags */ -#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 - -struct smb2_transport_capabilities_context { - __le16 ContextType; /* 6 */ - __le16 DataLength; - __u32 Reserved; - __le32 Flags; - __u32 Pad; -} __packed; - -/* - * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 - * and 2.2.4.1.6 - */ - -/* RDMA Transform IDs */ -#define SMB2_RDMA_TRANSFORM_NONE 0x0000 -#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 -#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002 - -struct smb2_rdma_transform_capabilities_context { - __le16 ContextType; /* 7 */ - __le16 DataLength; - __u32 Reserved; - __le16 TransformCount; - __u16 Reserved1; - __u32 Reserved2; - __le16 RDMATransformIds[]; -} __packed; - -/* - * For signing capabilities context see MS-SMB2 2.2.3.1.7 - * and 2.2.4.1.7 - */ - -/* Signing algorithms */ -#define SIGNING_ALG_HMAC_SHA256 0 -#define SIGNING_ALG_AES_CMAC 1 -#define SIGNING_ALG_AES_GMAC 2 - -struct smb2_signing_capabilities { - __le16 ContextType; /* 8 */ - __le16 DataLength; - __u32 Reserved; - __le16 SigningAlgorithmCount; - __le16 SigningAlgorithms[]; - /* Followed by padding to 8 byte boundary (required by some servers) */ -} __packed; - -#define POSIX_CTXT_DATA_LEN 16 -struct smb2_posix_neg_context { - __le16 ContextType; /* 0x100 */ - __le16 DataLength; - __le32 Reserved; - __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */ -} __packed; - -struct smb2_negotiate_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 65 */ - __le16 SecurityMode; - __le16 DialectRevision; - __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ - __u8 ServerGUID[16]; - __le32 Capabilities; - __le32 MaxTransactSize; - __le32 MaxReadSize; - __le32 MaxWriteSize; - __le64 SystemTime; /* MBZ */ - __le64 ServerStartTime; - __le16 SecurityBufferOffset; - __le16 SecurityBufferLength; - __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ - __u8 Buffer[1]; /* variable length GSS security buffer */ -} __packed; - -/* Flags */ -#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 -#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 - -struct smb2_sess_setup_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 25 */ - __u8 Flags; - __u8 SecurityMode; - __le32 Capabilities; - __le32 Channel; - __le16 SecurityBufferOffset; - __le16 SecurityBufferLength; - __u64 PreviousSessionId; - __u8 Buffer[1]; /* variable length GSS security buffer */ -} __packed; - -/* Currently defined SessionFlags */ -#define SMB2_SESSION_FLAG_IS_GUEST 0x0001 -#define SMB2_SESSION_FLAG_IS_NULL 0x0002 -#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 -struct smb2_sess_setup_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 SessionFlags; - __le16 SecurityBufferOffset; - __le16 SecurityBufferLength; - __u8 Buffer[1]; /* variable length GSS security buffer */ -} __packed; - -struct smb2_logoff_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -struct smb2_logoff_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -/* Flags/Reserved for SMB3.1.1 */ -#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001) -#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002) -#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004) - -struct smb2_tree_connect_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 Flags; /* Reserved MBZ for dialects prior to SMB3.1.1 */ - __le16 PathOffset; - __le16 PathLength; - __u8 Buffer[1]; /* variable length */ -} __packed; - -/* See MS-SMB2 section 2.2.9.2 */ -/* Context Types */ -#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000 -#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001) - -struct tree_connect_contexts { - __le16 ContextType; - __le16 DataLength; - __le32 Reserved; - __u8 Data[]; -} __packed; - -/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */ -struct smb3_blob_data { - __le16 BlobSize; - __u8 BlobData[]; -} __packed; - -/* Valid values for Attr */ -#define SE_GROUP_MANDATORY 0x00000001 -#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002 -#define SE_GROUP_ENABLED 0x00000004 -#define SE_GROUP_OWNER 0x00000008 -#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010 -#define SE_GROUP_INTEGRITY 0x00000020 -#define SE_GROUP_INTEGRITY_ENABLED 0x00000040 -#define SE_GROUP_RESOURCE 0x20000000 -#define SE_GROUP_LOGON_ID 0xC0000000 - -/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */ - -struct sid_array_data { - __le16 SidAttrCount; - /* SidAttrList - array of sid_attr_data structs */ -} __packed; - -struct luid_attr_data { - -} __packed; - -/* - * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5 - * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA - */ - -struct privilege_array_data { - __le16 PrivilegeCount; - /* array of privilege_data structs */ -} __packed; - -struct remoted_identity_tcon_context { - __le16 TicketType; /* must be 0x0001 */ - __le16 TicketSize; /* total size of this struct */ - __le16 User; /* offset to SID_ATTR_DATA struct with user info */ - __le16 UserName; /* offset to null terminated Unicode username string */ - __le16 Domain; /* offset to null terminated Unicode domain name */ - __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */ - __le16 RestrictedGroups; /* similar to above */ - __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */ - __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */ - __le16 Owner; /* offset to BLOB_DATA struct */ - __le16 DefaultDacl; /* offset to BLOB_DATA struct */ - __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */ - __le16 UserClaims; /* offset to BLOB_DATA struct */ - __le16 DeviceClaims; /* offset to BLOB_DATA struct */ - __u8 TicketInfo[]; /* variable length buf - remoted identity data */ -} __packed; - -struct smb2_tree_connect_req_extension { - __le32 TreeConnectContextOffset; - __le16 TreeConnectContextCount; - __u8 Reserved[10]; - __u8 PathName[]; /* variable sized array */ - /* followed by array of TreeConnectContexts */ -} __packed; - -struct smb2_tree_connect_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 16 */ - __u8 ShareType; /* see below */ - __u8 Reserved; - __le32 ShareFlags; /* see below */ - __le32 Capabilities; /* see below */ - __le32 MaximalAccess; -} __packed; - -/* Possible ShareType values */ -#define SMB2_SHARE_TYPE_DISK 0x01 -#define SMB2_SHARE_TYPE_PIPE 0x02 -#define SMB2_SHARE_TYPE_PRINT 0x03 - -/* - * Possible ShareFlags - exactly one and only one of the first 4 caching flags - * must be set (any of the remaining, SHI1005, flags may be set individually - * or in combination. - */ -#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000 -#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010 -#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020 -#define SMB2_SHAREFLAG_NO_CACHING 0x00000030 -#define SHI1005_FLAGS_DFS 0x00000001 -#define SHI1005_FLAGS_DFS_ROOT 0x00000002 -#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100 -#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200 -#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 -#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 -#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 -#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 -#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 -#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 -#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ -#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ -#define SHI1005_FLAGS_ALL 0x0014FF33 - -/* Possible share capabilities */ -#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ -#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */ -#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */ -#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */ -#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ -#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */ - -struct smb2_tree_disconnect_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -struct smb2_tree_disconnect_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 4 */ - __le16 Reserved; -} __packed; - -/* File Attrubutes */ -#define FILE_ATTRIBUTE_READONLY 0x00000001 -#define FILE_ATTRIBUTE_HIDDEN 0x00000002 -#define FILE_ATTRIBUTE_SYSTEM 0x00000004 -#define FILE_ATTRIBUTE_DIRECTORY 0x00000010 -#define FILE_ATTRIBUTE_ARCHIVE 0x00000020 -#define FILE_ATTRIBUTE_NORMAL 0x00000080 -#define FILE_ATTRIBUTE_TEMPORARY 0x00000100 -#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200 -#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400 -#define FILE_ATTRIBUTE_COMPRESSED 0x00000800 -#define FILE_ATTRIBUTE_OFFLINE 0x00001000 -#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000 -#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000 -#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000 -#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000 - -/* Oplock levels */ -#define SMB2_OPLOCK_LEVEL_NONE 0x00 -#define SMB2_OPLOCK_LEVEL_II 0x01 -#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 -#define SMB2_OPLOCK_LEVEL_BATCH 0x09 -#define SMB2_OPLOCK_LEVEL_LEASE 0xFF -/* Non-spec internal type */ -#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99 - -/* Desired Access Flags */ -#define FILE_READ_DATA_LE cpu_to_le32(0x00000001) -#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002) -#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004) -#define FILE_READ_EA_LE cpu_to_le32(0x00000008) -#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010) -#define FILE_EXECUTE_LE cpu_to_le32(0x00000020) -#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080) -#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100) -#define FILE_DELETE_LE cpu_to_le32(0x00010000) -#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000) -#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000) -#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000) -#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000) -#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000) -#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000) -#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000) -#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000) -#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000) -#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000) - -/* ShareAccess Flags */ -#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001) -#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002) -#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004) -#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007) - -/* CreateDisposition Flags */ -#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000) -#define FILE_OPEN_LE cpu_to_le32(0x00000001) -#define FILE_CREATE_LE cpu_to_le32(0x00000002) -#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003) -#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004) -#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005) - -/* CreateOptions Flags */ -#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001) -/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */ -#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002) -#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004) -#define FILE_NO_INTERMEDIATE_BUFFERRING_LE cpu_to_le32(0x00000008) -#define FILE_SYNCHRONOUS_IO_ALERT_LE cpu_to_le32(0x00000010) -#define FILE_SYNCHRONOUS_IO_NON_ALERT_LE cpu_to_le32(0x00000020) -#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040) -#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100) -#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200) -#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800) -#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) -#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000) -#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000) -#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000) -#define FILE_RESERVE_OPFILTER_LE cpu_to_le32(0x00100000) -#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000) -#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000) -#define FILE_OPEN_FOR_FREE_SPACE_QUERY_LE cpu_to_le32(0x00800000) - -#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \ - | FILE_READ_ATTRIBUTES_LE) -#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \ - | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE) -#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE) - -/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */ -#define IL_ANONYMOUS cpu_to_le32(0x00000000) -#define IL_IDENTIFICATION cpu_to_le32(0x00000001) -#define IL_IMPERSONATION cpu_to_le32(0x00000002) -#define IL_DELEGATE cpu_to_le32(0x00000003) - -/* Create Context Values */ -#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */ -#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ -#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" -#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" -#define SMB2_CREATE_ALLOCATION_SIZE "AISi" -#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" -#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" -#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" -#define SMB2_CREATE_REQUEST_LEASE "RqLs" -#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" -#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" -#define SMB2_CREATE_APP_INSTANCE_ID 0x45BCA66AEFA7F74A9008FA462E144D74 -#define SMB2_CREATE_APP_INSTANCE_VERSION 0xB982D0B73B56074FA07B524A8116A010 -#define SVHDX_OPEN_DEVICE_CONTEX 0x9CCBCF9E04C1E643980E158DA1F6EC83 -#define SMB2_CREATE_TAG_POSIX 0x93AD25509CB411E7B42383DE968BCD7C - -/* Flag (SMB3 open response) values */ -#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 - /* * Maximum number of iovs we need for an open/create request. * [0] : struct smb2_create_req @@ -808,26 +128,6 @@ struct smb2_tree_disconnect_rsp { */ #define SMB2_CREATE_IOV_SIZE 8 -struct smb2_create_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 57 */ - __u8 SecurityFlags; - __u8 RequestedOplockLevel; - __le32 ImpersonationLevel; - __le64 SmbCreateFlags; - __le64 Reserved; - __le32 DesiredAccess; - __le32 FileAttributes; - __le32 ShareAccess; - __le32 CreateDisposition; - __le32 CreateOptions; - __le16 NameOffset; - __le16 NameLength; - __le32 CreateContextsOffset; - __le32 CreateContextsLength; - __u8 Buffer[]; -} __packed; - /* * Maximum size of a SMB2_CREATE response is 64 (smb2 header) + * 88 (fixed part of create response) + 520 (path) + 208 (contexts) + @@ -835,37 +135,6 @@ struct smb2_create_req { */ #define MAX_SMB2_CREATE_RESPONSE_SIZE 880 -struct smb2_create_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 89 */ - __u8 OplockLevel; - __u8 Flag; /* 0x01 if reparse point */ - __le32 CreateAction; - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 AllocationSize; - __le64 EndofFile; - __le32 FileAttributes; - __le32 Reserved2; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 CreateContextsOffset; - __le32 CreateContextsLength; - __u8 Buffer[1]; -} __packed; - -struct create_context { - __le32 Next; - __le16 NameOffset; - __le16 NameLength; - __le16 Reserved; - __le16 DataOffset; - __le32 DataLength; - __u8 Buffer[]; -} __packed; - #define SMB2_LEASE_READ_CACHING_HE 0x01 #define SMB2_LEASE_HANDLE_CACHING_HE 0x02 #define SMB2_LEASE_WRITE_CACHING_HE 0x04 @@ -1211,7 +480,7 @@ struct duplicate_extents_to_file { #define SMB2_IOCTL_IOV_SIZE 2 struct smb2_ioctl_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ __u16 Reserved; __le32 CtlCode; @@ -1229,7 +498,7 @@ struct smb2_ioctl_req { } __packed; struct smb2_ioctl_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ __u16 Reserved; __le32 CtlCode; @@ -1244,161 +513,6 @@ struct smb2_ioctl_rsp { /* char * buffer[] */ } __packed; -/* Currently defined values for close flags */ -#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) -struct smb2_close_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 24 */ - __le16 Flags; - __le32 Reserved; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ -} __packed; - -/* - * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data) - */ -#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124 - -struct smb2_close_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* 60 */ - __le16 Flags; - __le32 Reserved; - __le64 CreationTime; - __le64 LastAccessTime; - __le64 LastWriteTime; - __le64 ChangeTime; - __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ - __le64 EndOfFile; - __le32 Attributes; -} __packed; - -struct smb2_flush_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 24 */ - __le16 Reserved1; - __le32 Reserved2; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ -} __packed; - -struct smb2_flush_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; - __le16 Reserved; -} __packed; - -/* For read request Flags field below, following flag is defined for SMB3.02 */ -#define SMB2_READFLAG_READ_UNBUFFERED 0x01 -#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */ - -/* Channel field for read and write: exactly one of following flags can be set*/ -#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) -#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) /* SMB3 or later */ -#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) /* >= SMB3.02 */ -#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) /* >= SMB3.02, only used on write */ - -/* SMB2 read request without RFC1001 length at the beginning */ -struct smb2_read_plain_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 49 */ - __u8 Padding; /* offset from start of SMB2 header to place read */ - __u8 Flags; /* MBZ unless SMB3.02 or later */ - __le32 Length; - __le64 Offset; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 MinimumCount; - __le32 Channel; /* MBZ except for SMB3 or later */ - __le32 RemainingBytes; - __le16 ReadChannelInfoOffset; - __le16 ReadChannelInfoLength; - __u8 Buffer[1]; -} __packed; - -/* Read flags */ -#define SMB2_READFLAG_RESPONSE_NONE 0x00000000 -#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM 0x00000001 - -struct smb2_read_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 17 */ - __u8 DataOffset; - __u8 Reserved; - __le32 DataLength; - __le32 DataRemaining; - __u32 Flags; - __u8 Buffer[1]; -} __packed; - -/* For write request Flags field below the following flags are defined: */ -#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */ -#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ - -struct smb2_write_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 49 */ - __le16 DataOffset; /* offset from start of SMB2 header to write data */ - __le32 Length; - __le64 Offset; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 Channel; /* MBZ unless SMB3.02 or later */ - __le32 RemainingBytes; - __le16 WriteChannelInfoOffset; - __le16 WriteChannelInfoLength; - __le32 Flags; - __u8 Buffer[1]; -} __packed; - -struct smb2_write_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 17 */ - __u8 DataOffset; - __u8 Reserved; - __le32 DataLength; - __le32 DataRemaining; - __u32 Reserved2; - __u8 Buffer[1]; -} __packed; - -/* notify flags */ -#define SMB2_WATCH_TREE 0x0001 - -/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */ -#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001 -#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002 -#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004 -#define FILE_NOTIFY_CHANGE_SIZE 0x00000008 -#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010 -#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020 -#define FILE_NOTIFY_CHANGE_CREATION 0x00000040 -#define FILE_NOTIFY_CHANGE_EA 0x00000080 -#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100 -#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200 -#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400 -#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800 - -struct smb2_change_notify_req { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; - __le16 Flags; - __le32 OutputBufferLength; - __u64 PersistentFileId; /* opaque endianness */ - __u64 VolatileFileId; /* opaque endianness */ - __le32 CompletionFilter; - __u32 Reserved; -} __packed; - -struct smb2_change_notify_rsp { - struct smb2_sync_hdr sync_hdr; - __le16 StructureSize; /* Must be 9 */ - __le16 OutputBufferOffset; - __le32 OutputBufferLength; - __u8 Buffer[1]; /* array of file notify structs */ -} __packed; - #define SMB2_LOCKFLAG_SHARED_LOCK 0x0001 #define SMB2_LOCKFLAG_EXCLUSIVE_LOCK 0x0002 #define SMB2_LOCKFLAG_UNLOCK 0x0004 @@ -1412,7 +526,7 @@ struct smb2_lock_element { } __packed; struct smb2_lock_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 48 */ __le16 LockCount; /* @@ -1427,19 +541,19 @@ struct smb2_lock_req { } __packed; struct smb2_lock_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __le16 Reserved; } __packed; struct smb2_echo_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __u16 Reserved; } __packed; struct smb2_echo_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 4 */ __u16 Reserved; } __packed; @@ -1469,7 +583,7 @@ struct smb2_echo_rsp { */ struct smb2_query_directory_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 33 */ __u8 FileInformationClass; __u8 Flags; @@ -1483,7 +597,7 @@ struct smb2_query_directory_req { } __packed; struct smb2_query_directory_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1516,7 +630,7 @@ struct smb2_query_directory_rsp { #define SL_INDEX_SPECIFIED 0x00000004 struct smb2_query_info_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 41 */ __u8 InfoType; __u8 FileInfoClass; @@ -1532,7 +646,7 @@ struct smb2_query_info_req { } __packed; struct smb2_query_info_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ __le16 OutputBufferOffset; __le32 OutputBufferLength; @@ -1549,7 +663,7 @@ struct smb2_query_info_rsp { #define SMB2_SET_INFO_IOV_SIZE 3 struct smb2_set_info_req { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 33 */ __u8 InfoType; __u8 FileInfoClass; @@ -1563,12 +677,12 @@ struct smb2_set_info_req { } __packed; struct smb2_set_info_rsp { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 2 */ } __packed; struct smb2_oplock_break { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 24 */ __u8 OplockLevel; __u8 Reserved; @@ -1580,7 +694,7 @@ struct smb2_oplock_break { #define SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED cpu_to_le32(0x01) struct smb2_lease_break { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 44 */ __le16 Epoch; __le32 Flags; @@ -1593,7 +707,7 @@ struct smb2_lease_break { } __packed; struct smb2_lease_ack { - struct smb2_sync_hdr sync_hdr; + struct smb2_hdr hdr; __le16 StructureSize; /* Must be 36 */ __le16 Reserved; __le32 Flags; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 263767f644f8..096fada16ebd 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/smb2proto.h * * Copyright (c) International Business Machines Corp., 2002, 2011 * Etersoft, 2012 @@ -26,7 +25,7 @@ extern int smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *server); extern unsigned int smb2_calc_size(void *buf, struct TCP_Server_Info *server); extern char *smb2_get_data_area_len(int *off, int *len, - struct smb2_sync_hdr *shdr); + struct smb2_hdr *shdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb); diff --git a/fs/cifs/smb2status.h b/fs/cifs/smb2status.h index 0215ef36e240..a9e958166fc5 100644 --- a/fs/cifs/smb2status.h +++ b/fs/cifs/smb2status.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/smb2status.h * * SMB2 Status code (network error) definitions * Definitions are from MS-ERREF diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 6f7952ea4941..2bf047b390a9 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/smb2transport.c * * Copyright (C) International Business Machines Corp., 2002, 2011 * Etersoft, 2012 @@ -20,7 +19,6 @@ #include <linux/mempool.h> #include <linux/highmem.h> #include <crypto/aead.h> -#include "smb2pdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "smb2proto.h" @@ -214,14 +212,14 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; + struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct cifs_ses *ses; struct shash_desc *shash; struct crypto_shash *hash; struct sdesc *sdesc = NULL; struct smb_rqst drqst; - ses = smb2_find_smb_ses(server, shdr->SessionId); + ses = smb2_find_smb_ses(server, le64_to_cpu(shdr->SessionId)); if (!ses) { cifs_server_dbg(VFS, "%s: Could not find session\n", __func__); return 0; @@ -535,14 +533,14 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, unsigned char smb3_signature[SMB2_CMACAES_SIZE]; unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; - struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)iov[0].iov_base; + struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct shash_desc *shash; struct crypto_shash *hash; struct sdesc *sdesc = NULL; struct smb_rqst drqst; u8 key[SMB3_SIGN_KEY_SIZE]; - rc = smb2_get_sign_key(shdr->SessionId, server, key); + rc = smb2_get_sign_key(le64_to_cpu(shdr->SessionId), server, key); if (rc) return 0; @@ -612,12 +610,12 @@ static int smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server) { int rc = 0; - struct smb2_sync_hdr *shdr; + struct smb2_hdr *shdr; struct smb2_sess_setup_req *ssr; bool is_binding; bool is_signed; - shdr = (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + shdr = (struct smb2_hdr *)rqst->rq_iov[0].iov_base; ssr = (struct smb2_sess_setup_req *)shdr; is_binding = shdr->Command == SMB2_SESSION_SETUP && @@ -643,8 +641,8 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { unsigned int rc; char server_response_sig[SMB2_SIGNATURE_SIZE]; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rqst->rq_iov[0].iov_base; if ((shdr->Command == SMB2_NEGOTIATE) || (shdr->Command == SMB2_SESSION_SETUP) || @@ -690,7 +688,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) */ static inline void smb2_seq_num_into_buf(struct TCP_Server_Info *server, - struct smb2_sync_hdr *shdr) + struct smb2_hdr *shdr) { unsigned int i, num = le16_to_cpu(shdr->CreditCharge); @@ -701,7 +699,7 @@ smb2_seq_num_into_buf(struct TCP_Server_Info *server, } static struct mid_q_entry * -smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, +smb2_mid_entry_alloc(const struct smb2_hdr *shdr, struct TCP_Server_Info *server) { struct mid_q_entry *temp; @@ -733,14 +731,15 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; - trace_smb3_cmd_enter(shdr->TreeId, shdr->SessionId, - le16_to_cpu(shdr->Command), temp->mid); + trace_smb3_cmd_enter(le32_to_cpu(shdr->Id.SyncId.TreeId), + le64_to_cpu(shdr->SessionId), + le16_to_cpu(shdr->Command), temp->mid); return temp; } static int smb2_get_mid_entry(struct cifs_ses *ses, struct TCP_Server_Info *server, - struct smb2_sync_hdr *shdr, struct mid_q_entry **mid) + struct smb2_hdr *shdr, struct mid_q_entry **mid) { if (server->tcpStatus == CifsExiting) return -ENOENT; @@ -808,8 +807,8 @@ smb2_setup_request(struct cifs_ses *ses, struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; smb2_seq_num_into_buf(server, shdr); @@ -834,8 +833,8 @@ struct mid_q_entry * smb2_setup_async_request(struct TCP_Server_Info *server, struct smb_rqst *rqst) { int rc; - struct smb2_sync_hdr *shdr = - (struct smb2_sync_hdr *)rqst->rq_iov[0].iov_base; + struct smb2_hdr *shdr = + (struct smb2_hdr *)rqst->rq_iov[0].iov_base; struct mid_q_entry *mid; if (server->tcpStatus == CifsNeedNegotiate && diff --git a/fs/cifs/smberr.h b/fs/cifs/smberr.h index 60189efb3236..aeffdad829e2 100644 --- a/fs/cifs/smberr.h +++ b/fs/cifs/smberr.h @@ -1,6 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1 */ /* - * fs/cifs/smberr.h * * Copyright (c) International Business Machines Corp., 2002,2004 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index dafcb6ab050d..6cecf302dcfd 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -11,6 +11,8 @@ #define _CIFS_TRACE_H #include <linux/tracepoint.h> +#include <linux/net.h> +#include <linux/inet.h> /* * Please use this 3-part article as a reference for writing new tracepoints: @@ -854,6 +856,75 @@ DEFINE_EVENT(smb3_lease_err_class, smb3_##name, \ DEFINE_SMB3_LEASE_ERR_EVENT(lease_err); +DECLARE_EVENT_CLASS(smb3_connect_class, + TP_PROTO(char *hostname, + __u64 conn_id, + const struct __kernel_sockaddr_storage *dst_addr), + TP_ARGS(hostname, conn_id, dst_addr), + TP_STRUCT__entry( + __string(hostname, hostname) + __field(__u64, conn_id) + __array(__u8, dst_addr, sizeof(struct sockaddr_storage)) + ), + TP_fast_assign( + struct sockaddr_storage *pss = NULL; + + __entry->conn_id = conn_id; + pss = (struct sockaddr_storage *)__entry->dst_addr; + *pss = *dst_addr; + __assign_str(hostname, hostname); + ), + TP_printk("conn_id=0x%llx server=%s addr=%pISpsfc", + __entry->conn_id, + __get_str(hostname), + __entry->dst_addr) +) + +#define DEFINE_SMB3_CONNECT_EVENT(name) \ +DEFINE_EVENT(smb3_connect_class, smb3_##name, \ + TP_PROTO(char *hostname, \ + __u64 conn_id, \ + const struct __kernel_sockaddr_storage *addr), \ + TP_ARGS(hostname, conn_id, addr)) + +DEFINE_SMB3_CONNECT_EVENT(connect_done); + +DECLARE_EVENT_CLASS(smb3_connect_err_class, + TP_PROTO(char *hostname, __u64 conn_id, + const struct __kernel_sockaddr_storage *dst_addr, int rc), + TP_ARGS(hostname, conn_id, dst_addr, rc), + TP_STRUCT__entry( + __string(hostname, hostname) + __field(__u64, conn_id) + __array(__u8, dst_addr, sizeof(struct sockaddr_storage)) + __field(int, rc) + ), + TP_fast_assign( + struct sockaddr_storage *pss = NULL; + + __entry->conn_id = conn_id; + __entry->rc = rc; + pss = (struct sockaddr_storage *)__entry->dst_addr; + *pss = *dst_addr; + __assign_str(hostname, hostname); + ), + TP_printk("rc=%d conn_id=0x%llx server=%s addr=%pISpsfc", + __entry->rc, + __entry->conn_id, + __get_str(hostname), + __entry->dst_addr) +) + +#define DEFINE_SMB3_CONNECT_ERR_EVENT(name) \ +DEFINE_EVENT(smb3_connect_err_class, smb3_##name, \ + TP_PROTO(char *hostname, \ + __u64 conn_id, \ + const struct __kernel_sockaddr_storage *addr, \ + int rc), \ + TP_ARGS(hostname, conn_id, addr, rc)) + +DEFINE_SMB3_CONNECT_ERR_EVENT(connect_err); + DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, __u64 conn_id, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 75a95de320cf..b7379329b741 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/transport.c * * Copyright (C) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c index 59b6c577aa0a..2f075b5b50df 100644 --- a/fs/cifs/winucase.c +++ b/fs/cifs/winucase.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - * fs/cifs/winucase.c * * Copyright (c) Jeffrey Layton <jlayton@redhat.com>, 2013 * diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 9ed481e79ce0..7d8b72d67c80 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: LGPL-2.1 /* - * fs/cifs/xattr.c * * Copyright (c) International Business Machines Corp., 2003, 2007 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 06855f6c7902..62a3d2565c26 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -63,9 +63,10 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid, struct inode *inode; struct coda_inode_info *cii; unsigned long hash = coda_f2i(fid); + umode_t inode_type = coda_inode_type(attr); +retry: inode = iget5_locked(sb, hash, coda_test_inode, coda_set_inode, fid); - if (!inode) return ERR_PTR(-ENOMEM); @@ -75,11 +76,15 @@ struct inode * coda_iget(struct super_block * sb, struct CodaFid * fid, inode->i_ino = hash; /* inode is locked and unique, no need to grab cii->c_lock */ cii->c_mapcount = 0; + coda_fill_inode(inode, attr); unlock_new_inode(inode); + } else if ((inode->i_mode & S_IFMT) != inode_type) { + /* Inode has changed type, mark bad and grab a new one */ + remove_inode_hash(inode); + coda_flag_inode(inode, C_PURGE); + iput(inode); + goto retry; } - - /* always replace the attributes, type might have changed */ - coda_fill_inode(inode, attr); return inode; } diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 2e1a5a192074..903ca8fa4b9b 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -87,28 +87,27 @@ static struct coda_timespec timespec64_to_coda(struct timespec64 ts64) } /* utility functions below */ +umode_t coda_inode_type(struct coda_vattr *attr) +{ + switch (attr->va_type) { + case C_VREG: + return S_IFREG; + case C_VDIR: + return S_IFDIR; + case C_VLNK: + return S_IFLNK; + case C_VNON: + default: + return 0; + } +} + void coda_vattr_to_iattr(struct inode *inode, struct coda_vattr *attr) { - int inode_type; - /* inode's i_flags, i_ino are set by iget - XXX: is this all we need ?? - */ - switch (attr->va_type) { - case C_VNON: - inode_type = 0; - break; - case C_VREG: - inode_type = S_IFREG; - break; - case C_VDIR: - inode_type = S_IFDIR; - break; - case C_VLNK: - inode_type = S_IFLNK; - break; - default: - inode_type = 0; - } + /* inode's i_flags, i_ino are set by iget + * XXX: is this all we need ?? + */ + umode_t inode_type = coda_inode_type(attr); inode->i_mode |= inode_type; if (attr->va_mode != (u_short) -1) diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index e7b27754ce78..9be281bbcc06 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -53,10 +53,11 @@ int coda_getattr(struct user_namespace *, const struct path *, struct kstat *, u32, unsigned int); int coda_setattr(struct user_namespace *, struct dentry *, struct iattr *); -/* this file: heloers */ +/* this file: helpers */ char *coda_f2s(struct CodaFid *f); int coda_iscontrol(const char *name, size_t length); +umode_t coda_inode_type(struct coda_vattr *attr); void coda_vattr_to_iattr(struct inode *, struct coda_vattr *); void coda_iattr_to_vattr(struct iattr *, struct coda_vattr *); unsigned short coda_flags_to_cflags(unsigned short); @@ -83,6 +84,9 @@ static __inline__ void coda_flag_inode(struct inode *inode, int flag) { struct coda_inode_info *cii = ITOC(inode); + if (!inode) + return; + spin_lock(&cii->c_lock); cii->c_flags |= flag; spin_unlock(&cii->c_lock); diff --git a/fs/coda/dir.c b/fs/coda/dir.c index d69989c1bac3..328d7a684b63 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -317,13 +317,10 @@ static int coda_rename(struct user_namespace *mnt_userns, struct inode *old_dir, coda_dir_drop_nlink(old_dir); coda_dir_inc_nlink(new_dir); } - coda_dir_update_mtime(old_dir); - coda_dir_update_mtime(new_dir); coda_flag_inode(d_inode(new_dentry), C_VATTR); - } else { - coda_flag_inode(old_dir, C_VATTR); - coda_flag_inode(new_dir, C_VATTR); } + coda_dir_update_mtime(old_dir); + coda_dir_update_mtime(new_dir); } return error; } @@ -499,15 +496,20 @@ out: */ static int coda_dentry_delete(const struct dentry * dentry) { - int flags; + struct inode *inode; + struct coda_inode_info *cii; if (d_really_is_negative(dentry)) return 0; - flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE; - if (is_bad_inode(d_inode(dentry)) || flags) { + inode = d_inode(dentry); + if (!inode || is_bad_inode(inode)) return 1; - } + + cii = ITOC(inode); + if (cii->c_flags & C_PURGE) + return 1; + return 0; } diff --git a/fs/coda/file.c b/fs/coda/file.c index ef5ca22bfb3e..29dd87be2fb8 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -8,6 +8,7 @@ * to the Coda project. Contact Peter Braam <coda@cs.cmu.edu>. */ +#include <linux/refcount.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/time.h> @@ -28,7 +29,7 @@ #include "coda_int.h" struct coda_vm_ops { - atomic_t refcnt; + refcount_t refcnt; struct file *coda_file; const struct vm_operations_struct *host_vm_ops; struct vm_operations_struct vm_ops; @@ -98,7 +99,7 @@ coda_vm_open(struct vm_area_struct *vma) struct coda_vm_ops *cvm_ops = container_of(vma->vm_ops, struct coda_vm_ops, vm_ops); - atomic_inc(&cvm_ops->refcnt); + refcount_inc(&cvm_ops->refcnt); if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->open) cvm_ops->host_vm_ops->open(vma); @@ -113,7 +114,7 @@ coda_vm_close(struct vm_area_struct *vma) if (cvm_ops->host_vm_ops && cvm_ops->host_vm_ops->close) cvm_ops->host_vm_ops->close(vma); - if (atomic_dec_and_test(&cvm_ops->refcnt)) { + if (refcount_dec_and_test(&cvm_ops->refcnt)) { vma->vm_ops = cvm_ops->host_vm_ops; fput(cvm_ops->coda_file); kfree(cvm_ops); @@ -189,7 +190,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) cvm_ops->vm_ops.open = coda_vm_open; cvm_ops->vm_ops.close = coda_vm_close; cvm_ops->coda_file = coda_file; - atomic_set(&cvm_ops->refcnt, 1); + refcount_set(&cvm_ops->refcnt, 1); vma->vm_ops = &cvm_ops->vm_ops; } @@ -238,11 +239,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) struct coda_file_info *cfi; struct coda_inode_info *cii; struct inode *host_inode; - int err; cfi = coda_ftoc(coda_file); - err = venus_close(coda_inode->i_sb, coda_i2f(coda_inode), + venus_close(coda_inode->i_sb, coda_i2f(coda_inode), coda_flags, coda_file->f_cred->fsuid); host_inode = file_inode(cfi->cfi_container); diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 240669f51eac..b39580ad4ce5 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -122,14 +122,10 @@ static ssize_t coda_psdev_write(struct file *file, const char __user *buf, hdr.opcode, hdr.unique); nbytes = size; } - dcbuf = kvmalloc(nbytes, GFP_KERNEL); - if (!dcbuf) { - retval = -ENOMEM; - goto out; - } - if (copy_from_user(dcbuf, buf, nbytes)) { - kvfree(dcbuf); - retval = -EFAULT; + + dcbuf = vmemdup_user(buf, nbytes); + if (IS_ERR(dcbuf)) { + retval = PTR_ERR(dcbuf); goto out; } @@ -388,7 +384,7 @@ MODULE_AUTHOR("Jan Harkes, Peter J. Braam"); MODULE_DESCRIPTION("Coda Distributed File System VFS interface"); MODULE_ALIAS_CHARDEV_MAJOR(CODA_PSDEV_MAJOR); MODULE_LICENSE("GPL"); -MODULE_VERSION("7.0"); +MODULE_VERSION("7.2"); static int __init init_coda(void) { diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index eb3b1898da46..59f6cfd06f96 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -744,7 +744,8 @@ static int coda_upcall(struct venus_comm *vcp, list_add_tail(&req->uc_chain, &vcp->vc_pending); wake_up_interruptible(&vcp->vc_waitq); - if (req->uc_flags & CODA_REQ_ASYNC) { + /* We can return early on asynchronous requests */ + if (outSize == NULL) { mutex_unlock(&vcp->vc_mutex); return 0; } diff --git a/fs/coredump.c b/fs/coredump.c index 3224dee44d30..a6b3c196cdef 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -359,7 +359,7 @@ static int zap_process(struct task_struct *start, int exit_code, int flags) for_each_thread(start, t) { task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK); - if (t != current && t->mm) { + if (t != current && !(t->flags & PF_POSTCOREDUMP)) { sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); nr++; @@ -369,99 +369,34 @@ static int zap_process(struct task_struct *start, int exit_code, int flags) return nr; } -static int zap_threads(struct task_struct *tsk, struct mm_struct *mm, +static int zap_threads(struct task_struct *tsk, struct core_state *core_state, int exit_code) { - struct task_struct *g, *p; - unsigned long flags; int nr = -EAGAIN; spin_lock_irq(&tsk->sighand->siglock); if (!signal_group_exit(tsk->signal)) { - mm->core_state = core_state; + tsk->signal->core_state = core_state; tsk->signal->group_exit_task = tsk; nr = zap_process(tsk, exit_code, 0); clear_tsk_thread_flag(tsk, TIF_SIGPENDING); + tsk->flags |= PF_DUMPCORE; + atomic_set(&core_state->nr_threads, nr); } spin_unlock_irq(&tsk->sighand->siglock); - if (unlikely(nr < 0)) - return nr; - - tsk->flags |= PF_DUMPCORE; - if (atomic_read(&mm->mm_users) == nr + 1) - goto done; - /* - * We should find and kill all tasks which use this mm, and we should - * count them correctly into ->nr_threads. We don't take tasklist - * lock, but this is safe wrt: - * - * fork: - * None of sub-threads can fork after zap_process(leader). All - * processes which were created before this point should be - * visible to zap_threads() because copy_process() adds the new - * process to the tail of init_task.tasks list, and lock/unlock - * of ->siglock provides a memory barrier. - * - * do_exit: - * The caller holds mm->mmap_lock. This means that the task which - * uses this mm can't pass exit_mm(), so it can't exit or clear - * its ->mm. - * - * de_thread: - * It does list_replace_rcu(&leader->tasks, ¤t->tasks), - * we must see either old or new leader, this does not matter. - * However, it can change p->sighand, so lock_task_sighand(p) - * must be used. Since p->mm != NULL and we hold ->mmap_lock - * it can't fail. - * - * Note also that "g" can be the old leader with ->mm == NULL - * and already unhashed and thus removed from ->thread_group. - * This is OK, __unhash_process()->list_del_rcu() does not - * clear the ->next pointer, we will find the new leader via - * next_thread(). - */ - rcu_read_lock(); - for_each_process(g) { - if (g == tsk->group_leader) - continue; - if (g->flags & PF_KTHREAD) - continue; - - for_each_thread(g, p) { - if (unlikely(!p->mm)) - continue; - if (unlikely(p->mm == mm)) { - lock_task_sighand(p, &flags); - nr += zap_process(p, exit_code, - SIGNAL_GROUP_EXIT); - unlock_task_sighand(p, &flags); - } - break; - } - } - rcu_read_unlock(); -done: - atomic_set(&core_state->nr_threads, nr); return nr; } static int coredump_wait(int exit_code, struct core_state *core_state) { struct task_struct *tsk = current; - struct mm_struct *mm = tsk->mm; int core_waiters = -EBUSY; init_completion(&core_state->startup); core_state->dumper.task = tsk; core_state->dumper.next = NULL; - if (mmap_write_lock_killable(mm)) - return -EINTR; - - if (!mm->core_state) - core_waiters = zap_threads(tsk, mm, core_state, exit_code); - mmap_write_unlock(mm); - + core_waiters = zap_threads(tsk, core_state, exit_code); if (core_waiters > 0) { struct core_thread *ptr; @@ -483,7 +418,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state) return core_waiters; } -static void coredump_finish(struct mm_struct *mm, bool core_dumped) +static void coredump_finish(bool core_dumped) { struct core_thread *curr, *next; struct task_struct *task; @@ -493,22 +428,21 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped) current->signal->group_exit_code |= 0x80; current->signal->group_exit_task = NULL; current->signal->flags = SIGNAL_GROUP_EXIT; + next = current->signal->core_state->dumper.next; + current->signal->core_state = NULL; spin_unlock_irq(¤t->sighand->siglock); - next = mm->core_state->dumper.next; while ((curr = next) != NULL) { next = curr->next; task = curr->task; /* - * see exit_mm(), curr->task must not see + * see coredump_task_exit(), curr->task must not see * ->task == NULL before we read ->next. */ smp_mb(); curr->task = NULL; wake_up_process(task); } - - mm->core_state = NULL; } static bool dump_interrupted(void) @@ -839,7 +773,7 @@ fail_dropcount: fail_unlock: kfree(argv); kfree(cn.corename); - coredump_finish(mm, core_dumped); + coredump_finish(core_dumped); revert_creds(old_cred); fail_creds: put_cred(cred); diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 2be65269a987..666aa380011e 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -209,7 +209,7 @@ static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, return read_buffers[i] + blk_offset; } - devsize = mapping->host->i_size >> PAGE_SHIFT; + devsize = bdev_nr_bytes(sb->s_bdev) >> PAGE_SHIFT; /* Ok, read in BLKS_PER_BUF pages completely first. */ for (i = 0; i < BLKS_PER_BUF; i++) { diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 68a2de6b5a9b..bfc2a5b74ed3 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -1,23 +1,10 @@ // SPDX-License-Identifier: GPL-2.0 /* - * This contains encryption functions for per-file encryption. + * Utility functions for file contents encryption/decryption on + * block device-based filesystems. * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * Add fscrypt_pullback_bio_page() - * Jaegeuk Kim, 2015. - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. */ #include <linux/pagemap.h> @@ -26,6 +13,21 @@ #include <linux/namei.h> #include "fscrypt_private.h" +/** + * fscrypt_decrypt_bio() - decrypt the contents of a bio + * @bio: the bio to decrypt + * + * Decrypt the contents of a "read" bio following successful completion of the + * underlying disk read. The bio must be reading a whole number of blocks of an + * encrypted file directly into the page cache. If the bio is reading the + * ciphertext into bounce pages instead of the page cache (for example, because + * the file is also compressed, so decompression is required after decryption), + * then this function isn't applicable. This function may sleep, so it must be + * called from a workqueue rather than from the bio's bi_end_io callback. + * + * This function sets PG_error on any pages that contain any blocks that failed + * to be decrypted. The filesystem must not mark such pages uptodate. + */ void fscrypt_decrypt_bio(struct bio *bio) { struct bio_vec *bv; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index eb538c28df94..a9be4bc74a94 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -429,8 +429,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (fscrypt_has_encryption_key(dir)) { if (!fscrypt_fname_encrypted_size(&dir->i_crypt_info->ci_policy, - iname->len, - dir->i_sb->s_cop->max_namelen, + iname->len, NAME_MAX, &fname->crypto_buf.len)) return -ENAMETOOLONG; fname->crypto_buf.name = kmalloc(fname->crypto_buf.len, diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 3fa965eb3336..5b0a9e6478b5 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -20,6 +20,11 @@ #define FSCRYPT_FILE_NONCE_SIZE 16 +/* + * Minimum size of an fscrypt master key. Note: a longer key will be required + * if ciphers with a 256-bit security strength are used. This is just the + * absolute minimum, which applies when only 128-bit encryption is used. + */ #define FSCRYPT_MIN_KEY_SIZE 16 #define FSCRYPT_CONTEXT_V1 1 @@ -413,7 +418,11 @@ struct fscrypt_master_key_secret { */ struct fscrypt_hkdf hkdf; - /* Size of the raw key in bytes. Set even if ->raw isn't set. */ + /* + * Size of the raw key in bytes. This remains set even if ->raw was + * zeroized due to no longer being needed. I.e. we still remember the + * size of the key even if we don't need to remember the key itself. + */ u32 size; /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ @@ -549,8 +558,9 @@ int __init fscrypt_init_keyring(void); struct fscrypt_mode { const char *friendly_name; const char *cipher_str; - int keysize; - int ivsize; + int keysize; /* key size in bytes */ + int security_strength; /* security strength in bytes */ + int ivsize; /* IV size in bytes */ int logged_impl_name; enum blk_crypto_mode_num blk_crypto_mode; }; diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index e0ec21055505..7607d18b35fc 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -16,9 +16,14 @@ /* * HKDF supports any unkeyed cryptographic hash algorithm, but fscrypt uses - * SHA-512 because it is reasonably secure and efficient; and since it produces - * a 64-byte digest, deriving an AES-256-XTS key preserves all 64 bytes of - * entropy from the master key and requires only one iteration of HKDF-Expand. + * SHA-512 because it is well-established, secure, and reasonably efficient. + * + * HKDF-SHA256 was also considered, as its 256-bit security strength would be + * sufficient here. A 512-bit security strength is "nice to have", though. + * Also, on 64-bit CPUs, SHA-512 is usually just as fast as SHA-256. In the + * common case of deriving an AES-256-XTS key (512 bits), that can result in + * HKDF-SHA512 being much faster than HKDF-SHA256, as the longer digest size of + * SHA-512 causes HKDF-Expand to only need to do one iteration rather than two. */ #define HKDF_HMAC_ALG "hmac(sha512)" #define HKDF_HASHLEN SHA512_DIGEST_SIZE diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index bca9c6658a7c..eede186b04ce 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -19,6 +19,7 @@ struct fscrypt_mode fscrypt_modes[] = { .friendly_name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, + .security_strength = 32, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_256_XTS, }, @@ -26,12 +27,14 @@ struct fscrypt_mode fscrypt_modes[] = { .friendly_name = "AES-256-CTS-CBC", .cipher_str = "cts(cbc(aes))", .keysize = 32, + .security_strength = 32, .ivsize = 16, }, [FSCRYPT_MODE_AES_128_CBC] = { .friendly_name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, + .security_strength = 16, .ivsize = 16, .blk_crypto_mode = BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, }, @@ -39,12 +42,14 @@ struct fscrypt_mode fscrypt_modes[] = { .friendly_name = "AES-128-CTS-CBC", .cipher_str = "cts(cbc(aes))", .keysize = 16, + .security_strength = 16, .ivsize = 16, }, [FSCRYPT_MODE_ADIANTUM] = { .friendly_name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, + .security_strength = 32, .ivsize = 32, .blk_crypto_mode = BLK_ENCRYPTION_MODE_ADIANTUM, }, @@ -117,8 +122,9 @@ err_free_tfm: /* * Prepare the crypto transform object or blk-crypto key in @prep_key, given the - * raw key, encryption mode, and flag indicating which encryption implementation - * (fs-layer or blk-crypto) will be used. + * raw key, encryption mode (@ci->ci_mode), flag indicating which encryption + * implementation (fs-layer or blk-crypto) will be used (@ci->ci_inlinecrypt), + * and IV generation method (@ci->ci_policy.flags). */ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_info *ci) @@ -358,6 +364,45 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, } /* + * Check whether the size of the given master key (@mk) is appropriate for the + * encryption settings which a particular file will use (@ci). + * + * If the file uses a v1 encryption policy, then the master key must be at least + * as long as the derived key, as this is a requirement of the v1 KDF. + * + * Otherwise, the KDF can accept any size key, so we enforce a slightly looser + * requirement: we require that the size of the master key be at least the + * maximum security strength of any algorithm whose key will be derived from it + * (but in practice we only need to consider @ci->ci_mode, since any other + * possible subkeys such as DIRHASH and INODE_HASH will never increase the + * required key size over @ci->ci_mode). This allows AES-256-XTS keys to be + * derived from a 256-bit master key, which is cryptographically sufficient, + * rather than requiring a 512-bit master key which is unnecessarily long. (We + * still allow 512-bit master keys if the user chooses to use them, though.) + */ +static bool fscrypt_valid_master_key_size(const struct fscrypt_master_key *mk, + const struct fscrypt_info *ci) +{ + unsigned int min_keysize; + + if (ci->ci_policy.version == FSCRYPT_POLICY_V1) + min_keysize = ci->ci_mode->keysize; + else + min_keysize = ci->ci_mode->security_strength; + + if (mk->mk_secret.size < min_keysize) { + fscrypt_warn(NULL, + "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", + master_key_spec_type(&mk->mk_spec), + master_key_spec_len(&mk->mk_spec), + (u8 *)&mk->mk_spec.u, + mk->mk_secret.size, min_keysize); + return false; + } + return true; +} + +/* * Find the master key, then set up the inode's actual encryption key. * * If the master key is found in the filesystem-level keyring, then the @@ -422,18 +467,7 @@ static int setup_file_encryption_key(struct fscrypt_info *ci, goto out_release_key; } - /* - * Require that the master key be at least as long as the derived key. - * Otherwise, the derived key cannot possibly contain as much entropy as - * that required by the encryption mode it will be used for. For v1 - * policies it's also required for the KDF to work at all. - */ - if (mk->mk_secret.size < ci->ci_mode->keysize) { - fscrypt_warn(NULL, - "key with %s %*phN is too short (got %u bytes, need %u+ bytes)", - master_key_spec_type(&mk_spec), - master_key_spec_len(&mk_spec), (u8 *)&mk_spec.u, - mk->mk_secret.size, ci->ci_mode->keysize); + if (!fscrypt_valid_master_key_size(mk, ci)) { err = -ENOKEY; goto out_release_key; } diff --git a/fs/d_path.c b/fs/d_path.c index cd60c7535181..e4e0ebad1f15 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -77,9 +77,8 @@ static bool prepend(struct prepend_buffer *p, const char *str, int namelen) /** * prepend_name - prepend a pathname in front of current buffer pointer - * @buffer: buffer pointer - * @buflen: allocated length of the buffer - * @name: name string and length qstr structure + * @p: prepend buffer which contains buffer pointer and allocated length + * @name: name string and length qstr structure * * With RCU path tracing, it may race with d_move(). Use READ_ONCE() to * make sure that either the old or the new name pointer and length are @@ -141,8 +140,7 @@ static int __prepend_path(const struct dentry *dentry, const struct mount *mnt, * prepend_path - Prepend path string to a buffer * @path: the dentry/vfsmount to report * @root: root vfsmnt/dentry - * @buffer: pointer to the end of the buffer - * @buflen: pointer to buffer length + * @p: prepend buffer which contains buffer pointer and allocated length * * The function will first try to write out the pathname without taking any * lock other than the RCU read lock to make sure that dentries won't go away. diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 8129a430d789..2f117c57160d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -528,7 +528,7 @@ void debugfs_create_file_size(const char *name, umode_t mode, { struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); - if (de) + if (!IS_ERR(de)) d_inode(de)->i_size = file_size; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); diff --git a/fs/direct-io.c b/fs/direct-io.c index b2e86e739d7a..654443558047 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -119,7 +119,6 @@ struct dio { int flags; /* doesn't change */ int op; int op_flags; - blk_qc_t bio_cookie; struct gendisk *bio_disk; struct inode *inode; loff_t i_size; /* i_size when submitted */ @@ -308,7 +307,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) if (ret > 0 && dio->op == REQ_OP_WRITE) ret = generic_write_sync(dio->iocb, ret); - dio->iocb->ki_complete(dio->iocb, ret, 0); + dio->iocb->ki_complete(dio->iocb, ret); } kmem_cache_free(dio_cache, dio); @@ -438,11 +437,10 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) dio->bio_disk = bio->bi_bdev->bd_disk; - if (sdio->submit_io) { + if (sdio->submit_io) sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); - dio->bio_cookie = BLK_QC_T_NONE; - } else - dio->bio_cookie = submit_bio(bio); + else + submit_bio(bio); sdio->bio = NULL; sdio->boundary = 0; @@ -481,9 +479,7 @@ static struct bio *dio_await_one(struct dio *dio) __set_current_state(TASK_UNINTERRUPTIBLE); dio->waiter = current; spin_unlock_irqrestore(&dio->bio_lock, flags); - if (!(dio->iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(dio->bio_disk->queue, dio->bio_cookie, true)) - blk_io_schedule(); + blk_io_schedule(); /* wake up sets us TASK_RUNNING */ spin_lock_irqsave(&dio->bio_lock, flags); dio->waiter = NULL; @@ -1214,8 +1210,6 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, } else { dio->op = REQ_OP_READ; } - if (iocb->ki_flags & IOCB_HIPRI) - dio->op_flags |= REQ_HIPRI; /* * For AIO O_(D)SYNC writes we need to defer completions to a workqueue diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 14b747026742..f57255ab88ed 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -6,16 +6,22 @@ config EROFS_FS select FS_IOMAP select LIBCRC32C help - EROFS (Enhanced Read-Only File System) is a lightweight - read-only file system with modern designs (eg. page-sized - blocks, inline xattrs/data, etc.) for scenarios which need - high-performance read-only requirements, e.g. Android OS - for mobile phones and LIVECDs. + EROFS (Enhanced Read-Only File System) is a lightweight read-only + file system with modern designs (e.g. no buffer heads, inline + xattrs/data, chunk-based deduplication, multiple devices, etc.) for + scenarios which need high-performance read-only solutions, e.g. + smartphones with Android OS, LiveCDs and high-density hosts with + numerous containers; - It also provides fixed-sized output compression support, - which improves storage density, keeps relatively higher - compression ratios, which is more useful to achieve high - performance for embedded devices with limited memory. + It also provides fixed-sized output compression support in order to + improve storage density as well as keep relatively higher compression + ratios and implements in-place decompression to reuse the file page + for compressed data temporarily with proper strategies, which is + quite useful to ensure guaranteed end-to-end runtime decompression + performance under extremely memory pressure without extra cost. + + See the documentation at <file:Documentation/filesystems/erofs.rst> + for more details. If unsure, say N. @@ -76,3 +82,19 @@ config EROFS_FS_ZIP Enable fixed-sized output compression for EROFS. If you don't want to enable compression feature, say N. + +config EROFS_FS_ZIP_LZMA + bool "EROFS LZMA compressed data support" + depends on EROFS_FS_ZIP + select XZ_DEC + select XZ_DEC_MICROLZMA + help + Saying Y here includes support for reading EROFS file systems + containing LZMA compressed data, specifically called microLZMA. it + gives better compression ratios than the LZ4 algorithm, at the + expense of more CPU overhead. + + LZMA support is an experimental feature for now and so most file + systems will be readable without selecting this option. + + If unsure, say N. diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 1f9aced49070..756fe2d65272 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_EROFS_FS) += erofs.o erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o +erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 3701c72bacb2..579406504919 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -8,11 +8,6 @@ #include "internal.h" -enum { - Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, - Z_EROFS_COMPRESSION_RUNTIME_MAX -}; - struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; @@ -25,6 +20,12 @@ struct z_erofs_decompress_req { bool inplace_io, partial_decoding; }; +struct z_erofs_decompressor { + int (*decompress)(struct z_erofs_decompress_req *rq, + struct page **pagepool); + char *name; +}; + /* some special page->private (unsigned long, see below) */ #define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2) #define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2) @@ -63,7 +64,7 @@ static inline bool z_erofs_is_shortlived_page(struct page *page) return true; } -static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, +static inline bool z_erofs_put_shortlivedpage(struct page **pagepool, struct page *page) { if (!z_erofs_is_shortlived_page(page)) @@ -74,13 +75,22 @@ static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool, put_page(page); } else { /* follow the pcluster rule above. */ - set_page_private(page, 0); - list_add(&page->lru, pagepool); + erofs_pagepool_add(pagepool, page); } return true; } +#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) +static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, + struct page *page) +{ + return page->mapping == MNGD_MAPPING(sbi); +} + int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); + struct page **pagepool); +/* prototypes for specific algorithms */ +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool); #endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 9db829715652..0e35ef3f9f3d 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -89,6 +89,7 @@ static int erofs_map_blocks(struct inode *inode, erofs_off_t pos; int err = 0; + map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ map->m_flags = 0; @@ -135,14 +136,8 @@ static int erofs_map_blocks(struct inode *inode, map->m_flags = 0; break; default: - /* only one device is supported for now */ - if (idx->device_id) { - erofs_err(sb, "invalid device id %u @ %llu for nid %llu", - le16_to_cpu(idx->device_id), - chunknr, vi->nid); - err = -EFSCORRUPTED; - goto out_unlock; - } + map->m_deviceid = le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr)); map->m_flags = EROFS_MAP_MAPPED; break; @@ -155,11 +150,55 @@ out: return err; } +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) +{ + struct erofs_dev_context *devs = EROFS_SB(sb)->devs; + struct erofs_device_info *dif; + int id; + + /* primary device by default */ + map->m_bdev = sb->s_bdev; + map->m_daxdev = EROFS_SB(sb)->dax_dev; + + if (map->m_deviceid) { + down_read(&devs->rwsem); + dif = idr_find(&devs->tree, map->m_deviceid - 1); + if (!dif) { + up_read(&devs->rwsem); + return -ENODEV; + } + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + up_read(&devs->rwsem); + } else if (devs->extra_devices) { + down_read(&devs->rwsem); + idr_for_each_entry(&devs->tree, dif, id) { + erofs_off_t startoff, length; + + if (!dif->mapped_blkaddr) + continue; + startoff = blknr_to_addr(dif->mapped_blkaddr); + length = blknr_to_addr(dif->blocks); + + if (map->m_pa >= startoff && + map->m_pa < startoff + length) { + map->m_pa -= startoff; + map->m_bdev = dif->bdev; + map->m_daxdev = dif->dax_dev; + break; + } + } + up_read(&devs->rwsem); + } + return 0; +} + static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct erofs_map_blocks map; + struct erofs_map_dev mdev; map.m_la = offset; map.m_llen = length; @@ -168,8 +207,16 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, if (ret < 0) return ret; - iomap->bdev = inode->i_sb->s_bdev; - iomap->dax_dev = EROFS_I_SB(inode)->dax_dev; + mdev = (struct erofs_map_dev) { + .m_deviceid = map.m_deviceid, + .m_pa = map.m_pa, + }; + ret = erofs_map_dev(inode->i_sb, &mdev); + if (ret) + return ret; + + iomap->bdev = mdev.m_bdev; + iomap->dax_dev = mdev.m_daxdev; iomap->offset = map.m_la; iomap->length = map.m_llen; iomap->flags = 0; @@ -188,15 +235,15 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->type = IOMAP_INLINE; ipage = erofs_get_meta_page(inode->i_sb, - erofs_blknr(map.m_pa)); + erofs_blknr(mdev.m_pa)); if (IS_ERR(ipage)) return PTR_ERR(ipage); iomap->inline_data = page_address(ipage) + - erofs_blkoff(map.m_pa); + erofs_blkoff(mdev.m_pa); iomap->private = ipage; } else { iomap->type = IOMAP_MAPPED; - iomap->addr = map.m_pa; + iomap->addr = mdev.m_pa; } return 0; } @@ -287,7 +334,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (!err) return iomap_dio_rw(iocb, to, &erofs_iomap_ops, - NULL, 0); + NULL, 0, 0); if (err < 0) return err; } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index a5bc4b1b7813..bf37fc76b182 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -16,17 +16,6 @@ #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) #endif -struct z_erofs_decompressor { - /* - * if destpages have sparsed pages, fill them with bounce pages. - * it also check whether destpages indicate continuous physical memory. - */ - int (*prepare_destpages)(struct z_erofs_decompress_req *rq, - struct list_head *pagepool); - int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out); - char *name; -}; - int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int size) @@ -63,8 +52,12 @@ int z_erofs_load_lz4_config(struct super_block *sb, return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); } -static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +/* + * Fill all gaps with bounce pages if it's a sparse page list. Also check if + * all physical pages are consecutive, which can be seen for moderate CR. + */ +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nr = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -119,7 +112,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq, return kaddr ? 1 : 0; } -static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq, +static void *z_erofs_lz4_handle_inplace_io(struct z_erofs_decompress_req *rq, void *inpage, unsigned int *inputmargin, int *maptype, bool support_0padding) { @@ -189,7 +182,8 @@ docopy: return src; } -static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) +static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, + u8 *out) { unsigned int inputmargin; u8 *headpage, *src; @@ -216,8 +210,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) } rq->inputsize -= inputmargin; - src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype, - support_0padding); + src = z_erofs_lz4_handle_inplace_io(rq, headpage, &inputmargin, + &maptype, support_0padding); if (IS_ERR(src)) return PTR_ERR(src); @@ -233,7 +227,6 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", ret, rq->inputsize, inputmargin, rq->outputsize); - WARN_ON(1); print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, 16, 1, src + inputmargin, rq->inputsize, true); print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, @@ -242,6 +235,8 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) if (ret >= 0) memset(out + ret, 0, rq->outputsize - ret); ret = -EIO; + } else { + ret = 0; } if (maptype == 0) { @@ -257,86 +252,25 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) return ret; } -static struct z_erofs_decompressor decompressors[] = { - [Z_EROFS_COMPRESSION_SHIFTED] = { - .name = "shifted" - }, - [Z_EROFS_COMPRESSION_LZ4] = { - .prepare_destpages = z_erofs_lz4_prepare_destpages, - .decompress = z_erofs_lz4_decompress, - .name = "lz4" - }, -}; - -static void copy_from_pcpubuf(struct page **out, const char *dst, - unsigned short pageofs_out, - unsigned int outputsize) -{ - const char *end = dst + outputsize; - const unsigned int righthalf = PAGE_SIZE - pageofs_out; - const char *cur = dst - pageofs_out; - - while (cur < end) { - struct page *const page = *out++; - - if (page) { - char *buf = kmap_atomic(page); - - if (cur >= dst) { - memcpy(buf, cur, min_t(uint, PAGE_SIZE, - end - cur)); - } else { - memcpy(buf + pageofs_out, cur + pageofs_out, - min_t(uint, righthalf, end - cur)); - } - kunmap_atomic(buf); - } - cur += PAGE_SIZE; - } -} - -static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; - const struct z_erofs_decompressor *alg = decompressors + rq->alg; unsigned int dst_maptype; void *dst; int ret; - /* two optimized fast paths only for non bigpcluster cases yet */ - if (rq->inputsize <= PAGE_SIZE) { - if (nrpages_out == 1 && !rq->inplace_io) { - DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); - dst_maptype = 0; - goto dstmap_out; - } - - /* - * For the case of small output size (especially much less - * than PAGE_SIZE), memcpy the decompressed data rather than - * compressed data is preferred. - */ - if (rq->outputsize <= PAGE_SIZE * 7 / 8) { - dst = erofs_get_pcpubuf(1); - if (IS_ERR(dst)) - return PTR_ERR(dst); - - rq->inplace_io = false; - ret = alg->decompress(rq, dst); - if (!ret) - copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, - rq->outputsize); - - erofs_put_pcpubuf(dst); - return ret; - } + /* one optimized fast path only for non bigpcluster cases yet */ + if (rq->inputsize <= PAGE_SIZE && nrpages_out == 1 && !rq->inplace_io) { + DBG_BUGON(!*rq->out); + dst = kmap_atomic(*rq->out); + dst_maptype = 0; + goto dstmap_out; } /* general decoding path which can be used for all cases */ - ret = alg->prepare_destpages(rq, pagepool); + ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); if (ret < 0) return ret; if (ret) { @@ -351,7 +285,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq, dst_maptype = 2; dstmap_out: - ret = alg->decompress(rq, dst + rq->pageofs_out); + ret = z_erofs_lz4_decompress_mem(rq, dst + rq->pageofs_out); if (!dst_maptype) kunmap_atomic(dst); @@ -360,8 +294,8 @@ dstmap_out: return ret; } -static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, - struct list_head *pagepool) +static int z_erofs_shifted_transform(struct z_erofs_decompress_req *rq, + struct page **pagepool) { const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; @@ -399,10 +333,25 @@ static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq, return 0; } +static struct z_erofs_decompressor decompressors[] = { + [Z_EROFS_COMPRESSION_SHIFTED] = { + .decompress = z_erofs_shifted_transform, + .name = "shifted" + }, + [Z_EROFS_COMPRESSION_LZ4] = { + .decompress = z_erofs_lz4_decompress, + .name = "lz4" + }, +#ifdef CONFIG_EROFS_FS_ZIP_LZMA + [Z_EROFS_COMPRESSION_LZMA] = { + .decompress = z_erofs_lzma_decompress, + .name = "lzma" + }, +#endif +}; + int z_erofs_decompress(struct z_erofs_decompress_req *rq, - struct list_head *pagepool) + struct page **pagepool) { - if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) - return z_erofs_shifted_transform(rq, pagepool); - return z_erofs_decompress_generic(rq, pagepool); + return decompressors[rq->alg].decompress(rq, pagepool); } diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c new file mode 100644 index 000000000000..50045510a1f4 --- /dev/null +++ b/fs/erofs/decompressor_lzma.c @@ -0,0 +1,290 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/xz.h> +#include <linux/module.h> +#include "compress.h" + +struct z_erofs_lzma { + struct z_erofs_lzma *next; + struct xz_dec_microlzma *state; + struct xz_buf buf; + u8 bounce[PAGE_SIZE]; +}; + +/* considering the LZMA performance, no need to use a lockless list for now */ +static DEFINE_SPINLOCK(z_erofs_lzma_lock); +static unsigned int z_erofs_lzma_max_dictsize; +static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms; +static struct z_erofs_lzma *z_erofs_lzma_head; +static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq); + +module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444); + +void z_erofs_lzma_exit(void) +{ + /* there should be no running fs instance */ + while (z_erofs_lzma_avail_strms) { + struct z_erofs_lzma *strm; + + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + DBG_BUGON(1); + return; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + while (strm) { + struct z_erofs_lzma *n = strm->next; + + if (strm->state) + xz_dec_microlzma_end(strm->state); + kfree(strm); + --z_erofs_lzma_avail_strms; + strm = n; + } + } +} + +int z_erofs_lzma_init(void) +{ + unsigned int i; + + /* by default, use # of possible CPUs instead */ + if (!z_erofs_lzma_nstrms) + z_erofs_lzma_nstrms = num_possible_cpus(); + + for (i = 0; i < z_erofs_lzma_nstrms; ++i) { + struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL); + + if (!strm) { + z_erofs_lzma_exit(); + return -ENOMEM; + } + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + ++z_erofs_lzma_avail_strms; + } + return 0; +} + +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size) +{ + static DEFINE_MUTEX(lzma_resize_mutex); + unsigned int dict_size, i; + struct z_erofs_lzma *strm, *head = NULL; + int err; + + if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) { + erofs_err(sb, "invalid lzma cfgs, size=%u", size); + return -EINVAL; + } + if (lzma->format) { + erofs_err(sb, "unidentified lzma format %x, please check kernel version", + le16_to_cpu(lzma->format)); + return -EINVAL; + } + dict_size = le32_to_cpu(lzma->dict_size); + if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) { + erofs_err(sb, "unsupported lzma dictionary size %u", + dict_size); + return -EINVAL; + } + + erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!"); + + /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */ + mutex_lock(&lzma_resize_mutex); + + if (z_erofs_lzma_max_dictsize >= dict_size) { + mutex_unlock(&lzma_resize_mutex); + return 0; + } + + /* 1. collect/isolate all streams for the following check */ + for (i = 0; i < z_erofs_lzma_avail_strms; ++i) { + struct z_erofs_lzma *last; + +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, + READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = NULL; + spin_unlock(&z_erofs_lzma_lock); + + for (last = strm; last->next; last = last->next) + ++i; + last->next = head; + head = strm; + } + + err = 0; + /* 2. walk each isolated stream and grow max dict_size if needed */ + for (strm = head; strm; strm = strm->next) { + if (strm->state) + xz_dec_microlzma_end(strm->state); + strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size); + if (!strm->state) + err = -ENOMEM; + } + + /* 3. push back all to the global list and update max dict_size */ + spin_lock(&z_erofs_lzma_lock); + DBG_BUGON(z_erofs_lzma_head); + z_erofs_lzma_head = head; + spin_unlock(&z_erofs_lzma_lock); + + z_erofs_lzma_max_dictsize = dict_size; + mutex_unlock(&lzma_resize_mutex); + return err; +} + +int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, + struct page **pagepool) +{ + const unsigned int nrpages_out = + PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = + PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; + unsigned int inputmargin, inlen, outlen, pageofs; + struct z_erofs_lzma *strm; + u8 *kin; + bool bounced = false; + int no, ni, j, err = 0; + + /* 1. get the exact LZMA compressed size */ + kin = kmap(*rq->in); + inputmargin = 0; + while (!kin[inputmargin & ~PAGE_MASK]) + if (!(++inputmargin & ~PAGE_MASK)) + break; + + if (inputmargin >= PAGE_SIZE) { + kunmap(*rq->in); + return -EFSCORRUPTED; + } + rq->inputsize -= inputmargin; + + /* 2. get an available lzma context */ +again: + spin_lock(&z_erofs_lzma_lock); + strm = z_erofs_lzma_head; + if (!strm) { + spin_unlock(&z_erofs_lzma_lock); + wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head)); + goto again; + } + z_erofs_lzma_head = strm->next; + spin_unlock(&z_erofs_lzma_lock); + + /* 3. multi-call decompress */ + inlen = rq->inputsize; + outlen = rq->outputsize; + xz_dec_microlzma_reset(strm->state, inlen, outlen, + !rq->partial_decoding); + pageofs = rq->pageofs_out; + strm->buf.in = kin + inputmargin; + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - inputmargin); + inlen -= strm->buf.in_size; + strm->buf.out = NULL; + strm->buf.out_pos = 0; + strm->buf.out_size = 0; + + for (ni = 0, no = -1;;) { + enum xz_ret xz_err; + + if (strm->buf.out_pos == strm->buf.out_size) { + if (strm->buf.out) { + kunmap(rq->out[no]); + strm->buf.out = NULL; + } + + if (++no >= nrpages_out || !outlen) { + erofs_err(rq->sb, "decompressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.out_pos = 0; + strm->buf.out_size = min_t(u32, outlen, + PAGE_SIZE - pageofs); + outlen -= strm->buf.out_size; + if (rq->out[no]) + strm->buf.out = kmap(rq->out[no]) + pageofs; + pageofs = 0; + } else if (strm->buf.in_pos == strm->buf.in_size) { + kunmap(rq->in[ni]); + + if (++ni >= nrpages_in || !inlen) { + erofs_err(rq->sb, "compressed buf out of bound"); + err = -EFSCORRUPTED; + break; + } + strm->buf.in_pos = 0; + strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE); + inlen -= strm->buf.in_size; + kin = kmap(rq->in[ni]); + strm->buf.in = kin; + bounced = false; + } + + /* + * Handle overlapping: Use bounced buffer if the compressed + * data is under processing; Otherwise, Use short-lived pages + * from the on-stack pagepool where pages share with the same + * request. + */ + if (!bounced && rq->out[no] == rq->in[ni]) { + memcpy(strm->bounce, strm->buf.in, strm->buf.in_size); + strm->buf.in = strm->bounce; + bounced = true; + } + for (j = ni + 1; j < nrpages_in; ++j) { + struct page *tmppage; + + if (rq->out[no] != rq->in[j]) + continue; + + DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb), + rq->in[j])); + tmppage = erofs_allocpage(pagepool, + GFP_KERNEL | __GFP_NOFAIL); + set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE); + copy_highpage(tmppage, rq->in[j]); + rq->in[j] = tmppage; + } + xz_err = xz_dec_microlzma_run(strm->state, &strm->buf); + DBG_BUGON(strm->buf.out_pos > strm->buf.out_size); + DBG_BUGON(strm->buf.in_pos > strm->buf.in_size); + + if (xz_err != XZ_OK) { + if (xz_err == XZ_STREAM_END && !outlen) + break; + erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]", + xz_err, rq->inputsize, rq->outputsize); + err = -EFSCORRUPTED; + break; + } + } + if (no < nrpages_out && strm->buf.out) + kunmap(rq->in[no]); + if (ni < nrpages_in) + kunmap(rq->in[ni]); + /* 4. push back LZMA stream context to the global list */ + spin_lock(&z_erofs_lzma_lock); + strm->next = z_erofs_lzma_head; + z_erofs_lzma_head = strm; + spin_unlock(&z_erofs_lzma_lock); + wake_up(&z_erofs_lzma_wq); + return err; +} diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index b0b23f41abc3..083997a034e5 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -21,14 +21,29 @@ #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002 #define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004 +#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008 +#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008 #define EROFS_ALL_FEATURE_INCOMPAT \ (EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE) + EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ + EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ + EROFS_FEATURE_INCOMPAT_COMPR_HEAD2) #define EROFS_SB_EXTSLOT_SIZE 16 +struct erofs_deviceslot { + union { + u8 uuid[16]; /* used for device manager later */ + u8 userdata[64]; /* digest(sha256), etc. */ + } u; + __le32 blocks; /* total fs blocks of this device */ + __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ + u8 reserved[56]; +}; +#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) + /* erofs on-disk super block (currently 128 bytes) */ struct erofs_super_block { __le32 magic; /* file system magic number */ @@ -54,7 +69,9 @@ struct erofs_super_block { /* customized sliding window size instead of 64k by default */ __le16 lz4_max_distance; } __packed u1; - __u8 reserved2[42]; + __le16 extra_devices; /* # of devices besides the primary device */ + __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */ + __u8 reserved2[38]; }; /* @@ -238,7 +255,7 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) /* 8-byte inode chunk indexes */ struct erofs_inode_chunk_index { __le16 advise; /* always 0, don't care for now */ - __le16 device_id; /* back-end storage id, always 0 for now */ + __le16 device_id; /* back-end storage id (with bits masked) */ __le32 blkaddr; /* start block address of this inode chunk */ }; @@ -247,10 +264,11 @@ struct erofs_inode_chunk_index { /* available compression algorithm types (for h_algorithmtype) */ enum { - Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZ4 = 0, + Z_EROFS_COMPRESSION_LZMA = 1, Z_EROFS_COMPRESSION_MAX }; -#define Z_EROFS_ALL_COMPR_ALGS (1 << (Z_EROFS_COMPRESSION_MAX - 1)) +#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1) /* 14 bytes (+ length field = 16 bytes) */ struct z_erofs_lz4_cfgs { @@ -259,6 +277,15 @@ struct z_erofs_lz4_cfgs { u8 reserved[10]; } __packed; +/* 14 bytes (+ length field = 16 bytes) */ +struct z_erofs_lzma_cfgs { + __le32 dict_size; + __le16 format; + u8 reserved[8]; +} __packed; + +#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE) + /* * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) * e.g. for 4k logical cluster size, 4B if compacted 2B is off; @@ -288,35 +315,34 @@ struct z_erofs_map_header { #define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 /* - * Fixed-sized output compression ondisk Logical Extent cluster type: - * 0 - literal (uncompressed) cluster - * 1 - compressed cluster (for the head logical cluster) - * 2 - compressed cluster (for the other logical clusters) + * Fixed-sized output compression on-disk logical cluster type: + * 0 - literal (uncompressed) lcluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * 2 - compressed lcluster (for NONHEAD lclusters) * * In detail, - * 0 - literal (uncompressed) cluster, + * 0 - literal (uncompressed) lcluster, * di_advise = 0 - * di_clusterofs = the literal data offset of the cluster - * di_blkaddr = the blkaddr of the literal cluster + * di_clusterofs = the literal data offset of the lcluster + * di_blkaddr = the blkaddr of the literal pcluster * - * 1 - compressed cluster (for the head logical cluster) - * di_advise = 1 - * di_clusterofs = the decompressed data offset of the cluster - * di_blkaddr = the blkaddr of the compressed cluster + * 1,3 - compressed lcluster (for HEAD lclusters) + * di_advise = 1 or 3 + * di_clusterofs = the decompressed data offset of the lcluster + * di_blkaddr = the blkaddr of the compressed pcluster * - * 2 - compressed cluster (for the other logical clusters) + * 2 - compressed lcluster (for NONHEAD lclusters) * di_advise = 2 * di_clusterofs = - * the decompressed data offset in its own head cluster - * di_u.delta[0] = distance to its corresponding head cluster - * di_u.delta[1] = distance to its corresponding tail cluster - * (di_advise could be 0, 1 or 2) + * the decompressed data offset in its own HEAD lcluster + * di_u.delta[0] = distance to this HEAD lcluster + * di_u.delta[1] = distance to the next HEAD lcluster */ enum { Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0, - Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1, Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2, - Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3, + Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3, Z_EROFS_VLE_CLUSTER_TYPE_MAX }; @@ -384,6 +410,7 @@ static inline void erofs_check_ondisk_layout_definitions(void) /* keep in sync between 2 index structures for better extendibility */ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != sizeof(struct z_erofs_vle_decompressed_index)); + BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128); BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index 31ac3a73b390..2345f1de438e 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -176,7 +176,7 @@ static struct page *erofs_read_inode(struct inode *inode, } if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { - if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_ALL)) { + if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { erofs_err(inode->i_sb, "unsupported chunk format %x of nid %llu", vi->chunkformat, vi->nid); @@ -192,7 +192,7 @@ static struct page *erofs_read_inode(struct inode *inode, inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec; inode->i_flags &= ~S_DAX; - if (test_opt(&sbi->ctx, DAX_ALWAYS) && S_ISREG(inode->i_mode) && + if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && vi->datalayout == EROFS_INODE_FLAT_PLAIN) inode->i_flags |= S_DAX; if (!nblks) diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 9524e155b38f..3265688af7f9 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -47,7 +47,16 @@ typedef u64 erofs_off_t; /* data type for filesystem-wide blocks number */ typedef u32 erofs_blk_t; -struct erofs_fs_context { +struct erofs_device_info { + char *path; + struct block_device *bdev; + struct dax_device *dax_dev; + + u32 blocks; + u32 mapped_blkaddr; +}; + +struct erofs_mount_opts { #ifdef CONFIG_EROFS_FS_ZIP /* current strategy of how to use managed cache */ unsigned char cache_strategy; @@ -60,6 +69,18 @@ struct erofs_fs_context { unsigned int mount_opt; }; +struct erofs_dev_context { + struct idr tree; + struct rw_semaphore rwsem; + + unsigned int extra_devices; +}; + +struct erofs_fs_context { + struct erofs_mount_opts opt; + struct erofs_dev_context *devs; +}; + /* all filesystem-wide lz4 configurations */ struct erofs_sb_lz4_info { /* # of pages needed for EROFS lz4 rolling decompression */ @@ -69,6 +90,7 @@ struct erofs_sb_lz4_info { }; struct erofs_sb_info { + struct erofs_mount_opts opt; /* options */ #ifdef CONFIG_EROFS_FS_ZIP /* list for all registered superblocks, mainly for shrinker */ struct list_head list; @@ -85,12 +107,16 @@ struct erofs_sb_info { struct erofs_sb_lz4_info lz4; #endif /* CONFIG_EROFS_FS_ZIP */ + struct erofs_dev_context *devs; struct dax_device *dax_dev; - u32 blocks; + u64 total_blocks; + u32 primarydevice_blocks; + u32 meta_blkaddr; #ifdef CONFIG_EROFS_FS_XATTR u32 xattr_blkaddr; #endif + u16 device_id_mask; /* valid bits of device id to be used */ /* inode slot unit size in bit shift */ unsigned char islotbits; @@ -108,8 +134,6 @@ struct erofs_sb_info { u8 volume_name[16]; /* volume name */ u32 feature_compat; u32 feature_incompat; - - struct erofs_fs_context ctx; /* options */ }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) @@ -121,9 +145,9 @@ struct erofs_sb_info { #define EROFS_MOUNT_DAX_ALWAYS 0x00000040 #define EROFS_MOUNT_DAX_NEVER 0x00000080 -#define clear_opt(ctx, option) ((ctx)->mount_opt &= ~EROFS_MOUNT_##option) -#define set_opt(ctx, option) ((ctx)->mount_opt |= EROFS_MOUNT_##option) -#define test_opt(ctx, option) ((ctx)->mount_opt & EROFS_MOUNT_##option) +#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option) +#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option) +#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option) enum { EROFS_ZIP_CACHE_DISABLED, @@ -237,6 +261,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING) EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS) EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER) +EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) /* atomic flag definitions */ @@ -307,6 +332,19 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value) EROFS_I_DATALAYOUT_BITS); } +/* + * Different from grab_cache_page_nowait(), reclaiming is never triggered + * when allocating new pages. + */ +static inline +struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, + pgoff_t index) +{ + return pagecache_get_page(mapping, index, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); +} + extern const struct super_operations erofs_sops; extern const struct address_space_operations erofs_raw_access_aops; @@ -338,7 +376,7 @@ extern const struct address_space_operations z_erofs_aops; * of the corresponding uncompressed data in the file. */ enum { - BH_Zipped = BH_PrivateStart, + BH_Encoded = BH_PrivateStart, BH_FullMapped, }; @@ -346,8 +384,8 @@ enum { #define EROFS_MAP_MAPPED (1 << BH_Mapped) /* Located in metadata (could be copied from bd_inode) */ #define EROFS_MAP_META (1 << BH_Meta) -/* The extent has been compressed */ -#define EROFS_MAP_ZIPPED (1 << BH_Zipped) +/* The extent is encoded */ +#define EROFS_MAP_ENCODED (1 << BH_Encoded) /* The length of extent is full */ #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) @@ -355,6 +393,8 @@ struct erofs_map_blocks { erofs_off_t m_pa, m_la; u64 m_plen, m_llen; + unsigned short m_deviceid; + char m_algorithmformat; unsigned int m_flags; struct page *mpage; @@ -367,6 +407,13 @@ struct erofs_map_blocks { * approach instead if possible since it's more metadata lightweight.) */ #define EROFS_GET_BLOCKS_FIEMAP 0x0002 +/* Used to map the whole extent if non-negligible data is requested for LZMA */ +#define EROFS_GET_BLOCKS_READMORE 0x0004 + +enum { + Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, + Z_EROFS_COMPRESSION_RUNTIME_MAX +}; /* zmap.c */ extern const struct iomap_ops z_erofs_iomap_report_ops; @@ -386,9 +433,18 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode, } #endif /* !CONFIG_EROFS_FS_ZIP */ +struct erofs_map_dev { + struct block_device *m_bdev; + struct dax_device *m_daxdev; + + erofs_off_t m_pa; + unsigned int m_deviceid; +}; + /* data.c */ extern const struct file_operations erofs_file_fops; struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr); +int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -443,7 +499,14 @@ void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); /* utils.c / zdata.c */ -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp); +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); +static inline void erofs_pagepool_add(struct page **pagepool, + struct page *page) +{ + set_page_private(page, (unsigned long)*pagepool); + *pagepool = page; +} +void erofs_release_pages(struct page **pagepool); #ifdef CONFIG_EROFS_FS_ZIP int erofs_workgroup_put(struct erofs_workgroup *grp); @@ -483,6 +546,26 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } #endif /* !CONFIG_EROFS_FS_ZIP */ +#ifdef CONFIG_EROFS_FS_ZIP_LZMA +int z_erofs_lzma_init(void); +void z_erofs_lzma_exit(void); +int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size); +#else +static inline int z_erofs_lzma_init(void) { return 0; } +static inline int z_erofs_lzma_exit(void) { return 0; } +static inline int z_erofs_load_lzma_config(struct super_block *sb, + struct erofs_super_block *dsb, + struct z_erofs_lzma_cfgs *lzma, int size) { + if (lzma) { + erofs_err(sb, "lzma algorithm isn't enabled"); + return -EINVAL; + } + return 0; +} +#endif /* !CONFIG_EROFS_FS_ZIP */ + #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* __EROFS_INTERNAL_H */ diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c index 6c885575128a..a2efd833d1b6 100644 --- a/fs/erofs/pcpubuf.c +++ b/fs/erofs/pcpubuf.c @@ -49,7 +49,7 @@ int erofs_pcpubuf_growsize(unsigned int nrpages) { static DEFINE_MUTEX(pcb_resize_mutex); static unsigned int pcb_nrpages; - LIST_HEAD(pagepool); + struct page *pagepool = NULL; int delta, cpu, ret, i; mutex_lock(&pcb_resize_mutex); @@ -102,13 +102,13 @@ int erofs_pcpubuf_growsize(unsigned int nrpages) vunmap(old_ptr); free_pagearray: while (i) - list_add(&oldpages[--i]->lru, &pagepool); + erofs_pagepool_add(&pagepool, oldpages[--i]); kfree(oldpages); if (ret) break; } pcb_nrpages = nrpages; - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); out: mutex_unlock(&pcb_resize_mutex); return ret; diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 11b88559f8bf..6a969b1e0ee6 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -225,6 +225,9 @@ static int erofs_load_compr_cfgs(struct super_block *sb, case Z_EROFS_COMPRESSION_LZ4: ret = z_erofs_load_lz4_config(sb, dsb, data, size); break; + case Z_EROFS_COMPRESSION_LZMA: + ret = z_erofs_load_lzma_config(sb, dsb, data, size); + break; default: DBG_BUGON(1); ret = -EFAULT; @@ -252,6 +255,79 @@ static int erofs_load_compr_cfgs(struct super_block *sb, } #endif +static int erofs_init_devices(struct super_block *sb, + struct erofs_super_block *dsb) +{ + struct erofs_sb_info *sbi = EROFS_SB(sb); + unsigned int ondisk_extradevs; + erofs_off_t pos; + struct page *page = NULL; + struct erofs_device_info *dif; + struct erofs_deviceslot *dis; + void *ptr; + int id, err = 0; + + sbi->total_blocks = sbi->primarydevice_blocks; + if (!erofs_sb_has_device_table(sbi)) + ondisk_extradevs = 0; + else + ondisk_extradevs = le16_to_cpu(dsb->extra_devices); + + if (ondisk_extradevs != sbi->devs->extra_devices) { + erofs_err(sb, "extra devices don't match (ondisk %u, given %u)", + ondisk_extradevs, sbi->devs->extra_devices); + return -EINVAL; + } + if (!ondisk_extradevs) + return 0; + + sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1; + pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE; + down_read(&sbi->devs->rwsem); + idr_for_each_entry(&sbi->devs->tree, dif, id) { + erofs_blk_t blk = erofs_blknr(pos); + struct block_device *bdev; + + if (!page || page->index != blk) { + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + + page = erofs_get_meta_page(sb, blk); + if (IS_ERR(page)) { + up_read(&sbi->devs->rwsem); + return PTR_ERR(page); + } + ptr = kmap(page); + } + dis = ptr + erofs_blkoff(pos); + + bdev = blkdev_get_by_path(dif->path, + FMODE_READ | FMODE_EXCL, + sb->s_type); + if (IS_ERR(bdev)) { + err = PTR_ERR(bdev); + goto err_out; + } + dif->bdev = bdev; + dif->dax_dev = fs_dax_get_by_bdev(bdev); + dif->blocks = le32_to_cpu(dis->blocks); + dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + sbi->total_blocks += dif->blocks; + pos += EROFS_DEVT_SLOT_SIZE; + } +err_out: + up_read(&sbi->devs->rwsem); + if (page) { + kunmap(page); + unlock_page(page); + put_page(page); + } + return err; +} + static int erofs_read_superblock(struct super_block *sb) { struct erofs_sb_info *sbi; @@ -303,7 +379,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->blocks = le32_to_cpu(dsb->blocks); + sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -330,6 +406,11 @@ static int erofs_read_superblock(struct super_block *sb) ret = erofs_load_compr_cfgs(sb, dsb); else ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0); + if (ret < 0) + goto out; + + /* handle multiple devices */ + ret = erofs_init_devices(sb, dsb); out: kunmap(page); put_page(page); @@ -340,15 +421,15 @@ out: static void erofs_default_options(struct erofs_fs_context *ctx) { #ifdef CONFIG_EROFS_FS_ZIP - ctx->cache_strategy = EROFS_ZIP_CACHE_READAROUND; - ctx->max_sync_decompress_pages = 3; - ctx->readahead_sync_decompress = false; + ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND; + ctx->opt.max_sync_decompress_pages = 3; + ctx->opt.readahead_sync_decompress = false; #endif #ifdef CONFIG_EROFS_FS_XATTR - set_opt(ctx, XATTR_USER); + set_opt(&ctx->opt, XATTR_USER); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - set_opt(ctx, POSIX_ACL); + set_opt(&ctx->opt, POSIX_ACL); #endif } @@ -358,6 +439,7 @@ enum { Opt_cache_strategy, Opt_dax, Opt_dax_enum, + Opt_device, Opt_err }; @@ -381,6 +463,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { erofs_param_cache_strategy), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums), + fsparam_string("device", Opt_device), {} }; @@ -392,12 +475,12 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) switch (mode) { case EROFS_MOUNT_DAX_ALWAYS: warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(ctx, DAX_ALWAYS); - clear_opt(ctx, DAX_NEVER); + set_opt(&ctx->opt, DAX_ALWAYS); + clear_opt(&ctx->opt, DAX_NEVER); return true; case EROFS_MOUNT_DAX_NEVER: - set_opt(ctx, DAX_NEVER); - clear_opt(ctx, DAX_ALWAYS); + set_opt(&ctx->opt, DAX_NEVER); + clear_opt(&ctx->opt, DAX_ALWAYS); return true; default: DBG_BUGON(1); @@ -412,9 +495,10 @@ static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode) static int erofs_fc_parse_param(struct fs_context *fc, struct fs_parameter *param) { - struct erofs_fs_context *ctx __maybe_unused = fc->fs_private; + struct erofs_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + struct erofs_device_info *dif; + int opt, ret; opt = fs_parse(fc, erofs_fs_parameters, param, &result); if (opt < 0) @@ -424,9 +508,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_user_xattr: #ifdef CONFIG_EROFS_FS_XATTR if (result.boolean) - set_opt(ctx, XATTR_USER); + set_opt(&ctx->opt, XATTR_USER); else - clear_opt(ctx, XATTR_USER); + clear_opt(&ctx->opt, XATTR_USER); #else errorfc(fc, "{,no}user_xattr options not supported"); #endif @@ -434,16 +518,16 @@ static int erofs_fc_parse_param(struct fs_context *fc, case Opt_acl: #ifdef CONFIG_EROFS_FS_POSIX_ACL if (result.boolean) - set_opt(ctx, POSIX_ACL); + set_opt(&ctx->opt, POSIX_ACL); else - clear_opt(ctx, POSIX_ACL); + clear_opt(&ctx->opt, POSIX_ACL); #else errorfc(fc, "{,no}acl options not supported"); #endif break; case Opt_cache_strategy: #ifdef CONFIG_EROFS_FS_ZIP - ctx->cache_strategy = result.uint_32; + ctx->opt.cache_strategy = result.uint_32; #else errorfc(fc, "compression not supported, cache_strategy ignored"); #endif @@ -456,6 +540,25 @@ static int erofs_fc_parse_param(struct fs_context *fc, if (!erofs_fc_set_dax_mode(fc, result.uint_32)) return -EINVAL; break; + case Opt_device: + dif = kzalloc(sizeof(*dif), GFP_KERNEL); + if (!dif) + return -ENOMEM; + dif->path = kstrdup(param->string, GFP_KERNEL); + if (!dif->path) { + kfree(dif); + return -ENOMEM; + } + down_write(&ctx->devs->rwsem); + ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL); + up_write(&ctx->devs->rwsem); + if (ret < 0) { + kfree(dif->path); + kfree(dif); + return ret; + } + ++ctx->devs->extra_devices; + break; default: return -ENOPARAM; } @@ -540,15 +643,19 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) return -ENOMEM; sb->s_fs_info = sbi; + sbi->opt = ctx->opt; sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev); + sbi->devs = ctx->devs; + ctx->devs = NULL; + err = erofs_read_superblock(sb); if (err) return err; - if (test_opt(ctx, DAX_ALWAYS) && + if (test_opt(&sbi->opt, DAX_ALWAYS) && !dax_supported(sbi->dax_dev, sb->s_bdev, EROFS_BLKSIZ, 0, bdev_nr_sectors(sb->s_bdev))) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); - clear_opt(ctx, DAX_ALWAYS); + clear_opt(&sbi->opt, DAX_ALWAYS); } sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -557,13 +664,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &erofs_sops; sb->s_xattr = erofs_xattr_handlers; - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(&sbi->opt, POSIX_ACL)) sb->s_flags |= SB_POSIXACL; else sb->s_flags &= ~SB_POSIXACL; - sbi->ctx = *ctx; - #ifdef CONFIG_EROFS_FS_ZIP xa_init(&sbi->managed_pslots); #endif @@ -607,20 +712,44 @@ static int erofs_fc_reconfigure(struct fs_context *fc) DBG_BUGON(!sb_rdonly(sb)); - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(&ctx->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else fc->sb_flags &= ~SB_POSIXACL; - sbi->ctx = *ctx; + sbi->opt = ctx->opt; fc->sb_flags |= SB_RDONLY; return 0; } +static int erofs_release_device_info(int id, void *ptr, void *data) +{ + struct erofs_device_info *dif = ptr; + + fs_put_dax(dif->dax_dev); + if (dif->bdev) + blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL); + kfree(dif->path); + kfree(dif); + return 0; +} + +static void erofs_free_dev_context(struct erofs_dev_context *devs) +{ + if (!devs) + return; + idr_for_each(&devs->tree, &erofs_release_device_info, NULL); + idr_destroy(&devs->tree); + kfree(devs); +} + static void erofs_fc_free(struct fs_context *fc) { - kfree(fc->fs_private); + struct erofs_fs_context *ctx = fc->fs_private; + + erofs_free_dev_context(ctx->devs); + kfree(ctx); } static const struct fs_context_operations erofs_context_ops = { @@ -632,15 +761,21 @@ static const struct fs_context_operations erofs_context_ops = { static int erofs_init_fs_context(struct fs_context *fc) { - fc->fs_private = kzalloc(sizeof(struct erofs_fs_context), GFP_KERNEL); - if (!fc->fs_private) - return -ENOMEM; + struct erofs_fs_context *ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); - /* set default mount options */ - erofs_default_options(fc->fs_private); + if (!ctx) + return -ENOMEM; + ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL); + if (!ctx->devs) { + kfree(ctx); + return -ENOMEM; + } + fc->fs_private = ctx; + idr_init(&ctx->devs->tree); + init_rwsem(&ctx->devs->rwsem); + erofs_default_options(ctx); fc->ops = &erofs_context_ops; - return 0; } @@ -659,6 +794,8 @@ static void erofs_kill_sb(struct super_block *sb) sbi = EROFS_SB(sb); if (!sbi) return; + + erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev); kfree(sbi); sb->s_fs_info = NULL; @@ -706,6 +843,10 @@ static int __init erofs_module_init(void) if (err) goto shrinker_err; + err = z_erofs_lzma_init(); + if (err) + goto lzma_err; + erofs_pcpubuf_init(); err = z_erofs_init_zip_subsystem(); if (err) @@ -720,6 +861,8 @@ static int __init erofs_module_init(void) fs_err: z_erofs_exit_zip_subsystem(); zip_err: + z_erofs_lzma_exit(); +lzma_err: erofs_exit_shrinker(); shrinker_err: kmem_cache_destroy(erofs_inode_cachep); @@ -730,11 +873,13 @@ icache_err: static void __exit erofs_module_exit(void) { unregister_filesystem(&erofs_fs_type); - z_erofs_exit_zip_subsystem(); - erofs_exit_shrinker(); - /* Ensure all RCU free inodes are safe before cache is destroyed. */ + /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */ rcu_barrier(); + + z_erofs_exit_zip_subsystem(); + z_erofs_lzma_exit(); + erofs_exit_shrinker(); kmem_cache_destroy(erofs_inode_cachep); erofs_pcpubuf_exit(); } @@ -748,7 +893,7 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = sb->s_magic; buf->f_bsize = EROFS_BLKSIZ; - buf->f_blocks = sbi->blocks; + buf->f_blocks = sbi->total_blocks; buf->f_bfree = buf->f_bavail = 0; buf->f_files = ULLONG_MAX; @@ -763,31 +908,31 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) static int erofs_show_options(struct seq_file *seq, struct dentry *root) { struct erofs_sb_info *sbi = EROFS_SB(root->d_sb); - struct erofs_fs_context *ctx = &sbi->ctx; + struct erofs_mount_opts *opt = &sbi->opt; #ifdef CONFIG_EROFS_FS_XATTR - if (test_opt(ctx, XATTR_USER)) + if (test_opt(opt, XATTR_USER)) seq_puts(seq, ",user_xattr"); else seq_puts(seq, ",nouser_xattr"); #endif #ifdef CONFIG_EROFS_FS_POSIX_ACL - if (test_opt(ctx, POSIX_ACL)) + if (test_opt(opt, POSIX_ACL)) seq_puts(seq, ",acl"); else seq_puts(seq, ",noacl"); #endif #ifdef CONFIG_EROFS_FS_ZIP - if (ctx->cache_strategy == EROFS_ZIP_CACHE_DISABLED) + if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED) seq_puts(seq, ",cache_strategy=disabled"); - else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) seq_puts(seq, ",cache_strategy=readahead"); - else if (ctx->cache_strategy == EROFS_ZIP_CACHE_READAROUND) + else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND) seq_puts(seq, ",cache_strategy=readaround"); #endif - if (test_opt(ctx, DAX_ALWAYS)) + if (test_opt(opt, DAX_ALWAYS)) seq_puts(seq, ",dax=always"); - if (test_opt(ctx, DAX_NEVER)) + if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); return 0; } diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index bd86067a63f7..84da2c280012 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -6,20 +6,29 @@ #include "internal.h" #include <linux/pagevec.h> -struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp) +struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp) { - struct page *page; + struct page *page = *pagepool; - if (!list_empty(pool)) { - page = lru_to_page(pool); + if (page) { DBG_BUGON(page_ref_count(page) != 1); - list_del(&page->lru); + *pagepool = (struct page *)page_private(page); } else { page = alloc_page(gfp); } return page; } +void erofs_release_pages(struct page **pagepool) +{ + while (*pagepool) { + struct page *page = *pagepool; + + *pagepool = (struct page *)page_private(page); + put_page(page); + } +} + #ifdef CONFIG_EROFS_FS_ZIP /* global shrink count (for all mounted EROFS instances) */ static atomic_long_t erofs_global_shrink_cnt; diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index 778f2c52295d..01c581e93c5f 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -429,7 +429,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) static bool erofs_xattr_user_list(struct dentry *dentry) { - return test_opt(&EROFS_SB(dentry->d_sb)->ctx, XATTR_USER); + return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER); } static bool erofs_xattr_trusted_list(struct dentry *dentry) @@ -476,7 +476,7 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, switch (handler->flags) { case EROFS_XATTR_INDEX_USER: - if (!test_opt(&sbi->ctx, XATTR_USER)) + if (!test_opt(&sbi->opt, XATTR_USER)) return -EOPNOTSUPP; break; case EROFS_XATTR_INDEX_TRUSTED: diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 11c7a1aaebad..bcb1b91b234f 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -236,7 +236,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, enum z_erofs_cache_alloctype type, - struct list_head *pagepool) + struct page **pagepool) { struct z_erofs_pcluster *pcl = clt->pcl; bool standalone = true; @@ -287,12 +287,10 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) continue; - if (page) { + if (page) put_page(page); - } else if (newpage) { - set_page_private(newpage, 0); - list_add(&newpage->lru, pagepool); - } + else if (newpage) + erofs_pagepool_add(pagepool, newpage); } /* @@ -476,6 +474,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, struct erofs_workgroup *grp; int err; + if (!(map->m_flags & EROFS_MAP_ENCODED)) { + DBG_BUGON(1); + return -EFSCORRUPTED; + } + /* no available pcluster, let's allocate one */ pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT); if (IS_ERR(pcl)) @@ -483,16 +486,11 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt, atomic_set(&pcl->obj.refcount, 1); pcl->obj.index = map->m_pa >> PAGE_SHIFT; - + pcl->algorithmformat = map->m_algorithmformat; pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | (map->m_flags & EROFS_MAP_FULL_MAPPED ? Z_EROFS_PCLUSTER_FULL_LENGTH : 0); - if (map->m_flags & EROFS_MAP_ZIPPED) - pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4; - else - pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = clt->owned_head; clt->mode = COLLECT_PRIMARY_FOLLOWED; @@ -643,7 +641,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, struct list_head *pagepool) + struct page *page, struct page **pagepool) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -695,7 +693,7 @@ restart_now: goto err_out; /* preload all compressed pages (maybe downgrade role if necessary) */ - if (should_alloc_managed_pages(fe, sbi->ctx.cache_strategy, map->m_la)) + if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, map->m_la)) cache_strategy = TRYALLOC; else cache_strategy = DONTALLOC; @@ -796,7 +794,7 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, /* Use workqueue and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { queue_work(z_erofs_workqueue, &io->u.work); - sbi->ctx.readahead_sync_decompress = true; + sbi->opt.readahead_sync_decompress = true; return; } z_erofs_decompressqueue_work(&io->u.work); @@ -836,7 +834,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) static int z_erofs_decompress_pcluster(struct super_block *sb, struct z_erofs_pcluster *pcl, - struct list_head *pagepool) + struct page **pagepool) { struct erofs_sb_info *const sbi = EROFS_SB(sb); struct z_erofs_pagevec_ctor ctor; @@ -1036,7 +1034,7 @@ out: } static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct list_head *pagepool) + struct page **pagepool) { z_erofs_next_pcluster_t owned = io->head; @@ -1060,18 +1058,18 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) { struct z_erofs_decompressqueue *bgq = container_of(work, struct z_erofs_decompressqueue, u.work); - LIST_HEAD(pagepool); + struct page *pagepool = NULL; DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); kvfree(bgq); } static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, - struct list_head *pagepool, + struct page **pagepool, struct address_space *mc, gfp_t gfp) { @@ -1173,7 +1171,7 @@ repeat: out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { - list_add(&page->lru, pagepool); + erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; } @@ -1257,7 +1255,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompress_frontend *f, - struct list_head *pagepool, + struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { @@ -1266,8 +1264,9 @@ static void z_erofs_submit_queue(struct super_block *sb, struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; z_erofs_next_pcluster_t owned_head = f->clt.owned_head; - /* since bio will be NULL, no need to initialize last_index */ + /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; + struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; @@ -1279,6 +1278,7 @@ static void z_erofs_submit_queue(struct super_block *sb, q[JQ_SUBMIT]->head = owned_head; do { + struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; pgoff_t cur, end; unsigned int i = 0; @@ -1290,7 +1290,13 @@ static void z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); - cur = pcl->obj.index; + /* no device id here, thus it will always succeed */ + mdev = (struct erofs_map_dev) { + .m_pa = blknr_to_addr(pcl->obj.index), + }; + (void)erofs_map_dev(sb, &mdev); + + cur = erofs_blknr(mdev.m_pa); end = cur + pcl->pclusterpages; /* close the main owned chain at first */ @@ -1306,7 +1312,8 @@ static void z_erofs_submit_queue(struct super_block *sb, if (!page) continue; - if (bio && cur != last_index + 1) { + if (bio && (cur != last_index + 1 || + last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); bio = NULL; @@ -1314,9 +1321,10 @@ submit_bio_retry: if (!bio) { bio = bio_alloc(GFP_NOIO, BIO_MAX_VECS); - bio->bi_end_io = z_erofs_decompressqueue_endio; - bio_set_dev(bio, sb->s_bdev); + + bio_set_dev(bio, mdev.m_bdev); + last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; @@ -1355,7 +1363,7 @@ submit_bio_retry: static void z_erofs_runqueue(struct super_block *sb, struct z_erofs_decompress_frontend *f, - struct list_head *pagepool, bool force_fg) + struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; @@ -1377,18 +1385,87 @@ static void z_erofs_runqueue(struct super_block *sb, z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); } +/* + * Since partial uptodate is still unimplemented for now, we have to use + * approximate readmore strategies as a start. + */ +static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, + struct readahead_control *rac, + erofs_off_t end, + struct page **pagepool, + bool backmost) +{ + struct inode *inode = f->inode; + struct erofs_map_blocks *map = &f->map; + erofs_off_t cur; + int err; + + if (backmost) { + map->m_la = end; + err = z_erofs_map_blocks_iter(inode, map, + EROFS_GET_BLOCKS_READMORE); + if (err) + return; + + /* expend ra for the trailing edge if readahead */ + if (rac) { + loff_t newstart = readahead_pos(rac); + + cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); + readahead_expand(rac, newstart, cur - newstart); + return; + } + end = round_up(end, PAGE_SIZE); + } else { + end = round_up(map->m_la, PAGE_SIZE); + + if (!map->m_llen) + return; + } + + cur = map->m_la + map->m_llen - 1; + while (cur >= end) { + pgoff_t index = cur >> PAGE_SHIFT; + struct page *page; + + page = erofs_grab_cache_page_nowait(inode->i_mapping, index); + if (!page) + goto skip; + + if (PageUptodate(page)) { + unlock_page(page); + put_page(page); + goto skip; + } + + err = z_erofs_do_read_page(f, page, pagepool); + if (err) + erofs_err(inode->i_sb, + "readmore error at page %lu @ nid %llu", + index, EROFS_I(inode)->nid); + put_page(page); +skip: + if (cur < PAGE_SIZE) + break; + cur = (index << PAGE_SHIFT) - 1; + } +} + static int z_erofs_readpage(struct file *file, struct page *page) { struct inode *const inode = page->mapping->host; struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + struct page *pagepool = NULL; int err; - LIST_HEAD(pagepool); trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, + &pagepool, true); err = z_erofs_do_read_page(&f, page, &pagepool); + z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); + (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ @@ -1400,8 +1477,7 @@ static int z_erofs_readpage(struct file *file, struct page *page) if (f.map.mpage) put_page(f.map.mpage); - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); return err; } @@ -1409,29 +1485,19 @@ static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - - unsigned int nr_pages = readahead_count(rac); - bool sync = (sbi->ctx.readahead_sync_decompress && - nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - struct page *page, *head = NULL; - LIST_HEAD(pagepool); - - trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + struct page *pagepool = NULL, *head = NULL, *page; + unsigned int nr_pages; f.readahead = true; f.headoffset = readahead_pos(rac); - while ((page = readahead_page(rac))) { - prefetchw(&page->flags); - - /* - * A pure asynchronous readahead is indicated if - * a PG_readahead marked page is hitted at first. - * Let's also do asynchronous decompression for this case. - */ - sync &= !(PageReadahead(page) && !head); + z_erofs_pcluster_readmore(&f, rac, f.headoffset + + readahead_length(rac) - 1, &pagepool, true); + nr_pages = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + while ((page = readahead_page(rac))) { set_page_private(page, (unsigned long)head); head = page; } @@ -1450,16 +1516,15 @@ static void z_erofs_readahead(struct readahead_control *rac) page->index, EROFS_I(inode)->nid); put_page(page); } - + z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); (void)z_erofs_collector_end(&f.clt); - z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync); - + z_erofs_runqueue(inode->i_sb, &f, &pagepool, + sbi->opt.readahead_sync_decompress && + nr_pages <= sbi->opt.max_sync_decompress_pages); if (f.map.mpage) put_page(f.map.mpage); - - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); } const struct address_space_operations z_erofs_aops = { diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 3a008f1b9f78..879df5362777 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -94,13 +94,6 @@ struct z_erofs_decompressqueue { } u; }; -#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) -static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, - struct page *page) -{ - return page->mapping == MNGD_MAPPING(sbi); -} - #define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 #define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 9fb98d85a3ce..660489a7fb64 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -28,7 +28,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); struct super_block *const sb = inode->i_sb; - int err; + int err, headnr; erofs_off_t pos; struct page *page; void *kaddr; @@ -68,9 +68,11 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel", - vi->z_algorithmtype[0], vi->nid); + headnr = 0; + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || + vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", + headnr + 1, vi->z_algorithmtype[headnr], vi->nid); err = -EOPNOTSUPP; goto unmap_done; } @@ -111,7 +113,7 @@ struct z_erofs_maprecorder { unsigned long lcn; /* compression extent information gathered */ - u8 type; + u8 type, headtype; u16 clusterofs; u16 delta[2]; erofs_blk_t pblk, compressedlcs; @@ -178,7 +180,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, m->clusterofs = 1 << vi->z_logical_clusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) { - if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -189,7 +192,8 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, m->delta[1] = le16_to_cpu(di->di_u.delta[1]); break; case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: m->clusterofs = le16_to_cpu(di->di_clusterofs); m->pblk = le32_to_cpu(di->di_u.blkaddr); break; @@ -369,7 +373,8 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, if (compacted_4b_initial == 32 / 4) compacted_4b_initial = 0; - if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) + if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) && + compacted_4b_initial < totalidx) compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); else compacted_2b = 0; @@ -445,9 +450,9 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, } return z_erofs_extent_lookback(m, m->delta[0]); case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - map->m_flags &= ~EROFS_MAP_ZIPPED; - fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: + m->headtype = m->type; map->m_la = (lcn << lclusterbits) | m->clusterofs; break; default: @@ -470,13 +475,18 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, int err; DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN && - m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD); - if (!(map->m_flags & EROFS_MAP_ZIPPED) || - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) { + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 && + m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2); + DBG_BUGON(m->type != m->headtype); + + if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || + ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) || + ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) { map->m_plen = 1 << lclusterbits; return 0; } - lcn = m->lcn + 1; if (m->compressedlcs) goto out; @@ -498,7 +508,8 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, switch (m->type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. @@ -553,7 +564,8 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) DBG_BUGON(!m->delta[1] && m->clusterofs != 1 << lclusterbits); } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD) { + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 || + m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) { /* go on until the next HEAD lcluster */ if (lcn != headlcn) break; @@ -608,16 +620,15 @@ int z_erofs_map_blocks_iter(struct inode *inode, if (err) goto unmap_out; - map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */ + map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; switch (m.type) { case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: - if (endoff >= m.clusterofs) - map->m_flags &= ~EROFS_MAP_ZIPPED; - fallthrough; - case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2: if (endoff >= m.clusterofs) { + m.headtype = m.type; map->m_la = (m.lcn << lclusterbits) | m.clusterofs; break; } @@ -649,13 +660,22 @@ int z_erofs_map_blocks_iter(struct inode *inode, map->m_llen = end - map->m_la; map->m_pa = blknr_to_addr(m.pblk); - map->m_flags |= EROFS_MAP_MAPPED; err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto out; - if (flags & EROFS_GET_BLOCKS_FIEMAP) { + if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) + map->m_algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; + else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) + map->m_algorithmformat = vi->z_algorithmtype[1]; + else + map->m_algorithmformat = vi->z_algorithmtype[0]; + + if ((flags & EROFS_GET_BLOCKS_FIEMAP) || + ((flags & EROFS_GET_BLOCKS_READMORE) && + map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA && + map->m_llen >= EROFS_BLKSIZ)) { err = z_erofs_get_extent_decompressedlen(&m); if (!err) map->m_flags |= EROFS_MAP_FULL_MAPPED; diff --git a/fs/exec.c b/fs/exec.c index a098c133d8d7..b6079f1a098e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -987,16 +987,14 @@ static int exec_mmap(struct mm_struct *mm) if (old_mm) { /* - * Make sure that if there is a core dump in progress - * for the old mm, we get out and die instead of going - * through with the exec. We must hold mmap_lock around - * checking core_state and changing tsk->mm. + * If there is a pending fatal signal perhaps a signal + * whose default action is to create a coredump get + * out and die instead of going through with the exec. */ - mmap_read_lock(old_mm); - if (unlikely(old_mm->core_state)) { - mmap_read_unlock(old_mm); + ret = mmap_read_lock_killable(old_mm); + if (ret) { up_write(&tsk->signal->exec_update_lock); - return -EINTR; + return ret; } } diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index ca37d4344361..1c7aa1ea4724 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -604,7 +604,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info) exfat_save_attr(inode, info->attr); inode->i_blocks = ((i_size_read(inode) + (sbi->cluster_size - 1)) & - ~(sbi->cluster_size - 1)) >> inode->i_blkbits; + ~((loff_t)sbi->cluster_size - 1)) >> inode->i_blkbits; inode->i_mtime = info->mtime; inode->i_ctime = info->mtime; ei->i_crtime = info->crtime; diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 1f3f4326bf3c..c17ccc19b938 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -48,10 +48,9 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, struct ext2_sb_info *sbi = EXT2_SB(sb); if (block_group >= sbi->s_groups_count) { - ext2_error (sb, "ext2_get_group_desc", - "block_group >= groups_count - " - "block_group = %d, groups_count = %lu", - block_group, sbi->s_groups_count); + WARN(1, "block_group >= groups_count - " + "block_group = %d, groups_count = %lu", + block_group, sbi->s_groups_count); return NULL; } @@ -59,10 +58,9 @@ struct ext2_group_desc * ext2_get_group_desc(struct super_block * sb, group_desc = block_group >> EXT2_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT2_DESC_PER_BLOCK(sb) - 1); if (!sbi->s_group_desc[group_desc]) { - ext2_error (sb, "ext2_get_group_desc", - "Group descriptor not loaded - " - "block_group = %d, group_desc = %lu, desc = %lu", - block_group, group_desc, offset); + WARN(1, "Group descriptor not loaded - " + "block_group = %d, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); return NULL; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ffb295aa891c..74b172a4adda 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -551,7 +551,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) struct dir_private_info *info = file->private_data; struct inode *inode = file_inode(file); struct fname *fname; - int ret; + int ret = 0; if (!info) { info = ext4_htree_create_dir_info(file, ctx->pos); @@ -599,7 +599,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) info->curr_minor_hash, &info->next_hash); if (ret < 0) - return ret; + goto finished; if (ret == 0) { ctx->pos = ext4_get_htree_eof(file); break; @@ -630,7 +630,7 @@ static int ext4_dx_readdir(struct file *file, struct dir_context *ctx) } finished: info->last_pos = ctx->pos; - return 0; + return ret < 0 ? ret : 0; } static int ext4_release_dir(struct inode *inode, struct file *filp) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 90ff5acaf11f..3825195539d7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3593,9 +3593,6 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, unsigned flags, struct page **pagep, void **fsdata); -extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c0de30f25185..0e02571f2f82 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5916,7 +5916,7 @@ void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end) } /* Check if *cur is a hole and if it is, skip it */ -static void skip_hole(struct inode *inode, ext4_lblk_t *cur) +static int skip_hole(struct inode *inode, ext4_lblk_t *cur) { int ret; struct ext4_map_blocks map; @@ -5925,9 +5925,12 @@ static void skip_hole(struct inode *inode, ext4_lblk_t *cur) map.m_len = ((inode->i_size) >> inode->i_sb->s_blocksize_bits) - *cur; ret = ext4_map_blocks(NULL, inode, &map, 0); + if (ret < 0) + return ret; if (ret != 0) - return; + return 0; *cur = *cur + map.m_len; + return 0; } /* Count number of blocks used by this inode and update i_blocks */ @@ -5976,7 +5979,9 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) * iblocks by total number of differences found. */ cur = 0; - skip_hole(inode, &cur); + ret = skip_hole(inode, &cur); + if (ret < 0) + goto out; path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) goto out; @@ -5995,8 +6000,12 @@ int ext4_ext_replay_set_iblocks(struct inode *inode) } cur = max(cur + 1, le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)); - skip_hole(inode, &cur); - + ret = skip_hole(inode, &cur); + if (ret < 0) { + ext4_ext_drop_refs(path); + kfree(path); + break; + } path2 = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path2)) { ext4_ext_drop_refs(path); diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 8e610a381862..8ea5a81e6554 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -892,6 +892,12 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) sizeof(lrange), (u8 *)&lrange, crc)) return -ENOSPC; } else { + unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? + EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; + + /* Limit the number of blocks in one extent */ + map.m_len = min(max, map.m_len); + fc_ext.fc_ino = cpu_to_le32(inode->i_ino); ex = (struct ext4_extent *)&fc_ext.fc_ex; ex->ee_block = cpu_to_le32(map.m_lblk); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ac0e11bbb445..4c5f41052351 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -74,7 +74,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to) return generic_file_read_iter(iocb, to); } - ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0); + ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0); inode_unlock_shared(inode); file_accessed(iocb->ki_filp); @@ -566,7 +566,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ilock_shared) iomap_ops = &ext4_iomap_overwrite_ops; ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, - (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0); + (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, + 0); if (ret == -ENOTBLK) ret = 0; @@ -915,7 +916,7 @@ const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, .read_iter = ext4_file_read_iter, .write_iter = ext4_file_write_iter, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 82bf4ff6be28..39a1ab129fdc 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -7,6 +7,7 @@ #include <linux/iomap.h> #include <linux/fiemap.h> #include <linux/iversion.h> +#include <linux/backing-dev.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -733,45 +734,83 @@ convert: int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page) { - int ret, no_expand; + handle_t *handle = ext4_journal_current_handle(); + int no_expand; void *kaddr; struct ext4_iloc iloc; + int ret = 0, ret2; + + if (unlikely(copied < len) && !PageUptodate(page)) + copied = 0; - if (unlikely(copied < len)) { - if (!PageUptodate(page)) { - copied = 0; + if (likely(copied)) { + ret = ext4_get_inode_loc(inode, &iloc); + if (ret) { + unlock_page(page); + put_page(page); + ext4_std_error(inode->i_sb, ret); goto out; } - } + ext4_write_lock_xattr(inode, &no_expand); + BUG_ON(!ext4_has_inline_data(inode)); - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) { - ext4_std_error(inode->i_sb, ret); - copied = 0; - goto out; - } + /* + * ei->i_inline_off may have changed since + * ext4_write_begin() called + * ext4_try_to_write_inline_data() + */ + (void) ext4_find_inline_data_nolock(inode); - ext4_write_lock_xattr(inode, &no_expand); - BUG_ON(!ext4_has_inline_data(inode)); + kaddr = kmap_atomic(page); + ext4_write_inline_data(inode, &iloc, kaddr, pos, copied); + kunmap_atomic(kaddr); + SetPageUptodate(page); + /* clear page dirty so that writepages wouldn't work for us. */ + ClearPageDirty(page); - /* - * ei->i_inline_off may have changed since ext4_write_begin() - * called ext4_try_to_write_inline_data() - */ - (void) ext4_find_inline_data_nolock(inode); + ext4_write_unlock_xattr(inode, &no_expand); + brelse(iloc.bh); - kaddr = kmap_atomic(page); - ext4_write_inline_data(inode, &iloc, kaddr, pos, len); - kunmap_atomic(kaddr); - SetPageUptodate(page); - /* clear page dirty so that writepages wouldn't work for us. */ - ClearPageDirty(page); + /* + * It's important to update i_size while still holding page + * lock: page writeout could otherwise come in and zero + * beyond i_size. + */ + ext4_update_inode_size(inode, pos + copied); + } + unlock_page(page); + put_page(page); - ext4_write_unlock_xattr(inode, &no_expand); - brelse(iloc.bh); - mark_inode_dirty(inode); + /* + * Don't mark the inode dirty under page lock. First, it unnecessarily + * makes the holding time of page lock longer. Second, it forces lock + * ordering of page lock and transaction start for journaling + * filesystems. + */ + if (likely(copied)) + mark_inode_dirty(inode); out: - return copied; + /* + * If we didn't copy as much data as expected, we need to trim back + * size of xattr containing inline data. + */ + if (pos + len > inode->i_size && ext4_can_truncate(inode)) + ext4_orphan_add(handle, inode); + + ret2 = ext4_journal_stop(handle); + if (!ret) + ret = ret2; + if (pos + len > inode->i_size) { + ext4_truncate_failed_write(inode); + /* + * If truncate failed early the inode might still be + * on the orphan list; we need to make sure the inode + * is removed from the orphan list in that case. + */ + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + } + return ret ? ret : copied; } struct buffer_head * @@ -953,43 +992,6 @@ out: return ret; } -int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, - unsigned len, unsigned copied, - struct page *page) -{ - int ret; - - ret = ext4_write_inline_data_end(inode, pos, len, copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - return ret; - } - copied = ret; - - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - * - * But it's important to update i_size while still holding page lock: - * page writeout could otherwise come in and zero beyond i_size. - */ - if (pos+copied > inode->i_size) - i_size_write(inode, pos+copied); - unlock_page(page); - put_page(page); - - /* - * Don't mark the inode dirty under page lock. First, it unnecessarily - * makes the holding time of page lock longer. Second, it forces lock - * ordering of page lock and transaction start for journaling - * filesystems. - */ - mark_inode_dirty(inode); - - return copied; -} - #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) @@ -1917,6 +1919,24 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) EXT4_I(inode)->i_disksize = i_size; if (i_size < inline_size) { + /* + * if there's inline data to truncate and this file was + * converted to extents after that inline data was written, + * the extent status cache must be cleared to avoid leaving + * behind stale delayed allocated extent entries + */ + if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { +retry: + err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + if (err == -ENOMEM) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + if (err) + goto out_error; + } + /* Clear the content in the xattr space. */ if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) { if ((err = ext4_xattr_ibody_find(inode, &i, &is)) != 0) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d18852d6029c..0f06305167d5 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1284,22 +1284,14 @@ static int ext4_write_end(struct file *file, loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; - int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); - if (inline_data) { - ret = ext4_write_inline_data_end(inode, pos, len, - copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - goto errout; - } - copied = ret; - } else - copied = block_write_end(file, mapping, pos, - len, copied, page, fsdata); + + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + + copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* * it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. @@ -1320,7 +1312,7 @@ static int ext4_write_end(struct file *file, * ordering of page lock and transaction start for journaling * filesystems. */ - if (i_size_changed || inline_data) + if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) @@ -1329,7 +1321,7 @@ static int ext4_write_end(struct file *file, * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); -errout: + ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1395,7 +1387,6 @@ static int ext4_journalled_write_end(struct file *file, int partial = 0; unsigned from, to; int size_changed = 0; - int inline_data = ext4_has_inline_data(inode); bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); @@ -1404,16 +1395,10 @@ static int ext4_journalled_write_end(struct file *file, BUG_ON(!ext4_handle_valid(handle)); - if (inline_data) { - ret = ext4_write_inline_data_end(inode, pos, len, - copied, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - goto errout; - } - copied = ret; - } else if (unlikely(copied < len) && !PageUptodate(page)) { + if (ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + + if (unlikely(copied < len) && !PageUptodate(page)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, page, from, to); } else { @@ -1436,7 +1421,7 @@ static int ext4_journalled_write_end(struct file *file, if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); - if (size_changed || inline_data) { + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1449,7 +1434,6 @@ static int ext4_journalled_write_end(struct file *file, */ ext4_orphan_add(handle, inode); -errout: ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; @@ -1644,6 +1628,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool allocated = false; + bool reserved = false; /* * If the cluster containing lblk is shared with a delayed, @@ -1660,6 +1645,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ goto errout; + reserved = true; } else { /* bigalloc */ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { if (!ext4_es_scan_clu(inode, @@ -1672,6 +1658,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ goto errout; + reserved = true; } else { allocated = true; } @@ -1682,6 +1669,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) } ret = ext4_es_insert_delayed_block(inode, lblk, allocated); + if (ret && reserved) + ext4_da_release_space(inode, 1); errout: return ret; @@ -1722,13 +1711,16 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, } /* - * Delayed extent could be allocated by fallocate. - * So we need to check it. + * the buffer head associated with a delayed and not unwritten + * block found in the extent status cache must contain an + * invalid block number and have its BH_New and BH_Delay bits + * set, reflecting the state assigned when the block was + * initially delayed allocated */ - if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { - map_bh(bh, inode->i_sb, invalid_block); - set_buffer_new(bh); - set_buffer_delay(bh); + if (ext4_es_is_delonly(&es)) { + BUG_ON(bh->b_blocknr != invalid_block); + BUG_ON(!buffer_new(bh)); + BUG_ON(!buffer_delay(bh)); return 0; } @@ -2932,19 +2924,6 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } -/* We always reserve for an inode update; the superblock could be there too */ -static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) -{ - if (likely(ext4_has_feature_large_file(inode->i_sb))) - return 1; - - if (pos + len <= 0x7fffffffULL) - return 1; - - /* We might need to update the superblock to set LARGE_FILE */ - return 2; -} - static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2953,7 +2932,6 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct page *page; pgoff_t index; struct inode *inode = mapping->host; - handle_t *handle; if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; @@ -2979,41 +2957,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, return 0; } - /* - * grab_cache_page_write_begin() can take a long time if the - * system is thrashing due to memory pressure, or if the page - * is being written back. So grab it first before we start - * the transaction handle. This also allows us to allocate - * the page (if needed) without using GFP_NOFS. - */ -retry_grab: +retry: page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; - unlock_page(page); - /* - * With delayed allocation, we don't log the i_disksize update - * if there is delayed block allocation. But we still need - * to journalling the i_disksize update if writes to the end - * of file which has an already mapped buffer. - */ -retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, - ext4_da_write_credits(inode, pos, len)); - if (IS_ERR(handle)) { - put_page(page); - return PTR_ERR(handle); - } - - lock_page(page); - if (page->mapping != mapping) { - /* The page got truncated from under us */ - unlock_page(page); - put_page(page); - ext4_journal_stop(handle); - goto retry_grab; - } /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); @@ -3025,20 +2973,18 @@ retry_journal: #endif if (ret < 0) { unlock_page(page); - ext4_journal_stop(handle); + put_page(page); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need - * i_size_read because we hold i_mutex. + * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_journal; - - put_page(page); + goto retry; return ret; } @@ -3075,8 +3021,6 @@ static int ext4_da_write_end(struct file *file, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - int ret = 0, ret2; - handle_t *handle = ext4_journal_current_handle(); loff_t new_i_size; unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; @@ -3086,44 +3030,36 @@ static int ext4_da_write_end(struct file *file, len, copied, page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); + + if (write_mode != CONVERT_INLINE_DATA && + ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && + ext4_has_inline_data(inode)) + return ext4_write_inline_data_end(inode, pos, len, copied, page); + start = pos & (PAGE_SIZE - 1); end = start + copied - 1; /* - * generic_write_end() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range upto i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize upto i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block (ext4_da_should_update_i_disksize() + * check), we need to update i_disksize here as neither + * ext4_writepage() nor certain ext4_writepages() paths not + * allocating blocks update i_disksize. + * + * Note that we defer inode dirtying to generic_write_end() / + * ext4_da_write_inline_data_end(). */ new_i_size = pos + copied; - if (copied && new_i_size > EXT4_I(inode)->i_disksize) { - if (ext4_has_inline_data(inode) || - ext4_da_should_update_i_disksize(page, end)) { - ext4_update_i_disksize(inode, new_i_size); - /* We need to mark inode dirty even if - * new_i_size is less that inode->i_size - * bu greater than i_disksize.(hint delalloc) - */ - ret = ext4_mark_inode_dirty(handle, inode); - } - } + if (copied && new_i_size > inode->i_size && + ext4_da_should_update_i_disksize(page, end)) + ext4_update_i_disksize(inode, new_i_size); - if (write_mode != CONVERT_INLINE_DATA && - ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && - ext4_has_inline_data(inode)) - ret2 = ext4_da_write_inline_data_end(inode, pos, len, copied, - page); - else - ret2 = generic_write_end(file, mapping, pos, len, copied, - page, fsdata); - - copied = ret2; - if (ret2 < 0) - ret = ret2; - ret2 = ext4_journal_stop(handle); - if (unlikely(ret2 && !ret)) - ret = ret2; - - return ret ? ret : copied; + return generic_write_end(file, mapping, pos, len, copied, page, fsdata); } /* @@ -4340,6 +4276,12 @@ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, goto has_buffer; lock_buffer(bh); + if (ext4_buffer_uptodate(bh)) { + /* Someone brought it uptodate while we waited */ + unlock_buffer(bh); + goto has_buffer; + } + /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0775950ee84e..a320c54202d9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -46,6 +46,7 @@ #include <linux/part_stat.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/fsnotify.h> #include "ext4.h" #include "ext4_extents.h" /* Needed for trace points definition */ @@ -658,7 +659,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * constraints, it may not be safe to do it right here so we * defer superblock flushing to a workqueue. */ - if (continue_fs) + if (continue_fs && journal) schedule_work(&EXT4_SB(sb)->s_error_work); else ext4_commit_super(sb); @@ -759,6 +760,8 @@ void __ext4_error(struct super_block *sb, const char *function, sb->s_id, function, line, current->comm, &vaf); va_end(args); } + fsnotify_sb_error(sb, NULL, error ? error : EFSCORRUPTED); + ext4_handle_error(sb, force_ro, error, 0, block, function, line); } @@ -789,6 +792,8 @@ void __ext4_error_inode(struct inode *inode, const char *function, current->comm, &vaf); va_end(args); } + fsnotify_sb_error(inode->i_sb, inode, error ? error : EFSCORRUPTED); + ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, function, line); } @@ -827,6 +832,8 @@ void __ext4_error_file(struct file *file, const char *function, current->comm, path, &vaf); va_end(args); } + fsnotify_sb_error(inode->i_sb, inode, EFSCORRUPTED); + ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, function, line); } @@ -894,6 +901,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", sb->s_id, function, line, errstr); } + fsnotify_sb_error(sb, NULL, errno ? errno : EFSCORRUPTED); ext4_handle_error(sb, false, -errno, 0, 0, function, line); } @@ -1350,6 +1358,12 @@ static void ext4_destroy_inode(struct inode *inode) true); dump_stack(); } + + if (EXT4_I(inode)->i_reserved_data_blocks) + ext4_msg(inode->i_sb, KERN_ERR, + "Inode %lu (%p): i_reserved_data_blocks (%u) not cleared!", + inode->i_ino, EXT4_I(inode), + EXT4_I(inode)->i_reserved_data_blocks); } static void init_once(void *foo) @@ -1566,7 +1580,6 @@ static const struct fscrypt_operations ext4_cryptops = { .set_context = ext4_set_context, .get_dummy_policy = ext4_get_dummy_policy, .empty_dir = ext4_empty_dir, - .max_namelen = EXT4_NAME_LEN, .has_stable_inodes = ext4_has_stable_inodes, .get_ino_and_lblk_bits = ext4_get_ino_and_lblk_bits, }; @@ -3021,17 +3034,17 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files) */ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { - loff_t res = EXT4_NDIR_BLOCKS; + unsigned long long upper_limit, res = EXT4_NDIR_BLOCKS; int meta_blocks; - loff_t upper_limit; - /* This is calculated to be the largest file size for a dense, block + + /* + * This is calculated to be the largest file size for a dense, block * mapped file such that the file's total number of 512-byte sectors, * including data and all indirect blocks, does not exceed (2^48 - 1). * * __u32 i_blocks_lo and _u16 i_blocks_high represent the total * number of 512-byte sectors of the file. */ - if (!has_huge_files) { /* * !has_huge_files or implies that the inode i_block field @@ -3074,7 +3087,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) if (res > MAX_LFS_FILESIZE) res = MAX_LFS_FILESIZE; - return res; + return (loff_t)res; } static ext4_fsblk_t descriptor_loc(struct super_block *sb, @@ -4468,7 +4481,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto cantfind_ext4; /* check blocks count against device size */ - blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; + blocks_count = sb_bdev_nr_blocks(sb); if (blocks_count && ext4_blocks_count(es) > blocks_count) { ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu " "exceeds size of device (%llu blocks)", @@ -5042,12 +5055,15 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { + /* flush s_error_work before journal destroy. */ + flush_work(&sbi->s_error_work); jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } failed_mount3a: ext4_es_unregister_shrinker(sbi); failed_mount3: + /* flush s_error_work before sbi destroy */ flush_work(&sbi->s_error_work); del_timer_sync(&sbi->s_err_report); ext4_stop_mmpd(sbi); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index c1bf9ad4c220..20a083dc9042 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -7,6 +7,7 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/moduleparam.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/lzo.h> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9c8ef33bd8d3..eb971e1e7227 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -4276,7 +4276,7 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) size_t target_size = 0; int err; - if (iov_iter_fault_in_readable(from, iov_iter_count(from))) + if (fault_in_iov_iter_readable(from, iov_iter_count(from))) set_inode_flag(inode, FI_NO_PREALLOC); if ((iocb->ki_flags & IOCB_NOWAIT)) { diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 78ebc306ee2b..cf049a042482 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2976,7 +2976,6 @@ static const struct fscrypt_operations f2fs_cryptops = { .set_context = f2fs_set_context, .get_dummy_policy = f2fs_get_dummy_policy, .empty_dir = f2fs_empty_dir, - .max_namelen = F2FS_NAME_LEN, .has_stable_inodes = f2fs_has_stable_inodes, .get_ino_and_lblk_bits = f2fs_get_ino_and_lblk_bits, .get_num_devices = f2fs_get_num_devices, diff --git a/fs/fat/inode.c b/fs/fat/inode.c index de0c9b013a85..a6f1c6d426d1 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -1536,14 +1536,11 @@ static int fat_read_static_bpb(struct super_block *sb, struct fat_bios_param_block *bpb) { static const char *notdos1x = "This doesn't look like a DOS 1.x volume"; - + sector_t bd_sects = bdev_nr_sectors(sb->s_bdev); struct fat_floppy_defaults *fdefaults = NULL; int error = -EINVAL; - sector_t bd_sects; unsigned i; - bd_sects = i_size_read(sb->s_bdev->bd_inode) / SECTOR_SIZE; - /* 16-bit DOS 1.x reliably wrote bootstrap short-jmp code */ if (b->ignored[0] != 0xeb || b->ignored[2] != 0x90) { if (!silent) @@ -1943,10 +1940,8 @@ int fat_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2) ret = writeback_inode(i1); if (!ret && i2) ret = writeback_inode(i2); - if (!ret) { - struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; - ret = filemap_flush(mapping); - } + if (!ret) + ret = sync_blockdev_nowait(sb->s_bdev); return ret; } EXPORT_SYMBOL_GPL(fat_flush_inodes); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 81ec192ce067..67f0e88eed01 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -566,7 +566,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (atomic_read(&isw_nr_in_flight) > WB_FRN_MAX_IN_FLIGHT) return; - isw = kzalloc(sizeof(*isw) + 2 * sizeof(struct inode *), GFP_ATOMIC); + isw = kzalloc(struct_size(isw, inodes, 2), GFP_ATOMIC); if (!isw) return; @@ -624,8 +624,8 @@ bool cleanup_offline_cgwb(struct bdi_writeback *wb) int nr; bool restart = false; - isw = kzalloc(sizeof(*isw) + WB_MAX_INODES_PER_ISW * - sizeof(struct inode *), GFP_KERNEL); + isw = kzalloc(struct_size(isw, inodes, WB_MAX_INODES_PER_ISW), + GFP_KERNEL); if (!isw) return restart; @@ -1893,7 +1893,8 @@ static long writeback_sb_inodes(struct super_block *sb, * unplug, so get our IOs out the door before we * give up the CPU. */ - blk_flush_plug(current); + if (current->plug) + blk_flush_plug(current->plug, false); cond_resched(); } @@ -2291,7 +2292,7 @@ void wakeup_flusher_threads(enum wb_reason reason) * If we are expecting writeback progress we must submit plugged IO. */ if (blk_needs_flush_plug(current)) - blk_schedule_flush_plug(current); + blk_flush_plug(current->plug, true); rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) diff --git a/fs/fscache/object.c b/fs/fscache/object.c index f346a78f4bd6..6a675652129b 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -77,7 +77,6 @@ static WORK_STATE(INIT_OBJECT, "INIT", fscache_initialise_object); static WORK_STATE(PARENT_READY, "PRDY", fscache_parent_ready); static WORK_STATE(ABORT_INIT, "ABRT", fscache_abort_initialisation); static WORK_STATE(LOOK_UP_OBJECT, "LOOK", fscache_look_up_object); -static WORK_STATE(CREATE_OBJECT, "CRTO", fscache_look_up_object); static WORK_STATE(OBJECT_AVAILABLE, "AVBL", fscache_object_available); static WORK_STATE(JUMPSTART_DEPS, "JUMP", fscache_jumpstart_dependents); @@ -907,6 +906,7 @@ static void fscache_dequeue_object(struct fscache_object *object) * @object: The object to ask about * @data: The auxiliary data for the object * @datalen: The size of the auxiliary data + * @object_size: The size of the object according to the server. * * This function consults the netfs about the coherency state of an object. * The caller must be holding a ref on cookie->n_active (held by diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 433877107700..e002cdfaf3cc 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -22,7 +22,10 @@ static void fscache_operation_dummy_cancel(struct fscache_operation *op) /** * fscache_operation_init - Do basic initialisation of an operation + * @cookie: The cookie to operate on * @op: The operation to initialise + * @processor: The function to perform the operation + * @cancel: A function to handle operation cancellation * @release: The release function to assign * * Do basic initialisation of an operation. The caller must still set flags, diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 281d79f8b3d3..713818d74de6 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -732,11 +732,8 @@ static ssize_t fuse_dax_direct_write(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; ret = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); - if (ret < 0) - return ret; - fuse_invalidate_attr(inode); - fuse_write_update_size(inode, iocb->ki_pos); + fuse_write_update_attr(inode, iocb->ki_pos, ret); return ret; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index dde341a6388a..79f7eda49e06 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -756,7 +756,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) { unsigned ncpy = min(*size, cs->len); if (val) { - void *pgaddr = kmap_atomic(cs->pg); + void *pgaddr = kmap_local_page(cs->pg); void *buf = pgaddr + cs->offset; if (cs->write) @@ -764,7 +764,7 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) else memcpy(*val, buf, ncpy); - kunmap_atomic(pgaddr); + kunmap_local(pgaddr); *val += ncpy; } *size -= ncpy; @@ -847,6 +847,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) replace_page_cache_page(oldpage, newpage); + /* + * Release while we have extra ref on stolen page. Otherwise + * anon_pipe_buf_release() might think the page can be reused. + */ + pipe_buf_release(cs->pipe, buf); + get_page(newpage); if (!(buf->flags & PIPE_BUF_FLAG_LRU)) @@ -949,10 +955,10 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, } } if (page) { - void *mapaddr = kmap_atomic(page); + void *mapaddr = kmap_local_page(page); void *buf = mapaddr + offset; offset += fuse_copy_do(cs, &buf, &count); - kunmap_atomic(mapaddr); + kunmap_local(mapaddr); } else offset += fuse_copy_do(cs, NULL, &count); } @@ -1591,7 +1597,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, end = outarg.offset + outarg.size; if (end > file_size) { file_size = end; - fuse_write_update_size(inode, file_size); + fuse_write_update_attr(inode, file_size, outarg.size); } num = outarg.size; @@ -2031,8 +2037,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, pipe_lock(pipe); out_free: - for (idx = 0; idx < nbuf; idx++) - pipe_buf_release(pipe, &bufs[idx]); + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + + if (buf->ops) + pipe_buf_release(pipe, buf); + } pipe_unlock(pipe); kvfree(bufs); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index d9b977c0f38d..0654bfedcbb0 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -116,7 +116,7 @@ u64 entry_attr_timeout(struct fuse_entry_out *o) return time_to_jiffies(o->attr_valid, o->attr_valid_nsec); } -static void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) +void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) { set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask); } @@ -738,14 +738,51 @@ static int fuse_symlink(struct user_namespace *mnt_userns, struct inode *dir, return create_new_entry(fm, &args, dir, entry, S_IFLNK); } -void fuse_update_ctime(struct inode *inode) +void fuse_flush_time_update(struct inode *inode) +{ + int err = sync_inode_metadata(inode, 1); + + mapping_set_error(inode->i_mapping, err); +} + +static void fuse_update_ctime_in_cache(struct inode *inode) { if (!IS_NOCMTIME(inode)) { inode->i_ctime = current_time(inode); mark_inode_dirty_sync(inode); + fuse_flush_time_update(inode); } } +void fuse_update_ctime(struct inode *inode) +{ + fuse_invalidate_attr_mask(inode, STATX_CTIME); + fuse_update_ctime_in_cache(inode); +} + +static void fuse_entry_unlinked(struct dentry *entry) +{ + struct inode *inode = d_inode(entry); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fi->lock); + fi->attr_version = atomic64_inc_return(&fc->attr_version); + /* + * If i_nlink == 0 then unlink doesn't make sense, yet this can + * happen if userspace filesystem is careless. It would be + * difficult to enforce correct nlink usage so just ignore this + * condition here + */ + if (S_ISDIR(inode->i_mode)) + clear_nlink(inode); + else if (inode->i_nlink > 0) + drop_nlink(inode); + spin_unlock(&fi->lock); + fuse_invalidate_entry_cache(entry); + fuse_update_ctime(inode); +} + static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; @@ -762,24 +799,8 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { - struct inode *inode = d_inode(entry); - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fm->fc->attr_version); - /* - * If i_nlink == 0 then unlink doesn't make sense, yet this can - * happen if userspace filesystem is careless. It would be - * difficult to enforce correct nlink usage so just ignore this - * condition here - */ - if (inode->i_nlink > 0) - drop_nlink(inode); - spin_unlock(&fi->lock); - fuse_invalidate_attr(inode); fuse_dir_changed(dir); - fuse_invalidate_entry_cache(entry); - fuse_update_ctime(inode); + fuse_entry_unlinked(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -801,9 +822,8 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.in_args[0].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { - clear_nlink(d_inode(entry)); fuse_dir_changed(dir); - fuse_invalidate_entry_cache(entry); + fuse_entry_unlinked(entry); } else if (err == -EINTR) fuse_invalidate_entry(entry); return err; @@ -833,24 +853,18 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, err = fuse_simple_request(fm, &args); if (!err) { /* ctime changes */ - fuse_invalidate_attr(d_inode(oldent)); fuse_update_ctime(d_inode(oldent)); - if (flags & RENAME_EXCHANGE) { - fuse_invalidate_attr(d_inode(newent)); + if (flags & RENAME_EXCHANGE) fuse_update_ctime(d_inode(newent)); - } fuse_dir_changed(olddir); if (olddir != newdir) fuse_dir_changed(newdir); /* newent will end up negative */ - if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { - fuse_invalidate_attr(d_inode(newent)); - fuse_invalidate_entry_cache(newent); - fuse_update_ctime(d_inode(newent)); - } + if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) + fuse_entry_unlinked(newent); } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation @@ -916,25 +930,11 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; err = create_new_entry(fm, &args, newdir, newent, inode->i_mode); - /* Contrary to "normal" filesystems it can happen that link - makes two "logical" inodes point to the same "physical" - inode. We invalidate the attributes of the old one, so it - will reflect changes in the backing inode (link count, - etc.) - */ - if (!err) { - struct fuse_inode *fi = get_fuse_inode(inode); - - spin_lock(&fi->lock); - fi->attr_version = atomic64_inc_return(&fm->fc->attr_version); - if (likely(inode->i_nlink < UINT_MAX)) - inc_nlink(inode); - spin_unlock(&fi->lock); - fuse_invalidate_attr(inode); - fuse_update_ctime(inode); - } else if (err == -EINTR) { + if (!err) + fuse_update_ctime_in_cache(inode); + else if (err == -EINTR) fuse_invalidate_attr(inode); - } + return err; } @@ -944,15 +944,6 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr, unsigned int blkbits; struct fuse_conn *fc = get_fuse_conn(inode); - /* see the comment in fuse_change_attributes() */ - if (fc->writeback_cache && S_ISREG(inode->i_mode)) { - attr->size = i_size_read(inode); - attr->mtime = inode->i_mtime.tv_sec; - attr->mtimensec = inode->i_mtime.tv_nsec; - attr->ctime = inode->i_ctime.tv_sec; - attr->ctimensec = inode->i_ctime.tv_nsec; - } - stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); @@ -1030,12 +1021,14 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, struct fuse_inode *fi = get_fuse_inode(inode); int err = 0; bool sync; + u32 inval_mask = READ_ONCE(fi->inval_mask); + u32 cache_mask = fuse_get_cache_mask(inode); if (flags & AT_STATX_FORCE_SYNC) sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; - else if (request_mask & READ_ONCE(fi->inval_mask)) + else if (request_mask & inval_mask & ~cache_mask) sync = true; else sync = time_before64(fi->i_time, get_jiffies_64()); @@ -1052,11 +1045,9 @@ static int fuse_update_get_attr(struct inode *inode, struct file *file, return err; } -int fuse_update_attributes(struct inode *inode, struct file *file) +int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) { - /* Do *not* need to get atime for internal purposes */ - return fuse_update_get_attr(inode, file, NULL, - STATX_BASIC_STATS & ~STATX_ATIME, 0); + return fuse_update_get_attr(inode, file, NULL, mask, 0); } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, @@ -1071,7 +1062,7 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, if (!parent) return -ENOENT; - inode_lock(parent); + inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) goto unlock; @@ -1561,10 +1552,10 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; - bool is_wb = fc->writeback_cache; + bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode); loff_t oldsize; int err; - bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode); + bool trust_local_cmtime = is_wb; bool fault_blocked = false; if (!fc->default_permissions) @@ -1608,7 +1599,7 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } /* Flush dirty data/metadata before non-truncate SETATTR */ - if (is_wb && S_ISREG(inode->i_mode) && + if (is_wb && attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET | ATTR_TIMES_SET)) { @@ -1676,10 +1667,11 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr, } fuse_change_attributes_common(inode, &outarg.attr, - attr_timeout(&outarg)); + attr_timeout(&outarg), + fuse_get_cache_mask(inode)); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ - if (!is_wb || is_truncate || !S_ISREG(inode->i_mode)) + if (!is_wb || is_truncate) i_size_write(inode, outarg.attr.size); if (is_truncate) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 11404f8c21c7..9d6c5f6361f7 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -211,9 +211,8 @@ void fuse_finish_open(struct inode *inode, struct file *file) i_size_write(inode, 0); spin_unlock(&fi->lock); truncate_pagecache(inode, 0); - fuse_invalidate_attr(inode); - if (fc->writeback_cache) - file_update_time(file); + file_update_time(file); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) { invalidate_inode_pages2(inode->i_mapping); } @@ -339,12 +338,6 @@ static int fuse_open(struct inode *inode, struct file *file) static int fuse_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc = get_fuse_conn(inode); - - /* see fuse_vma_close() for !writeback_cache case */ - if (fc->writeback_cache) - write_inode_now(inode, 1); - fuse_release_common(file, false); /* return value is ignored by VFS */ @@ -483,6 +476,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (fuse_is_bad(inode)) return -EIO; + if (ff->open_flags & FOPEN_NOFLUSH && !fm->fc->writeback_cache) + return 0; + err = write_inode_now(inode, 1); if (err) return err; @@ -521,7 +517,7 @@ inval_attr_out: * enabled, i_blocks from cached attr may not be accurate. */ if (!err && fm->fc->writeback_cache) - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, STATX_BLOCKS); return err; } @@ -687,7 +683,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) spin_unlock(&fi->lock); } - io->iocb->ki_complete(io->iocb, res, 0); + io->iocb->ki_complete(io->iocb, res); } kref_put(&io->refcnt, fuse_io_release); @@ -793,7 +789,7 @@ static void fuse_read_update_size(struct inode *inode, loff_t size, struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); - if (attr_ver == fi->attr_version && size < inode->i_size && + if (attr_ver >= fi->attr_version && size < inode->i_size && !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { fi->attr_version = atomic64_inc_return(&fc->attr_version); i_size_write(inode, size); @@ -1003,7 +999,7 @@ static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to) if (fc->auto_inval_data || (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) { int err; - err = fuse_update_attributes(inode, iocb->ki_filp); + err = fuse_update_attributes(inode, iocb->ki_filp, STATX_SIZE); if (err) return err; } @@ -1072,7 +1068,7 @@ static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos, return err ?: ia->write.out.size; } -bool fuse_write_update_size(struct inode *inode, loff_t pos) +bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -1080,12 +1076,14 @@ bool fuse_write_update_size(struct inode *inode, loff_t pos) spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); - if (pos > inode->i_size) { + if (written > 0 && pos > inode->i_size) { i_size_write(inode, pos); ret = true; } spin_unlock(&fi->lock); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); + return ret; } @@ -1164,7 +1162,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, again: err = -EFAULT; - if (iov_iter_fault_in_readable(ii, bytes)) + if (fault_in_iov_iter_readable(ii, bytes)) break; err = -ENOMEM; @@ -1268,11 +1266,8 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, kfree(ap->pages); } while (!err && iov_iter_count(ii)); - if (res > 0) - fuse_write_update_size(inode, pos); - + fuse_write_update_attr(inode, pos, res); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); - fuse_invalidate_attr(inode); return res > 0 ? res : err; } @@ -1290,7 +1285,8 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from) if (fc->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ - err = fuse_update_attributes(mapping->host, file); + err = fuse_update_attributes(mapping->host, file, + STATX_SIZE | STATX_MODE); if (err) return err; @@ -1451,7 +1447,6 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; - ia->io = io; if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) inode_lock(inode); @@ -1561,11 +1556,9 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) } else { res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + fuse_write_update_attr(inode, iocb->ki_pos, res); } } - fuse_invalidate_attr(inode); - if (res > 0) - fuse_write_update_size(inode, iocb->ki_pos); inode_unlock(inode); return res; @@ -1776,7 +1769,7 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, * is enabled, we trust local ctime/mtime. */ if (!fc->writeback_cache) - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { @@ -1822,14 +1815,13 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, static struct fuse_file *__fuse_write_file_get(struct fuse_inode *fi) { - struct fuse_file *ff = NULL; + struct fuse_file *ff; spin_lock(&fi->lock); - if (!list_empty(&fi->write_files)) { - ff = list_entry(fi->write_files.next, struct fuse_file, - write_entry); + ff = list_first_entry_or_null(&fi->write_files, struct fuse_file, + write_entry); + if (ff) fuse_file_get(ff); - } spin_unlock(&fi->lock); return ff; @@ -1848,6 +1840,17 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc) struct fuse_file *ff; int err; + /* + * Inode is always written before the last reference is dropped and + * hence this should not be reached from reclaim. + * + * Writing back the inode from reclaim can deadlock if the request + * processing itself needs an allocation. Allocations triggering + * reclaim while serving a request can't be prevented, because it can + * involve any number of unrelated userspace processes. + */ + WARN_ON(wbc->for_reclaim); + ff = __fuse_write_file_get(fi); err = fuse_flush_times(inode, ff); if (ff) @@ -2306,15 +2309,18 @@ static int fuse_write_end(struct file *file, struct address_space *mapping, if (!copied) goto unlock; + pos += copied; if (!PageUptodate(page)) { /* Zero any unwritten bytes at the end of the page */ - size_t endoff = (pos + copied) & ~PAGE_MASK; + size_t endoff = pos & ~PAGE_MASK; if (endoff) zero_user_segment(page, endoff, PAGE_SIZE); SetPageUptodate(page); } - fuse_write_update_size(inode, pos + copied); + if (pos > inode->i_size) + i_size_write(inode, pos); + set_page_dirty(page); unlock: @@ -2340,12 +2346,15 @@ static int fuse_launder_page(struct page *page) } /* - * Write back dirty pages now, because there may not be any suitable - * open files later + * Write back dirty data/metadata now (there may not be any suitable + * open files later for data) */ static void fuse_vma_close(struct vm_area_struct *vma) { - filemap_write_and_wait(vma->vm_file->f_mapping); + int err; + + err = write_inode_now(vma->vm_file->f_mapping->host, 1); + mapping_set_error(vma->vm_file->f_mapping, err); } /* @@ -2628,7 +2637,7 @@ static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); fallback: - err = fuse_update_attributes(inode, file); + err = fuse_update_attributes(inode, file, STATX_SIZE); if (!err) return generic_file_llseek(file, offset, whence); else @@ -2648,7 +2657,7 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) break; case SEEK_END: inode_lock(inode); - retval = fuse_update_attributes(inode, file); + retval = fuse_update_attributes(inode, file, STATX_SIZE); if (!retval) retval = generic_file_llseek(file, offset, whence); inode_unlock(inode); @@ -2869,7 +2878,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) if (iov_iter_rw(iter) == WRITE) { ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); } else { ret = __fuse_direct_read(io, iter, &pos); } @@ -2891,9 +2900,8 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) kref_put(&io->refcnt, fuse_io_release); if (iov_iter_rw(iter) == WRITE) { - if (ret > 0) - fuse_write_update_size(inode, pos); - else if (ret < 0 && offset + count > i_size) + fuse_write_update_attr(inode, pos, ret); + if (ret < 0 && offset + count > i_size) fuse_do_truncate(file); } @@ -2981,16 +2989,14 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, /* we could have extended the file */ if (!(mode & FALLOC_FL_KEEP_SIZE)) { - bool changed = fuse_write_update_size(inode, offset + length); - - if (changed && fm->fc->writeback_cache) + if (fuse_write_update_attr(inode, offset + length, length)) file_update_time(file); } if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) truncate_pagecache_range(inode, offset, offset + length - 1); - fuse_invalidate_attr(inode); + fuse_invalidate_attr_mask(inode, FUSE_STATX_MODSIZE); out: if (!(mode & FALLOC_FL_KEEP_SIZE)) @@ -3002,6 +3008,8 @@ out: if (lock_inode) inode_unlock(inode); + fuse_flush_time_update(inode); + return err; } @@ -3096,12 +3104,8 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, ALIGN_DOWN(pos_out, PAGE_SIZE), ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); - if (fc->writeback_cache) { - fuse_write_update_size(inode_out, pos_out + outarg.size); - file_update_time(file_out); - } - - fuse_invalidate_attr(inode_out); + file_update_time(file_out); + fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); err = outarg.size; out: @@ -3111,6 +3115,8 @@ out: inode_unlock(inode_out); file_accessed(file_in); + fuse_flush_time_update(inode_out); + return err; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 319596df5dc6..198637b41e19 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1031,7 +1031,9 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, u64 attr_valid, u64 attr_version); void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid); + u64 attr_valid, u32 cache_mask); + +u32 fuse_get_cache_mask(struct inode *inode); /** * Initialize the client device @@ -1065,7 +1067,15 @@ void fuse_wait_aborted(struct fuse_conn *fc); /** * Invalidate inode attributes */ + +/* Attributes possibly changed on data modification */ +#define FUSE_STATX_MODIFY (STATX_MTIME | STATX_CTIME | STATX_BLOCKS) + +/* Attributes possibly changed on data and/or size modification */ +#define FUSE_STATX_MODSIZE (FUSE_STATX_MODIFY | STATX_SIZE) + void fuse_invalidate_attr(struct inode *inode); +void fuse_invalidate_attr_mask(struct inode *inode, u32 mask); void fuse_invalidate_entry_cache(struct dentry *entry); @@ -1121,6 +1131,9 @@ int fuse_init_fs_context_submount(struct fs_context *fsc); */ void fuse_conn_destroy(struct fuse_mount *fm); +/* Drop the connection and free the fuse mount */ +void fuse_mount_destroy(struct fuse_mount *fm); + /** * Add connection to control filesystem */ @@ -1145,9 +1158,10 @@ int fuse_allow_current_process(struct fuse_conn *fc); u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); +void fuse_flush_time_update(struct inode *inode); void fuse_update_ctime(struct inode *inode); -int fuse_update_attributes(struct inode *inode, struct file *file); +int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask); void fuse_flush_writepages(struct inode *inode); @@ -1205,7 +1219,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd, __poll_t fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); -bool fuse_write_update_size(struct inode *inode, loff_t pos); +bool fuse_write_update_attr(struct inode *inode, loff_t pos, ssize_t written); int fuse_flush_times(struct inode *inode, struct fuse_file *ff); int fuse_write_inode(struct inode *inode, struct writeback_control *wbc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 36cd03114b6d..8b89e3ba7df3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -118,6 +118,9 @@ static void fuse_evict_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); + /* Will write inode on close/munmap and in all other dirtiers */ + WARN_ON(inode->i_state & I_DIRTY_INODE); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (inode->i_sb->s_flags & SB_ACTIVE) { @@ -161,7 +164,7 @@ static ino_t fuse_squash_ino(u64 ino64) } void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid) + u64 attr_valid, u32 cache_mask) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -181,9 +184,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_atime.tv_sec = attr->atime; inode->i_atime.tv_nsec = attr->atimensec; /* mtime from server may be stale due to local buffered write */ - if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) { + if (!(cache_mask & STATX_MTIME)) { inode->i_mtime.tv_sec = attr->mtime; inode->i_mtime.tv_nsec = attr->mtimensec; + } + if (!(cache_mask & STATX_CTIME)) { inode->i_ctime.tv_sec = attr->ctime; inode->i_ctime.tv_nsec = attr->ctimensec; } @@ -215,16 +220,44 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, inode->i_flags &= ~S_NOSEC; } +u32 fuse_get_cache_mask(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) + return 0; + + return STATX_MTIME | STATX_CTIME | STATX_SIZE; +} + void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, u64 attr_valid, u64 attr_version) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - bool is_wb = fc->writeback_cache; + u32 cache_mask; loff_t oldsize; struct timespec64 old_mtime; spin_lock(&fi->lock); + /* + * In case of writeback_cache enabled, writes update mtime, ctime and + * may update i_size. In these cases trust the cached value in the + * inode. + */ + cache_mask = fuse_get_cache_mask(inode); + if (cache_mask & STATX_SIZE) + attr->size = i_size_read(inode); + + if (cache_mask & STATX_MTIME) { + attr->mtime = inode->i_mtime.tv_sec; + attr->mtimensec = inode->i_mtime.tv_nsec; + } + if (cache_mask & STATX_CTIME) { + attr->ctime = inode->i_ctime.tv_sec; + attr->ctimensec = inode->i_ctime.tv_nsec; + } + if ((attr_version != 0 && fi->attr_version > attr_version) || test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) { spin_unlock(&fi->lock); @@ -232,7 +265,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, } old_mtime = inode->i_mtime; - fuse_change_attributes_common(inode, attr, attr_valid); + fuse_change_attributes_common(inode, attr, attr_valid, cache_mask); oldsize = inode->i_size; /* @@ -240,11 +273,11 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, * extend local i_size without keeping userspace server in sync. So, * attr->size coming from server can be stale. We cannot trust it. */ - if (!is_wb || !S_ISREG(inode->i_mode)) + if (!(cache_mask & STATX_SIZE)) i_size_write(inode, attr->size); spin_unlock(&fi->lock); - if (!is_wb && S_ISREG(inode->i_mode)) { + if (!cache_mask && S_ISREG(inode->i_mode)) { bool inval = false; if (oldsize != attr->size) { @@ -457,14 +490,6 @@ static void fuse_send_destroy(struct fuse_mount *fm) } } -static void fuse_put_super(struct super_block *sb) -{ - struct fuse_mount *fm = get_fuse_mount_super(sb); - - fuse_conn_put(fm->fc); - kfree(fm); -} - static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr) { stbuf->f_type = FUSE_SUPER_MAGIC; @@ -1003,7 +1028,6 @@ static const struct super_operations fuse_super_operations = { .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, .drop_inode = generic_delete_inode, - .put_super = fuse_put_super, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, @@ -1424,20 +1448,17 @@ static int fuse_get_tree_submount(struct fs_context *fsc) if (!fm) return -ENOMEM; + fm->fc = fuse_conn_get(fc); fsc->s_fs_info = fm; sb = sget_fc(fsc, NULL, set_anon_super_fc); - if (IS_ERR(sb)) { - kfree(fm); + if (fsc->s_fs_info) + fuse_mount_destroy(fm); + if (IS_ERR(sb)) return PTR_ERR(sb); - } - fm->fc = fuse_conn_get(fc); /* Initialize superblock, making @mp_fi its root */ err = fuse_fill_super_submount(sb, mp_fi); if (err) { - fuse_conn_put(fc); - kfree(fm); - sb->s_fs_info = NULL; deactivate_locked_super(sb); return err; } @@ -1569,8 +1590,6 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; int err; - struct fuse_conn *fc; - struct fuse_mount *fm; if (!ctx->file || !ctx->rootmode_present || !ctx->user_id_present || !ctx->group_id_present) @@ -1580,42 +1599,18 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) * Require mount to happen from the same user namespace which * opened /dev/fuse to prevent potential attacks. */ - err = -EINVAL; if ((ctx->file->f_op != &fuse_dev_operations) || (ctx->file->f_cred->user_ns != sb->s_user_ns)) - goto err; + return -EINVAL; ctx->fudptr = &ctx->file->private_data; - fc = kmalloc(sizeof(*fc), GFP_KERNEL); - err = -ENOMEM; - if (!fc) - goto err; - - fm = kzalloc(sizeof(*fm), GFP_KERNEL); - if (!fm) { - kfree(fc); - goto err; - } - - fuse_conn_init(fc, fm, sb->s_user_ns, &fuse_dev_fiq_ops, NULL); - fc->release = fuse_free_conn; - - sb->s_fs_info = fm; - err = fuse_fill_super_common(sb, ctx); if (err) - goto err_put_conn; + return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); fuse_send_init(get_fuse_mount_super(sb)); return 0; - - err_put_conn: - fuse_conn_put(fc); - kfree(fm); - sb->s_fs_info = NULL; - err: - return err; } /* @@ -1637,22 +1632,40 @@ static int fuse_get_tree(struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; struct fuse_dev *fud; + struct fuse_conn *fc; + struct fuse_mount *fm; struct super_block *sb; int err; + fc = kmalloc(sizeof(*fc), GFP_KERNEL); + if (!fc) + return -ENOMEM; + + fm = kzalloc(sizeof(*fm), GFP_KERNEL); + if (!fm) { + kfree(fc); + return -ENOMEM; + } + + fuse_conn_init(fc, fm, fsc->user_ns, &fuse_dev_fiq_ops, NULL); + fc->release = fuse_free_conn; + + fsc->s_fs_info = fm; + if (ctx->fd_present) ctx->file = fget(ctx->fd); if (IS_ENABLED(CONFIG_BLOCK) && ctx->is_bdev) { err = get_tree_bdev(fsc, fuse_fill_super); - goto out_fput; + goto out; } /* * While block dev mount can be initialized with a dummy device fd * (found by device name), normal fuse mounts can't */ + err = -EINVAL; if (!ctx->file) - return -EINVAL; + goto out; /* * Allow creating a fuse mount with an already initialized fuse @@ -1668,7 +1681,9 @@ static int fuse_get_tree(struct fs_context *fsc) } else { err = get_tree_nodev(fsc, fuse_fill_super); } -out_fput: +out: + if (fsc->s_fs_info) + fuse_mount_destroy(fm); if (ctx->file) fput(ctx->file); return err; @@ -1747,17 +1762,25 @@ static void fuse_sb_destroy(struct super_block *sb) struct fuse_mount *fm = get_fuse_mount_super(sb); bool last; - if (fm) { + if (sb->s_root) { last = fuse_mount_remove(fm); if (last) fuse_conn_destroy(fm); } } +void fuse_mount_destroy(struct fuse_mount *fm) +{ + fuse_conn_put(fm->fc); + kfree(fm); +} +EXPORT_SYMBOL(fuse_mount_destroy); + static void fuse_kill_sb_anon(struct super_block *sb) { fuse_sb_destroy(sb); kill_anon_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuse_fs_type = { @@ -1775,6 +1798,7 @@ static void fuse_kill_sb_blk(struct super_block *sb) { fuse_sb_destroy(sb); kill_block_super(sb); + fuse_mount_destroy(get_fuse_mount_super(sb)); } static struct file_system_type fuseblk_fs_type = { diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 546ea3d58fb4..fbc09dab1f85 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -286,11 +286,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) goto out; - vaddr = kmap_atomic(ap.pages[0]); + vaddr = kmap_local_page(ap.pages[0]); err = fuse_copy_ioctl_iovec(fm->fc, iov_page, vaddr, transferred, in_iovs + out_iovs, (flags & FUSE_IOCTL_COMPAT) != 0); - kunmap_atomic(vaddr); + kunmap_local(vaddr); if (err) goto out; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index bc267832310c..b4e565711045 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -76,11 +76,11 @@ static void fuse_add_dirent_to_cache(struct file *file, WARN_ON(fi->rdc.pos != pos)) goto unlock; - addr = kmap_atomic(page); + addr = kmap_local_page(page); if (!offset) clear_page(addr); memcpy(addr + offset, dirent, reclen); - kunmap_atomic(addr); + kunmap_local(addr); fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen; fi->rdc.pos = dirent->off; unlock: @@ -454,7 +454,7 @@ static int fuse_readdir_cached(struct file *file, struct dir_context *ctx) * cache; both cases require an up-to-date mtime value. */ if (!ctx->pos && fc->auto_inval_data) { - int err = fuse_update_attributes(inode, file); + int err = fuse_update_attributes(inode, file, STATX_MTIME); if (err) return err; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 0ad89c6629d7..4cfa4bc1f579 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -649,7 +649,7 @@ static void virtio_fs_vq_done(struct virtqueue *vq) static void virtio_fs_init_vq(struct virtio_fs_vq *fsvq, char *name, int vq_type) { - strncpy(fsvq->name, name, VQ_NAME_LEN); + strscpy(fsvq->name, name, VQ_NAME_LEN); spin_lock_init(&fsvq->lock); INIT_LIST_HEAD(&fsvq->queued_reqs); INIT_LIST_HEAD(&fsvq->end_reqs); @@ -1394,12 +1394,13 @@ static void virtio_kill_sb(struct super_block *sb) bool last; /* If mount failed, we can still be called without any fc */ - if (fm) { + if (sb->s_root) { last = fuse_mount_remove(fm); if (last) virtio_fs_conn_destroy(fm); } kill_anon_super(sb); + fuse_mount_destroy(fm); } static int virtio_fs_test_super(struct super_block *sb, @@ -1455,19 +1456,14 @@ static int virtio_fs_get_tree(struct fs_context *fsc) fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); - if (fsc->s_fs_info) { - fuse_conn_put(fc); - kfree(fm); - } + if (fsc->s_fs_info) + fuse_mount_destroy(fm); if (IS_ERR(sb)) return PTR_ERR(sb); if (!sb->s_root) { err = virtio_fs_fill_super(sb, fsc); if (err) { - fuse_conn_put(fc); - kfree(fm); - sb->s_fs_info = NULL; deactivate_locked_super(sb); return err; } diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 61dfaf7b7d20..0d3e7177fce0 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -42,10 +42,9 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, fm->fc->no_setxattr = 1; err = -EOPNOTSUPP; } - if (!err) { - fuse_invalidate_attr(inode); + if (!err) fuse_update_ctime(inode); - } + return err; } @@ -173,10 +172,9 @@ int fuse_removexattr(struct inode *inode, const char *name) fm->fc->no_removexattr = 1; err = -EOPNOTSUPP; } - if (!err) { - fuse_invalidate_attr(inode); + if (!err) fuse_update_ctime(inode); - } + return err; } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 5414c2c33580..7235d539e969 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -961,46 +961,6 @@ hole_found: goto out; } -static int gfs2_write_lock(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; - - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); - error = gfs2_glock_nq(&ip->i_gh); - if (error) - goto out_uninit; - if (&ip->i_inode == sdp->sd_rindex) { - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - - error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, - GL_NOCACHE, &m_ip->i_gh); - if (error) - goto out_unlock; - } - return 0; - -out_unlock: - gfs2_glock_dq(&ip->i_gh); -out_uninit: - gfs2_holder_uninit(&ip->i_gh); - return error; -} - -static void gfs2_write_unlock(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - - if (&ip->i_inode == sdp->sd_rindex) { - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); - - gfs2_glock_dq_uninit(&m_ip->i_gh); - } - gfs2_glock_dq_uninit(&ip->i_gh); -} - static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos, unsigned len) { @@ -1118,11 +1078,6 @@ out_qunlock: return ret; } -static inline bool gfs2_iomap_need_write_lock(unsigned flags) -{ - return (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT); -} - static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) @@ -1135,12 +1090,6 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, iomap->flags |= IOMAP_F_BUFFER_HEAD; trace_gfs2_iomap_start(ip, pos, length, flags); - if (gfs2_iomap_need_write_lock(flags)) { - ret = gfs2_write_lock(inode); - if (ret) - goto out; - } - ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp); if (ret) goto out_unlock; @@ -1168,10 +1117,7 @@ static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length, ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp); out_unlock: - if (ret && gfs2_iomap_need_write_lock(flags)) - gfs2_write_unlock(inode); release_metapath(&mp); -out: trace_gfs2_iomap_end(ip, iomap, ret); return ret; } @@ -1219,15 +1165,11 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length, } if (unlikely(!written)) - goto out_unlock; + return 0; if (iomap->flags & IOMAP_F_SIZE_CHANGED) mark_inode_dirty(inode); set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); - -out_unlock: - if (gfs2_iomap_need_write_lock(flags)) - gfs2_write_unlock(inode); return 0; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c559827cb6f9..adafaaf7d24d 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -213,11 +213,9 @@ void gfs2_set_inode_flags(struct inode *inode) * @inode: The inode * @reqflags: The flags to set * @mask: Indicates which flags are valid - * @fsflags: The FS_* inode flags passed in * */ -static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask, - const u32 fsflags) +static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -236,11 +234,6 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask, if ((new_flags ^ flags) == 0) goto out; - error = -EPERM; - if (IS_IMMUTABLE(inode) && (new_flags & GFS2_DIF_IMMUTABLE)) - goto out; - if (IS_APPEND(inode) && (new_flags & GFS2_DIF_APPENDONLY)) - goto out; if (!IS_IMMUTABLE(inode)) { error = gfs2_permission(&init_user_ns, inode, MAY_WRITE); if (error) @@ -313,7 +306,7 @@ int gfs2_fileattr_set(struct user_namespace *mnt_userns, mask &= ~(GFS2_DIF_TOPDIR | GFS2_DIF_INHERIT_JDATA); } - return do_gfs2_set_flags(inode, gfsflags, mask, fsflags); + return do_gfs2_set_flags(inode, gfsflags, mask); } static int gfs2_getlabel(struct file *filp, char __user *label) @@ -776,27 +769,99 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end, return ret ? ret : ret1; } +static inline bool should_fault_in_pages(ssize_t ret, struct iov_iter *i, + size_t *prev_count, + size_t *window_size) +{ + char __user *p = i->iov[0].iov_base + i->iov_offset; + size_t count = iov_iter_count(i); + int pages = 1; + + if (likely(!count)) + return false; + if (ret <= 0 && ret != -EFAULT) + return false; + if (!iter_is_iovec(i)) + return false; + + if (*prev_count != count || !*window_size) { + int pages, nr_dirtied; + + pages = min_t(int, BIO_MAX_VECS, + DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE)); + nr_dirtied = max(current->nr_dirtied_pause - + current->nr_dirtied, 1); + pages = min(pages, nr_dirtied); + } + + *prev_count = count; + *window_size = (size_t)PAGE_SIZE * pages - offset_in_page(p); + return true; +} + static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, struct gfs2_holder *gh) { struct file *file = iocb->ki_filp; struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - size_t count = iov_iter_count(to); + size_t prev_count = 0, window_size = 0; + size_t written = 0; ssize_t ret; - if (!count) + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + * + * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger + * physical as well as manual page faults, and we need to disable both + * kinds. + * + * For direct I/O, gfs2 takes the inode glock in deferred mode. This + * locking mode is compatible with other deferred holders, so multiple + * processes and nodes can do direct I/O to a file at the same time. + * There's no guarantee that reads or writes will be atomic. Any + * coordination among readers and writers needs to happen externally. + */ + + if (!iov_iter_count(to)) return 0; /* skip atime */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); +retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; +retry_under_glock: + pagefault_disable(); + to->nofault = true; + ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, + IOMAP_DIO_PARTIAL, written); + to->nofault = false; + pagefault_enable(); + if (ret > 0) + written = ret; + + if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { + size_t leftover; - ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL, 0); - gfs2_glock_dq(gh); + gfs2_holder_allow_demote(gh); + leftover = fault_in_iov_iter_writeable(to, window_size); + gfs2_holder_disallow_demote(gh); + if (leftover != window_size) { + if (!gfs2_holder_queued(gh)) + goto retry; + goto retry_under_glock; + } + } + if (gfs2_holder_queued(gh)) + gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); - return ret; + if (ret < 0) + return ret; + return written; } static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, @@ -805,11 +870,21 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct gfs2_inode *ip = GFS2_I(inode); - size_t len = iov_iter_count(from); - loff_t offset = iocb->ki_pos; + size_t prev_count = 0, window_size = 0; + size_t read = 0; ssize_t ret; /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + * + * For writes, iomap_dio_rw only triggers manual page faults, so we + * don't need to disable physical ones. + */ + + /* * Deferred lock, even if its a write, since we do no allocation on * this path. All we need to change is the atime, and this lock mode * ensures that other nodes have flushed their buffered read caches @@ -818,31 +893,62 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, * VFS does. */ gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, gh); +retry: ret = gfs2_glock_nq(gh); if (ret) goto out_uninit; - +retry_under_glock: /* Silently fall back to buffered I/O when writing beyond EOF */ - if (offset + len > i_size_read(&ip->i_inode)) + if (iocb->ki_pos + iov_iter_count(from) > i_size_read(&ip->i_inode)) goto out; - ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, 0); + from->nofault = true; + ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL, + IOMAP_DIO_PARTIAL, read); + from->nofault = false; + if (ret == -ENOTBLK) ret = 0; + if (ret > 0) + read = ret; + + if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { + size_t leftover; + + gfs2_holder_allow_demote(gh); + leftover = fault_in_iov_iter_readable(from, window_size); + gfs2_holder_disallow_demote(gh); + if (leftover != window_size) { + if (!gfs2_holder_queued(gh)) + goto retry; + goto retry_under_glock; + } + } out: - gfs2_glock_dq(gh); + if (gfs2_holder_queued(gh)) + gfs2_glock_dq(gh); out_uninit: gfs2_holder_uninit(gh); - return ret; + if (ret < 0) + return ret; + return read; } static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct gfs2_inode *ip; struct gfs2_holder gh; + size_t prev_count = 0, window_size = 0; size_t written = 0; ssize_t ret; + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + */ + if (iocb->ki_flags & IOCB_DIRECT) { ret = gfs2_file_direct_read(iocb, to, &gh); if (likely(ret != -ENOTBLK)) @@ -864,18 +970,118 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } ip = GFS2_I(iocb->ki_filp->f_mapping->host); gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); +retry: ret = gfs2_glock_nq(&gh); if (ret) goto out_uninit; +retry_under_glock: + pagefault_disable(); ret = generic_file_read_iter(iocb, to); + pagefault_enable(); if (ret > 0) written += ret; - gfs2_glock_dq(&gh); + + if (should_fault_in_pages(ret, to, &prev_count, &window_size)) { + size_t leftover; + + gfs2_holder_allow_demote(&gh); + leftover = fault_in_iov_iter_writeable(to, window_size); + gfs2_holder_disallow_demote(&gh); + if (leftover != window_size) { + if (!gfs2_holder_queued(&gh)) { + if (written) + goto out_uninit; + goto retry; + } + goto retry_under_glock; + } + } + if (gfs2_holder_queued(&gh)) + gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); return written ? written : ret; } +static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, + struct iov_iter *from, + struct gfs2_holder *gh) +{ + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder *statfs_gh = NULL; + size_t prev_count = 0, window_size = 0; + size_t read = 0; + ssize_t ret; + + /* + * In this function, we disable page faults when we're holding the + * inode glock while doing I/O. If a page fault occurs, we indicate + * that the inode glock may be dropped, fault in the pages manually, + * and retry. + */ + + if (inode == sdp->sd_rindex) { + statfs_gh = kmalloc(sizeof(*statfs_gh), GFP_NOFS); + if (!statfs_gh) + return -ENOMEM; + } + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, gh); +retry: + ret = gfs2_glock_nq(gh); + if (ret) + goto out_uninit; +retry_under_glock: + if (inode == sdp->sd_rindex) { + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + + ret = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, statfs_gh); + if (ret) + goto out_unlock; + } + + current->backing_dev_info = inode_to_bdi(inode); + pagefault_disable(); + ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); + pagefault_enable(); + current->backing_dev_info = NULL; + if (ret > 0) { + iocb->ki_pos += ret; + read += ret; + } + + if (inode == sdp->sd_rindex) + gfs2_glock_dq_uninit(statfs_gh); + + if (should_fault_in_pages(ret, from, &prev_count, &window_size)) { + size_t leftover; + + gfs2_holder_allow_demote(gh); + leftover = fault_in_iov_iter_readable(from, window_size); + gfs2_holder_disallow_demote(gh); + if (leftover != window_size) { + if (!gfs2_holder_queued(gh)) { + if (read) + goto out_uninit; + goto retry; + } + goto retry_under_glock; + } + } +out_unlock: + if (gfs2_holder_queued(gh)) + gfs2_glock_dq(gh); +out_uninit: + gfs2_holder_uninit(gh); + if (statfs_gh) + kfree(statfs_gh); + return read ? read : ret; +} + /** * gfs2_file_write_iter - Perform a write to a file * @iocb: The io context @@ -927,9 +1133,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) goto out_unlock; iocb->ki_flags |= IOCB_DSYNC; - current->backing_dev_info = inode_to_bdi(inode); - buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); - current->backing_dev_info = NULL; + buffered = gfs2_file_buffered_write(iocb, from, &gh); if (unlikely(buffered <= 0)) { if (!ret) ret = buffered; @@ -943,7 +1147,6 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) * the direct I/O range as we don't know if the buffered pages * made it to disk. */ - iocb->ki_pos += buffered; ret2 = generic_write_sync(iocb, buffered); invalidate_mapping_pages(mapping, (iocb->ki_pos - buffered) >> PAGE_SHIFT, @@ -951,13 +1154,9 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!ret || ret2 > 0) ret += ret2; } else { - current->backing_dev_info = inode_to_bdi(inode); - ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); - current->backing_dev_info = NULL; - if (likely(ret > 0)) { - iocb->ki_pos += ret; + ret = gfs2_file_buffered_write(iocb, from, &gh); + if (likely(ret > 0)) ret = generic_write_sync(iocb, ret); - } } out_unlock: @@ -1338,8 +1537,6 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) { if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - if (fl->fl_type & LOCK_MAND) - return -EOPNOTSUPP; if (fl->fl_type == F_UNLCK) { do_unflock(file, fl); @@ -1353,7 +1550,7 @@ const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = gfs2_ioctl, .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, @@ -1386,7 +1583,7 @@ const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, .read_iter = gfs2_file_read_iter, .write_iter = gfs2_file_write_iter, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = gfs2_ioctl, .compat_ioctl = gfs2_compat_ioctl, .mmap = gfs2_mmap, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index e0eaa9cf9fb6..19f38aee1b61 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -58,6 +58,7 @@ struct gfs2_glock_iter { typedef void (*glock_examiner) (struct gfs2_glock * gl); static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target); +static void __gfs2_glock_dq(struct gfs2_holder *gh); static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; @@ -197,6 +198,12 @@ static int demote_ok(const struct gfs2_glock *gl) if (gl->gl_state == LM_ST_UNLOCKED) return 0; + /* + * Note that demote_ok is used for the lru process of disposing of + * glocks. For this purpose, we don't care if the glock's holders + * have the HIF_MAY_DEMOTE flag set or not. If someone is using + * them, don't demote. + */ if (!list_empty(&gl->gl_holders)) return 0; if (glops->go_demote_ok) @@ -294,6 +301,9 @@ void gfs2_glock_queue_put(struct gfs2_glock *gl) void gfs2_glock_put(struct gfs2_glock *gl) { + /* last put could call sleepable dlm api */ + might_sleep(); + if (lockref_put_or_lock(&gl->gl_lockref)) return; @@ -301,46 +311,59 @@ void gfs2_glock_put(struct gfs2_glock *gl) } /** - * may_grant - check if its ok to grant a new lock + * may_grant - check if it's ok to grant a new lock * @gl: The glock + * @current_gh: One of the current holders of @gl * @gh: The lock request which we wish to grant * - * Returns: true if its ok to grant the lock + * With our current compatibility rules, if a glock has one or more active + * holders (HIF_HOLDER flag set), any of those holders can be passed in as + * @current_gh; they are all the same as far as compatibility with the new @gh + * goes. + * + * Returns true if it's ok to grant the lock. */ -static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holder *gh) -{ - const struct gfs2_holder *gh_head = list_first_entry(&gl->gl_holders, const struct gfs2_holder, gh_list); +static inline bool may_grant(struct gfs2_glock *gl, + struct gfs2_holder *current_gh, + struct gfs2_holder *gh) +{ + if (current_gh) { + GLOCK_BUG_ON(gl, !test_bit(HIF_HOLDER, ¤t_gh->gh_iflags)); + + switch(current_gh->gh_state) { + case LM_ST_EXCLUSIVE: + /* + * Here we make a special exception to grant holders + * who agree to share the EX lock with other holders + * who also have the bit set. If the original holder + * has the LM_FLAG_NODE_SCOPE bit set, we grant more + * holders with the bit set. + */ + return gh->gh_state == LM_ST_EXCLUSIVE && + (current_gh->gh_flags & LM_FLAG_NODE_SCOPE) && + (gh->gh_flags & LM_FLAG_NODE_SCOPE); - if (gh != gh_head) { - /** - * Here we make a special exception to grant holders who agree - * to share the EX lock with other holders who also have the - * bit set. If the original holder has the LM_FLAG_NODE_SCOPE bit - * is set, we grant more holders with the bit set. - */ - if (gh_head->gh_state == LM_ST_EXCLUSIVE && - (gh_head->gh_flags & LM_FLAG_NODE_SCOPE) && - gh->gh_state == LM_ST_EXCLUSIVE && - (gh->gh_flags & LM_FLAG_NODE_SCOPE)) - return 1; - if ((gh->gh_state == LM_ST_EXCLUSIVE || - gh_head->gh_state == LM_ST_EXCLUSIVE)) - return 0; + case LM_ST_SHARED: + case LM_ST_DEFERRED: + return gh->gh_state == current_gh->gh_state; + + default: + return false; + } } + if (gl->gl_state == gh->gh_state) - return 1; + return true; if (gh->gh_flags & GL_EXACT) - return 0; + return false; if (gl->gl_state == LM_ST_EXCLUSIVE) { - if (gh->gh_state == LM_ST_SHARED && gh_head->gh_state == LM_ST_SHARED) - return 1; - if (gh->gh_state == LM_ST_DEFERRED && gh_head->gh_state == LM_ST_DEFERRED) - return 1; + return gh->gh_state == LM_ST_SHARED || + gh->gh_state == LM_ST_DEFERRED; } - if (gl->gl_state != LM_ST_UNLOCKED && (gh->gh_flags & LM_FLAG_ANY)) - return 1; - return 0; + if (gh->gh_flags & LM_FLAG_ANY) + return gl->gl_state != LM_ST_UNLOCKED; + return false; } static void gfs2_holder_wake(struct gfs2_holder *gh) @@ -366,7 +389,7 @@ static void do_error(struct gfs2_glock *gl, const int ret) struct gfs2_holder *gh, *tmp; list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { - if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + if (!test_bit(HIF_WAIT, &gh->gh_iflags)) continue; if (ret & LM_OUT_ERROR) gh->gh_error = -EIO; @@ -381,6 +404,123 @@ static void do_error(struct gfs2_glock *gl, const int ret) } /** + * demote_incompat_holders - demote incompatible demoteable holders + * @gl: the glock we want to promote + * @new_gh: the new holder to be promoted + */ +static void demote_incompat_holders(struct gfs2_glock *gl, + struct gfs2_holder *new_gh) +{ + struct gfs2_holder *gh; + + /* + * Demote incompatible holders before we make ourselves eligible. + * (This holder may or may not allow auto-demoting, but we don't want + * to demote the new holder before it's even granted.) + */ + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + /* + * Since holders are at the front of the list, we stop when we + * find the first non-holder. + */ + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + return; + if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags) && + !may_grant(gl, new_gh, gh)) { + /* + * We should not recurse into do_promote because + * __gfs2_glock_dq only calls handle_callback, + * gfs2_glock_add_to_lru and __gfs2_glock_queue_work. + */ + __gfs2_glock_dq(gh); + } + } +} + +/** + * find_first_holder - find the first "holder" gh + * @gl: the glock + */ + +static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + if (!list_empty(&gl->gl_holders)) { + gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, + gh_list); + if (test_bit(HIF_HOLDER, &gh->gh_iflags)) + return gh; + } + return NULL; +} + +/** + * find_first_strong_holder - find the first non-demoteable holder + * @gl: the glock + * + * Find the first holder that doesn't have the HIF_MAY_DEMOTE flag set. + */ +static inline struct gfs2_holder * +find_first_strong_holder(struct gfs2_glock *gl) +{ + struct gfs2_holder *gh; + + list_for_each_entry(gh, &gl->gl_holders, gh_list) { + if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) + return NULL; + if (!test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) + return gh; + } + return NULL; +} + +/* + * gfs2_instantiate - Call the glops instantiate function + * @gl: The glock + * + * Returns: 0 if instantiate was successful, 2 if type specific operation is + * underway, or error. + */ +int gfs2_instantiate(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + const struct gfs2_glock_operations *glops = gl->gl_ops; + int ret; + +again: + if (!test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags)) + return 0; + + /* + * Since we unlock the lockref lock, we set a flag to indicate + * instantiate is in progress. + */ + if (test_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags)) { + wait_on_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG, + TASK_UNINTERRUPTIBLE); + /* + * Here we just waited for a different instantiate to finish. + * But that may not have been successful, as when a process + * locks an inode glock _before_ it has an actual inode to + * instantiate into. So we check again. This process might + * have an inode to instantiate, so might be successful. + */ + goto again; + } + + set_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); + + ret = glops->go_instantiate(gh); + if (!ret) + clear_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); + clear_bit(GLF_INSTANTIATE_IN_PROG, &gl->gl_flags); + smp_mb__after_atomic(); + wake_up_bit(&gl->gl_flags, GLF_INSTANTIATE_IN_PROG); + return ret; +} + +/** * do_promote - promote as many requests as possible on the current queue * @gl: The glock * @@ -392,44 +532,59 @@ static int do_promote(struct gfs2_glock *gl) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { - const struct gfs2_glock_operations *glops = gl->gl_ops; - struct gfs2_holder *gh, *tmp; + struct gfs2_holder *gh, *tmp, *first_gh; + bool incompat_holders_demoted = false; + bool lock_released; int ret; restart: + first_gh = find_first_strong_holder(gl); list_for_each_entry_safe(gh, tmp, &gl->gl_holders, gh_list) { + lock_released = false; if (test_bit(HIF_HOLDER, &gh->gh_iflags)) continue; - if (may_grant(gl, gh)) { - if (gh->gh_list.prev == &gl->gl_holders && - glops->go_lock) { - spin_unlock(&gl->gl_lockref.lock); - /* FIXME: eliminate this eventually */ - ret = glops->go_lock(gh); - spin_lock(&gl->gl_lockref.lock); - if (ret) { - if (ret == 1) - return 2; - gh->gh_error = ret; - list_del_init(&gh->gh_list); - trace_gfs2_glock_queue(gh, 0); - gfs2_holder_wake(gh); - goto restart; - } - set_bit(HIF_HOLDER, &gh->gh_iflags); - trace_gfs2_promote(gh, 1); + if (!may_grant(gl, first_gh, gh)) { + /* + * If we get here, it means we may not grant this holder for + * some reason. If this holder is the head of the list, it + * means we have a blocked holder at the head, so return 1. + */ + if (gh->gh_list.prev == &gl->gl_holders) + return 1; + do_error(gl, 0); + break; + } + if (!incompat_holders_demoted) { + demote_incompat_holders(gl, first_gh); + incompat_holders_demoted = true; + first_gh = gh; + } + if (test_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags) && + !(gh->gh_flags & GL_SKIP) && gl->gl_ops->go_instantiate) { + lock_released = true; + spin_unlock(&gl->gl_lockref.lock); + ret = gfs2_instantiate(gh); + spin_lock(&gl->gl_lockref.lock); + if (ret) { + if (ret == 1) + return 2; + gh->gh_error = ret; + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); gfs2_holder_wake(gh); goto restart; } - set_bit(HIF_HOLDER, &gh->gh_iflags); - trace_gfs2_promote(gh, 0); - gfs2_holder_wake(gh); - continue; } - if (gh->gh_list.prev == &gl->gl_holders) - return 1; - do_error(gl, 0); - break; + set_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_promote(gh); + gfs2_holder_wake(gh); + /* + * If we released the gl_lockref.lock the holders list may have + * changed. For that reason, we start again at the start of + * the holders queue. + */ + if (lock_released) + goto restart; } return 0; } @@ -723,23 +878,6 @@ out: } /** - * find_first_holder - find the first "holder" gh - * @gl: the glock - */ - -static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) -{ - struct gfs2_holder *gh; - - if (!list_empty(&gl->gl_holders)) { - gh = list_first_entry(&gl->gl_holders, struct gfs2_holder, gh_list); - if (test_bit(HIF_HOLDER, &gh->gh_iflags)) - return gh; - } - return NULL; -} - -/** * run_queue - do all outstanding tasks related to a glock * @gl: The glock in question * @nonblock: True if we must not block in run_queue @@ -822,7 +960,7 @@ static void gfs2_glock_poke(struct gfs2_glock *gl) struct gfs2_holder gh; int error; - gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh); + __gfs2_holder_init(gl, LM_ST_SHARED, flags, &gh, _RET_IP_); error = gfs2_glock_nq(&gh); if (!error) gfs2_glock_dq(&gh); @@ -1057,7 +1195,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, atomic_inc(&sdp->sd_glock_disposal); gl->gl_node.next = NULL; - gl->gl_flags = 0; + gl->gl_flags = glops->go_instantiate ? BIT(GLF_INSTANTIATE_NEEDED) : 0; gl->gl_name = name; lockdep_set_subclass(&gl->gl_lockref.lock, glops->go_subclass); gl->gl_lockref.count = 1; @@ -1119,12 +1257,12 @@ out: * */ -void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, - struct gfs2_holder *gh) +void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, + struct gfs2_holder *gh, unsigned long ip) { INIT_LIST_HEAD(&gh->gh_list); gh->gh_gl = gl; - gh->gh_ip = _RET_IP_; + gh->gh_ip = ip; gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; @@ -1354,15 +1492,20 @@ __acquires(&gl->gl_lockref.lock) GLOCK_BUG_ON(gl, true); if (gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB)) { - if (test_bit(GLF_LOCK, &gl->gl_flags)) - try_futile = !may_grant(gl, gh); + if (test_bit(GLF_LOCK, &gl->gl_flags)) { + struct gfs2_holder *first_gh; + + first_gh = find_first_strong_holder(gl); + try_futile = !may_grant(gl, first_gh, gh); + } if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags)) goto fail; } list_for_each_entry(gh2, &gl->gl_holders, gh_list) { if (unlikely(gh2->gh_owner_pid == gh->gh_owner_pid && - (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK))) + (gh->gh_gl->gl_ops->go_type != LM_TYPE_FLOCK) && + !test_bit(HIF_MAY_DEMOTE, &gh2->gh_iflags))) goto trap_recursive; if (try_futile && !(gh2->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) { @@ -1458,51 +1601,83 @@ int gfs2_glock_poll(struct gfs2_holder *gh) return test_bit(HIF_WAIT, &gh->gh_iflags) ? 0 : 1; } -/** - * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) - * @gh: the glock holder - * - */ +static inline bool needs_demote(struct gfs2_glock *gl) +{ + return (test_bit(GLF_DEMOTE, &gl->gl_flags) || + test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags)); +} -void gfs2_glock_dq(struct gfs2_holder *gh) +static void __gfs2_glock_dq(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; unsigned delay = 0; int fast_path = 0; - spin_lock(&gl->gl_lockref.lock); /* - * If we're in the process of file system withdraw, we cannot just - * dequeue any glocks until our journal is recovered, lest we - * introduce file system corruption. We need two exceptions to this - * rule: We need to allow unlocking of nondisk glocks and the glock - * for our own journal that needs recovery. + * This while loop is similar to function demote_incompat_holders: + * If the glock is due to be demoted (which may be from another node + * or even if this holder is GL_NOCACHE), the weak holders are + * demoted as well, allowing the glock to be demoted. */ - if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && - glock_blocked_by_withdraw(gl) && - gh->gh_gl != sdp->sd_jinode_gl) { - sdp->sd_glock_dqs_held++; - spin_unlock(&gl->gl_lockref.lock); - might_sleep(); - wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, - TASK_UNINTERRUPTIBLE); - spin_lock(&gl->gl_lockref.lock); - } - if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0, false); + while (gh) { + /* + * If we're in the process of file system withdraw, we cannot + * just dequeue any glocks until our journal is recovered, lest + * we introduce file system corruption. We need two exceptions + * to this rule: We need to allow unlocking of nondisk glocks + * and the glock for our own journal that needs recovery. + */ + if (test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags) && + glock_blocked_by_withdraw(gl) && + gh->gh_gl != sdp->sd_jinode_gl) { + sdp->sd_glock_dqs_held++; + spin_unlock(&gl->gl_lockref.lock); + might_sleep(); + wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, + TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); + } + + /* + * This holder should not be cached, so mark it for demote. + * Note: this should be done before the check for needs_demote + * below. + */ + if (gh->gh_flags & GL_NOCACHE) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + + list_del_init(&gh->gh_list); + clear_bit(HIF_HOLDER, &gh->gh_iflags); + trace_gfs2_glock_queue(gh, 0); + + /* + * If there hasn't been a demote request we are done. + * (Let the remaining holders, if any, keep holding it.) + */ + if (!needs_demote(gl)) { + if (list_empty(&gl->gl_holders)) + fast_path = 1; + break; + } + /* + * If we have another strong holder (we cannot auto-demote) + * we are done. It keeps holding it until it is done. + */ + if (find_first_strong_holder(gl)) + break; - list_del_init(&gh->gh_list); - clear_bit(HIF_HOLDER, &gh->gh_iflags); - if (list_empty(&gl->gl_holders) && - !test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && - !test_bit(GLF_DEMOTE, &gl->gl_flags)) - fast_path = 1; + /* + * If we have a weak holder at the head of the list, it + * (and all others like it) must be auto-demoted. If there + * are no more weak holders, we exit the while loop. + */ + gh = find_first_holder(gl); + } if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) gfs2_glock_add_to_lru(gl); - trace_gfs2_glock_queue(gh, 0); if (unlikely(!fast_path)) { gl->gl_lockref.count++; if (test_bit(GLF_PENDING_DEMOTE, &gl->gl_flags) && @@ -1511,6 +1686,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh) delay = gl->gl_hold_time; __gfs2_glock_queue_work(gl, delay); } +} + +/** + * gfs2_glock_dq - dequeue a struct gfs2_holder from a glock (release a glock) + * @gh: the glock holder + * + */ +void gfs2_glock_dq(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + spin_lock(&gl->gl_lockref.lock); + __gfs2_glock_dq(gh); spin_unlock(&gl->gl_lockref.lock); } @@ -1673,6 +1861,7 @@ void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs) void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) { + struct gfs2_holder mock_gh = { .gh_gl = gl, .gh_state = state, }; unsigned long delay = 0; unsigned long holdtime; unsigned long now = jiffies; @@ -1687,6 +1876,28 @@ void gfs2_glock_cb(struct gfs2_glock *gl, unsigned int state) if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) delay = gl->gl_hold_time; } + /* + * Note 1: We cannot call demote_incompat_holders from handle_callback + * or gfs2_set_demote due to recursion problems like: gfs2_glock_dq -> + * handle_callback -> demote_incompat_holders -> gfs2_glock_dq + * Plus, we only want to demote the holders if the request comes from + * a remote cluster node because local holder conflicts are resolved + * elsewhere. + * + * Note 2: if a remote node wants this glock in EX mode, lock_dlm will + * request that we set our state to UNLOCKED. Here we mock up a holder + * to make it look like someone wants the lock EX locally. Any SH + * and DF requests should be able to share the lock without demoting. + * + * Note 3: We only want to demote the demoteable holders when there + * are no more strong holders. The demoteable holders might as well + * keep the glock until the last strong holder is done with it. + */ + if (!find_first_strong_holder(gl)) { + if (state == LM_ST_UNLOCKED) + mock_gh.gh_state = LM_ST_EXCLUSIVE; + demote_incompat_holders(gl, &mock_gh); + } handle_callback(gl, state, delay, true); __gfs2_glock_queue_work(gl, delay); spin_unlock(&gl->gl_lockref.lock); @@ -1893,10 +2104,10 @@ static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) do { rhashtable_walk_start(&iter); - while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) - if (gl->gl_name.ln_sbd == sdp && - lockref_get_not_dead(&gl->gl_lockref)) + while ((gl = rhashtable_walk_next(&iter)) && !IS_ERR(gl)) { + if (gl->gl_name.ln_sbd == sdp) examiner(gl); + } rhashtable_walk_stop(&iter); } while (cond_resched(), gl == ERR_PTR(-EAGAIN)); @@ -1919,7 +2130,7 @@ bool gfs2_queue_delete_work(struct gfs2_glock *gl, unsigned long delay) void gfs2_cancel_delete_work(struct gfs2_glock *gl) { - if (cancel_delayed_work_sync(&gl->gl_delete)) { + if (cancel_delayed_work(&gl->gl_delete)) { clear_bit(GLF_PENDING_DELETE, &gl->gl_flags); gfs2_glock_put(gl); } @@ -1938,7 +2149,6 @@ static void flush_delete_work(struct gfs2_glock *gl) &gl->gl_delete, 0); } } - gfs2_glock_queue_work(gl, 0); } void gfs2_flush_delete_work(struct gfs2_sbd *sdp) @@ -1955,10 +2165,10 @@ void gfs2_flush_delete_work(struct gfs2_sbd *sdp) static void thaw_glock(struct gfs2_glock *gl) { - if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) { - gfs2_glock_put(gl); + if (!test_and_clear_bit(GLF_FROZEN, &gl->gl_flags)) + return; + if (!lockref_get_not_dead(&gl->gl_lockref)) return; - } set_bit(GLF_REPLY_PENDING, &gl->gl_flags); gfs2_glock_queue_work(gl, 0); } @@ -1974,9 +2184,12 @@ static void clear_glock(struct gfs2_glock *gl) gfs2_glock_remove_from_lru(gl); spin_lock(&gl->gl_lockref.lock); - if (gl->gl_state != LM_ST_UNLOCKED) - handle_callback(gl, LM_ST_UNLOCKED, 0, false); - __gfs2_glock_queue_work(gl, 0); + if (!__lockref_is_dead(&gl->gl_lockref)) { + gl->gl_lockref.count++; + if (gl->gl_state != LM_ST_UNLOCKED) + handle_callback(gl, LM_ST_UNLOCKED, 0, false); + __gfs2_glock_queue_work(gl, 0); + } spin_unlock(&gl->gl_lockref.lock); } @@ -2076,6 +2289,10 @@ static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) *p++ = 'H'; if (test_bit(HIF_WAIT, &iflags)) *p++ = 'W'; + if (test_bit(HIF_MAY_DEMOTE, &iflags)) + *p++ = 'D'; + if (flags & GL_SKIP) + *p++ = 's'; *p = 0; return buf; } @@ -2144,6 +2361,10 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'P'; if (test_bit(GLF_FREEING, gflags)) *p++ = 'x'; + if (test_bit(GLF_INSTANTIATE_NEEDED, gflags)) + *p++ = 'n'; + if (test_bit(GLF_INSTANTIATE_IN_PROG, gflags)) + *p++ = 'N'; *p = 0; return buf; } diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 31a8f2f649b5..4f8642301801 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -150,6 +150,8 @@ static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock * list_for_each_entry(gh, &gl->gl_holders, gh_list) { if (!test_bit(HIF_HOLDER, &gh->gh_iflags)) break; + if (test_bit(HIF_MAY_DEMOTE, &gh->gh_iflags)) + continue; if (gh->gh_owner_pid == pid) goto out; } @@ -188,13 +190,21 @@ extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, extern void gfs2_glock_hold(struct gfs2_glock *gl); extern void gfs2_glock_put(struct gfs2_glock *gl); extern void gfs2_glock_queue_put(struct gfs2_glock *gl); -extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, - u16 flags, struct gfs2_holder *gh); + +extern void __gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, + u16 flags, struct gfs2_holder *gh, + unsigned long ip); +static inline void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, + u16 flags, struct gfs2_holder *gh) { + __gfs2_holder_init(gl, state, flags, gh, _RET_IP_); +} + extern void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh); extern void gfs2_holder_uninit(struct gfs2_holder *gh); extern int gfs2_glock_nq(struct gfs2_holder *gh); extern int gfs2_glock_poll(struct gfs2_holder *gh); +extern int gfs2_instantiate(struct gfs2_holder *gh); extern int gfs2_glock_wait(struct gfs2_holder *gh); extern int gfs2_glock_async_wait(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq(struct gfs2_holder *gh); @@ -239,7 +249,7 @@ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, { int error; - gfs2_holder_init(gl, state, flags, gh); + __gfs2_holder_init(gl, state, flags, gh, _RET_IP_); error = gfs2_glock_nq(gh); if (error) @@ -325,6 +335,24 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) spin_unlock(&gl->gl_lockref.lock); } +static inline void gfs2_holder_allow_demote(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + spin_lock(&gl->gl_lockref.lock); + set_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); + spin_unlock(&gl->gl_lockref.lock); +} + +static inline void gfs2_holder_disallow_demote(struct gfs2_holder *gh) +{ + struct gfs2_glock *gl = gh->gh_gl; + + spin_lock(&gl->gl_lockref.lock); + clear_bit(HIF_MAY_DEMOTE, &gh->gh_iflags); + spin_unlock(&gl->gl_lockref.lock); +} + extern void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation); extern bool gfs2_inode_already_deleted(struct gfs2_glock *gl, u64 generation); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 79c621c7863d..650ad77c4d0b 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -228,7 +228,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); truncate_inode_pages_range(mapping, start, end); - rgd->rd_flags &= ~GFS2_RDF_UPTODATE; + set_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); } static void gfs2_rgrp_go_dump(struct seq_file *seq, struct gfs2_glock *gl, @@ -356,7 +356,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) struct address_space *mapping = gfs2_glock2aspace(gl); truncate_inode_pages(mapping, 0); if (ip) { - set_bit(GIF_INVALID, &ip->i_flags); + set_bit(GLF_INSTANTIATE_NEEDED, &gl->gl_flags); forget_all_cached_acls(&ip->i_inode); security_inode_invalidate_secctx(&ip->i_inode); gfs2_dir_hash_inval(ip); @@ -476,33 +476,29 @@ int gfs2_inode_refresh(struct gfs2_inode *ip) error = gfs2_dinode_in(ip, dibh->b_data); brelse(dibh); - clear_bit(GIF_INVALID, &ip->i_flags); - return error; } /** - * inode_go_lock - operation done after an inode lock is locked by a process + * inode_go_instantiate - read in an inode if necessary * @gh: The glock holder * * Returns: errno */ -static int inode_go_lock(struct gfs2_holder *gh) +static int inode_go_instantiate(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct gfs2_inode *ip = gl->gl_object; int error = 0; - if (!ip || (gh->gh_flags & GL_SKIP)) - return 0; + if (!ip) /* no inode to populate - read it in later */ + goto out; - if (test_bit(GIF_INVALID, &ip->i_flags)) { - error = gfs2_inode_refresh(ip); - if (error) - return error; - } + error = gfs2_inode_refresh(ip); + if (error) + goto out; if (gh->gh_state != LM_ST_DEFERRED) inode_dio_wait(&ip->i_inode); @@ -515,9 +511,10 @@ static int inode_go_lock(struct gfs2_holder *gh) list_add(&ip->i_trunc_list, &sdp->sd_trunc_list); spin_unlock(&sdp->sd_trunc_lock); wake_up(&sdp->sd_quota_wait); - return 1; + error = 1; } +out: return error; } @@ -740,7 +737,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_sync = inode_go_sync, .go_inval = inode_go_inval, .go_demote_ok = inode_go_demote_ok, - .go_lock = inode_go_lock, + .go_instantiate = inode_go_instantiate, .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, .go_flags = GLOF_ASPACE | GLOF_LRU | GLOF_LVB, @@ -750,7 +747,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_sync = rgrp_go_sync, .go_inval = rgrp_go_inval, - .go_lock = gfs2_rgrp_go_lock, + .go_instantiate = gfs2_rgrp_go_instantiate, .go_dump = gfs2_rgrp_go_dump, .go_type = LM_TYPE_RGRP, .go_flags = GLOF_LVB, diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0fe49770166e..8c00fb389ae5 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -119,7 +119,6 @@ struct gfs2_rgrpd { u32 rd_flags; u32 rd_extfail_pt; /* extent failure point */ #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ -#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ #define GFS2_RDF_PREFERRED 0x80000000 /* This rgrp is preferred */ #define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ @@ -220,7 +219,7 @@ struct gfs2_glock_operations { int (*go_xmote_bh)(struct gfs2_glock *gl); void (*go_inval) (struct gfs2_glock *gl, int flags); int (*go_demote_ok) (const struct gfs2_glock *gl); - int (*go_lock) (struct gfs2_holder *gh); + int (*go_instantiate) (struct gfs2_holder *gh); void (*go_dump)(struct seq_file *seq, struct gfs2_glock *gl, const char *fs_id_buf); void (*go_callback)(struct gfs2_glock *gl, bool remote); @@ -252,6 +251,7 @@ struct gfs2_lkstats { enum { /* States */ + HIF_MAY_DEMOTE = 1, HIF_HOLDER = 6, /* Set for gh that "holds" the glock */ HIF_WAIT = 10, }; @@ -315,6 +315,7 @@ struct gfs2_alloc_parms { enum { GLF_LOCK = 1, + GLF_INSTANTIATE_NEEDED = 2, /* needs instantiate */ GLF_DEMOTE = 3, GLF_PENDING_DEMOTE = 4, GLF_DEMOTE_IN_PROGRESS = 5, @@ -324,6 +325,7 @@ enum { GLF_REPLY_PENDING = 9, GLF_INITIAL = 10, GLF_FROZEN = 11, + GLF_INSTANTIATE_IN_PROG = 12, /* instantiate happening now */ GLF_LRU = 13, GLF_OBJECT = 14, /* Used only for tracing */ GLF_BLOCKING = 15, @@ -370,7 +372,6 @@ struct gfs2_glock { }; enum { - GIF_INVALID = 0, GIF_QD_LOCKED = 1, GIF_ALLOC_FAILED = 2, GIF_SW_PAGED = 3, @@ -386,9 +387,8 @@ struct gfs2_inode { u64 i_generation; u64 i_eattr; unsigned long i_flags; /* GIF_... */ - struct gfs2_glock *i_gl; /* Move into i_gh? */ + struct gfs2_glock *i_gl; struct gfs2_holder i_iopen_gh; - struct gfs2_holder i_gh; /* for prepare/commit_write only */ struct gfs2_qadata *i_qadata; /* quota allocation data */ struct gfs2_holder i_rgd_gh; struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 3130f85d2b3f..6424b903e885 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -182,7 +182,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, } glock_set_object(ip->i_gl, ip); - set_bit(GIF_INVALID, &ip->i_flags); + set_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags); error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh); if (unlikely(error)) goto fail; @@ -196,7 +196,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (type == DT_UNKNOWN) { /* Inode glock must be locked already */ - error = gfs2_inode_refresh(GFS2_I(inode)); + error = gfs2_instantiate(&i_gh); if (error) goto fail; } else { @@ -225,6 +225,10 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, return inode; fail: + if (gfs2_holder_initialized(&ip->i_iopen_gh)) { + glock_clear_object(ip->i_iopen_gh.gh_gl, ip); + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + } if (io_gl) gfs2_glock_put(io_gl); if (gfs2_holder_initialized(&i_gh)) @@ -727,18 +731,17 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_free_inode; flush_delayed_work(&ip->i_gl->gl_work); - glock_set_object(ip->i_gl, ip); error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) goto fail_free_inode; gfs2_cancel_delete_work(io_gl); - glock_set_object(io_gl, ip); error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); if (error) goto fail_gunlock2; + glock_set_object(ip->i_gl, ip); error = gfs2_trans_begin(sdp, blocks, 0); if (error) goto fail_gunlock2; @@ -754,6 +757,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (error) goto fail_gunlock2; + glock_set_object(io_gl, ip); gfs2_set_iop(inode); insert_inode_hash(inode); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c3b00ba92ed2..0fb3c01bc557 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -932,7 +932,7 @@ static int read_rindex_entry(struct gfs2_inode *ip) goto fail; rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr; - rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED); + rgd->rd_flags &= ~GFS2_RDF_PREFERRED; if (rgd->rd_data > sdp->sd_max_rg_data) sdp->sd_max_rg_data = rgd->rd_data; spin_lock(&sdp->sd_rindex_spin); @@ -1185,8 +1185,8 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd) } /** - * gfs2_rgrp_bh_get - Read in a RG's header and bitmaps - * @rgd: the struct gfs2_rgrpd describing the RG to read in + * gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps + * @gh: the glock holder representing the rgrpd to read in * * Read in all of a Resource Group's header and bitmap blocks. * Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps. @@ -1194,10 +1194,11 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd) * Returns: errno */ -static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) +int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh) { + struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_rgrpd *rgd = gl->gl_object; struct gfs2_sbd *sdp = rgd->rd_sbd; - struct gfs2_glock *gl = rgd->rd_gl; unsigned int length = rgd->rd_length; struct gfs2_bitmap *bi; unsigned int x, y; @@ -1225,21 +1226,18 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) } } - if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { - gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); - rgrp_set_bitmap_flags(rgd); - rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); - rgd->rd_free_clone = rgd->rd_free; - BUG_ON(rgd->rd_reserved); - /* max out the rgrp allocation failure point */ - rgd->rd_extfail_pt = rgd->rd_free; - } + gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); + rgrp_set_bitmap_flags(rgd); + rgd->rd_flags |= GFS2_RDF_CHECK; + rgd->rd_free_clone = rgd->rd_free; + GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved); + /* max out the rgrp allocation failure point */ + rgd->rd_extfail_pt = rgd->rd_free; if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) { rgd->rd_rgl->rl_unlinked = cpu_to_be32(count_unlinked(rgd)); gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data); - } - else if (sdp->sd_args.ar_rgrplvb) { + } else if (sdp->sd_args.ar_rgrplvb) { if (!gfs2_rgrp_lvb_valid(rgd)){ gfs2_consist_rgrpd(rgd); error = -EIO; @@ -1257,19 +1255,18 @@ fail: bi->bi_bh = NULL; gfs2_assert_warn(sdp, !bi->bi_clone); } - return error; } -static int update_rgrp_lvb(struct gfs2_rgrpd *rgd) +static int update_rgrp_lvb(struct gfs2_rgrpd *rgd, struct gfs2_holder *gh) { u32 rl_flags; - if (rgd->rd_flags & GFS2_RDF_UPTODATE) + if (!test_bit(GLF_INSTANTIATE_NEEDED, &gh->gh_gl->gl_flags)) return 0; if (cpu_to_be32(GFS2_MAGIC) != rgd->rd_rgl->rl_magic) - return gfs2_rgrp_bh_get(rgd); + return gfs2_instantiate(gh); rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags); rl_flags &= ~GFS2_RDF_MASK; @@ -1280,7 +1277,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd) rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free); rgrp_set_bitmap_flags(rgd); rgd->rd_free_clone = rgd->rd_free; - BUG_ON(rgd->rd_reserved); + GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved); /* max out the rgrp allocation failure point */ rgd->rd_extfail_pt = rgd->rd_free; rgd->rd_dinodes = be32_to_cpu(rgd->rd_rgl->rl_dinodes); @@ -1288,16 +1285,6 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd) return 0; } -int gfs2_rgrp_go_lock(struct gfs2_holder *gh) -{ - struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; - struct gfs2_sbd *sdp = rgd->rd_sbd; - - if (gh->gh_flags & GL_SKIP && sdp->sd_args.ar_rgrplvb) - return 0; - return gfs2_rgrp_bh_get(rgd); -} - /** * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get() * @rgd: The resource group @@ -1315,6 +1302,7 @@ void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd) bi->bi_bh = NULL; } } + set_bit(GLF_INSTANTIATE_NEEDED, &rgd->rd_gl->gl_flags); } int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, @@ -2113,7 +2101,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) gfs2_rgrp_congested(rs->rs_rgd, loops)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) { - error = update_rgrp_lvb(rs->rs_rgd); + error = update_rgrp_lvb(rs->rs_rgd, + &ip->i_rgd_gh); if (unlikely(error)) { rgrp_unlock_local(rs->rs_rgd); gfs2_glock_dq_uninit(&ip->i_rgd_gh); @@ -2128,8 +2117,11 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) (loops == 0 && target > rs->rs_rgd->rd_extfail_pt)) goto skip_rgrp; - if (sdp->sd_args.ar_rgrplvb) - gfs2_rgrp_bh_get(rs->rs_rgd); + if (sdp->sd_args.ar_rgrplvb) { + error = gfs2_instantiate(&ip->i_rgd_gh); + if (error) + goto skip_rgrp; + } /* Get a reservation if we don't already have one */ if (!gfs2_rs_active(rs)) @@ -2215,7 +2207,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip) struct gfs2_rgrpd *rgd = rs->rs_rgd; spin_lock(&rgd->rd_rsspin); - BUG_ON(rgd->rd_reserved < rs->rs_reserved); + GLOCK_BUG_ON(rgd->rd_gl, rgd->rd_reserved < rs->rs_reserved); rgd->rd_reserved -= rs->rs_reserved; spin_unlock(&rgd->rd_rsspin); rs->rs_reserved = 0; @@ -2476,9 +2468,9 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, spin_unlock(&rbm.rgd->rd_rsspin); goto rgrp_error; } - BUG_ON(rbm.rgd->rd_reserved < *nblocks); - BUG_ON(rbm.rgd->rd_free_clone < *nblocks); - BUG_ON(rbm.rgd->rd_free < *nblocks); + GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_reserved < *nblocks); + GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free_clone < *nblocks); + GLOCK_BUG_ON(rbm.rgd->rd_gl, rbm.rgd->rd_free < *nblocks); rbm.rgd->rd_reserved -= *nblocks; rbm.rgd->rd_free_clone -= *nblocks; rbm.rgd->rd_free -= *nblocks; @@ -2765,8 +2757,6 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist) void rgrp_lock_local(struct gfs2_rgrpd *rgd) { - BUG_ON(!gfs2_glock_is_held_excl(rgd->rd_gl) && - !test_bit(SDF_NORECOVERY, &rgd->rd_sbd->sd_flags)); mutex_lock(&rgd->rd_mutex); } diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index a6855fd796e0..3e2ca1fb4305 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -31,7 +31,7 @@ extern struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); extern int gfs2_rindex_update(struct gfs2_sbd *sdp); extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); -extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); +extern int gfs2_rgrp_go_instantiate(struct gfs2_holder *gh); extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 6e00d15ef0a8..5b121371508a 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1244,8 +1244,8 @@ static enum dinode_demise evict_should_delete(struct inode *inode, if (ret) return SHOULD_NOT_DELETE_DINODE; - if (test_bit(GIF_INVALID, &ip->i_flags)) { - ret = gfs2_inode_refresh(ip); + if (test_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags)) { + ret = gfs2_instantiate(gh); if (ret) return SHOULD_NOT_DELETE_DINODE; } diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index bd6c8e9e49db..a5deb9f86831 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -197,15 +197,14 @@ TRACE_EVENT(gfs2_demote_rq, /* Promotion/grant of a glock */ TRACE_EVENT(gfs2_promote, - TP_PROTO(const struct gfs2_holder *gh, int first), + TP_PROTO(const struct gfs2_holder *gh), - TP_ARGS(gh, first), + TP_ARGS(gh), TP_STRUCT__entry( __field( dev_t, dev ) __field( u64, glnum ) __field( u32, gltype ) - __field( int, first ) __field( u8, state ) ), @@ -213,14 +212,12 @@ TRACE_EVENT(gfs2_promote, __entry->dev = gh->gh_gl->gl_name.ln_sbd->sd_vfs->s_dev; __entry->glnum = gh->gh_gl->gl_name.ln_number; __entry->gltype = gh->gh_gl->gl_name.ln_type; - __entry->first = first; __entry->state = glock_trace_state(gh->gh_state); ), - TP_printk("%u,%u glock %u:%llu promote %s %s", + TP_printk("%u,%u glock %u:%llu promote %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->gltype, (unsigned long long)__entry->glnum, - __entry->first ? "first": "other", glock_trace_name(__entry->state)) ); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index cf345a86ef67..8241029a2a5d 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -454,6 +454,7 @@ void gfs2_consist_inode_i(struct gfs2_inode *ip, (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, function, file, line); + gfs2_dump_glock(NULL, ip->i_gl, 1); gfs2_withdraw(sdp); } @@ -475,6 +476,7 @@ void gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, " function = %s, file = %s, line = %u\n", (unsigned long long)rgd->rd_addr, function, file, line); + gfs2_dump_glock(NULL, rgd->rd_gl, 1); gfs2_withdraw(sdp); } diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index 4a95a92546a0..2a5143246282 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -462,8 +462,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) goto out; if (S_ISDIR(main_inode->i_mode)) { - if (fd.entrylength < sizeof(struct hfs_cat_dir)) - /* panic? */; + WARN_ON(fd.entrylength < sizeof(struct hfs_cat_dir)); hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_dir)); if (rec.type != HFS_CDR_DIR || @@ -483,8 +482,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc) hfs_bnode_write(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); } else { - if (fd.entrylength < sizeof(struct hfs_cat_file)) - /* panic? */; + WARN_ON(fd.entrylength < sizeof(struct hfs_cat_file)); hfs_bnode_read(fd.bnode, &rec, fd.entryoffset, sizeof(struct hfs_cat_file)); if (rec.type != HFS_CDR_FIL || diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index cdf0edeeb278..5beb82652435 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -36,7 +36,7 @@ static int hfs_get_last_session(struct super_block *sb, /* default values */ *start = 0; - *size = i_size_read(sb->s_bdev->bd_inode) >> 9; + *size = bdev_nr_sectors(sb->s_bdev); if (HFS_SB(sb)->session >= 0) { struct cdrom_tocentry te; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 6fef67c2a9f0..d08a8d1d40a4 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -509,8 +509,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) if (type == HFSPLUS_FOLDER) { struct hfsplus_cat_folder *folder = &entry.folder; - if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) - /* panic? */; + WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_folder)); hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_folder)); hfsplus_get_perms(inode, &folder->permissions, 1); @@ -530,8 +529,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) } else if (type == HFSPLUS_FILE) { struct hfsplus_cat_file *file = &entry.file; - if (fd->entrylength < sizeof(struct hfsplus_cat_file)) - /* panic? */; + WARN_ON(fd->entrylength < sizeof(struct hfsplus_cat_file)); hfs_bnode_read(fd->bnode, &entry, fd->entryoffset, sizeof(struct hfsplus_cat_file)); @@ -588,8 +586,7 @@ int hfsplus_cat_write_inode(struct inode *inode) if (S_ISDIR(main_inode->i_mode)) { struct hfsplus_cat_folder *folder = &entry.folder; - if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) - /* panic? */; + WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_folder)); hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_folder)); /* simple node checks? */ @@ -614,8 +611,7 @@ int hfsplus_cat_write_inode(struct inode *inode) } else { struct hfsplus_cat_file *file = &entry.file; - if (fd.entrylength < sizeof(struct hfsplus_cat_file)) - /* panic? */; + WARN_ON(fd.entrylength < sizeof(struct hfsplus_cat_file)); hfs_bnode_read(fd.bnode, &entry, fd.entryoffset, sizeof(struct hfsplus_cat_file)); hfsplus_inode_write_fork(inode, &file->data_fork); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 0350dc7821bf..51ae6f1eb4a5 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -131,7 +131,7 @@ static int hfsplus_get_last_session(struct super_block *sb, /* default values */ *start = 0; - *size = i_size_read(sb->s_bdev->bd_inode) >> 9; + *size = bdev_nr_sectors(sb->s_bdev); if (HFSPLUS_SB(sb)->session >= 0) { struct cdrom_tocentry te; diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index d92c4af3e1b4..281dec8f636b 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -409,10 +409,10 @@ struct bplus_header __le16 first_free; /* offset from start of header to first free node in array */ union { - struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving - subtree pointers */ - struct bplus_leaf_node external[0]; /* (external) 3-word entries giving - sector runs */ + /* (internal) 2-word entries giving subtree pointers */ + DECLARE_FLEX_ARRAY(struct bplus_internal_node, internal); + /* (external) 3-word entries giving sector runs */ + DECLARE_FLEX_ARRAY(struct bplus_leaf_node, external); } u; }; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index cdfb1ae78a3f..49d2e686be74 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1446,8 +1446,8 @@ static int get_hstate_idx(int page_size_log) * otherwise hugetlb_reserve_pages reserves one less hugepages than intended. */ struct file *hugetlb_file_setup(const char *name, size_t size, - vm_flags_t acctflag, struct ucounts **ucounts, - int creat_flags, int page_size_log) + vm_flags_t acctflag, int creat_flags, + int page_size_log) { struct inode *inode; struct vfsmount *mnt; @@ -1458,22 +1458,19 @@ struct file *hugetlb_file_setup(const char *name, size_t size, if (hstate_idx < 0) return ERR_PTR(-ENODEV); - *ucounts = NULL; mnt = hugetlbfs_vfsmount[hstate_idx]; if (!mnt) return ERR_PTR(-ENOENT); if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { - *ucounts = current_ucounts(); - if (user_shm_lock(size, *ucounts)) { - task_lock(current); - pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", + struct ucounts *ucounts = current_ucounts(); + + if (user_shm_lock(size, ucounts)) { + pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n", current->comm, current->pid); - task_unlock(current); - } else { - *ucounts = NULL; - return ERR_PTR(-EPERM); + user_shm_unlock(size, ucounts); } + return ERR_PTR(-EPERM); } file = ERR_PTR(-ENOSPC); @@ -1498,10 +1495,6 @@ struct file *hugetlb_file_setup(const char *name, size_t size, iput(inode); out: - if (*ucounts) { - user_shm_unlock(size, *ucounts); - *ucounts = NULL; - } return file; } diff --git a/fs/inode.c b/fs/inode.c index 37710ca863b5..3eba0940ffcf 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -190,8 +190,10 @@ int inode_init_always(struct super_block *sb, struct inode *inode) mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->private_data = NULL; mapping->writeback_index = 0; - __init_rwsem(&mapping->invalidate_lock, "mapping.invalidate_lock", - &sb->s_type->invalidate_lock_key); + init_rwsem(&mapping->invalidate_lock); + lockdep_set_class_and_name(&mapping->invalidate_lock, + &sb->s_type->invalidate_lock_key, + "mapping.invalidate_lock"); inode->i_private = NULL; inode->i_mapping = mapping; INIT_HLIST_HEAD(&inode->i_dentry); /* buggered by rcu freeing */ @@ -426,11 +428,20 @@ void ihold(struct inode *inode) } EXPORT_SYMBOL(ihold); -static void inode_lru_list_add(struct inode *inode) +static void __inode_add_lru(struct inode *inode, bool rotate) { + if (inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) + return; + if (atomic_read(&inode->i_count)) + return; + if (!(inode->i_sb->s_flags & SB_ACTIVE)) + return; + if (!mapping_shrinkable(&inode->i_data)) + return; + if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); - else + else if (rotate) inode->i_state |= I_REFERENCED; } @@ -441,16 +452,11 @@ static void inode_lru_list_add(struct inode *inode) */ void inode_add_lru(struct inode *inode) { - if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | - I_FREEING | I_WILL_FREE)) && - !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE) - inode_lru_list_add(inode); + __inode_add_lru(inode, false); } - static void inode_lru_list_del(struct inode *inode) { - if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_dec(nr_unused); } @@ -726,10 +732,6 @@ again: /* * Isolate the inode from the LRU in preparation for freeing it. * - * Any inodes which are pinned purely because of attached pagecache have their - * pagecache removed. If the inode has metadata buffers attached to - * mapping->private_list then try to remove them. - * * If the inode has the I_REFERENCED flag set, then it means that it has been * used recently - the flag is set in iput_final(). When we encounter such an * inode, clear the flag and move it to the back of the LRU so it gets another @@ -745,31 +747,39 @@ static enum lru_status inode_lru_isolate(struct list_head *item, struct inode *inode = container_of(item, struct inode, i_lru); /* - * we are inverting the lru lock/inode->i_lock here, so use a trylock. - * If we fail to get the lock, just skip it. + * We are inverting the lru lock/inode->i_lock here, so use a + * trylock. If we fail to get the lock, just skip it. */ if (!spin_trylock(&inode->i_lock)) return LRU_SKIP; /* - * Referenced or dirty inodes are still in use. Give them another pass - * through the LRU as we canot reclaim them now. + * Inodes can get referenced, redirtied, or repopulated while + * they're already on the LRU, and this can make them + * unreclaimable for a while. Remove them lazily here; iput, + * sync, or the last page cache deletion will requeue them. */ if (atomic_read(&inode->i_count) || - (inode->i_state & ~I_REFERENCED)) { + (inode->i_state & ~I_REFERENCED) || + !mapping_shrinkable(&inode->i_data)) { list_lru_isolate(lru, &inode->i_lru); spin_unlock(&inode->i_lock); this_cpu_dec(nr_unused); return LRU_REMOVED; } - /* recently referenced inodes get one more pass */ + /* Recently referenced inodes get one more pass */ if (inode->i_state & I_REFERENCED) { inode->i_state &= ~I_REFERENCED; spin_unlock(&inode->i_lock); return LRU_ROTATE; } + /* + * On highmem systems, mapping_shrinkable() permits dropping + * page cache in order to free up struct inodes: lowmem might + * be under pressure before the cache inside the highmem zone. + */ if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { __iget(inode); spin_unlock(&inode->i_lock); @@ -1636,7 +1646,7 @@ static void iput_final(struct inode *inode) if (!drop && !(inode->i_state & I_DONTCACHE) && (sb->s_flags & SB_ACTIVE)) { - inode_add_lru(inode); + __inode_add_lru(inode, true); spin_unlock(&inode->i_lock); return; } @@ -1780,12 +1790,13 @@ EXPORT_SYMBOL(generic_update_time); * This does the actual work of updating an inodes time or version. Must have * had called mnt_want_write() before calling this. */ -static int update_time(struct inode *inode, struct timespec64 *time, int flags) +int inode_update_time(struct inode *inode, struct timespec64 *time, int flags) { if (inode->i_op->update_time) return inode->i_op->update_time(inode, time, flags); return generic_update_time(inode, time, flags); } +EXPORT_SYMBOL(inode_update_time); /** * atime_needs_update - update the access time @@ -1855,7 +1866,7 @@ void touch_atime(const struct path *path) * of the fs read only, e.g. subvolumes in Btrfs. */ now = current_time(inode); - update_time(inode, &now, S_ATIME); + inode_update_time(inode, &now, S_ATIME); __mnt_drop_write(mnt); skip_update: sb_end_write(inode->i_sb); @@ -2000,7 +2011,7 @@ int file_update_time(struct file *file) if (__mnt_want_write_file(file)) return 0; - ret = update_time(inode, &now, sync_it); + ret = inode_update_time(inode, &now, sync_it); __mnt_drop_write_file(file); return ret; diff --git a/fs/internal.h b/fs/internal.h index 3cd065c8a66b..7979ff8d168c 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -23,22 +23,11 @@ struct pipe_inode_info; #ifdef CONFIG_BLOCK extern void __init bdev_cache_init(void); -extern int __sync_blockdev(struct block_device *bdev, int wait); -void iterate_bdevs(void (*)(struct block_device *, void *), void *); void emergency_thaw_bdev(struct super_block *sb); #else static inline void bdev_cache_init(void) { } - -static inline int __sync_blockdev(struct block_device *bdev, int wait) -{ - return 0; -} -static inline void iterate_bdevs(void (*f)(struct block_device *, void *), - void *arg) -{ -} static inline int emergency_thaw_bdev(struct super_block *sb) { return 0; @@ -149,7 +138,6 @@ extern int vfs_open(const struct path *, struct file *); * inode.c */ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); -extern void inode_add_lru(struct inode *inode); extern int dentry_needs_remove_privs(struct dentry *dentry); /* diff --git a/fs/io-wq.c b/fs/io-wq.c index 6c55362c1f99..c51691262208 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -14,6 +14,8 @@ #include <linux/rculist_nulls.h> #include <linux/cpu.h> #include <linux/tracehook.h> +#include <linux/audit.h> +#include <uapi/linux/io_uring.h> #include "io-wq.h" @@ -139,6 +141,7 @@ static void io_wqe_dec_running(struct io_worker *worker); static bool io_acct_cancel_pending_work(struct io_wqe *wqe, struct io_wqe_acct *acct, struct io_cb_cancel_data *match); +static void create_worker_cb(struct callback_head *cb); static bool io_worker_get(struct io_worker *worker) { @@ -173,20 +176,52 @@ static void io_worker_ref_put(struct io_wq *wq) complete(&wq->worker_done); } +static void io_worker_cancel_cb(struct io_worker *worker) +{ + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wqe *wqe = worker->wqe; + struct io_wq *wq = wqe->wq; + + atomic_dec(&acct->nr_running); + raw_spin_lock(&worker->wqe->lock); + acct->nr_workers--; + raw_spin_unlock(&worker->wqe->lock); + io_worker_ref_put(wq); + clear_bit_unlock(0, &worker->create_state); + io_worker_release(worker); +} + +static bool io_task_worker_match(struct callback_head *cb, void *data) +{ + struct io_worker *worker; + + if (cb->func != create_worker_cb) + return false; + worker = container_of(cb, struct io_worker, create_work); + return worker == data; +} + static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; - struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wq *wq = wqe->wq; - if (refcount_dec_and_test(&worker->ref)) - complete(&worker->ref_done); + while (1) { + struct callback_head *cb = task_work_cancel_match(wq->task, + io_task_worker_match, worker); + + if (!cb) + break; + io_worker_cancel_cb(worker); + } + + io_worker_release(worker); wait_for_completion(&worker->ref_done); raw_spin_lock(&wqe->lock); if (worker->flags & IO_WORKER_F_FREE) hlist_nulls_del_rcu(&worker->nulls_node); list_del_rcu(&worker->all_list); - acct->nr_workers--; preempt_disable(); io_wqe_dec_running(worker); worker->flags = 0; @@ -246,8 +281,6 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe, */ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) { - bool do_create = false; - /* * Most likely an attempt to queue unbounded work on an io_wq that * wasn't setup with any unbounded workers. @@ -256,18 +289,15 @@ static bool io_wqe_create_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) pr_warn_once("io-wq is not configured for unbound workers"); raw_spin_lock(&wqe->lock); - if (acct->nr_workers < acct->max_workers) { - acct->nr_workers++; - do_create = true; + if (acct->nr_workers >= acct->max_workers) { + raw_spin_unlock(&wqe->lock); + return true; } + acct->nr_workers++; raw_spin_unlock(&wqe->lock); - if (do_create) { - atomic_inc(&acct->nr_running); - atomic_inc(&wqe->wq->worker_refs); - return create_io_worker(wqe->wq, wqe, acct->index); - } - - return true; + atomic_inc(&acct->nr_running); + atomic_inc(&wqe->wq->worker_refs); + return create_io_worker(wqe->wq, wqe, acct->index); } static void io_wqe_inc_running(struct io_worker *worker) @@ -329,8 +359,10 @@ static bool io_queue_worker_create(struct io_worker *worker, init_task_work(&worker->create_work, func); worker->create_index = acct->index; - if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) + if (!task_work_add(wq->task, &worker->create_work, TWA_SIGNAL)) { + clear_bit_unlock(0, &worker->create_state); return true; + } clear_bit_unlock(0, &worker->create_state); fail_release: io_worker_release(worker); @@ -562,6 +594,8 @@ static int io_wqe_worker(void *data) snprintf(buf, sizeof(buf), "iou-wrk-%d", wq->task->pid); set_task_comm(current, buf); + audit_alloc_kernel(current); + while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { long ret; @@ -574,6 +608,7 @@ loop: } /* timed out, exit unless we're the last worker */ if (last_timeout && acct->nr_workers > 1) { + acct->nr_workers--; raw_spin_unlock(&wqe->lock); __set_current_state(TASK_RUNNING); break; @@ -589,9 +624,7 @@ loop: if (!get_signal(&ksig)) continue; - if (fatal_signal_pending(current)) - break; - continue; + break; } last_timeout = !ret; } @@ -601,6 +634,7 @@ loop: io_worker_handle_work(worker); } + audit_free(current); io_worker_exit(worker); return 0; } @@ -723,11 +757,8 @@ static void io_workqueue_create(struct work_struct *work) struct io_worker *worker = container_of(work, struct io_worker, work); struct io_wqe_acct *acct = io_wqe_get_acct(worker); - if (!io_queue_worker_create(worker, acct, create_worker_cont)) { - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); + if (!io_queue_worker_create(worker, acct, create_worker_cont)) kfree(worker); - } } static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) @@ -1157,17 +1188,9 @@ static void io_wq_exit_workers(struct io_wq *wq) while ((cb = task_work_cancel_match(wq->task, io_task_work_match, wq)) != NULL) { struct io_worker *worker; - struct io_wqe_acct *acct; worker = container_of(cb, struct io_worker, create_work); - acct = io_wqe_get_acct(worker); - atomic_dec(&acct->nr_running); - raw_spin_lock(&worker->wqe->lock); - acct->nr_workers--; - raw_spin_unlock(&worker->wqe->lock); - io_worker_ref_put(wq); - clear_bit_unlock(0, &worker->create_state); - io_worker_release(worker); + io_worker_cancel_cb(worker); } rcu_read_lock(); @@ -1287,6 +1310,10 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) { int i, node, prev = 0; + BUILD_BUG_ON((int) IO_WQ_ACCT_BOUND != (int) IO_WQ_BOUND); + BUILD_BUG_ON((int) IO_WQ_ACCT_UNBOUND != (int) IO_WQ_UNBOUND); + BUILD_BUG_ON((int) IO_WQ_ACCT_NR != 2); + for (i = 0; i < 2; i++) { if (new_count[i] > task_rlimit(current, RLIMIT_NPROC)) new_count[i] = task_rlimit(current, RLIMIT_NPROC); @@ -1294,15 +1321,18 @@ int io_wq_max_workers(struct io_wq *wq, int *new_count) rcu_read_lock(); for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; struct io_wqe_acct *acct; + raw_spin_lock(&wqe->lock); for (i = 0; i < IO_WQ_ACCT_NR; i++) { - acct = &wq->wqes[node]->acct[i]; + acct = &wqe->acct[i]; prev = max_t(int, acct->max_workers, prev); if (new_count[i]) acct->max_workers = new_count[i]; new_count[i] = prev; } + raw_spin_unlock(&wqe->lock); } rcu_read_unlock(); return 0; diff --git a/fs/io-wq.h b/fs/io-wq.h index bf5c4c533760..41bf37674a49 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -29,6 +29,17 @@ struct io_wq_work_list { struct io_wq_work_node *last; }; +#define wq_list_for_each(pos, prv, head) \ + for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) + +#define wq_list_for_each_resume(pos, prv) \ + for (; pos; prv = pos, pos = (pos)->next) + +#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) +#define INIT_WQ_LIST(list) do { \ + (list)->first = NULL; \ +} while (0) + static inline void wq_list_add_after(struct io_wq_work_node *node, struct io_wq_work_node *pos, struct io_wq_work_list *list) @@ -54,6 +65,15 @@ static inline void wq_list_add_tail(struct io_wq_work_node *node, } } +static inline void wq_list_add_head(struct io_wq_work_node *node, + struct io_wq_work_list *list) +{ + node->next = list->first; + if (!node->next) + list->last = node; + WRITE_ONCE(list->first, node); +} + static inline void wq_list_cut(struct io_wq_work_list *list, struct io_wq_work_node *last, struct io_wq_work_node *prev) @@ -69,6 +89,31 @@ static inline void wq_list_cut(struct io_wq_work_list *list, last->next = NULL; } +static inline void __wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + list->last->next = to->next; + to->next = list->first; + INIT_WQ_LIST(list); +} + +static inline bool wq_list_splice(struct io_wq_work_list *list, + struct io_wq_work_node *to) +{ + if (!wq_list_empty(list)) { + __wq_list_splice(list, to); + return true; + } + return false; +} + +static inline void wq_stack_add_head(struct io_wq_work_node *node, + struct io_wq_work_node *stack) +{ + node->next = stack->next; + stack->next = node; +} + static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work_node *node, struct io_wq_work_node *prev) @@ -76,14 +121,14 @@ static inline void wq_list_del(struct io_wq_work_list *list, wq_list_cut(list, node, prev); } -#define wq_list_for_each(pos, prv, head) \ - for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next) +static inline +struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack) +{ + struct io_wq_work_node *node = stack->next; -#define wq_list_empty(list) (READ_ONCE((list)->first) == NULL) -#define INIT_WQ_LIST(list) do { \ - (list)->first = NULL; \ - (list)->last = NULL; \ -} while (0) + stack->next = node->next; + return node; +} struct io_wq_work { struct io_wq_work_node list; diff --git a/fs/io_uring.c b/fs/io_uring.c index 16fb7436043c..3ecd4b51510e 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -79,6 +79,8 @@ #include <linux/pagemap.h> #include <linux/io_uring.h> #include <linux/tracehook.h> +#include <linux/audit.h> +#include <linux/security.h> #define CREATE_TRACE_POINTS #include <trace/events/io_uring.h> @@ -103,11 +105,14 @@ #define IORING_MAX_REG_BUFFERS (1U << 14) -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ - IOSQE_BUFFER_SELECT) +#define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC) + +#define SQE_VALID_FLAGS (SQE_COMMON_FLAGS|IOSQE_BUFFER_SELECT|IOSQE_IO_DRAIN) + #define IO_REQ_CLEAN_FLAGS (REQ_F_BUFFER_SELECTED | REQ_F_NEED_CLEANUP | \ - REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS) + REQ_F_POLLED | REQ_F_INFLIGHT | REQ_F_CREDS | \ + REQ_F_ASYNC_DATA) #define IO_TCTX_REFS_CACHE_NR (1U << 10) @@ -195,8 +200,10 @@ struct io_rings { }; enum io_uring_cmd_flags { - IO_URING_F_NONBLOCK = 1, - IO_URING_F_COMPLETE_DEFER = 2, + IO_URING_F_COMPLETE_DEFER = 1, + IO_URING_F_UNLOCKED = 2, + /* int's last bit, sign checks are usually faster than a bit test */ + IO_URING_F_NONBLOCK = INT_MIN, }; struct io_mapped_ubuf { @@ -305,26 +312,16 @@ struct io_submit_link { }; struct io_submit_state { - struct blk_plug plug; + /* inline/task_work completion list, under ->uring_lock */ + struct io_wq_work_node free_list; + /* batch completion logic */ + struct io_wq_work_list compl_reqs; struct io_submit_link link; - /* - * io_kiocb alloc cache - */ - void *reqs[IO_REQ_CACHE_SIZE]; - unsigned int free_reqs; - bool plug_started; - - /* - * Batch completion logic - */ - struct io_kiocb *compl_reqs[IO_COMPL_BATCH]; - unsigned int compl_nr; - /* inline/task_work completion list, under ->uring_lock */ - struct list_head free_list; - - unsigned int ios_left; + bool need_plug; + unsigned short submit_nr; + struct blk_plug plug; }; struct io_ring_ctx { @@ -368,6 +365,7 @@ struct io_ring_ctx { * uring_lock, and updated through io_uring_register(2) */ struct io_rsrc_node *rsrc_node; + int rsrc_cached_refs; struct io_file_table file_table; unsigned nr_user_files; unsigned nr_user_bufs; @@ -384,7 +382,7 @@ struct io_ring_ctx { } ____cacheline_aligned_in_smp; /* IRQ completion list, under ->completion_lock */ - struct list_head locked_free_list; + struct io_wq_work_list locked_free_list; unsigned int locked_free_nr; const struct cred *sq_creds; /* cred used for __io_sq_thread() */ @@ -399,11 +397,9 @@ struct io_ring_ctx { unsigned cached_cq_tail; unsigned cq_entries; struct eventfd_ctx *cq_ev_fd; - struct wait_queue_head poll_wait; struct wait_queue_head cq_wait; unsigned cq_extra; atomic_t cq_timeouts; - struct fasync_struct *cq_fasync; unsigned cq_last_tm_flush; } ____cacheline_aligned_in_smp; @@ -418,7 +414,7 @@ struct io_ring_ctx { * For SQPOLL, only the single threaded io_sq_thread() will * manipulate the list, hence no extra locking is needed there. */ - struct list_head iopoll_list; + struct io_wq_work_list iopoll_list; struct hlist_head *cancel_hash; unsigned cancel_hash_bits; bool poll_multi_queue; @@ -457,6 +453,8 @@ struct io_ring_ctx { struct work_struct exit_work; struct list_head tctx_list; struct completion ref_comp; + u32 iowq_limits[2]; + bool iowq_limits_set; }; }; @@ -502,6 +500,7 @@ struct io_poll_update { struct io_close { struct file *file; int fd; + u32 file_slot; }; struct io_timeout_data { @@ -578,7 +577,6 @@ struct io_sr_msg { int msg_flags; int bgid; size_t len; - struct io_buffer *kbuf; }; struct io_open { @@ -690,11 +688,6 @@ struct io_hardlink { int flags; }; -struct io_completion { - struct file *file; - u32 cflags; -}; - struct io_async_connect { struct sockaddr_storage address; }; @@ -708,10 +701,15 @@ struct io_async_msghdr { struct sockaddr_storage addr; }; -struct io_async_rw { +struct io_rw_state { + struct iov_iter iter; + struct iov_iter_state iter_state; struct iovec fast_iov[UIO_FASTIOV]; +}; + +struct io_async_rw { + struct io_rw_state s; const struct iovec *free_iovec; - struct iov_iter iter; size_t bytes_done; struct wait_page_queue wpq; }; @@ -735,13 +733,12 @@ enum { REQ_F_BUFFER_SELECTED_BIT, REQ_F_COMPLETE_INLINE_BIT, REQ_F_REISSUE_BIT, - REQ_F_DONT_REISSUE_BIT, REQ_F_CREDS_BIT, REQ_F_REFCOUNT_BIT, REQ_F_ARM_LTIMEOUT_BIT, + REQ_F_ASYNC_DATA_BIT, /* keep async read/write and isreg together and in order */ - REQ_F_NOWAIT_READ_BIT, - REQ_F_NOWAIT_WRITE_BIT, + REQ_F_SUPPORT_NOWAIT_BIT, REQ_F_ISREG_BIT, /* not a real bit, just to check we're not overflowing the space */ @@ -782,12 +779,8 @@ enum { REQ_F_COMPLETE_INLINE = BIT(REQ_F_COMPLETE_INLINE_BIT), /* caller should reissue async */ REQ_F_REISSUE = BIT(REQ_F_REISSUE_BIT), - /* don't attempt request reissue, see io_rw_reissue() */ - REQ_F_DONT_REISSUE = BIT(REQ_F_DONT_REISSUE_BIT), - /* supports async reads */ - REQ_F_NOWAIT_READ = BIT(REQ_F_NOWAIT_READ_BIT), - /* supports async writes */ - REQ_F_NOWAIT_WRITE = BIT(REQ_F_NOWAIT_WRITE_BIT), + /* supports async reads/writes */ + REQ_F_SUPPORT_NOWAIT = BIT(REQ_F_SUPPORT_NOWAIT_BIT), /* regular file */ REQ_F_ISREG = BIT(REQ_F_ISREG_BIT), /* has creds assigned */ @@ -796,6 +789,8 @@ enum { REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), /* there is a linked timeout that has to be armed */ REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), + /* ->async_data allocated */ + REQ_F_ASYNC_DATA = BIT(REQ_F_ASYNC_DATA_BIT), }; struct async_poll { @@ -852,39 +847,41 @@ struct io_kiocb { struct io_mkdir mkdir; struct io_symlink symlink; struct io_hardlink hardlink; - /* use only after cleaning per-op data, see io_clean_op() */ - struct io_completion compl; }; - /* opcode allocated if it needs to store data for async defer */ - void *async_data; u8 opcode; /* polled IO has completed */ u8 iopoll_completed; - u16 buf_index; + unsigned int flags; + + u64 user_data; u32 result; + u32 cflags; struct io_ring_ctx *ctx; - unsigned int flags; - atomic_t refs; struct task_struct *task; - u64 user_data; - struct io_kiocb *link; struct percpu_ref *fixed_rsrc_refs; + /* store used ubuf, so we can prevent reloading */ + struct io_mapped_ubuf *imu; - /* used with ctx->iopoll_list with reads/writes */ - struct list_head inflight_entry; + /* used by request caches, completion batching and iopoll */ + struct io_wq_work_node comp_list; + atomic_t refs; + struct io_kiocb *link; struct io_task_work io_task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ struct hlist_node hash_node; + /* internal polling, see IORING_FEAT_FAST_POLL */ struct async_poll *apoll; + /* opcode allocated if it needs to store data for async defer */ + void *async_data; struct io_wq_work work; + /* custom credentials, valid IFF REQ_F_CREDS is set */ const struct cred *creds; - - /* store used ubuf, so we can prevent reloading */ - struct io_mapped_ubuf *imu; + /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */ + struct io_buffer *kbuf; }; struct io_tctx_node { @@ -902,12 +899,12 @@ struct io_defer_entry { struct io_op_def { /* needs req->file assigned */ unsigned needs_file : 1; + /* should block plug */ + unsigned plug : 1; /* hash wq insertion if file is a regular file */ unsigned hash_reg_file : 1; /* unbound wq insertion if file is a non-regular file */ unsigned unbound_nonreg_file : 1; - /* opcode is not supported by this kernel */ - unsigned not_supported : 1; /* set if opcode supports polled "wait" */ unsigned pollin : 1; unsigned pollout : 1; @@ -915,8 +912,10 @@ struct io_op_def { unsigned buffer_select : 1; /* do prep async if is going to be punted */ unsigned needs_async_setup : 1; - /* should block plug */ - unsigned plug : 1; + /* opcode is not supported by this kernel */ + unsigned not_supported : 1; + /* skip auditing */ + unsigned audit_skip : 1; /* size of async data needed, if any */ unsigned short async_size; }; @@ -930,6 +929,7 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .needs_async_setup = 1, .plug = 1, + .audit_skip = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITEV] = { @@ -939,16 +939,19 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_setup = 1, .plug = 1, + .audit_skip = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FSYNC] = { .needs_file = 1, + .audit_skip = 1, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .plug = 1, + .audit_skip = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE_FIXED] = { @@ -957,15 +960,20 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, + .audit_skip = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_POLL_ADD] = { .needs_file = 1, .unbound_nonreg_file = 1, + .audit_skip = 1, + }, + [IORING_OP_POLL_REMOVE] = { + .audit_skip = 1, }, - [IORING_OP_POLL_REMOVE] = {}, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, + .audit_skip = 1, }, [IORING_OP_SENDMSG] = { .needs_file = 1, @@ -983,18 +991,23 @@ static const struct io_op_def io_op_defs[] = { .async_size = sizeof(struct io_async_msghdr), }, [IORING_OP_TIMEOUT] = { + .audit_skip = 1, .async_size = sizeof(struct io_timeout_data), }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ + .audit_skip = 1, }, [IORING_OP_ACCEPT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, }, - [IORING_OP_ASYNC_CANCEL] = {}, + [IORING_OP_ASYNC_CANCEL] = { + .audit_skip = 1, + }, [IORING_OP_LINK_TIMEOUT] = { + .audit_skip = 1, .async_size = sizeof(struct io_timeout_data), }, [IORING_OP_CONNECT] = { @@ -1009,14 +1022,19 @@ static const struct io_op_def io_op_defs[] = { }, [IORING_OP_OPENAT] = {}, [IORING_OP_CLOSE] = {}, - [IORING_OP_FILES_UPDATE] = {}, - [IORING_OP_STATX] = {}, + [IORING_OP_FILES_UPDATE] = { + .audit_skip = 1, + }, + [IORING_OP_STATX] = { + .audit_skip = 1, + }, [IORING_OP_READ] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, .plug = 1, + .audit_skip = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_WRITE] = { @@ -1025,39 +1043,50 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollout = 1, .plug = 1, + .audit_skip = 1, .async_size = sizeof(struct io_async_rw), }, [IORING_OP_FADVISE] = { .needs_file = 1, + .audit_skip = 1, }, [IORING_OP_MADVISE] = {}, [IORING_OP_SEND] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, + .audit_skip = 1, }, [IORING_OP_RECV] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, + .audit_skip = 1, }, [IORING_OP_OPENAT2] = { }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, + .audit_skip = 1, }, [IORING_OP_SPLICE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .audit_skip = 1, + }, + [IORING_OP_PROVIDE_BUFFERS] = { + .audit_skip = 1, + }, + [IORING_OP_REMOVE_BUFFERS] = { + .audit_skip = 1, }, - [IORING_OP_PROVIDE_BUFFERS] = {}, - [IORING_OP_REMOVE_BUFFERS] = {}, [IORING_OP_TEE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, + .audit_skip = 1, }, [IORING_OP_SHUTDOWN] = { .needs_file = 1, @@ -1080,7 +1109,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd); static bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags); + s32 res, u32 cflags); static void io_put_req(struct io_kiocb *req); static void io_put_req_deferred(struct io_kiocb *req); static void io_dismantle_req(struct io_kiocb *req); @@ -1095,11 +1124,13 @@ static void __io_queue_sqe(struct io_kiocb *req); static void io_rsrc_put_work(struct work_struct *work); static void io_req_task_queue(struct io_kiocb *req); -static void io_submit_flush_completions(struct io_ring_ctx *ctx); +static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static int io_req_prep_async(struct io_kiocb *req); static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index); +static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags); + static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer); static struct kmem_cache *req_cachep; @@ -1165,6 +1196,12 @@ static inline void req_ref_get(struct io_kiocb *req) atomic_inc(&req->refs); } +static inline void io_submit_flush_completions(struct io_ring_ctx *ctx) +{ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); +} + static inline void __io_req_set_refcount(struct io_kiocb *req, int nr) { if (!(req->flags & REQ_F_REFCOUNT)) { @@ -1178,13 +1215,52 @@ static inline void io_req_set_refcount(struct io_kiocb *req) __io_req_set_refcount(req, 1); } -static inline void io_req_set_rsrc_node(struct io_kiocb *req) +#define IO_RSRC_REF_BATCH 100 + +static inline void io_req_put_rsrc_locked(struct io_kiocb *req, + struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) { - struct io_ring_ctx *ctx = req->ctx; + struct percpu_ref *ref = req->fixed_rsrc_refs; + + if (ref) { + if (ref == &ctx->rsrc_node->refs) + ctx->rsrc_cached_refs++; + else + percpu_ref_put(ref); + } +} +static inline void io_req_put_rsrc(struct io_kiocb *req, struct io_ring_ctx *ctx) +{ + if (req->fixed_rsrc_refs) + percpu_ref_put(req->fixed_rsrc_refs); +} + +static __cold void io_rsrc_refs_drop(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + if (ctx->rsrc_cached_refs) { + percpu_ref_put_many(&ctx->rsrc_node->refs, ctx->rsrc_cached_refs); + ctx->rsrc_cached_refs = 0; + } +} + +static void io_rsrc_refs_refill(struct io_ring_ctx *ctx) + __must_hold(&ctx->uring_lock) +{ + ctx->rsrc_cached_refs += IO_RSRC_REF_BATCH; + percpu_ref_get_many(&ctx->rsrc_node->refs, IO_RSRC_REF_BATCH); +} + +static inline void io_req_set_rsrc_node(struct io_kiocb *req, + struct io_ring_ctx *ctx) +{ if (!req->fixed_rsrc_refs) { req->fixed_rsrc_refs = &ctx->rsrc_node->refs; - percpu_ref_get(req->fixed_rsrc_refs); + ctx->rsrc_cached_refs--; + if (unlikely(ctx->rsrc_cached_refs < 0)) + io_rsrc_refs_refill(ctx); } } @@ -1217,6 +1293,11 @@ static bool io_match_task(struct io_kiocb *head, struct task_struct *task, return false; } +static inline bool req_has_async_data(struct io_kiocb *req) +{ + return req->flags & REQ_F_ASYNC_DATA; +} + static inline void req_set_fail(struct io_kiocb *req) { req->flags |= REQ_F_FAIL; @@ -1228,7 +1309,7 @@ static inline void req_fail_link_node(struct io_kiocb *req, int res) req->result = res; } -static void io_ring_ctx_ref_free(struct percpu_ref *ref) +static __cold void io_ring_ctx_ref_free(struct percpu_ref *ref) { struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); @@ -1240,7 +1321,7 @@ static inline bool io_is_timeout_noseq(struct io_kiocb *req) return !req->timeout.off; } -static void io_fallback_req_func(struct work_struct *work) +static __cold void io_fallback_req_func(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, fallback_work.work); @@ -1253,15 +1334,13 @@ static void io_fallback_req_func(struct work_struct *work) req->io_task_work.func(req, &locked); if (locked) { - if (ctx->submit_state.compl_nr) - io_submit_flush_completions(ctx); + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); } percpu_ref_put(&ctx->refs); - } -static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) +static __cold struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) { struct io_ring_ctx *ctx; int hash_bits; @@ -1298,7 +1377,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ctx->flags = p->flags; init_waitqueue_head(&ctx->sqo_sq_wait); INIT_LIST_HEAD(&ctx->sqd_list); - init_waitqueue_head(&ctx->poll_wait); INIT_LIST_HEAD(&ctx->cq_overflow_list); init_completion(&ctx->ref_comp); xa_init_flags(&ctx->io_buffers, XA_FLAGS_ALLOC1); @@ -1307,7 +1385,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->cq_wait); spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->timeout_lock); - INIT_LIST_HEAD(&ctx->iopoll_list); + INIT_WQ_LIST(&ctx->iopoll_list); INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->ltimeout_list); @@ -1316,9 +1394,10 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) INIT_DELAYED_WORK(&ctx->rsrc_put_work, io_rsrc_put_work); init_llist_head(&ctx->rsrc_put_llist); INIT_LIST_HEAD(&ctx->tctx_list); - INIT_LIST_HEAD(&ctx->submit_state.free_list); - INIT_LIST_HEAD(&ctx->locked_free_list); + ctx->submit_state.free_list.next = NULL; + INIT_WQ_LIST(&ctx->locked_free_list); INIT_DELAYED_WORK(&ctx->fallback_work, io_fallback_req_func); + INIT_WQ_LIST(&ctx->submit_state.compl_reqs); return ctx; err: kfree(ctx->dummy_ubuf); @@ -1346,21 +1425,16 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -#define FFS_ASYNC_READ 0x1UL -#define FFS_ASYNC_WRITE 0x2UL -#ifdef CONFIG_64BIT -#define FFS_ISREG 0x4UL -#else -#define FFS_ISREG 0x0UL -#endif -#define FFS_MASK ~(FFS_ASYNC_READ|FFS_ASYNC_WRITE|FFS_ISREG) +#define FFS_NOWAIT 0x1UL +#define FFS_ISREG 0x2UL +#define FFS_MASK ~(FFS_NOWAIT|FFS_ISREG) static inline bool io_req_ffs_set(struct io_kiocb *req) { - return IS_ENABLED(CONFIG_64BIT) && (req->flags & REQ_F_FIXED_FILE); + return req->flags & REQ_F_FIXED_FILE; } -static void io_req_track_inflight(struct io_kiocb *req) +static inline void io_req_track_inflight(struct io_kiocb *req) { if (!(req->flags & REQ_F_INFLIGHT)) { req->flags |= REQ_F_INFLIGHT; @@ -1368,11 +1442,6 @@ static void io_req_track_inflight(struct io_kiocb *req) } } -static inline void io_unprep_linked_timeout(struct io_kiocb *req) -{ - req->flags &= ~REQ_F_LINK_TIMEOUT; -} - static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) { if (WARN_ON_ONCE(!req->link)) @@ -1443,15 +1512,19 @@ static void io_prep_async_link(struct io_kiocb *req) } } -static void io_queue_async_work(struct io_kiocb *req, bool *locked) +static inline void io_req_add_compl_list(struct io_kiocb *req) +{ + struct io_submit_state *state = &req->ctx->submit_state; + + wq_list_add_tail(&req->comp_list, &state->compl_reqs); +} + +static void io_queue_async_work(struct io_kiocb *req, bool *dont_use) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; - /* must not take the lock, NULL it as a precaution */ - locked = NULL; - BUG_ON(!tctx); BUG_ON(!tctx->io_wq); @@ -1492,7 +1565,7 @@ static void io_kill_timeout(struct io_kiocb *req, int status) } } -static void io_queue_deferred(struct io_ring_ctx *ctx) +static __cold void io_queue_deferred(struct io_ring_ctx *ctx) { while (!list_empty(&ctx->defer_list)) { struct io_defer_entry *de = list_first_entry(&ctx->defer_list, @@ -1506,7 +1579,7 @@ static void io_queue_deferred(struct io_ring_ctx *ctx) } } -static void io_flush_timeouts(struct io_ring_ctx *ctx) +static __cold void io_flush_timeouts(struct io_ring_ctx *ctx) __must_hold(&ctx->completion_lock) { u32 seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); @@ -1539,7 +1612,7 @@ static void io_flush_timeouts(struct io_ring_ctx *ctx) spin_unlock_irq(&ctx->timeout_lock); } -static void __io_commit_cqring_flush(struct io_ring_ctx *ctx) +static __cold void __io_commit_cqring_flush(struct io_ring_ctx *ctx) { if (ctx->off_timeout_used) io_flush_timeouts(ctx); @@ -1609,14 +1682,8 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) */ if (wq_has_sleeper(&ctx->cq_wait)) wake_up_all(&ctx->cq_wait); - if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) - wake_up(&ctx->sq_data->wait); if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->poll_wait)) { - wake_up_interruptible(&ctx->poll_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } } static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) @@ -1630,10 +1697,6 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) } if (io_should_trigger_evfd(ctx)) eventfd_signal(ctx->cq_ev_fd, 1); - if (waitqueue_active(&ctx->poll_wait)) { - wake_up_interruptible(&ctx->poll_wait); - kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); - } } /* Returns true if there are no backlogged entries after the flush */ @@ -1729,7 +1792,7 @@ static inline void io_get_task_refs(int nr) } static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { struct io_overflow_cqe *ocqe; @@ -1757,7 +1820,7 @@ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data, } static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { struct io_uring_cqe *cqe; @@ -1780,13 +1843,13 @@ static inline bool __io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data /* not as hot to bloat with inlining */ static noinline bool io_cqring_fill_event(struct io_ring_ctx *ctx, u64 user_data, - long res, unsigned int cflags) + s32 res, u32 cflags) { return __io_cqring_fill_event(ctx, user_data, res, cflags); } -static void io_req_complete_post(struct io_kiocb *req, long res, - unsigned int cflags) +static void io_req_complete_post(struct io_kiocb *req, s32 res, + u32 cflags) { struct io_ring_ctx *ctx = req->ctx; @@ -1805,40 +1868,27 @@ static void io_req_complete_post(struct io_kiocb *req, long res, req->link = NULL; } } + io_req_put_rsrc(req, ctx); io_dismantle_req(req); io_put_task(req->task, 1); - list_add(&req->inflight_entry, &ctx->locked_free_list); + wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; - } else { - if (!percpu_ref_tryget(&ctx->refs)) - req = NULL; } io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); - - if (req) { - io_cqring_ev_posted(ctx); - percpu_ref_put(&ctx->refs); - } -} - -static inline bool io_req_needs_clean(struct io_kiocb *req) -{ - return req->flags & IO_REQ_CLEAN_FLAGS; + io_cqring_ev_posted(ctx); } -static void io_req_complete_state(struct io_kiocb *req, long res, - unsigned int cflags) +static inline void io_req_complete_state(struct io_kiocb *req, s32 res, + u32 cflags) { - if (io_req_needs_clean(req)) - io_clean_op(req); req->result = res; - req->compl.cflags = cflags; + req->cflags = cflags; req->flags |= REQ_F_COMPLETE_INLINE; } static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, - long res, unsigned cflags) + s32 res, u32 cflags) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) io_req_complete_state(req, res, cflags); @@ -1846,12 +1896,12 @@ static inline void __io_req_complete(struct io_kiocb *req, unsigned issue_flags, io_req_complete_post(req, res, cflags); } -static inline void io_req_complete(struct io_kiocb *req, long res) +static inline void io_req_complete(struct io_kiocb *req, s32 res) { __io_req_complete(req, 0, res, 0); } -static void io_req_complete_failed(struct io_kiocb *req, long res) +static void io_req_complete_failed(struct io_kiocb *req, s32 res) { req_set_fail(req); io_req_complete_post(req, res, 0); @@ -1885,7 +1935,7 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, struct io_submit_state *state) { spin_lock(&ctx->completion_lock); - list_splice_init(&ctx->locked_free_list, &state->free_list); + wq_list_splice(&ctx->locked_free_list, &state->free_list); ctx->locked_free_nr = 0; spin_unlock(&ctx->completion_lock); } @@ -1894,7 +1944,6 @@ static void io_flush_cached_locked_reqs(struct io_ring_ctx *ctx, static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; - int nr; /* * If we have more than a batch's worth of requests in our IRQ side @@ -1903,20 +1952,7 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) */ if (READ_ONCE(ctx->locked_free_nr) > IO_COMPL_BATCH) io_flush_cached_locked_reqs(ctx, state); - - nr = state->free_reqs; - while (!list_empty(&state->free_list)) { - struct io_kiocb *req = list_first_entry(&state->free_list, - struct io_kiocb, inflight_entry); - - list_del(&req->inflight_entry); - state->reqs[nr++] = req; - if (nr == ARRAY_SIZE(state->reqs)) - break; - } - - state->free_reqs = nr; - return nr != 0; + return !!state->free_list.next; } /* @@ -1925,38 +1961,54 @@ static bool io_flush_cached_reqs(struct io_ring_ctx *ctx) * Because of that, io_alloc_req() should be called only under ->uring_lock * and with extra caution to not get a request that is still worked on. */ -static struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +static __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { struct io_submit_state *state = &ctx->submit_state; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; + void *reqs[IO_REQ_ALLOC_BATCH]; + struct io_kiocb *req; int ret, i; - BUILD_BUG_ON(ARRAY_SIZE(state->reqs) < IO_REQ_ALLOC_BATCH); - - if (likely(state->free_reqs || io_flush_cached_reqs(ctx))) - goto got_req; + if (likely(state->free_list.next || io_flush_cached_reqs(ctx))) + return true; - ret = kmem_cache_alloc_bulk(req_cachep, gfp, IO_REQ_ALLOC_BATCH, - state->reqs); + ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs); /* * Bulk alloc is all-or-nothing. If we fail to get a batch, * retry single alloc to be on the safe side. */ if (unlikely(ret <= 0)) { - state->reqs[0] = kmem_cache_alloc(req_cachep, gfp); - if (!state->reqs[0]) - return NULL; + reqs[0] = kmem_cache_alloc(req_cachep, gfp); + if (!reqs[0]) + return false; ret = 1; } - for (i = 0; i < ret; i++) - io_preinit_req(state->reqs[i], ctx); - state->free_reqs = ret; -got_req: - state->free_reqs--; - return state->reqs[state->free_reqs]; + percpu_ref_get_many(&ctx->refs, ret); + for (i = 0; i < ret; i++) { + req = reqs[i]; + + io_preinit_req(req, ctx); + wq_stack_add_head(&req->comp_list, &state->free_list); + } + return true; +} + +static inline bool io_alloc_req_refill(struct io_ring_ctx *ctx) +{ + if (unlikely(!ctx->submit_state.free_list.next)) + return __io_alloc_req_refill(ctx); + return true; +} + +static inline struct io_kiocb *io_alloc_req(struct io_ring_ctx *ctx) +{ + struct io_wq_work_node *node; + + node = wq_stack_extract(&ctx->submit_state.free_list); + return container_of(node, struct io_kiocb, comp_list); } static inline void io_put_file(struct file *file) @@ -1965,35 +2017,28 @@ static inline void io_put_file(struct file *file) fput(file); } -static void io_dismantle_req(struct io_kiocb *req) +static inline void io_dismantle_req(struct io_kiocb *req) { unsigned int flags = req->flags; - if (io_req_needs_clean(req)) + if (unlikely(flags & IO_REQ_CLEAN_FLAGS)) io_clean_op(req); if (!(flags & REQ_F_FIXED_FILE)) io_put_file(req->file); - if (req->fixed_rsrc_refs) - percpu_ref_put(req->fixed_rsrc_refs); - if (req->async_data) { - kfree(req->async_data); - req->async_data = NULL; - } } -static void __io_free_req(struct io_kiocb *req) +static __cold void __io_free_req(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; + io_req_put_rsrc(req, ctx); io_dismantle_req(req); io_put_task(req->task, 1); spin_lock(&ctx->completion_lock); - list_add(&req->inflight_entry, &ctx->locked_free_list); + wq_list_add_head(&req->comp_list, &ctx->locked_free_list); ctx->locked_free_nr++; spin_unlock(&ctx->completion_lock); - - percpu_ref_put(&ctx->refs); } static inline void io_remove_next_linked(struct io_kiocb *req) @@ -2079,47 +2124,45 @@ static bool io_disarm_next(struct io_kiocb *req) return posted; } -static struct io_kiocb *__io_req_find_next(struct io_kiocb *req) +static void __io_req_find_next_prep(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + bool posted; + + spin_lock(&ctx->completion_lock); + posted = io_disarm_next(req); + if (posted) + io_commit_cqring(req->ctx); + spin_unlock(&ctx->completion_lock); + if (posted) + io_cqring_ev_posted(ctx); +} + +static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) { struct io_kiocb *nxt; + if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) + return NULL; /* * If LINK is set, we have dependent requests in this chain. If we * didn't fail this request, queue the first one up, moving any other * dependencies to the next request. In case of failure, fail the rest * of the chain. */ - if (req->flags & IO_DISARM_MASK) { - struct io_ring_ctx *ctx = req->ctx; - bool posted; - - spin_lock(&ctx->completion_lock); - posted = io_disarm_next(req); - if (posted) - io_commit_cqring(req->ctx); - spin_unlock(&ctx->completion_lock); - if (posted) - io_cqring_ev_posted(ctx); - } + if (unlikely(req->flags & IO_DISARM_MASK)) + __io_req_find_next_prep(req); nxt = req->link; req->link = NULL; return nxt; } -static inline struct io_kiocb *io_req_find_next(struct io_kiocb *req) -{ - if (likely(!(req->flags & (REQ_F_LINK|REQ_F_HARDLINK)))) - return NULL; - return __io_req_find_next(req); -} - static void ctx_flush_and_put(struct io_ring_ctx *ctx, bool *locked) { if (!ctx) return; if (*locked) { - if (ctx->submit_state.compl_nr) - io_submit_flush_completions(ctx); + io_submit_flush_completions(ctx); mutex_unlock(&ctx->uring_lock); *locked = false; } @@ -2136,7 +2179,7 @@ static void tctx_task_work(struct callback_head *cb) while (1) { struct io_wq_work_node *node; - if (!tctx->task_list.first && locked && ctx->submit_state.compl_nr) + if (!tctx->task_list.first && locked) io_submit_flush_completions(ctx); spin_lock_irq(&tctx->task_lock); @@ -2199,8 +2242,9 @@ static void io_req_task_work_add(struct io_kiocb *req) * will do the job. */ notify = (req->ctx->flags & IORING_SETUP_SQPOLL) ? TWA_NONE : TWA_SIGNAL; - if (!task_work_add(tsk, &tctx->task_work, notify)) { - wake_up_process(tsk); + if (likely(!task_work_add(tsk, &tctx->task_work, notify))) { + if (notify == TWA_NONE) + wake_up_process(tsk); return; } @@ -2278,77 +2322,62 @@ static void io_free_req_work(struct io_kiocb *req, bool *locked) io_free_req(req); } -struct req_batch { - struct task_struct *task; - int task_refs; - int ctx_refs; -}; - -static inline void io_init_req_batch(struct req_batch *rb) +static void io_free_batch_list(struct io_ring_ctx *ctx, + struct io_wq_work_node *node) + __must_hold(&ctx->uring_lock) { - rb->task_refs = 0; - rb->ctx_refs = 0; - rb->task = NULL; -} + struct task_struct *task = NULL; + int task_refs = 0; -static void io_req_free_batch_finish(struct io_ring_ctx *ctx, - struct req_batch *rb) -{ - if (rb->ctx_refs) - percpu_ref_put_many(&ctx->refs, rb->ctx_refs); - if (rb->task) - io_put_task(rb->task, rb->task_refs); -} + do { + struct io_kiocb *req = container_of(node, struct io_kiocb, + comp_list); -static void io_req_free_batch(struct req_batch *rb, struct io_kiocb *req, - struct io_submit_state *state) -{ - io_queue_next(req); - io_dismantle_req(req); + if (unlikely(req->flags & REQ_F_REFCOUNT)) { + node = req->comp_list.next; + if (!req_ref_put_and_test(req)) + continue; + } - if (req->task != rb->task) { - if (rb->task) - io_put_task(rb->task, rb->task_refs); - rb->task = req->task; - rb->task_refs = 0; - } - rb->task_refs++; - rb->ctx_refs++; + io_req_put_rsrc_locked(req, ctx); + io_queue_next(req); + io_dismantle_req(req); - if (state->free_reqs != ARRAY_SIZE(state->reqs)) - state->reqs[state->free_reqs++] = req; - else - list_add(&req->inflight_entry, &state->free_list); + if (req->task != task) { + if (task) + io_put_task(task, task_refs); + task = req->task; + task_refs = 0; + } + task_refs++; + node = req->comp_list.next; + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); + } while (node); + + if (task) + io_put_task(task, task_refs); } -static void io_submit_flush_completions(struct io_ring_ctx *ctx) +static void __io_submit_flush_completions(struct io_ring_ctx *ctx) __must_hold(&ctx->uring_lock) { + struct io_wq_work_node *node, *prev; struct io_submit_state *state = &ctx->submit_state; - int i, nr = state->compl_nr; - struct req_batch rb; spin_lock(&ctx->completion_lock); - for (i = 0; i < nr; i++) { - struct io_kiocb *req = state->compl_reqs[i]; + wq_list_for_each(node, prev, &state->compl_reqs) { + struct io_kiocb *req = container_of(node, struct io_kiocb, + comp_list); __io_cqring_fill_event(ctx, req->user_data, req->result, - req->compl.cflags); + req->cflags); } io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_cqring_ev_posted(ctx); - io_init_req_batch(&rb); - for (i = 0; i < nr; i++) { - struct io_kiocb *req = state->compl_reqs[i]; - - if (req_ref_put_and_test(req)) - io_req_free_batch(&rb, req, &ctx->submit_state); - } - - io_req_free_batch_finish(ctx, &rb); - state->compl_nr = 0; + io_free_batch_list(ctx, state->compl_reqs.first); + INIT_WQ_LIST(&state->compl_reqs); } /* @@ -2408,12 +2437,9 @@ static unsigned int io_put_kbuf(struct io_kiocb *req, struct io_buffer *kbuf) static inline unsigned int io_put_rw_kbuf(struct io_kiocb *req) { - struct io_buffer *kbuf; - if (likely(!(req->flags & REQ_F_BUFFER_SELECTED))) return 0; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; - return io_put_kbuf(req, kbuf); + return io_put_kbuf(req, req->kbuf); } static inline bool io_run_task_work(void) @@ -2427,57 +2453,22 @@ static inline bool io_run_task_work(void) return false; } -/* - * Find and free completed poll iocbs - */ -static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, - struct list_head *done) +static int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin) { - struct req_batch rb; - struct io_kiocb *req; - - /* order with ->result store in io_complete_rw_iopoll() */ - smp_rmb(); - - io_init_req_batch(&rb); - while (!list_empty(done)) { - req = list_first_entry(done, struct io_kiocb, inflight_entry); - list_del(&req->inflight_entry); - - if (READ_ONCE(req->result) == -EAGAIN && - !(req->flags & REQ_F_DONT_REISSUE)) { - req->iopoll_completed = 0; - io_req_task_queue_reissue(req); - continue; - } - - __io_cqring_fill_event(ctx, req->user_data, req->result, - io_put_rw_kbuf(req)); - (*nr_events)++; - - if (req_ref_put_and_test(req)) - io_req_free_batch(&rb, req, &ctx->submit_state); - } - - io_commit_cqring(ctx); - io_cqring_ev_posted_iopoll(ctx); - io_req_free_batch_finish(ctx, &rb); -} - -static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, - long min) -{ - struct io_kiocb *req, *tmp; - LIST_HEAD(done); - bool spin; + struct io_wq_work_node *pos, *start, *prev; + unsigned int poll_flags = BLK_POLL_NOSLEEP; + DEFINE_IO_COMP_BATCH(iob); + int nr_events = 0; /* * Only spin for completions if we don't have multiple devices hanging - * off our complete list, and we're under the requested amount. + * off our complete list. */ - spin = !ctx->poll_multi_queue && *nr_events < min; + if (ctx->poll_multi_queue || force_nonspin) + poll_flags |= BLK_POLL_ONESHOT; - list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { + wq_list_for_each(pos, start, &ctx->iopoll_list) { + struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); struct kiocb *kiocb = &req->rw.kiocb; int ret; @@ -2486,47 +2477,62 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * If we find a request that requires polling, break out * and complete those lists first, if we have entries there. */ - if (READ_ONCE(req->iopoll_completed)) { - list_move_tail(&req->inflight_entry, &done); - continue; - } - if (!list_empty(&done)) + if (READ_ONCE(req->iopoll_completed)) break; - ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); + ret = kiocb->ki_filp->f_op->iopoll(kiocb, &iob, poll_flags); if (unlikely(ret < 0)) return ret; else if (ret) - spin = false; + poll_flags |= BLK_POLL_ONESHOT; /* iopoll may have completed current req */ - if (READ_ONCE(req->iopoll_completed)) - list_move_tail(&req->inflight_entry, &done); + if (!rq_list_empty(iob.req_list) || + READ_ONCE(req->iopoll_completed)) + break; } - if (!list_empty(&done)) - io_iopoll_complete(ctx, nr_events, &done); + if (!rq_list_empty(iob.req_list)) + iob.complete(&iob); + else if (!pos) + return 0; - return 0; + prev = start; + wq_list_for_each_resume(pos, prev) { + struct io_kiocb *req = container_of(pos, struct io_kiocb, comp_list); + + /* order with io_complete_rw_iopoll(), e.g. ->result updates */ + if (!smp_load_acquire(&req->iopoll_completed)) + break; + __io_cqring_fill_event(ctx, req->user_data, req->result, + io_put_rw_kbuf(req)); + nr_events++; + } + + if (unlikely(!nr_events)) + return 0; + + io_commit_cqring(ctx); + io_cqring_ev_posted_iopoll(ctx); + pos = start ? start->next : ctx->iopoll_list.first; + wq_list_cut(&ctx->iopoll_list, prev, start); + io_free_batch_list(ctx, pos); + return nr_events; } /* * We can't just wait for polled events to come to us, we have to actively * find and complete them. */ -static void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) +static __cold void io_iopoll_try_reap_events(struct io_ring_ctx *ctx) { if (!(ctx->flags & IORING_SETUP_IOPOLL)) return; mutex_lock(&ctx->uring_lock); - while (!list_empty(&ctx->iopoll_list)) { - unsigned int nr_events = 0; - - io_do_iopoll(ctx, &nr_events, 0); - + while (!wq_list_empty(&ctx->iopoll_list)) { /* let it sleep and repeat later if can't complete a request */ - if (nr_events == 0) + if (io_do_iopoll(ctx, true) == 0) break; /* * Ensure we allow local-to-the-cpu processing to take place, @@ -2573,7 +2579,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) * forever, while the workqueue is stuck trying to acquire the * very same mutex. */ - if (list_empty(&ctx->iopoll_list)) { + if (wq_list_empty(&ctx->iopoll_list)) { u32 tail = ctx->cached_cq_tail; mutex_unlock(&ctx->uring_lock); @@ -2582,11 +2588,15 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) /* some requests don't go through iopoll_list */ if (tail != ctx->cached_cq_tail || - list_empty(&ctx->iopoll_list)) + wq_list_empty(&ctx->iopoll_list)) break; } - ret = io_do_iopoll(ctx, &nr_events, min); - } while (!ret && nr_events < min && !need_resched()); + ret = io_do_iopoll(ctx, !min); + if (ret < 0) + break; + nr_events += ret; + ret = 0; + } while (nr_events < min && !need_resched()); out: mutex_unlock(&ctx->uring_lock); return ret; @@ -2611,10 +2621,9 @@ static bool io_resubmit_prep(struct io_kiocb *req) { struct io_async_rw *rw = req->async_data; - if (!rw) + if (!req_has_async_data(req)) return !io_req_prep_async(req); - /* may have left rw->iter inconsistent on -EIOCBQUEUED */ - iov_iter_revert(&rw->iter, req->result - iov_iter_count(&rw->iter)); + iov_iter_restore(&rw->s.iter, &rw->s.iter_state); return true; } @@ -2658,7 +2667,7 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) { if (req->rw.kiocb.ki_flags & IOCB_WRITE) kiocb_end_write(req); - if (res != req->result) { + if (unlikely(res != req->result)) { if ((res == -EAGAIN || res == -EOPNOTSUPP) && io_rw_should_reissue(req)) { req->flags |= REQ_F_REISSUE; @@ -2673,16 +2682,11 @@ static bool __io_complete_rw_common(struct io_kiocb *req, long res) static void io_req_task_complete(struct io_kiocb *req, bool *locked) { unsigned int cflags = io_put_rw_kbuf(req); - long res = req->result; + int res = req->result; if (*locked) { - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_state *state = &ctx->submit_state; - io_req_complete_state(req, res, cflags); - state->compl_reqs[state->compl_nr++] = req; - if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) - io_submit_flush_completions(ctx); + io_req_add_compl_list(req); } else { io_req_complete_post(req, res, cflags); } @@ -2696,7 +2700,7 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, __io_req_complete(req, issue_flags, req->result, io_put_rw_kbuf(req)); } -static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +static void io_complete_rw(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); @@ -2707,24 +2711,22 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) io_req_task_work_add(req); } -static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); if (kiocb->ki_flags & IOCB_WRITE) kiocb_end_write(req); if (unlikely(res != req->result)) { - if (!(res == -EAGAIN && io_rw_should_reissue(req) && - io_resubmit_prep(req))) { - req_set_fail(req); - req->flags |= REQ_F_DONT_REISSUE; + if (res == -EAGAIN && io_rw_should_reissue(req)) { + req->flags |= REQ_F_REISSUE; + return; } + req->result = res; } - WRITE_ONCE(req->result, res); - /* order with io_iopoll_complete() checking ->result */ - smp_wmb(); - WRITE_ONCE(req->iopoll_completed, 1); + /* order with io_iopoll_complete() checking ->iopoll_completed */ + smp_store_release(&req->iopoll_completed, 1); } /* @@ -2733,13 +2735,13 @@ static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) * find it from a io_do_iopoll() thread before the issuer is done * accessing the kiocb cookie. */ -static void io_iopoll_req_issued(struct io_kiocb *req) +static void io_iopoll_req_issued(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; - const bool in_async = io_wq_current_is_worker(); + const bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; /* workqueue context doesn't hold uring_lock, grab it now */ - if (unlikely(in_async)) + if (unlikely(needs_lock)) mutex_lock(&ctx->uring_lock); /* @@ -2747,23 +2749,15 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * how we do polling eventually, not spinning if we're on potentially * different devices. */ - if (list_empty(&ctx->iopoll_list)) { + if (wq_list_empty(&ctx->iopoll_list)) { ctx->poll_multi_queue = false; } else if (!ctx->poll_multi_queue) { struct io_kiocb *list_req; - unsigned int queue_num0, queue_num1; - list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, - inflight_entry); - - if (list_req->file != req->file) { + list_req = container_of(ctx->iopoll_list.first, struct io_kiocb, + comp_list); + if (list_req->file != req->file) ctx->poll_multi_queue = true; - } else { - queue_num0 = blk_qc_t_to_queue_num(list_req->rw.kiocb.ki_cookie); - queue_num1 = blk_qc_t_to_queue_num(req->rw.kiocb.ki_cookie); - if (queue_num0 != queue_num1) - ctx->poll_multi_queue = true; - } } /* @@ -2771,11 +2765,11 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->inflight_entry, &ctx->iopoll_list); + wq_list_add_head(&req->comp_list, &ctx->iopoll_list); else - list_add_tail(&req->inflight_entry, &ctx->iopoll_list); + wq_list_add_tail(&req->comp_list, &ctx->iopoll_list); - if (unlikely(in_async)) { + if (unlikely(needs_lock)) { /* * If IORING_SETUP_SQPOLL is enabled, sqes are either handle * in sq thread task context or in io worker task context. If @@ -2800,10 +2794,8 @@ static bool io_bdev_nowait(struct block_device *bdev) * any file. For now, just ensure that anything potentially problematic is done * inline. */ -static bool __io_file_supports_nowait(struct file *file, int rw) +static bool __io_file_supports_nowait(struct file *file, umode_t mode) { - umode_t mode = file_inode(file)->i_mode; - if (S_ISBLK(mode)) { if (IS_ENABLED(CONFIG_BLOCK) && io_bdev_nowait(I_BDEV(file->f_mapping->host))) @@ -2823,24 +2815,29 @@ static bool __io_file_supports_nowait(struct file *file, int rw) /* any ->read/write should understand O_NONBLOCK */ if (file->f_flags & O_NONBLOCK) return true; + return file->f_mode & FMODE_NOWAIT; +} - if (!(file->f_mode & FMODE_NOWAIT)) - return false; - - if (rw == READ) - return file->f_op->read_iter != NULL; +/* + * If we tracked the file through the SCM inflight mechanism, we could support + * any file. For now, just ensure that anything potentially problematic is done + * inline. + */ +static unsigned int io_file_get_flags(struct file *file) +{ + umode_t mode = file_inode(file)->i_mode; + unsigned int res = 0; - return file->f_op->write_iter != NULL; + if (S_ISREG(mode)) + res |= FFS_ISREG; + if (__io_file_supports_nowait(file, mode)) + res |= FFS_NOWAIT; + return res; } -static bool io_file_supports_nowait(struct io_kiocb *req, int rw) +static inline bool io_file_supports_nowait(struct io_kiocb *req) { - if (rw == READ && (req->flags & REQ_F_NOWAIT_READ)) - return true; - else if (rw == WRITE && (req->flags & REQ_F_NOWAIT_WRITE)) - return true; - - return __io_file_supports_nowait(req->file, rw); + return req->flags & REQ_F_SUPPORT_NOWAIT; } static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) @@ -2851,37 +2848,30 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) unsigned ioprio; int ret; - if (!io_req_ffs_set(req) && S_ISREG(file_inode(file)->i_mode)) - req->flags |= REQ_F_ISREG; + if (!io_req_ffs_set(req)) + req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT; kiocb->ki_pos = READ_ONCE(sqe->off); if (kiocb->ki_pos == -1 && !(file->f_mode & FMODE_STREAM)) { req->flags |= REQ_F_CUR_POS; kiocb->ki_pos = file->f_pos; } - kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); - kiocb->ki_flags = iocb_flags(kiocb->ki_filp); + kiocb->ki_flags = iocb_flags(file); ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); if (unlikely(ret)) return ret; - /* don't allow async punt for O_NONBLOCK or RWF_NOWAIT */ - if ((kiocb->ki_flags & IOCB_NOWAIT) || (file->f_flags & O_NONBLOCK)) + /* + * If the file is marked O_NONBLOCK, still allow retry for it if it + * supports async. Otherwise it's impossible to use O_NONBLOCK files + * reliably. If not, or it IOCB_NOWAIT is set, don't retry. + */ + if ((kiocb->ki_flags & IOCB_NOWAIT) || + ((file->f_flags & O_NONBLOCK) && !io_file_supports_nowait(req))) req->flags |= REQ_F_NOWAIT; - ioprio = READ_ONCE(sqe->ioprio); - if (ioprio) { - ret = ioprio_check_cap(ioprio); - if (ret) - return ret; - - kiocb->ki_ioprio = ioprio; - } else - kiocb->ki_ioprio = get_current_ioprio(); - if (ctx->flags & IORING_SETUP_IOPOLL) { - if (!(kiocb->ki_flags & IOCB_DIRECT) || - !kiocb->ki_filp->f_op->iopoll) + if (!(kiocb->ki_flags & IOCB_DIRECT) || !file->f_op->iopoll) return -EOPNOTSUPP; kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE; @@ -2893,12 +2883,18 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe) kiocb->ki_complete = io_complete_rw; } - if (req->opcode == IORING_OP_READ_FIXED || - req->opcode == IORING_OP_WRITE_FIXED) { - req->imu = NULL; - io_req_set_rsrc_node(req); + ioprio = READ_ONCE(sqe->ioprio); + if (ioprio) { + ret = ioprio_check_cap(ioprio); + if (ret) + return ret; + + kiocb->ki_ioprio = ioprio; + } else { + kiocb->ki_ioprio = get_current_ioprio(); } + req->imu = NULL; req->rw.addr = READ_ONCE(sqe->addr); req->rw.len = READ_ONCE(sqe->len); req->buf_index = READ_ONCE(sqe->buf_index); @@ -2922,7 +2918,7 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) ret = -EINTR; fallthrough; default: - kiocb->ki_complete(kiocb, ret, 0); + kiocb->ki_complete(kiocb, ret); } } @@ -2931,10 +2927,9 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); struct io_async_rw *io = req->async_data; - bool check_reissue = kiocb->ki_complete == io_complete_rw; /* add previously done IO, if any */ - if (io && io->bytes_done > 0) { + if (req_has_async_data(req) && io->bytes_done > 0) { if (ret < 0) ret = io->bytes_done; else @@ -2943,19 +2938,27 @@ static void kiocb_done(struct kiocb *kiocb, ssize_t ret, if (req->flags & REQ_F_CUR_POS) req->file->f_pos = kiocb->ki_pos; - if (ret >= 0 && check_reissue) + if (ret >= 0 && (kiocb->ki_complete == io_complete_rw)) __io_complete_rw(req, ret, 0, issue_flags); else io_rw_done(kiocb, ret); - if (check_reissue && (req->flags & REQ_F_REISSUE)) { + if (req->flags & REQ_F_REISSUE) { req->flags &= ~REQ_F_REISSUE; if (io_resubmit_prep(req)) { io_req_task_queue_reissue(req); } else { + unsigned int cflags = io_put_rw_kbuf(req); + struct io_ring_ctx *ctx = req->ctx; + req_set_fail(req); - __io_req_complete(req, issue_flags, ret, - io_put_rw_kbuf(req)); + if (issue_flags & IO_URING_F_UNLOCKED) { + mutex_lock(&ctx->uring_lock); + __io_req_complete(req, issue_flags, ret, cflags); + mutex_unlock(&ctx->uring_lock); + } else { + __io_req_complete(req, issue_flags, ret, cflags); + } } } } @@ -3020,13 +3023,15 @@ static int __io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter static int io_import_fixed(struct io_kiocb *req, int rw, struct iov_iter *iter) { - struct io_ring_ctx *ctx = req->ctx; struct io_mapped_ubuf *imu = req->imu; u16 index, buf_index = req->buf_index; if (likely(!imu)) { + struct io_ring_ctx *ctx = req->ctx; + if (unlikely(buf_index >= ctx->nr_user_bufs)) return -EFAULT; + io_req_set_rsrc_node(req, ctx); index = array_index_nospec(buf_index, ctx->nr_user_bufs); imu = READ_ONCE(ctx->user_bufs[index]); req->imu = imu; @@ -3053,10 +3058,11 @@ static void io_ring_submit_lock(struct io_ring_ctx *ctx, bool needs_lock) } static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, - int bgid, struct io_buffer *kbuf, - bool needs_lock) + int bgid, unsigned int issue_flags) { + struct io_buffer *kbuf = req->kbuf; struct io_buffer *head; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; if (req->flags & REQ_F_BUFFER_SELECTED) return kbuf; @@ -3077,34 +3083,32 @@ static struct io_buffer *io_buffer_select(struct io_kiocb *req, size_t *len, } if (*len > kbuf->len) *len = kbuf->len; + req->flags |= REQ_F_BUFFER_SELECTED; + req->kbuf = kbuf; } else { kbuf = ERR_PTR(-ENOBUFS); } io_ring_submit_unlock(req->ctx, needs_lock); - return kbuf; } static void __user *io_rw_buffer_select(struct io_kiocb *req, size_t *len, - bool needs_lock) + unsigned int issue_flags) { struct io_buffer *kbuf; u16 bgid; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; bgid = req->buf_index; - kbuf = io_buffer_select(req, len, bgid, kbuf, needs_lock); + kbuf = io_buffer_select(req, len, bgid, issue_flags); if (IS_ERR(kbuf)) return kbuf; - req->rw.addr = (u64) (unsigned long) kbuf; - req->flags |= REQ_F_BUFFER_SELECTED; return u64_to_user_ptr(kbuf->addr); } #ifdef CONFIG_COMPAT static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, - bool needs_lock) + unsigned int issue_flags) { struct compat_iovec __user *uiov; compat_ssize_t clen; @@ -3120,7 +3124,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, return -EINVAL; len = clen; - buf = io_rw_buffer_select(req, &len, needs_lock); + buf = io_rw_buffer_select(req, &len, issue_flags); if (IS_ERR(buf)) return PTR_ERR(buf); iov[0].iov_base = buf; @@ -3130,7 +3134,7 @@ static ssize_t io_compat_import(struct io_kiocb *req, struct iovec *iov, #endif static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - bool needs_lock) + unsigned int issue_flags) { struct iovec __user *uiov = u64_to_user_ptr(req->rw.addr); void __user *buf; @@ -3142,7 +3146,7 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, len = iov[0].iov_len; if (len < 0) return -EINVAL; - buf = io_rw_buffer_select(req, &len, needs_lock); + buf = io_rw_buffer_select(req, &len, issue_flags); if (IS_ERR(buf)) return PTR_ERR(buf); iov[0].iov_base = buf; @@ -3151,12 +3155,11 @@ static ssize_t __io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, } static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, - bool needs_lock) + unsigned int issue_flags) { if (req->flags & REQ_F_BUFFER_SELECTED) { - struct io_buffer *kbuf; + struct io_buffer *kbuf = req->kbuf; - kbuf = (struct io_buffer *) (unsigned long) req->rw.addr; iov[0].iov_base = u64_to_user_ptr(kbuf->addr); iov[0].iov_len = kbuf->len; return 0; @@ -3166,52 +3169,72 @@ static ssize_t io_iov_buffer_select(struct io_kiocb *req, struct iovec *iov, #ifdef CONFIG_COMPAT if (req->ctx->compat) - return io_compat_import(req, iov, needs_lock); + return io_compat_import(req, iov, issue_flags); #endif - return __io_iov_buffer_select(req, iov, needs_lock); + return __io_iov_buffer_select(req, iov, issue_flags); } -static int io_import_iovec(int rw, struct io_kiocb *req, struct iovec **iovec, - struct iov_iter *iter, bool needs_lock) +static struct iovec *__io_import_iovec(int rw, struct io_kiocb *req, + struct io_rw_state *s, + unsigned int issue_flags) { - void __user *buf = u64_to_user_ptr(req->rw.addr); - size_t sqe_len = req->rw.len; + struct iov_iter *iter = &s->iter; u8 opcode = req->opcode; + struct iovec *iovec; + void __user *buf; + size_t sqe_len; ssize_t ret; - if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) { - *iovec = NULL; - return io_import_fixed(req, rw, iter); - } + BUILD_BUG_ON(ERR_PTR(0) != NULL); + + if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) + return ERR_PTR(io_import_fixed(req, rw, iter)); /* buffer index only valid with fixed read/write, or buffer select */ - if (req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT)) - return -EINVAL; + if (unlikely(req->buf_index && !(req->flags & REQ_F_BUFFER_SELECT))) + return ERR_PTR(-EINVAL); + + buf = u64_to_user_ptr(req->rw.addr); + sqe_len = req->rw.len; if (opcode == IORING_OP_READ || opcode == IORING_OP_WRITE) { if (req->flags & REQ_F_BUFFER_SELECT) { - buf = io_rw_buffer_select(req, &sqe_len, needs_lock); + buf = io_rw_buffer_select(req, &sqe_len, issue_flags); if (IS_ERR(buf)) - return PTR_ERR(buf); + return ERR_CAST(buf); req->rw.len = sqe_len; } - ret = import_single_range(rw, buf, sqe_len, *iovec, iter); - *iovec = NULL; - return ret; + ret = import_single_range(rw, buf, sqe_len, s->fast_iov, iter); + return ERR_PTR(ret); } + iovec = s->fast_iov; if (req->flags & REQ_F_BUFFER_SELECT) { - ret = io_iov_buffer_select(req, *iovec, needs_lock); + ret = io_iov_buffer_select(req, iovec, issue_flags); if (!ret) - iov_iter_init(iter, rw, *iovec, 1, (*iovec)->iov_len); - *iovec = NULL; - return ret; + iov_iter_init(iter, rw, iovec, 1, iovec->iov_len); + return ERR_PTR(ret); } - return __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter, + ret = __import_iovec(rw, buf, sqe_len, UIO_FASTIOV, &iovec, iter, req->ctx->compat); + if (unlikely(ret < 0)) + return ERR_PTR(ret); + return iovec; +} + +static inline int io_import_iovec(int rw, struct io_kiocb *req, + struct iovec **iovec, struct io_rw_state *s, + unsigned int issue_flags) +{ + *iovec = __io_import_iovec(rw, req, s, issue_flags); + if (unlikely(IS_ERR(*iovec))) + return PTR_ERR(*iovec); + + iov_iter_save_state(&s->iter, &s->iter_state); + return 0; } static inline loff_t *io_kiocb_ppos(struct kiocb *kiocb) @@ -3236,7 +3259,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) */ if (kiocb->ki_flags & IOCB_HIPRI) return -EOPNOTSUPP; - if (kiocb->ki_flags & IOCB_NOWAIT) + if ((kiocb->ki_flags & IOCB_NOWAIT) && + !(kiocb->ki_filp->f_flags & O_NONBLOCK)) return -EAGAIN; while (iov_iter_count(iter)) { @@ -3263,12 +3287,15 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) ret = nr; break; } + if (!iov_iter_is_bvec(iter)) { + iov_iter_advance(iter, nr); + } else { + req->rw.len -= nr; + req->rw.addr += nr; + } ret += nr; if (nr != iovec.iov_len) break; - req->rw.len -= nr; - req->rw.addr += nr; - iov_iter_advance(iter, nr); } return ret; @@ -3279,7 +3306,7 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, { struct io_async_rw *rw = req->async_data; - memcpy(&rw->iter, iter, sizeof(*iter)); + memcpy(&rw->s.iter, iter, sizeof(*iter)); rw->free_iovec = iovec; rw->bytes_done = 0; /* can only be fixed buffers, no need to do anything */ @@ -3288,39 +3315,47 @@ static void io_req_map_rw(struct io_kiocb *req, const struct iovec *iovec, if (!iovec) { unsigned iov_off = 0; - rw->iter.iov = rw->fast_iov; + rw->s.iter.iov = rw->s.fast_iov; if (iter->iov != fast_iov) { iov_off = iter->iov - fast_iov; - rw->iter.iov += iov_off; + rw->s.iter.iov += iov_off; } - if (rw->fast_iov != fast_iov) - memcpy(rw->fast_iov + iov_off, fast_iov + iov_off, + if (rw->s.fast_iov != fast_iov) + memcpy(rw->s.fast_iov + iov_off, fast_iov + iov_off, sizeof(struct iovec) * iter->nr_segs); } else { req->flags |= REQ_F_NEED_CLEANUP; } } -static inline int io_alloc_async_data(struct io_kiocb *req) +static inline bool io_alloc_async_data(struct io_kiocb *req) { WARN_ON_ONCE(!io_op_defs[req->opcode].async_size); req->async_data = kmalloc(io_op_defs[req->opcode].async_size, GFP_KERNEL); - return req->async_data == NULL; + if (req->async_data) { + req->flags |= REQ_F_ASYNC_DATA; + return false; + } + return true; } static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, - const struct iovec *fast_iov, - struct iov_iter *iter, bool force) + struct io_rw_state *s, bool force) { if (!force && !io_op_defs[req->opcode].needs_async_setup) return 0; - if (!req->async_data) { + if (!req_has_async_data(req)) { + struct io_async_rw *iorw; + if (io_alloc_async_data(req)) { kfree(iovec); return -ENOMEM; } - io_req_map_rw(req, iovec, fast_iov, iter); + io_req_map_rw(req, iovec, s->fast_iov, &s->iter); + iorw = req->async_data; + /* we've copied and mapped the iter, ensure state is saved */ + iov_iter_save_state(&iorw->s.iter, &iorw->s.iter_state); } return 0; } @@ -3328,10 +3363,11 @@ static int io_setup_async_rw(struct io_kiocb *req, const struct iovec *iovec, static inline int io_rw_prep_async(struct io_kiocb *req, int rw) { struct io_async_rw *iorw = req->async_data; - struct iovec *iov = iorw->fast_iov; + struct iovec *iov; int ret; - ret = io_import_iovec(rw, req, &iov, &iorw->iter, false); + /* submission path, ->uring_lock should already be taken */ + ret = io_import_iovec(rw, req, &iov, &iorw->s, 0); if (unlikely(ret < 0)) return ret; @@ -3350,7 +3386,7 @@ static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) } /* - * This is our waitqueue callback handler, registered through lock_page_async() + * This is our waitqueue callback handler, registered through __folio_lock_async() * when we initially tried to do the IO with the iocb armed our waitqueue. * This gets called when the page is unlocked, and we generally expect that to * happen when the page IO is completed and the page is now uptodate. This will @@ -3422,7 +3458,7 @@ static bool io_rw_should_retry(struct io_kiocb *req) static inline int io_iter_do_read(struct io_kiocb *req, struct iov_iter *iter) { - if (req->file->f_op->read_iter) + if (likely(req->file->f_op->read_iter)) return call_read_iter(req->file, &req->rw.kiocb, iter); else if (req->file->f_op->read) return loop_rw_iter(READ, req, iter); @@ -3438,43 +3474,49 @@ static bool need_read_all(struct io_kiocb *req) static int io_read(struct io_kiocb *req, unsigned int issue_flags) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct io_rw_state __s, *s = &__s; + struct iovec *iovec; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter __iter, *iter = &__iter; - struct io_async_rw *rw = req->async_data; - ssize_t io_size, ret, ret2; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct io_async_rw *rw; + ssize_t ret, ret2; - if (rw) { - iter = &rw->iter; - iovec = NULL; - } else { - ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock); - if (ret < 0) + if (!req_has_async_data(req)) { + ret = io_import_iovec(READ, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) return ret; + } else { + rw = req->async_data; + s = &rw->s; + /* + * We come here from an earlier attempt, restore our state to + * match in case it doesn't. It's cheap enough that we don't + * need to make this conditional. + */ + iov_iter_restore(&s->iter, &s->iter_state); + iovec = NULL; } - io_size = iov_iter_count(iter); - req->result = io_size; + req->result = iov_iter_count(&s->iter); - /* Ensure we clear previously set non-block flag */ - if (!force_nonblock) - kiocb->ki_flags &= ~IOCB_NOWAIT; - else + if (force_nonblock) { + /* If the file doesn't support async, just async punt */ + if (unlikely(!io_file_supports_nowait(req))) { + ret = io_setup_async_rw(req, iovec, s, true); + return ret ?: -EAGAIN; + } kiocb->ki_flags |= IOCB_NOWAIT; - - /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_nowait(req, READ)) { - ret = io_setup_async_rw(req, iovec, inline_vecs, iter, true); - return ret ?: -EAGAIN; + } else { + /* Ensure we clear previously set non-block flag */ + kiocb->ki_flags &= ~IOCB_NOWAIT; } - ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), io_size); + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); if (unlikely(ret)) { kfree(iovec); return ret; } - ret = io_iter_do_read(req, iter); + ret = io_iter_do_read(req, &s->iter); if (ret == -EAGAIN || (req->flags & REQ_F_REISSUE)) { req->flags &= ~REQ_F_REISSUE; @@ -3484,30 +3526,46 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) /* no retry on NONBLOCK nor RWF_NOWAIT */ if (req->flags & REQ_F_NOWAIT) goto done; - /* some cases will consume bytes even on error returns */ - iov_iter_reexpand(iter, iter->count + iter->truncated); - iov_iter_revert(iter, io_size - iov_iter_count(iter)); ret = 0; } else if (ret == -EIOCBQUEUED) { goto out_free; - } else if (ret <= 0 || ret == io_size || !force_nonblock || + } else if (ret == req->result || ret <= 0 || !force_nonblock || (req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { /* read all, failed, already did sync or don't want to retry */ goto done; } - ret2 = io_setup_async_rw(req, iovec, inline_vecs, iter, true); + /* + * Don't depend on the iter state matching what was consumed, or being + * untouched in case of error. Restore it and we'll advance it + * manually if we need to. + */ + iov_iter_restore(&s->iter, &s->iter_state); + + ret2 = io_setup_async_rw(req, iovec, s, true); if (ret2) return ret2; iovec = NULL; rw = req->async_data; - /* now use our persistent iterator, if we aren't already */ - iter = &rw->iter; + s = &rw->s; + /* + * Now use our persistent iterator and state, if we aren't already. + * We've restored and mapped the iter to match. + */ do { - io_size -= ret; + /* + * We end up here because of a partial read, either from + * above or inside this loop. Advance the iter by the bytes + * that were consumed. + */ + iov_iter_advance(&s->iter, ret); + if (!iov_iter_count(&s->iter)) + break; rw->bytes_done += ret; + iov_iter_save_state(&s->iter, &s->iter_state); + /* if we can retry, do so with the callbacks armed */ if (!io_rw_should_retry(req)) { kiocb->ki_flags &= ~IOCB_WAITQ; @@ -3520,12 +3578,13 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) * desired page gets unlocked. We can also get a partial read * here, and if we do, then just retry at the new offset. */ - ret = io_iter_do_read(req, iter); + ret = io_iter_do_read(req, &s->iter); if (ret == -EIOCBQUEUED) return 0; /* we got some bytes, but not all. retry. */ kiocb->ki_flags &= ~IOCB_WAITQ; - } while (ret > 0 && ret < io_size); + iov_iter_restore(&s->iter, &s->iter_state); + } while (ret > 0); done: kiocb_done(kiocb, ret, issue_flags); out_free: @@ -3539,45 +3598,48 @@ static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; + req->rw.kiocb.ki_hint = ki_hint_validate(file_write_hint(req->file)); return io_prep_rw(req, sqe); } static int io_write(struct io_kiocb *req, unsigned int issue_flags) { - struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct io_rw_state __s, *s = &__s; + struct iovec *iovec; struct kiocb *kiocb = &req->rw.kiocb; - struct iov_iter __iter, *iter = &__iter; - struct io_async_rw *rw = req->async_data; - ssize_t ret, ret2, io_size; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + ssize_t ret, ret2; - if (rw) { - iter = &rw->iter; - iovec = NULL; - } else { - ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock); - if (ret < 0) + if (!req_has_async_data(req)) { + ret = io_import_iovec(WRITE, req, &iovec, s, issue_flags); + if (unlikely(ret < 0)) return ret; + } else { + struct io_async_rw *rw = req->async_data; + + s = &rw->s; + iov_iter_restore(&s->iter, &s->iter_state); + iovec = NULL; } - io_size = iov_iter_count(iter); - req->result = io_size; + req->result = iov_iter_count(&s->iter); - /* Ensure we clear previously set non-block flag */ - if (!force_nonblock) - kiocb->ki_flags &= ~IOCB_NOWAIT; - else - kiocb->ki_flags |= IOCB_NOWAIT; + if (force_nonblock) { + /* If the file doesn't support async, just async punt */ + if (unlikely(!io_file_supports_nowait(req))) + goto copy_iov; - /* If the file doesn't support async, just async punt */ - if (force_nonblock && !io_file_supports_nowait(req, WRITE)) - goto copy_iov; + /* file path doesn't support NOWAIT for non-direct_IO */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && + (req->flags & REQ_F_ISREG)) + goto copy_iov; - /* file path doesn't support NOWAIT for non-direct_IO */ - if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && - (req->flags & REQ_F_ISREG)) - goto copy_iov; + kiocb->ki_flags |= IOCB_NOWAIT; + } else { + /* Ensure we clear previously set non-block flag */ + kiocb->ki_flags &= ~IOCB_NOWAIT; + } - ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), io_size); + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); if (unlikely(ret)) goto out_free; @@ -3595,10 +3657,10 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) } kiocb->ki_flags |= IOCB_WRITE; - if (req->file->f_op->write_iter) - ret2 = call_write_iter(req->file, kiocb, iter); + if (likely(req->file->f_op->write_iter)) + ret2 = call_write_iter(req->file, kiocb, &s->iter); else if (req->file->f_op->write) - ret2 = loop_rw_iter(WRITE, req, iter); + ret2 = loop_rw_iter(WRITE, req, &s->iter); else ret2 = -EINVAL; @@ -3618,16 +3680,14 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) goto done; if (!force_nonblock || ret2 != -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ - if ((req->ctx->flags & IORING_SETUP_IOPOLL) && ret2 == -EAGAIN) + if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) goto copy_iov; done: kiocb_done(kiocb, ret2, issue_flags); } else { copy_iov: - /* some cases will consume bytes even on error returns */ - iov_iter_reexpand(iter, iter->count + iter->truncated); - iov_iter_revert(iter, io_size - iov_iter_count(iter)); - ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false); + iov_iter_restore(&s->iter, &s->iter_state); + ret = io_setup_async_rw(req, iovec, s, false); return ret ?: -EAGAIN; } out_free: @@ -3763,7 +3823,7 @@ static int io_mkdirat_prep(struct io_kiocb *req, return 0; } -static int io_mkdirat(struct io_kiocb *req, int issue_flags) +static int io_mkdirat(struct io_kiocb *req, unsigned int issue_flags) { struct io_mkdir *mkd = &req->mkdir; int ret; @@ -3812,7 +3872,7 @@ static int io_symlinkat_prep(struct io_kiocb *req, return 0; } -static int io_symlinkat(struct io_kiocb *req, int issue_flags) +static int io_symlinkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_symlink *sl = &req->symlink; int ret; @@ -3862,7 +3922,7 @@ static int io_linkat_prep(struct io_kiocb *req, return 0; } -static int io_linkat(struct io_kiocb *req, int issue_flags) +static int io_linkat(struct io_kiocb *req, unsigned int issue_flags) { struct io_hardlink *lnk = &req->hardlink; int ret; @@ -4281,9 +4341,9 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_buffer *head; int ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - io_ring_submit_lock(ctx, !force_nonblock); + io_ring_submit_lock(ctx, needs_lock); lockdep_assert_held(&ctx->uring_lock); @@ -4296,7 +4356,7 @@ static int io_remove_buffers(struct io_kiocb *req, unsigned int issue_flags) /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, !force_nonblock); + io_ring_submit_unlock(ctx, needs_lock); return 0; } @@ -4342,7 +4402,7 @@ static int io_add_buffers(struct io_provide_buf *pbuf, struct io_buffer **head) int i, bid = pbuf->bid; for (i = 0; i < pbuf->nbufs; i++) { - buf = kmalloc(sizeof(*buf), GFP_KERNEL); + buf = kmalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); if (!buf) break; @@ -4368,9 +4428,9 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_buffer *head, *list; int ret = 0; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; - io_ring_submit_lock(ctx, !force_nonblock); + io_ring_submit_lock(ctx, needs_lock); lockdep_assert_held(&ctx->uring_lock); @@ -4386,7 +4446,7 @@ static int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) req_set_fail(req); /* complete before unlock, IOPOLL may need the lock */ __io_req_complete(req, issue_flags, ret, 0); - io_ring_submit_unlock(ctx, !force_nonblock); + io_ring_submit_unlock(ctx, needs_lock); return 0; } @@ -4549,12 +4609,16 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (sqe->ioprio || sqe->off || sqe->addr || sqe->len || - sqe->rw_flags || sqe->buf_index || sqe->splice_fd_in) + sqe->rw_flags || sqe->buf_index) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; req->close.fd = READ_ONCE(sqe->fd); + req->close.file_slot = READ_ONCE(sqe->file_index); + if (req->close.file_slot && req->close.fd) + return -EINVAL; + return 0; } @@ -4566,6 +4630,11 @@ static int io_close(struct io_kiocb *req, unsigned int issue_flags) struct file *file = NULL; int ret = -EBADF; + if (req->close.file_slot) { + ret = io_close_fixed(req, issue_flags); + goto err; + } + spin_lock(&files->file_lock); fdt = files_fdtable(files); if (close->fd >= fdt->max_fds) { @@ -4710,8 +4779,9 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - kmsg = req->async_data; - if (!kmsg) { + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { ret = io_sendmsg_copy_hdr(req, &iomsg); if (ret) return ret; @@ -4870,23 +4940,16 @@ static int io_recvmsg_copy_hdr(struct io_kiocb *req, } static struct io_buffer *io_recv_buffer_select(struct io_kiocb *req, - bool needs_lock) + unsigned int issue_flags) { struct io_sr_msg *sr = &req->sr_msg; - struct io_buffer *kbuf; - kbuf = io_buffer_select(req, &sr->len, sr->bgid, sr->kbuf, needs_lock); - if (IS_ERR(kbuf)) - return kbuf; - - sr->kbuf = kbuf; - req->flags |= REQ_F_BUFFER_SELECTED; - return kbuf; + return io_buffer_select(req, &sr->len, sr->bgid, issue_flags); } static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) { - return io_put_kbuf(req, req->sr_msg.kbuf); + return io_put_kbuf(req, req->kbuf); } static int io_recvmsg_prep_async(struct io_kiocb *req) @@ -4934,8 +4997,9 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) if (unlikely(!sock)) return -ENOTSOCK; - kmsg = req->async_data; - if (!kmsg) { + if (req_has_async_data(req)) { + kmsg = req->async_data; + } else { ret = io_recvmsg_copy_hdr(req, &iomsg); if (ret) return ret; @@ -4943,7 +5007,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) } if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, !force_nonblock); + kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); kmsg->fast_iov[0].iov_base = u64_to_user_ptr(kbuf->addr); @@ -4995,7 +5059,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) return -ENOTSOCK; if (req->flags & REQ_F_BUFFER_SELECT) { - kbuf = io_recv_buffer_select(req, !force_nonblock); + kbuf = io_recv_buffer_select(req, issue_flags); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); buf = u64_to_user_ptr(kbuf->addr); @@ -5126,7 +5190,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) int ret; bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; - if (req->async_data) { + if (req_has_async_data(req)) { io = req->async_data; } else { ret = move_addr_to_kernel(req->connect.addr, @@ -5142,7 +5206,7 @@ static int io_connect(struct io_kiocb *req, unsigned int issue_flags) ret = __sys_connect_file(req->file, &io->address, req->connect.addr_len, file_flags); if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) { - if (req->async_data) + if (req_has_async_data(req)) return -EAGAIN; if (io_alloc_async_data(req)) { ret = -ENOMEM; @@ -5293,7 +5357,7 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) if (req->poll.events & EPOLLONESHOT) flags = 0; if (!io_cqring_fill_event(ctx, req->user_data, error, flags)) { - req->poll.done = true; + req->poll.events |= EPOLLONESHOT; flags = 0; } if (flags & IORING_CQE_F_MORE) @@ -5302,16 +5366,6 @@ static bool __io_poll_complete(struct io_kiocb *req, __poll_t mask) return !(flags & IORING_CQE_F_MORE); } -static inline bool io_poll_complete(struct io_kiocb *req, __poll_t mask) - __must_hold(&req->ctx->completion_lock) -{ - bool done; - - done = __io_poll_complete(req, mask); - io_commit_cqring(req->ctx); - return done; -} - static void io_poll_task_func(struct io_kiocb *req, bool *locked) { struct io_ring_ctx *ctx = req->ctx; @@ -5322,10 +5376,15 @@ static void io_poll_task_func(struct io_kiocb *req, bool *locked) } else { bool done; + if (req->poll.done) { + spin_unlock(&ctx->completion_lock); + return; + } done = __io_poll_complete(req, req->result); if (done) { io_poll_remove_double(req); hash_del(&req->hash_node); + req->poll.done = true; } else { req->result = 0; add_wait_queue(req->poll.head, &req->poll.wait); @@ -5428,7 +5487,10 @@ static void __io_queue_proc(struct io_poll_iocb *poll, struct io_poll_table *pt, io_init_poll_iocb(poll, poll_one->events, io_poll_double_wake); req_ref_get(req); poll->wait.private = req; + *poll_ptr = poll; + if (req->opcode == IORING_OP_POLL_ADD) + req->flags |= REQ_F_ASYNC_DATA; } pt->nr_entries++; @@ -5463,6 +5525,7 @@ static void io_async_task_func(struct io_kiocb *req, bool *locked) hash_del(&req->hash_node); io_poll_remove_double(req); + apoll->poll.done = true; spin_unlock(&ctx->completion_lock); if (!READ_ONCE(apoll->poll.canceled)) @@ -5551,17 +5614,13 @@ static int io_arm_poll_handler(struct io_kiocb *req) struct async_poll *apoll; struct io_poll_table ipt; __poll_t ret, mask = EPOLLONESHOT | POLLERR | POLLPRI; - int rw; - if (!req->file || !file_can_poll(req->file)) - return IO_APOLL_ABORTED; - if (req->flags & REQ_F_POLLED) - return IO_APOLL_ABORTED; if (!def->pollin && !def->pollout) return IO_APOLL_ABORTED; + if (!file_can_poll(req->file) || (req->flags & REQ_F_POLLED)) + return IO_APOLL_ABORTED; if (def->pollin) { - rw = READ; mask |= POLLIN | POLLRDNORM; /* If reading from MSG_ERRQUEUE using recvmsg, ignore POLLIN */ @@ -5569,14 +5628,9 @@ static int io_arm_poll_handler(struct io_kiocb *req) (req->sr_msg.msg_flags & MSG_ERRQUEUE)) mask &= ~POLLIN; } else { - rw = WRITE; mask |= POLLOUT | POLLWRNORM; } - /* if we can't nonblock try, then no point in arming a poll handler */ - if (!io_file_supports_nowait(req, rw)) - return IO_APOLL_ABORTED; - apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); if (unlikely(!apoll)) return IO_APOLL_ABORTED; @@ -5637,8 +5691,8 @@ static bool io_poll_remove_one(struct io_kiocb *req) /* * Returns true if we found and killed one or more poll requests */ -static bool io_poll_remove_all(struct io_ring_ctx *ctx, struct task_struct *tsk, - bool cancel_all) +static __cold bool io_poll_remove_all(struct io_ring_ctx *ctx, + struct task_struct *tsk, bool cancel_all) { struct hlist_node *tmp; struct io_kiocb *req; @@ -5783,6 +5837,7 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) struct io_ring_ctx *ctx = req->ctx; struct io_poll_table ipt; __poll_t mask; + bool done; ipt.pt._qproc = io_poll_queue_proc; @@ -5791,13 +5846,14 @@ static int io_poll_add(struct io_kiocb *req, unsigned int issue_flags) if (mask) { /* no async, we'd stolen it */ ipt.error = 0; - io_poll_complete(req, mask); + done = __io_poll_complete(req, mask); + io_commit_cqring(req->ctx); } spin_unlock(&ctx->completion_lock); if (mask) { io_cqring_ev_posted(ctx); - if (poll->events & EPOLLONESHOT) + if (done) io_put_req(req); } return ipt.error; @@ -5867,7 +5923,10 @@ err: static void io_req_task_timeout(struct io_kiocb *req, bool *locked) { - req_set_fail(req); + struct io_timeout_data *data = req->async_data; + + if (!(data->flags & IORING_TIMEOUT_ETIME_SUCCESS)) + req_set_fail(req); io_req_complete_post(req, -ETIME, 0); } @@ -6073,7 +6132,8 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (off && is_timeout_link) return -EINVAL; flags = READ_ONCE(sqe->timeout_flags); - if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK)) + if (flags & ~(IORING_TIMEOUT_ABS | IORING_TIMEOUT_CLOCK_MASK | + IORING_TIMEOUT_ETIME_SUCCESS)) return -EINVAL; /* more than one clock specified is invalid, obviously */ if (hweight32(flags & IORING_TIMEOUT_CLOCK_MASK) > 1) @@ -6084,7 +6144,9 @@ static int io_timeout_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe, if (unlikely(off && !req->ctx->off_timeout_used)) req->ctx->off_timeout_used = true; - if (!req->async_data && io_alloc_async_data(req)) + if (WARN_ON_ONCE(req_has_async_data(req))) + return -EFAULT; + if (io_alloc_async_data(req)) return -ENOMEM; data = req->async_data; @@ -6241,6 +6303,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; u64 sqe_addr = req->cancel.addr; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_tctx_node *node; int ret; @@ -6249,7 +6312,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) goto done; /* slow path, try all io-wq's */ - io_ring_submit_lock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_lock(ctx, needs_lock); ret = -ENOENT; list_for_each_entry(node, &ctx->tctx_list, ctx_node) { struct io_uring_task *tctx = node->task->io_uring; @@ -6258,7 +6321,7 @@ static int io_async_cancel(struct io_kiocb *req, unsigned int issue_flags) if (ret != -ENOENT) break; } - io_ring_submit_unlock(ctx, !(issue_flags & IO_URING_F_NONBLOCK)); + io_ring_submit_unlock(ctx, needs_lock); done: if (ret < 0) req_set_fail(req); @@ -6285,22 +6348,20 @@ static int io_rsrc_update_prep(struct io_kiocb *req, static int io_files_update(struct io_kiocb *req, unsigned int issue_flags) { struct io_ring_ctx *ctx = req->ctx; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; struct io_uring_rsrc_update2 up; int ret; - if (issue_flags & IO_URING_F_NONBLOCK) - return -EAGAIN; - up.offset = req->rsrc_update.offset; up.data = req->rsrc_update.arg; up.nr = 0; up.tags = 0; up.resv = 0; - mutex_lock(&ctx->uring_lock); + io_ring_submit_lock(ctx, needs_lock); ret = __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, req->rsrc_update.nr_args); - mutex_unlock(&ctx->uring_lock); + io_ring_submit_unlock(ctx, needs_lock); if (ret < 0) req_set_fail(req); @@ -6396,7 +6457,7 @@ static int io_req_prep_async(struct io_kiocb *req) { if (!io_op_defs[req->opcode].needs_async_setup) return 0; - if (WARN_ON_ONCE(req->async_data)) + if (WARN_ON_ONCE(req_has_async_data(req))) return -EFAULT; if (io_alloc_async_data(req)) return -EAGAIN; @@ -6428,68 +6489,39 @@ static u32 io_get_sequence(struct io_kiocb *req) return seq; } -static bool io_drain_req(struct io_kiocb *req) +static __cold void io_drain_req(struct io_kiocb *req) { - struct io_kiocb *pos; struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; int ret; - u32 seq; - - if (req->flags & REQ_F_FAIL) { - io_req_complete_fail_submit(req); - return true; - } - - /* - * If we need to drain a request in the middle of a link, drain the - * head request and the next request/link after the current link. - * Considering sequential execution of links, IOSQE_IO_DRAIN will be - * maintained for every request of our link. - */ - if (ctx->drain_next) { - req->flags |= REQ_F_IO_DRAIN; - ctx->drain_next = false; - } - /* not interested in head, start from the first linked */ - io_for_each_link(pos, req->link) { - if (pos->flags & REQ_F_IO_DRAIN) { - ctx->drain_next = true; - req->flags |= REQ_F_IO_DRAIN; - break; - } - } + u32 seq = io_get_sequence(req); /* Still need defer if there is pending req in defer list. */ - if (likely(list_empty_careful(&ctx->defer_list) && - !(req->flags & REQ_F_IO_DRAIN))) { + if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) { +queue: ctx->drain_active = false; - return false; + io_req_task_queue(req); + return; } - seq = io_get_sequence(req); - /* Still a chance to pass the sequence check */ - if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) - return false; - ret = io_req_prep_async(req); - if (ret) - goto fail; + if (ret) { +fail: + io_req_complete_failed(req, ret); + return; + } io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) { ret = -ENOMEM; -fail: - io_req_complete_failed(req, ret); - return true; + goto fail; } spin_lock(&ctx->completion_lock); if (!req_need_defer(req, seq) && list_empty(&ctx->defer_list)) { spin_unlock(&ctx->completion_lock); kfree(de); - io_queue_async_work(req, NULL); - return true; + goto queue; } trace_io_uring_defer(ctx, req, req->user_data); @@ -6497,23 +6529,13 @@ fail: de->seq = seq; list_add_tail(&de->list, &ctx->defer_list); spin_unlock(&ctx->completion_lock); - return true; } static void io_clean_op(struct io_kiocb *req) { if (req->flags & REQ_F_BUFFER_SELECTED) { - switch (req->opcode) { - case IORING_OP_READV: - case IORING_OP_READ_FIXED: - case IORING_OP_READ: - kfree((void *)(unsigned long)req->rw.addr); - break; - case IORING_OP_RECVMSG: - case IORING_OP_RECV: - kfree(req->sr_msg.kbuf); - break; - } + kfree(req->kbuf); + req->kbuf = NULL; } if (req->flags & REQ_F_NEED_CLEANUP) { @@ -6578,19 +6600,24 @@ static void io_clean_op(struct io_kiocb *req) } if (req->flags & REQ_F_CREDS) put_cred(req->creds); - + if (req->flags & REQ_F_ASYNC_DATA) { + kfree(req->async_data); + req->async_data = NULL; + } req->flags &= ~IO_REQ_CLEAN_FLAGS; } static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { - struct io_ring_ctx *ctx = req->ctx; const struct cred *creds = NULL; int ret; - if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) creds = override_creds(req->creds); + if (!io_op_defs[req->opcode].audit_skip) + audit_uring_entry(req->opcode); + switch (req->opcode) { case IORING_OP_NOP: ret = io_nop(req, issue_flags); @@ -6706,13 +6733,16 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) break; } + if (!io_op_defs[req->opcode].audit_skip) + audit_uring_exit(!ret, ret); + if (creds) revert_creds(creds); if (ret) return ret; /* If the op doesn't have a file, we're not polling for it */ - if ((ctx->flags & IORING_SETUP_IOPOLL) && req->file) - io_iopoll_req_issued(req); + if ((req->ctx->flags & IORING_SETUP_IOPOLL) && req->file) + io_iopoll_req_issued(req, issue_flags); return 0; } @@ -6728,6 +6758,8 @@ static struct io_wq_work *io_wq_free_work(struct io_wq_work *work) static void io_wq_submit_work(struct io_wq_work *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); + unsigned int issue_flags = IO_URING_F_UNLOCKED; + bool needs_poll = false; struct io_kiocb *timeout; int ret = 0; @@ -6742,23 +6774,42 @@ static void io_wq_submit_work(struct io_wq_work *work) io_queue_linked_timeout(timeout); /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ - if (work->flags & IO_WQ_WORK_CANCEL) - ret = -ECANCELED; + if (work->flags & IO_WQ_WORK_CANCEL) { + io_req_task_queue_fail(req, -ECANCELED); + return; + } - if (!ret) { - do { - ret = io_issue_sqe(req, 0); - /* - * We can get EAGAIN for polled IO even though we're - * forcing a sync submission from here, since we can't - * wait for request slots on the block side. - */ - if (ret != -EAGAIN) - break; - cond_resched(); - } while (1); + if (req->flags & REQ_F_FORCE_ASYNC) { + const struct io_op_def *def = &io_op_defs[req->opcode]; + bool opcode_poll = def->pollin || def->pollout; + + if (opcode_poll && file_can_poll(req->file)) { + needs_poll = true; + issue_flags |= IO_URING_F_NONBLOCK; + } } + do { + ret = io_issue_sqe(req, issue_flags); + if (ret != -EAGAIN) + break; + /* + * We can get EAGAIN for iopolled IO even though we're + * forcing a sync submission from here, since we can't + * wait for request slots on the block side. + */ + if (!needs_poll) { + cond_resched(); + continue; + } + + if (io_arm_poll_handler(req) == IO_APOLL_OK) + return; + /* aborted or ready, in either case retry blocking */ + needs_poll = false; + issue_flags &= ~IO_URING_F_NONBLOCK; + } while (1); + /* avoid locking problems by failing it from a clean context */ if (ret) io_req_task_queue_fail(req, ret); @@ -6782,12 +6833,7 @@ static void io_fixed_file_set(struct io_fixed_file *file_slot, struct file *file { unsigned long file_ptr = (unsigned long) file; - if (__io_file_supports_nowait(file, READ)) - file_ptr |= FFS_ASYNC_READ; - if (__io_file_supports_nowait(file, WRITE)) - file_ptr |= FFS_ASYNC_WRITE; - if (S_ISREG(file_inode(file)->i_mode)) - file_ptr |= FFS_ISREG; + file_ptr |= io_file_get_flags(file); file_slot->file_ptr = file_ptr; } @@ -6804,8 +6850,8 @@ static inline struct file *io_file_get_fixed(struct io_ring_ctx *ctx, file = (struct file *) (file_ptr & FFS_MASK); file_ptr &= ~FFS_MASK; /* mask in overlapping REQ_F and FFS bits */ - req->flags |= (file_ptr << REQ_F_NOWAIT_READ_BIT); - io_req_set_rsrc_node(req); + req->flags |= (file_ptr << REQ_F_SUPPORT_NOWAIT_BIT); + io_req_set_rsrc_node(req, ctx); return file; } @@ -6897,67 +6943,66 @@ static void io_queue_linked_timeout(struct io_kiocb *req) io_put_req(req); } -static void __io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe_arm_apoll(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) +{ + struct io_kiocb *linked_timeout = io_prep_linked_timeout(req); + + switch (io_arm_poll_handler(req)) { + case IO_APOLL_READY: + if (linked_timeout) { + io_queue_linked_timeout(linked_timeout); + linked_timeout = NULL; + } + io_req_task_queue(req); + break; + case IO_APOLL_ABORTED: + /* + * Queued up for async execution, worker will release + * submit reference when the iocb is actually submitted. + */ + io_queue_async_work(req, NULL); + break; + } + + if (linked_timeout) + io_queue_linked_timeout(linked_timeout); +} + +static inline void __io_queue_sqe(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { struct io_kiocb *linked_timeout; int ret; -issue_sqe: ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); + if (req->flags & REQ_F_COMPLETE_INLINE) { + io_req_add_compl_list(req); + return; + } /* * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ if (likely(!ret)) { - if (req->flags & REQ_F_COMPLETE_INLINE) { - struct io_ring_ctx *ctx = req->ctx; - struct io_submit_state *state = &ctx->submit_state; - - state->compl_reqs[state->compl_nr++] = req; - if (state->compl_nr == ARRAY_SIZE(state->compl_reqs)) - io_submit_flush_completions(ctx); - return; - } - linked_timeout = io_prep_linked_timeout(req); if (linked_timeout) io_queue_linked_timeout(linked_timeout); } else if (ret == -EAGAIN && !(req->flags & REQ_F_NOWAIT)) { - linked_timeout = io_prep_linked_timeout(req); - - switch (io_arm_poll_handler(req)) { - case IO_APOLL_READY: - if (linked_timeout) - io_unprep_linked_timeout(req); - goto issue_sqe; - case IO_APOLL_ABORTED: - /* - * Queued up for async execution, worker will release - * submit reference when the iocb is actually submitted. - */ - io_queue_async_work(req, NULL); - break; - } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); + io_queue_sqe_arm_apoll(req); } else { io_req_complete_failed(req, ret); } } -static inline void io_queue_sqe(struct io_kiocb *req) +static void io_queue_sqe_fallback(struct io_kiocb *req) __must_hold(&req->ctx->uring_lock) { - if (unlikely(req->ctx->drain_active) && io_drain_req(req)) - return; - - if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) { - __io_queue_sqe(req); - } else if (req->flags & REQ_F_FAIL) { + if (req->flags & REQ_F_FAIL) { io_req_complete_fail_submit(req); + } else if (unlikely(req->ctx->drain_active)) { + io_drain_req(req); } else { int ret = io_req_prep_async(req); @@ -6968,6 +7013,15 @@ static inline void io_queue_sqe(struct io_kiocb *req) } } +static inline void io_queue_sqe(struct io_kiocb *req) + __must_hold(&req->ctx->uring_lock) +{ + if (likely(!(req->flags & (REQ_F_FORCE_ASYNC | REQ_F_FAIL)))) + __io_queue_sqe(req); + else + io_queue_sqe_fallback(req); +} + /* * Check SQE restrictions (opcode and flags). * @@ -6977,9 +7031,6 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, struct io_kiocb *req, unsigned int sqe_flags) { - if (likely(!ctx->restricted)) - return true; - if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) return false; @@ -6994,16 +7045,35 @@ static inline bool io_check_restriction(struct io_ring_ctx *ctx, return true; } +static void io_init_req_drain(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *head = ctx->submit_state.link.head; + + ctx->drain_active = true; + if (head) { + /* + * If we need to drain a request in the middle of a link, drain + * the head request and the next request/link after the current + * link. Considering sequential execution of links, + * IOSQE_IO_DRAIN will be maintained for every request of our + * link. + */ + head->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; + ctx->drain_next = true; + } +} + static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct io_uring_sqe *sqe) __must_hold(&ctx->uring_lock) { - struct io_submit_state *state; unsigned int sqe_flags; - int personality, ret = 0; + int personality; + u8 opcode; /* req is partially pre-initialised, see io_preinit_req() */ - req->opcode = READ_ONCE(sqe->opcode); + req->opcode = opcode = READ_ONCE(sqe->opcode); /* same numerical values with corresponding REQ_F_*, safe to copy */ req->flags = sqe_flags = READ_ONCE(sqe->flags); req->user_data = READ_ONCE(sqe->user_data); @@ -7011,49 +7081,70 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, req->fixed_rsrc_refs = NULL; req->task = current; - /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) - return -EINVAL; - if (unlikely(req->opcode >= IORING_OP_LAST)) + if (unlikely(opcode >= IORING_OP_LAST)) { + req->opcode = 0; return -EINVAL; - if (!io_check_restriction(ctx, req, sqe_flags)) - return -EACCES; + } + if (unlikely(sqe_flags & ~SQE_COMMON_FLAGS)) { + /* enforce forwards compatibility on users */ + if (sqe_flags & ~SQE_VALID_FLAGS) + return -EINVAL; + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[opcode].buffer_select) + return -EOPNOTSUPP; + if (sqe_flags & IOSQE_IO_DRAIN) + io_init_req_drain(req); + } + if (unlikely(ctx->restricted || ctx->drain_active || ctx->drain_next)) { + if (ctx->restricted && !io_check_restriction(ctx, req, sqe_flags)) + return -EACCES; + /* knock it to the slow queue path, will be drained there */ + if (ctx->drain_active) + req->flags |= REQ_F_FORCE_ASYNC; + /* if there is no link, we're at "next" request and need to drain */ + if (unlikely(ctx->drain_next) && !ctx->submit_state.link.head) { + ctx->drain_next = false; + ctx->drain_active = true; + req->flags |= IOSQE_IO_DRAIN | REQ_F_FORCE_ASYNC; + } + } - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[req->opcode].buffer_select) - return -EOPNOTSUPP; - if (unlikely(sqe_flags & IOSQE_IO_DRAIN)) - ctx->drain_active = true; + if (io_op_defs[opcode].needs_file) { + struct io_submit_state *state = &ctx->submit_state; + + /* + * Plug now if we have more than 2 IO left after this, and the + * target is potentially a read/write to block based storage. + */ + if (state->need_plug && io_op_defs[opcode].plug) { + state->plug_started = true; + state->need_plug = false; + blk_start_plug_nr_ios(&state->plug, state->submit_nr); + } + + req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), + (sqe_flags & IOSQE_FIXED_FILE)); + if (unlikely(!req->file)) + return -EBADF; + } personality = READ_ONCE(sqe->personality); if (personality) { + int ret; + req->creds = xa_load(&ctx->personalities, personality); if (!req->creds) return -EINVAL; get_cred(req->creds); + ret = security_uring_override_creds(req->creds); + if (ret) { + put_cred(req->creds); + return ret; + } req->flags |= REQ_F_CREDS; } - state = &ctx->submit_state; - /* - * Plug now if we have more than 1 IO left after this, and the target - * is potentially a read/write to block based storage. - */ - if (!state->plug_started && state->ios_left > 1 && - io_op_defs[req->opcode].plug) { - blk_start_plug(&state->plug); - state->plug_started = true; - } - - if (io_op_defs[req->opcode].needs_file) { - req->file = io_file_get(ctx, req, READ_ONCE(sqe->fd), - (sqe_flags & IOSQE_FIXED_FILE)); - if (unlikely(!req->file)) - ret = -EBADF; - } - - state->ios_left--; - return ret; + return io_req_prep(req, sqe); } static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, @@ -7065,7 +7156,8 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_init_req(ctx, req, sqe); if (unlikely(ret)) { -fail_req: + trace_io_uring_req_failed(sqe, ret); + /* fail even hard links since we don't submit */ if (link->head) { /* @@ -7088,10 +7180,6 @@ fail_req: return ret; } req_fail_link_node(req, ret); - } else { - ret = io_req_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; } /* don't need @sqe from now on */ @@ -7121,33 +7209,32 @@ fail_req: link->last->link = req; link->last = req; + if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) + return 0; /* last request of a link, enqueue the link */ - if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - link->head = NULL; - io_queue_sqe(head); - } - } else { - if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - link->head = req; - link->last = req; - } else { - io_queue_sqe(req); - } + link->head = NULL; + req = head; + } else if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { + link->head = req; + link->last = req; + return 0; } + io_queue_sqe(req); return 0; } /* * Batched submission is done, ensure local IO is flushed out. */ -static void io_submit_state_end(struct io_submit_state *state, - struct io_ring_ctx *ctx) +static void io_submit_state_end(struct io_ring_ctx *ctx) { + struct io_submit_state *state = &ctx->submit_state; + if (state->link.head) io_queue_sqe(state->link.head); - if (state->compl_nr) - io_submit_flush_completions(ctx); + /* flush only after queuing links as they can generate completions */ + io_submit_flush_completions(ctx); if (state->plug_started) blk_finish_plug(&state->plug); } @@ -7159,7 +7246,8 @@ static void io_submit_state_start(struct io_submit_state *state, unsigned int max_ios) { state->plug_started = false; - state->ios_left = max_ios; + state->need_plug = max_ios > 2; + state->submit_nr = max_ios; /* set only head, no need to init link_last in advance */ state->link.head = NULL; } @@ -7211,45 +7299,45 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) __must_hold(&ctx->uring_lock) { + unsigned int entries = io_sqring_entries(ctx); int submitted = 0; + if (unlikely(!entries)) + return 0; /* make sure SQ entry isn't read before tail */ - nr = min3(nr, ctx->sq_entries, io_sqring_entries(ctx)); - if (!percpu_ref_tryget_many(&ctx->refs, nr)) - return -EAGAIN; + nr = min3(nr, ctx->sq_entries, entries); io_get_task_refs(nr); io_submit_state_start(&ctx->submit_state, nr); - while (submitted < nr) { + do { const struct io_uring_sqe *sqe; struct io_kiocb *req; - req = io_alloc_req(ctx); - if (unlikely(!req)) { + if (unlikely(!io_alloc_req_refill(ctx))) { if (!submitted) submitted = -EAGAIN; break; } + req = io_alloc_req(ctx); sqe = io_get_sqe(ctx); if (unlikely(!sqe)) { - list_add(&req->inflight_entry, &ctx->submit_state.free_list); + wq_stack_add_head(&req->comp_list, &ctx->submit_state.free_list); break; } /* will complete beyond this point, count as submitted */ submitted++; if (io_submit_sqe(ctx, req, sqe)) break; - } + } while (submitted < nr); if (unlikely(submitted != nr)) { int ref_used = (submitted == -EAGAIN) ? 0 : submitted; int unused = nr - ref_used; current->io_uring->cached_refs += unused; - percpu_ref_put_many(&ctx->refs, unused); } - io_submit_state_end(&ctx->submit_state, ctx); + io_submit_state_end(ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); @@ -7288,16 +7376,15 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) if (cap_entries && to_submit > IORING_SQPOLL_CAP_ENTRIES_VALUE) to_submit = IORING_SQPOLL_CAP_ENTRIES_VALUE; - if (!list_empty(&ctx->iopoll_list) || to_submit) { - unsigned nr_events = 0; + if (!wq_list_empty(&ctx->iopoll_list) || to_submit) { const struct cred *creds = NULL; if (ctx->sq_creds != current_cred()) creds = override_creds(ctx->sq_creds); mutex_lock(&ctx->uring_lock); - if (!list_empty(&ctx->iopoll_list)) - io_do_iopoll(ctx, &nr_events, 0); + if (!wq_list_empty(&ctx->iopoll_list)) + io_do_iopoll(ctx, true); /* * Don't submit if refs are dying, good for io_uring_register(), @@ -7317,7 +7404,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries) return ret; } -static void io_sqd_update_thread_idle(struct io_sq_data *sqd) +static __cold void io_sqd_update_thread_idle(struct io_sq_data *sqd) { struct io_ring_ctx *ctx; unsigned sq_thread_idle = 0; @@ -7360,6 +7447,8 @@ static int io_sq_thread(void *data) set_cpus_allowed_ptr(current, cpu_online_mask); current->flags |= PF_NO_SETAFFINITY; + audit_alloc_kernel(current); + mutex_lock(&sqd->lock); while (1) { bool cap_entries, sqt_spin = false; @@ -7374,7 +7463,7 @@ static int io_sq_thread(void *data) list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { int ret = __io_sq_thread(ctx, cap_entries); - if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) + if (!sqt_spin && (ret > 0 || !wq_list_empty(&ctx->iopoll_list))) sqt_spin = true; } if (io_run_task_work()) @@ -7395,7 +7484,7 @@ static int io_sq_thread(void *data) io_ring_set_wakeup_flag(ctx); if ((ctx->flags & IORING_SETUP_IOPOLL) && - !list_empty_careful(&ctx->iopoll_list)) { + !wq_list_empty(&ctx->iopoll_list)) { needs_sched = false; break; } @@ -7425,6 +7514,8 @@ static int io_sq_thread(void *data) io_run_task_work(); mutex_unlock(&sqd->lock); + audit_free(current); + complete(&sqd->exited); do_exit(0); } @@ -7515,6 +7606,14 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, break; } while (1); + if (uts) { + struct timespec64 ts; + + if (get_timespec64(&ts, uts)) + return -EFAULT; + timeout = timespec64_to_jiffies(&ts); + } + if (sig) { #ifdef CONFIG_COMPAT if (in_compat_syscall()) @@ -7528,14 +7627,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return ret; } - if (uts) { - struct timespec64 ts; - - if (get_timespec64(&ts, uts)) - return -EFAULT; - timeout = timespec64_to_jiffies(&ts); - } - init_waitqueue_func_entry(&iowq.wq, io_wake_function); iowq.wq.private = current; INIT_LIST_HEAD(&iowq.wq.entry); @@ -7571,7 +7662,7 @@ static void io_free_page_table(void **table, size_t size) kfree(table); } -static void **io_alloc_page_table(size_t size) +static __cold void **io_alloc_page_table(size_t size) { unsigned i, nr_tables = DIV_ROUND_UP(size, PAGE_SIZE); size_t init_size = size; @@ -7600,7 +7691,7 @@ static void io_rsrc_node_destroy(struct io_rsrc_node *ref_node) kfree(ref_node); } -static void io_rsrc_node_ref_zero(struct percpu_ref *ref) +static __cold void io_rsrc_node_ref_zero(struct percpu_ref *ref) { struct io_rsrc_node *node = container_of(ref, struct io_rsrc_node, refs); struct io_ring_ctx *ctx = node->rsrc_data->ctx; @@ -7646,10 +7737,13 @@ static struct io_rsrc_node *io_rsrc_node_alloc(struct io_ring_ctx *ctx) static void io_rsrc_node_switch(struct io_ring_ctx *ctx, struct io_rsrc_data *data_to_kill) + __must_hold(&ctx->uring_lock) { WARN_ON_ONCE(!ctx->rsrc_backup_node); WARN_ON_ONCE(data_to_kill && !ctx->rsrc_node); + io_rsrc_refs_drop(ctx); + if (data_to_kill) { struct io_rsrc_node *rsrc_node = ctx->rsrc_node; @@ -7677,7 +7771,8 @@ static int io_rsrc_node_switch_start(struct io_ring_ctx *ctx) return ctx->rsrc_backup_node ? 0 : -ENOMEM; } -static int io_rsrc_ref_quiesce(struct io_rsrc_data *data, struct io_ring_ctx *ctx) +static __cold int io_rsrc_ref_quiesce(struct io_rsrc_data *data, + struct io_ring_ctx *ctx) { int ret; @@ -7733,9 +7828,9 @@ static void io_rsrc_data_free(struct io_rsrc_data *data) kfree(data); } -static int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, - u64 __user *utags, unsigned nr, - struct io_rsrc_data **pdata) +static __cold int io_rsrc_data_alloc(struct io_ring_ctx *ctx, rsrc_put_fn *do_put, + u64 __user *utags, unsigned nr, + struct io_rsrc_data **pdata) { struct io_rsrc_data *data; int ret = -ENOMEM; @@ -8284,15 +8379,31 @@ static int io_sqe_file_register(struct io_ring_ctx *ctx, struct file *file, #endif } +static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, + struct io_rsrc_node *node, void *rsrc) +{ + struct io_rsrc_put *prsrc; + + prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); + if (!prsrc) + return -ENOMEM; + + prsrc->tag = *io_get_tag_slot(data, idx); + prsrc->rsrc = rsrc; + list_add(&prsrc->list, &node->rsrc_list); + return 0; +} + static int io_install_fixed_file(struct io_kiocb *req, struct file *file, unsigned int issue_flags, u32 slot_index) { struct io_ring_ctx *ctx = req->ctx; - bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; + bool needs_switch = false; struct io_fixed_file *file_slot; int ret = -EBADF; - io_ring_submit_lock(ctx, !force_nonblock); + io_ring_submit_lock(ctx, needs_lock); if (file->f_op == &io_uring_fops) goto err; ret = -ENXIO; @@ -8304,9 +8415,22 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, slot_index = array_index_nospec(slot_index, ctx->nr_user_files); file_slot = io_fixed_file_slot(&ctx->file_table, slot_index); - ret = -EBADF; - if (file_slot->file_ptr) - goto err; + + if (file_slot->file_ptr) { + struct file *old_file; + + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto err; + + old_file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, slot_index, + ctx->rsrc_node, old_file); + if (ret) + goto err; + file_slot->file_ptr = 0; + needs_switch = true; + } *io_get_tag_slot(ctx->file_data, slot_index) = 0; io_fixed_file_set(file_slot, file); @@ -8318,25 +8442,51 @@ static int io_install_fixed_file(struct io_kiocb *req, struct file *file, ret = 0; err: - io_ring_submit_unlock(ctx, !force_nonblock); + if (needs_switch) + io_rsrc_node_switch(ctx, ctx->file_data); + io_ring_submit_unlock(ctx, needs_lock); if (ret) fput(file); return ret; } -static int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx, - struct io_rsrc_node *node, void *rsrc) +static int io_close_fixed(struct io_kiocb *req, unsigned int issue_flags) { - struct io_rsrc_put *prsrc; + unsigned int offset = req->close.file_slot - 1; + struct io_ring_ctx *ctx = req->ctx; + bool needs_lock = issue_flags & IO_URING_F_UNLOCKED; + struct io_fixed_file *file_slot; + struct file *file; + int ret, i; - prsrc = kzalloc(sizeof(*prsrc), GFP_KERNEL); - if (!prsrc) - return -ENOMEM; + io_ring_submit_lock(ctx, needs_lock); + ret = -ENXIO; + if (unlikely(!ctx->file_data)) + goto out; + ret = -EINVAL; + if (offset >= ctx->nr_user_files) + goto out; + ret = io_rsrc_node_switch_start(ctx); + if (ret) + goto out; - prsrc->tag = *io_get_tag_slot(data, idx); - prsrc->rsrc = rsrc; - list_add(&prsrc->list, &node->rsrc_list); - return 0; + i = array_index_nospec(offset, ctx->nr_user_files); + file_slot = io_fixed_file_slot(&ctx->file_table, i); + ret = -EBADF; + if (!file_slot->file_ptr) + goto out; + + file = (struct file *)(file_slot->file_ptr & FFS_MASK); + ret = io_queue_rsrc_removal(ctx->file_data, offset, ctx->rsrc_node, file); + if (ret) + goto out; + + file_slot->file_ptr = 0; + io_rsrc_node_switch(ctx, ctx->file_data); + ret = 0; +out: + io_ring_submit_unlock(ctx, needs_lock); + return ret; } static int __io_sqe_files_update(struct io_ring_ctx *ctx, @@ -8451,8 +8601,8 @@ static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, return io_wq_create(concurrency, &data); } -static int io_uring_alloc_task_context(struct task_struct *task, - struct io_ring_ctx *ctx) +static __cold int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; @@ -8499,8 +8649,8 @@ void __io_uring_free(struct task_struct *tsk) tsk->io_uring = NULL; } -static int io_sq_offload_create(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static __cold int io_sq_offload_create(struct io_ring_ctx *ctx, + struct io_uring_params *p) { int ret; @@ -8523,6 +8673,10 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_sq_data *sqd; bool attached; + ret = security_uring_sqpoll(); + if (ret) + return ret; + sqd = io_get_sq_data(p, &attached); if (IS_ERR(sqd)) { ret = PTR_ERR(sqd); @@ -9105,33 +9259,31 @@ static void io_destroy_buffers(struct io_ring_ctx *ctx) struct io_buffer *buf; unsigned long index; - xa_for_each(&ctx->io_buffers, index, buf) + xa_for_each(&ctx->io_buffers, index, buf) { __io_remove_buffers(ctx, buf, index, -1U); -} - -static void io_req_cache_free(struct list_head *list) -{ - struct io_kiocb *req, *nxt; - - list_for_each_entry_safe(req, nxt, list, inflight_entry) { - list_del(&req->inflight_entry); - kmem_cache_free(req_cachep, req); + cond_resched(); } } static void io_req_caches_free(struct io_ring_ctx *ctx) { struct io_submit_state *state = &ctx->submit_state; + int nr = 0; mutex_lock(&ctx->uring_lock); + io_flush_cached_locked_reqs(ctx, state); - if (state->free_reqs) { - kmem_cache_free_bulk(req_cachep, state->free_reqs, state->reqs); - state->free_reqs = 0; - } + while (state->free_list.next) { + struct io_wq_work_node *node; + struct io_kiocb *req; - io_flush_cached_locked_reqs(ctx, state); - io_req_cache_free(&state->free_list); + node = wq_stack_extract(&state->free_list); + req = container_of(node, struct io_kiocb, comp_list); + kmem_cache_free(req_cachep, req); + nr++; + } + if (nr) + percpu_ref_put_many(&ctx->refs, nr); mutex_unlock(&ctx->uring_lock); } @@ -9141,7 +9293,7 @@ static void io_wait_rsrc_data(struct io_rsrc_data *data) wait_for_completion(&data->done); } -static void io_ring_ctx_free(struct io_ring_ctx *ctx) +static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx) { io_sq_thread_finish(ctx); @@ -9150,6 +9302,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ctx->mm_account = NULL; } + io_rsrc_refs_drop(ctx); /* __io_rsrc_put_work() may need uring_lock to progress, wait w/o it */ io_wait_rsrc_data(ctx->buf_data); io_wait_rsrc_data(ctx->file_data); @@ -9173,6 +9326,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) if (ctx->rsrc_backup_node) io_rsrc_node_destroy(ctx->rsrc_backup_node); flush_delayed_work(&ctx->rsrc_put_work); + flush_delayed_work(&ctx->fallback_work); WARN_ON_ONCE(!list_empty(&ctx->rsrc_ref_list)); WARN_ON_ONCE(!llist_empty(&ctx->rsrc_put_llist)); @@ -9203,7 +9357,7 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) struct io_ring_ctx *ctx = file->private_data; __poll_t mask = 0; - poll_wait(file, &ctx->poll_wait, wait); + poll_wait(file, &ctx->cq_wait, wait); /* * synchronizes with barrier from wq_has_sleeper call in * io_commit_cqring @@ -9231,13 +9385,6 @@ static __poll_t io_uring_poll(struct file *file, poll_table *wait) return mask; } -static int io_uring_fasync(int fd, struct file *file, int on) -{ - struct io_ring_ctx *ctx = file->private_data; - - return fasync_helper(fd, file, on, &ctx->cq_fasync); -} - static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { const struct cred *creds; @@ -9257,7 +9404,7 @@ struct io_tctx_exit { struct io_ring_ctx *ctx; }; -static void io_tctx_exit_cb(struct callback_head *cb) +static __cold void io_tctx_exit_cb(struct callback_head *cb) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_exit *work; @@ -9272,14 +9419,14 @@ static void io_tctx_exit_cb(struct callback_head *cb) complete(&work->completion); } -static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) +static __cold bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); return req->ctx == data; } -static void io_ring_exit_work(struct work_struct *work) +static __cold void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, exit_work); unsigned long timeout = jiffies + HZ * 60 * 5; @@ -9308,6 +9455,8 @@ static void io_ring_exit_work(struct work_struct *work) io_sq_thread_unpark(sqd); } + io_req_caches_free(ctx); + if (WARN_ON_ONCE(time_after(jiffies, timeout))) { /* there is little hope left, don't run it too often */ interval = HZ * 60; @@ -9334,7 +9483,6 @@ static void io_ring_exit_work(struct work_struct *work) ret = task_work_add(node->task, &exit.task_work, TWA_SIGNAL); if (WARN_ON_ONCE(ret)) continue; - wake_up_process(node->task); mutex_unlock(&ctx->uring_lock); wait_for_completion(&exit.completion); @@ -9348,8 +9496,8 @@ static void io_ring_exit_work(struct work_struct *work) } /* Returns true if we found and killed one or more timeouts */ -static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, - bool cancel_all) +static __cold bool io_kill_timeouts(struct io_ring_ctx *ctx, + struct task_struct *tsk, bool cancel_all) { struct io_kiocb *req, *tmp; int canceled = 0; @@ -9371,7 +9519,7 @@ static bool io_kill_timeouts(struct io_ring_ctx *ctx, struct task_struct *tsk, return canceled != 0; } -static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) +static __cold void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { unsigned long index; struct creds *creds; @@ -9433,8 +9581,9 @@ static bool io_cancel_task_cb(struct io_wq_work *work, void *data) return ret; } -static bool io_cancel_defer_files(struct io_ring_ctx *ctx, - struct task_struct *task, bool cancel_all) +static __cold bool io_cancel_defer_files(struct io_ring_ctx *ctx, + struct task_struct *task, + bool cancel_all) { struct io_defer_entry *de; LIST_HEAD(list); @@ -9459,7 +9608,7 @@ static bool io_cancel_defer_files(struct io_ring_ctx *ctx, return true; } -static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) +static __cold bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) { struct io_tctx_node *node; enum io_wq_cancel cret; @@ -9483,9 +9632,9 @@ static bool io_uring_try_cancel_iowq(struct io_ring_ctx *ctx) return ret; } -static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, - struct task_struct *task, - bool cancel_all) +static __cold void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, + struct task_struct *task, + bool cancel_all) { struct io_task_cancel cancel = { .task = task, .all = cancel_all, }; struct io_uring_task *tctx = task ? task->io_uring : NULL; @@ -9509,7 +9658,7 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, /* SQPOLL thread does its own polling */ if ((!(ctx->flags & IORING_SETUP_SQPOLL) && cancel_all) || (ctx->sq_data && ctx->sq_data->thread == current)) { - while (!list_empty_careful(&ctx->iopoll_list)) { + while (!wq_list_empty(&ctx->iopoll_list)) { io_iopoll_try_reap_events(ctx); ret = true; } @@ -9536,7 +9685,16 @@ static int __io_uring_add_tctx_node(struct io_ring_ctx *ctx) ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) return ret; + tctx = current->io_uring; + if (ctx->iowq_limits_set) { + unsigned int limits[2] = { ctx->iowq_limits[0], + ctx->iowq_limits[1], }; + + ret = io_wq_max_workers(tctx->io_wq, limits); + if (ret) + return ret; + } } if (!xa_load(&tctx->xa, (unsigned long)ctx)) { node = kmalloc(sizeof(*node), GFP_KERNEL); @@ -9575,7 +9733,7 @@ static inline int io_uring_add_tctx_node(struct io_ring_ctx *ctx) /* * Remove this io_uring_file -> task mapping. */ -static void io_uring_del_tctx_node(unsigned long index) +static __cold void io_uring_del_tctx_node(unsigned long index) { struct io_uring_task *tctx = current->io_uring; struct io_tctx_node *node; @@ -9598,14 +9756,16 @@ static void io_uring_del_tctx_node(unsigned long index) kfree(node); } -static void io_uring_clean_tctx(struct io_uring_task *tctx) +static __cold void io_uring_clean_tctx(struct io_uring_task *tctx) { struct io_wq *wq = tctx->io_wq; struct io_tctx_node *node; unsigned long index; - xa_for_each(&tctx->xa, index, node) + xa_for_each(&tctx->xa, index, node) { io_uring_del_tctx_node(index); + cond_resched(); + } if (wq) { /* * Must be after io_uring_del_task_file() (removes nodes under @@ -9623,7 +9783,7 @@ static s64 tctx_inflight(struct io_uring_task *tctx, bool tracked) return percpu_counter_sum(&tctx->inflight); } -static void io_uring_drop_tctx_refs(struct task_struct *task) +static __cold void io_uring_drop_tctx_refs(struct task_struct *task) { struct io_uring_task *tctx = task->io_uring; unsigned int refs = tctx->cached_refs; @@ -9639,7 +9799,8 @@ static void io_uring_drop_tctx_refs(struct task_struct *task) * Find any io_uring ctx that this task has registered or done IO on, and cancel * requests. @sqd should be not-null IIF it's an SQPOLL thread cancellation. */ -static void io_uring_cancel_generic(bool cancel_all, struct io_sq_data *sqd) +static __cold void io_uring_cancel_generic(bool cancel_all, + struct io_sq_data *sqd) { struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx; @@ -9732,7 +9893,7 @@ static void *io_uring_validate_mmap_request(struct file *file, #ifdef CONFIG_MMU -static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +static __cold int io_uring_mmap(struct file *file, struct vm_area_struct *vma) { size_t sz = vma->vm_end - vma->vm_start; unsigned long pfn; @@ -9917,7 +10078,7 @@ out_fput: } #ifdef CONFIG_PROC_FS -static int io_uring_show_cred(struct seq_file *m, unsigned int id, +static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, const struct cred *cred) { struct user_namespace *uns = seq_user_ns(m); @@ -9949,11 +10110,59 @@ static int io_uring_show_cred(struct seq_file *m, unsigned int id, return 0; } -static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) +static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, + struct seq_file *m) { struct io_sq_data *sq = NULL; + struct io_overflow_cqe *ocqe; + struct io_rings *r = ctx->rings; + unsigned int sq_mask = ctx->sq_entries - 1, cq_mask = ctx->cq_entries - 1; + unsigned int sq_head = READ_ONCE(r->sq.head); + unsigned int sq_tail = READ_ONCE(r->sq.tail); + unsigned int cq_head = READ_ONCE(r->cq.head); + unsigned int cq_tail = READ_ONCE(r->cq.tail); + unsigned int sq_entries, cq_entries; bool has_lock; - int i; + unsigned int i; + + /* + * we may get imprecise sqe and cqe info if uring is actively running + * since we get cached_sq_head and cached_cq_tail without uring_lock + * and sq_tail and cq_head are changed by userspace. But it's ok since + * we usually use these info when it is stuck. + */ + seq_printf(m, "SqMask:\t\t0x%x\n", sq_mask); + seq_printf(m, "SqHead:\t%u\n", sq_head); + seq_printf(m, "SqTail:\t%u\n", sq_tail); + seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); + seq_printf(m, "CqMask:\t0x%x\n", cq_mask); + seq_printf(m, "CqHead:\t%u\n", cq_head); + seq_printf(m, "CqTail:\t%u\n", cq_tail); + seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); + seq_printf(m, "SQEs:\t%u\n", sq_tail - ctx->cached_sq_head); + sq_entries = min(sq_tail - sq_head, ctx->sq_entries); + for (i = 0; i < sq_entries; i++) { + unsigned int entry = i + sq_head; + unsigned int sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); + struct io_uring_sqe *sqe = &ctx->sq_sqes[sq_idx]; + + if (sq_idx > sq_mask) + continue; + sqe = &ctx->sq_sqes[sq_idx]; + seq_printf(m, "%5u: opcode:%d, fd:%d, flags:%x, user_data:%llu\n", + sq_idx, sqe->opcode, sqe->fd, sqe->flags, + sqe->user_data); + } + seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); + cq_entries = min(cq_tail - cq_head, ctx->cq_entries); + for (i = 0; i < cq_entries; i++) { + unsigned int entry = i + cq_head; + struct io_uring_cqe *cqe = &r->cqes[entry & cq_mask]; + + seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x\n", + entry & cq_mask, cqe->user_data, cqe->res, + cqe->flags); + } /* * Avoid ABBA deadlock between the seq lock and the io_uring mutex, @@ -9995,7 +10204,10 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) xa_for_each(&ctx->personalities, index, cred) io_uring_show_cred(m, index, cred); } - seq_printf(m, "PollList:\n"); + if (has_lock) + mutex_unlock(&ctx->uring_lock); + + seq_puts(m, "PollList:\n"); spin_lock(&ctx->completion_lock); for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) { struct hlist_head *list = &ctx->cancel_hash[i]; @@ -10005,12 +10217,20 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) seq_printf(m, " op=%d, task_works=%d\n", req->opcode, req->task->task_works != NULL); } + + seq_puts(m, "CqOverflowList:\n"); + list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { + struct io_uring_cqe *cqe = &ocqe->cqe; + + seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", + cqe->user_data, cqe->res, cqe->flags); + + } + spin_unlock(&ctx->completion_lock); - if (has_lock) - mutex_unlock(&ctx->uring_lock); } -static void io_uring_show_fdinfo(struct seq_file *m, struct file *f) +static __cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) { struct io_ring_ctx *ctx = f->private_data; @@ -10029,14 +10249,13 @@ static const struct file_operations io_uring_fops = { .mmap_capabilities = io_uring_nommu_mmap_capabilities, #endif .poll = io_uring_poll, - .fasync = io_uring_fasync, #ifdef CONFIG_PROC_FS .show_fdinfo = io_uring_show_fdinfo, #endif }; -static int io_allocate_scq_urings(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, + struct io_uring_params *p) { struct io_rings *rings; size_t size, sq_array_offset; @@ -10112,8 +10331,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) return ERR_PTR(ret); #endif - file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, - O_RDWR | O_CLOEXEC); + file = anon_inode_getfile_secure("[io_uring]", &io_uring_fops, ctx, + O_RDWR | O_CLOEXEC, NULL); #if defined(CONFIG_UNIX) if (IS_ERR(file)) { sock_release(ctx->ring_sock); @@ -10125,8 +10344,8 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) return file; } -static int io_uring_create(unsigned entries, struct io_uring_params *p, - struct io_uring_params __user *params) +static __cold int io_uring_create(unsigned entries, struct io_uring_params *p, + struct io_uring_params __user *params) { struct io_ring_ctx *ctx; struct file *file; @@ -10284,7 +10503,8 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } -static int io_probe(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +static __cold int io_probe(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) { struct io_uring_probe *p; size_t size; @@ -10340,8 +10560,8 @@ static int io_register_personality(struct io_ring_ctx *ctx) return id; } -static int io_register_restrictions(struct io_ring_ctx *ctx, void __user *arg, - unsigned int nr_args) +static __cold int io_register_restrictions(struct io_ring_ctx *ctx, + void __user *arg, unsigned int nr_args) { struct io_uring_restriction *res; size_t size; @@ -10475,7 +10695,7 @@ static int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, return __io_register_rsrc_update(ctx, type, &up, up.nr); } -static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, +static __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr; @@ -10501,8 +10721,8 @@ static int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, return -EINVAL; } -static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, - unsigned len) +static __cold int io_register_iowq_aff(struct io_ring_ctx *ctx, + void __user *arg, unsigned len) { struct io_uring_task *tctx = current->io_uring; cpumask_var_t new_mask; @@ -10528,7 +10748,7 @@ static int io_register_iowq_aff(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) +static __cold int io_unregister_iowq_aff(struct io_ring_ctx *ctx) { struct io_uring_task *tctx = current->io_uring; @@ -10538,9 +10758,11 @@ static int io_unregister_iowq_aff(struct io_ring_ctx *ctx) return io_wq_cpu_affinity(tctx->io_wq, NULL); } -static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, - void __user *arg) +static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx, + void __user *arg) + __must_hold(&ctx->uring_lock) { + struct io_tctx_node *node; struct io_uring_task *tctx = NULL; struct io_sq_data *sqd = NULL; __u32 new_count[2]; @@ -10560,33 +10782,61 @@ static int io_register_iowq_max_workers(struct io_ring_ctx *ctx, * ordering. Fine to drop uring_lock here, we hold * a ref to the ctx. */ + refcount_inc(&sqd->refs); mutex_unlock(&ctx->uring_lock); mutex_lock(&sqd->lock); mutex_lock(&ctx->uring_lock); - tctx = sqd->thread->io_uring; + if (sqd->thread) + tctx = sqd->thread->io_uring; } } else { tctx = current->io_uring; } - ret = -EINVAL; - if (!tctx || !tctx->io_wq) - goto err; + BUILD_BUG_ON(sizeof(new_count) != sizeof(ctx->iowq_limits)); - ret = io_wq_max_workers(tctx->io_wq, new_count); - if (ret) - goto err; + memcpy(ctx->iowq_limits, new_count, sizeof(new_count)); + ctx->iowq_limits_set = true; - if (sqd) + ret = -EINVAL; + if (tctx && tctx->io_wq) { + ret = io_wq_max_workers(tctx->io_wq, new_count); + if (ret) + goto err; + } else { + memset(new_count, 0, sizeof(new_count)); + } + + if (sqd) { mutex_unlock(&sqd->lock); + io_put_sq_data(sqd); + } if (copy_to_user(arg, new_count, sizeof(new_count))) return -EFAULT; + /* that's it for SQPOLL, only the SQPOLL task creates requests */ + if (sqd) + return 0; + + /* now propagate the restriction to all registered users */ + list_for_each_entry(node, &ctx->tctx_list, ctx_node) { + struct io_uring_task *tctx = node->task->io_uring; + + if (WARN_ON_ONCE(!tctx->io_wq)) + continue; + + for (i = 0; i < ARRAY_SIZE(new_count); i++) + new_count[i] = ctx->iowq_limits[i]; + /* ignore errors, it always returns zero anyway */ + (void)io_wq_max_workers(tctx->io_wq, new_count); + } return 0; err: - if (sqd) + if (sqd) { mutex_unlock(&sqd->lock); + io_put_sq_data(sqd); + } return ret; } @@ -10614,7 +10864,7 @@ static bool io_register_op_must_quiesce(int op) } } -static int io_ctx_quiesce(struct io_ring_ctx *ctx) +static __cold int io_ctx_quiesce(struct io_ring_ctx *ctx) { long ret; @@ -10629,10 +10879,14 @@ static int io_ctx_quiesce(struct io_ring_ctx *ctx) */ mutex_unlock(&ctx->uring_lock); do { - ret = wait_for_completion_interruptible(&ctx->ref_comp); - if (!ret) + ret = wait_for_completion_interruptible_timeout(&ctx->ref_comp, HZ); + if (ret) { + ret = min(0L, ret); break; + } + ret = io_run_task_work_sig(); + io_req_caches_free(ctx); } while (ret >= 0); mutex_lock(&ctx->uring_lock); @@ -10863,6 +11117,8 @@ static int __init io_uring_init(void) /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); + BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8)); + BUILD_BUG_ON((SQE_VALID_FLAGS | SQE_COMMON_FLAGS) != SQE_VALID_FLAGS); BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST); BUILD_BUG_ON(__REQ_F_LAST_BIT > 8 * sizeof(int)); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 9cc5798423d1..1753c26c8e76 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -750,7 +750,7 @@ again: * same page as we're writing to, without it being marked * up-to-date. */ - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, bytes))) { status = -EFAULT; break; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 4ecd255e0511..b4dc51063d36 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -31,6 +31,7 @@ struct iomap_dio { atomic_t ref; unsigned flags; int error; + size_t done_before; bool wait_for_completion; union { @@ -38,8 +39,7 @@ struct iomap_dio { struct { struct iov_iter *iter; struct task_struct *waiter; - struct request_queue *last_queue; - blk_qc_t cookie; + struct bio *poll_bio; } submit; /* used for aio completion: */ @@ -49,29 +49,20 @@ struct iomap_dio { }; }; -int iomap_dio_iopoll(struct kiocb *kiocb, bool spin) -{ - struct request_queue *q = READ_ONCE(kiocb->private); - - if (!q) - return 0; - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), spin); -} -EXPORT_SYMBOL_GPL(iomap_dio_iopoll); - static void iomap_dio_submit_bio(const struct iomap_iter *iter, struct iomap_dio *dio, struct bio *bio, loff_t pos) { atomic_inc(&dio->ref); - if (dio->iocb->ki_flags & IOCB_HIPRI) + if (dio->iocb->ki_flags & IOCB_HIPRI) { bio_set_polled(bio, dio->iocb); + dio->submit.poll_bio = bio; + } - dio->submit.last_queue = bdev_get_queue(iter->iomap.bdev); if (dio->dops && dio->dops->submit_io) - dio->submit.cookie = dio->dops->submit_io(iter, bio, pos); + dio->dops->submit_io(iter, bio, pos); else - dio->submit.cookie = submit_bio(bio); + submit_bio(bio); } ssize_t iomap_dio_complete(struct iomap_dio *dio) @@ -124,6 +115,9 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) ret = generic_write_sync(iocb, ret); + if (ret > 0) + ret += dio->done_before; + kfree(dio); return ret; @@ -135,7 +129,7 @@ static void iomap_dio_complete_work(struct work_struct *work) struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); struct kiocb *iocb = dio->iocb; - iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); + iocb->ki_complete(iocb, iomap_dio_complete(dio)); } /* @@ -164,9 +158,11 @@ static void iomap_dio_bio_end_io(struct bio *bio) } else if (dio->flags & IOMAP_DIO_WRITE) { struct inode *inode = file_inode(dio->iocb->ki_filp); + WRITE_ONCE(dio->iocb->private, NULL); INIT_WORK(&dio->aio.work, iomap_dio_complete_work); queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); } else { + WRITE_ONCE(dio->iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); } } @@ -282,6 +278,13 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, if (!iov_iter_count(dio->submit.iter)) goto out; + /* + * We can only poll for single bio I/Os. + */ + if (need_zeroout || + ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) + dio->iocb->ki_flags &= ~IOCB_HIPRI; + if (need_zeroout) { /* zero out from the start of the block to the write offset */ pad = pos & (fs_block_size - 1); @@ -339,6 +342,11 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); + /* + * We can only poll for single bio I/Os. + */ + if (nr_pages) + dio->iocb->ki_flags &= ~IOCB_HIPRI; iomap_dio_submit_bio(iter, dio, bio, pos); pos += n; } while (nr_pages); @@ -371,6 +379,8 @@ static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); dio->size += length; + if (!length) + return -EFAULT; return length; } @@ -402,6 +412,8 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, copied = copy_to_iter(inline_data, length, iter); } dio->size += copied; + if (!copied) + return -EFAULT; return copied; } @@ -446,13 +458,21 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter, * may be pure data writes. In that case, we still need to do a full data sync * completion. * + * When page faults are disabled and @dio_flags includes IOMAP_DIO_PARTIAL, + * __iomap_dio_rw can return a partial result if it encounters a non-resident + * page in @iter after preparing a transfer. In that case, the non-resident + * pages can be faulted in and the request resumed with @done_before set to the + * number of bytes previously transferred. The request will then complete with + * the correct total number of bytes transferred; this is essential for + * completing partial requests asynchronously. + * * Returns -ENOTBLK In case of a page invalidation invalidation failure for * writes. The callers needs to fall back to buffered I/O in this case. */ struct iomap_dio * __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags) + unsigned int dio_flags, size_t done_before) { struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = file_inode(iocb->ki_filp); @@ -482,11 +502,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->dops = dops; dio->error = 0; dio->flags = 0; + dio->done_before = done_before; dio->submit.iter = iter; dio->submit.waiter = current; - dio->submit.cookie = BLK_QC_T_NONE; - dio->submit.last_queue = NULL; + dio->submit.poll_bio = NULL; if (iov_iter_rw(iter) == READ) { if (iomi.pos >= dio->i_size) @@ -565,8 +585,15 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, inode_dio_begin(inode); blk_start_plug(&plug); - while ((ret = iomap_iter(&iomi, ops)) > 0) + while ((ret = iomap_iter(&iomi, ops)) > 0) { iomi.processed = iomap_dio_iter(&iomi, dio); + + /* + * We can only poll for single bio I/Os. + */ + iocb->ki_flags &= ~IOCB_HIPRI; + } + blk_finish_plug(&plug); /* @@ -577,6 +604,12 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iov_iter_rw(iter) == READ && iomi.pos >= dio->i_size) iov_iter_revert(iter, iomi.pos - dio->i_size); + if (ret == -EFAULT && dio->size && (dio_flags & IOMAP_DIO_PARTIAL)) { + if (!(iocb->ki_flags & IOCB_NOWAIT)) + wait_for_completion = true; + ret = 0; + } + /* magic error code to fall back to buffered I/O */ if (ret == -ENOTBLK) { wait_for_completion = true; @@ -592,8 +625,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (dio->flags & IOMAP_DIO_WRITE_FUA) dio->flags &= ~IOMAP_DIO_NEED_SYNC; - WRITE_ONCE(iocb->ki_cookie, dio->submit.cookie); - WRITE_ONCE(iocb->private, dio->submit.last_queue); + WRITE_ONCE(iocb->private, dio->submit.poll_bio); /* * We are about to drop our additional submission reference, which @@ -620,10 +652,8 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (!READ_ONCE(dio->submit.waiter)) break; - if (!(iocb->ki_flags & IOCB_HIPRI) || - !dio->submit.last_queue || - !blk_poll(dio->submit.last_queue, - dio->submit.cookie, true)) + if (!dio->submit.poll_bio || + !bio_poll(dio->submit.poll_bio, NULL, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -642,11 +672,11 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw); ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, const struct iomap_dio_ops *dops, - unsigned int dio_flags) + unsigned int dio_flags, size_t done_before) { struct iomap_dio *dio; - dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags); + dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before); if (IS_ERR_OR_NULL(dio)) return PTR_ERR_OR_ZERO(dio); return iomap_dio_complete(dio); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 678e2c51b855..0c6eacfcbeef 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1322,6 +1322,8 @@ static int isofs_read_inode(struct inode *inode, int relocated) de = (struct iso_directory_record *) (bh->b_data + offset); de_len = *(unsigned char *) de; + if (de_len < sizeof(struct iso_directory_record)) + goto fail; if (offset + de_len > bufsize) { int frag1 = bufsize - offset; diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 176580f54af9..104ae698443e 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -13,6 +13,7 @@ #include <linux/buffer_head.h> #include <linux/mempool.h> #include <linux/seq_file.h> +#include <linux/writeback.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 5d7d7170c03c..aa4ff7bcaff2 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -81,14 +81,14 @@ int jfs_mount(struct super_block *sb) * (initialize mount inode from the superblock) */ if ((rc = chkSuper(sb))) { - goto errout20; + goto out; } ipaimap = diReadSpecial(sb, AGGREGATE_I, 0); if (ipaimap == NULL) { jfs_err("jfs_mount: Failed to read AGGREGATE_I"); rc = -EIO; - goto errout20; + goto out; } sbi->ipaimap = ipaimap; @@ -99,7 +99,7 @@ int jfs_mount(struct super_block *sb) */ if ((rc = diMount(ipaimap))) { jfs_err("jfs_mount: diMount(ipaimap) failed w/rc = %d", rc); - goto errout21; + goto err_ipaimap; } /* @@ -108,7 +108,7 @@ int jfs_mount(struct super_block *sb) ipbmap = diReadSpecial(sb, BMAP_I, 0); if (ipbmap == NULL) { rc = -EIO; - goto errout22; + goto err_umount_ipaimap; } jfs_info("jfs_mount: ipbmap:0x%p", ipbmap); @@ -120,7 +120,7 @@ int jfs_mount(struct super_block *sb) */ if ((rc = dbMount(ipbmap))) { jfs_err("jfs_mount: dbMount failed w/rc = %d", rc); - goto errout22; + goto err_ipbmap; } /* @@ -139,7 +139,7 @@ int jfs_mount(struct super_block *sb) if (!ipaimap2) { jfs_err("jfs_mount: Failed to read AGGREGATE_I"); rc = -EIO; - goto errout35; + goto err_umount_ipbmap; } sbi->ipaimap2 = ipaimap2; @@ -151,7 +151,7 @@ int jfs_mount(struct super_block *sb) if ((rc = diMount(ipaimap2))) { jfs_err("jfs_mount: diMount(ipaimap2) failed, rc = %d", rc); - goto errout35; + goto err_ipaimap2; } } else /* Secondary aggregate inode table is not valid */ @@ -168,7 +168,7 @@ int jfs_mount(struct super_block *sb) jfs_err("jfs_mount: Failed to read FILESYSTEM_I"); /* open fileset secondary inode allocation map */ rc = -EIO; - goto errout40; + goto err_umount_ipaimap2; } jfs_info("jfs_mount: ipimap:0x%p", ipimap); @@ -178,41 +178,34 @@ int jfs_mount(struct super_block *sb) /* initialize fileset inode allocation map */ if ((rc = diMount(ipimap))) { jfs_err("jfs_mount: diMount failed w/rc = %d", rc); - goto errout41; + goto err_ipimap; } - goto out; + return rc; /* * unwind on error */ - errout41: /* close fileset inode allocation map inode */ +err_ipimap: + /* close fileset inode allocation map inode */ diFreeSpecial(ipimap); - - errout40: /* fileset closed */ - +err_umount_ipaimap2: /* close secondary aggregate inode allocation map */ - if (ipaimap2) { + if (ipaimap2) diUnmount(ipaimap2, 1); +err_ipaimap2: + /* close aggregate inodes */ + if (ipaimap2) diFreeSpecial(ipaimap2); - } - - errout35: - - /* close aggregate block allocation map */ +err_umount_ipbmap: /* close aggregate block allocation map */ dbUnmount(ipbmap, 1); +err_ipbmap: /* close aggregate inodes */ diFreeSpecial(ipbmap); - - errout22: /* close aggregate inode allocation map */ - +err_umount_ipaimap: /* close aggregate inode allocation map */ diUnmount(ipaimap, 1); - - errout21: /* close aggregate inodes */ +err_ipaimap: /* close aggregate inodes */ diFreeSpecial(ipaimap); - errout20: /* aggregate closed */ - - out: - +out: if (rc) jfs_err("Mount JFS Failure: %d", rc); diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index bde787c354fc..8b9a72ae5efa 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -86,8 +86,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) goto out; } - VolumeSize = i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits; - + VolumeSize = sb_bdev_nr_blocks(sb); if (VolumeSize) { if (newLVSize > VolumeSize) { printk(KERN_WARNING "jfs_extendfs: invalid size\n"); @@ -199,7 +198,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) txQuiesce(sb); /* Reset size of direct inode */ - sbi->direct_inode->i_size = i_size_read(sb->s_bdev->bd_inode); + sbi->direct_inode->i_size = bdev_nr_bytes(sb->s_bdev); if (sbi->mntflag & JFS_INLINELOG) { /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 9030aeaf0f88..24cbc9946e01 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -284,8 +284,7 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize, } case Opt_resize_nosize: { - *newLVSize = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; + *newLVSize = sb_bdev_nr_blocks(sb); if (*newLVSize == 0) pr_err("JFS: Cannot determine volume size\n"); break; @@ -551,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) ret = -ENOMEM; goto out_unload; } - inode->i_size = i_size_read(sb->s_bdev->bd_inode); + inode->i_size = bdev_nr_bytes(sb->s_bdev); inode->i_mapping->a_ops = &jfs_metapage_aops; inode_fake_hash(inode); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); diff --git a/fs/kernel_read_file.c b/fs/kernel_read_file.c index 87aac4c72c37..1b07550485b9 100644 --- a/fs/kernel_read_file.c +++ b/fs/kernel_read_file.c @@ -178,7 +178,7 @@ int kernel_read_file_from_fd(int fd, loff_t offset, void **buf, struct fd f = fdget(fd); int ret = -EBADF; - if (!f.file) + if (!f.file || !(f.file->f_mode & FMODE_READ)) goto out; ret = kernel_read_file(f.file, offset, buf, buf_size, file_size, id); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index ba581429bf7b..8e0a1378a4b1 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1111,13 +1111,25 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, kn = kernfs_find_ns(parent, dentry->d_name.name, ns); /* attach dentry and inode */ - if (kn && kernfs_active(kn)) { + if (kn) { + /* Inactive nodes are invisible to the VFS so don't + * create a negative. + */ + if (!kernfs_active(kn)) { + up_read(&kernfs_rwsem); + return NULL; + } inode = kernfs_get_inode(dir->i_sb, kn); if (!inode) inode = ERR_PTR(-ENOMEM); } - /* Needed only for negative dentry validation */ - if (!inode) + /* + * Needed for negative dentry validation. + * The negative dentry can be created in kernfs_iop_lookup() + * or transforms from positive dentry in dentry_unlink_inode() + * called from vfs_rmdir(). + */ + if (!IS_ERR(inode)) kernfs_set_rev(parent, dentry); up_read(&kernfs_rwsem); diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index c8f8e41b8411..19a6c71c6ff5 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -36,8 +36,7 @@ struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, gid = target->iattr->ia_gid; } - kn = kernfs_new_node(parent, name, S_IFLNK|S_IRWXUGO, uid, gid, - KERNFS_LINK); + kn = kernfs_new_node(parent, name, S_IFLNK|0777, uid, gid, KERNFS_LINK); if (!kn) return ERR_PTR(-ENOMEM); diff --git a/fs/ksmbd/auth.c b/fs/ksmbd/auth.c index de36f12070bf..30a92ddc1817 100644 --- a/fs/ksmbd/auth.c +++ b/fs/ksmbd/auth.c @@ -68,125 +68,6 @@ void ksmbd_copy_gss_neg_header(void *buf) memcpy(buf, NEGOTIATE_GSS_HEADER, AUTH_GSS_LENGTH); } -static void -str_to_key(unsigned char *str, unsigned char *key) -{ - int i; - - key[0] = str[0] >> 1; - key[1] = ((str[0] & 0x01) << 6) | (str[1] >> 2); - key[2] = ((str[1] & 0x03) << 5) | (str[2] >> 3); - key[3] = ((str[2] & 0x07) << 4) | (str[3] >> 4); - key[4] = ((str[3] & 0x0F) << 3) | (str[4] >> 5); - key[5] = ((str[4] & 0x1F) << 2) | (str[5] >> 6); - key[6] = ((str[5] & 0x3F) << 1) | (str[6] >> 7); - key[7] = str[6] & 0x7F; - for (i = 0; i < 8; i++) - key[i] = (key[i] << 1); -} - -static int -smbhash(unsigned char *out, const unsigned char *in, unsigned char *key) -{ - unsigned char key2[8]; - struct des_ctx ctx; - - if (fips_enabled) { - ksmbd_debug(AUTH, "FIPS compliance enabled: DES not permitted\n"); - return -ENOENT; - } - - str_to_key(key, key2); - des_expand_key(&ctx, key2, DES_KEY_SIZE); - des_encrypt(&ctx, out, in); - memzero_explicit(&ctx, sizeof(ctx)); - return 0; -} - -static int ksmbd_enc_p24(unsigned char *p21, const unsigned char *c8, unsigned char *p24) -{ - int rc; - - rc = smbhash(p24, c8, p21); - if (rc) - return rc; - rc = smbhash(p24 + 8, c8, p21 + 7); - if (rc) - return rc; - return smbhash(p24 + 16, c8, p21 + 14); -} - -/* produce a md4 message digest from data of length n bytes */ -static int ksmbd_enc_md4(unsigned char *md4_hash, unsigned char *link_str, - int link_len) -{ - int rc; - struct ksmbd_crypto_ctx *ctx; - - ctx = ksmbd_crypto_ctx_find_md4(); - if (!ctx) { - ksmbd_debug(AUTH, "Crypto md4 allocation error\n"); - return -ENOMEM; - } - - rc = crypto_shash_init(CRYPTO_MD4(ctx)); - if (rc) { - ksmbd_debug(AUTH, "Could not init md4 shash\n"); - goto out; - } - - rc = crypto_shash_update(CRYPTO_MD4(ctx), link_str, link_len); - if (rc) { - ksmbd_debug(AUTH, "Could not update with link_str\n"); - goto out; - } - - rc = crypto_shash_final(CRYPTO_MD4(ctx), md4_hash); - if (rc) - ksmbd_debug(AUTH, "Could not generate md4 hash\n"); -out: - ksmbd_release_crypto_ctx(ctx); - return rc; -} - -static int ksmbd_enc_update_sess_key(unsigned char *md5_hash, char *nonce, - char *server_challenge, int len) -{ - int rc; - struct ksmbd_crypto_ctx *ctx; - - ctx = ksmbd_crypto_ctx_find_md5(); - if (!ctx) { - ksmbd_debug(AUTH, "Crypto md5 allocation error\n"); - return -ENOMEM; - } - - rc = crypto_shash_init(CRYPTO_MD5(ctx)); - if (rc) { - ksmbd_debug(AUTH, "Could not init md5 shash\n"); - goto out; - } - - rc = crypto_shash_update(CRYPTO_MD5(ctx), server_challenge, len); - if (rc) { - ksmbd_debug(AUTH, "Could not update with challenge\n"); - goto out; - } - - rc = crypto_shash_update(CRYPTO_MD5(ctx), nonce, len); - if (rc) { - ksmbd_debug(AUTH, "Could not update with nonce\n"); - goto out; - } - - rc = crypto_shash_final(CRYPTO_MD5(ctx), md5_hash); - if (rc) - ksmbd_debug(AUTH, "Could not generate md5 hash\n"); -out: - ksmbd_release_crypto_ctx(ctx); - return rc; -} - /** * ksmbd_gen_sess_key() - function to generate session key * @sess: session of connection @@ -325,43 +206,6 @@ out: } /** - * ksmbd_auth_ntlm() - NTLM authentication handler - * @sess: session of connection - * @pw_buf: NTLM challenge response - * @passkey: user password - * - * Return: 0 on success, error number on error - */ -int ksmbd_auth_ntlm(struct ksmbd_session *sess, char *pw_buf) -{ - int rc; - unsigned char p21[21]; - char key[CIFS_AUTH_RESP_SIZE]; - - memset(p21, '\0', 21); - memcpy(p21, user_passkey(sess->user), CIFS_NTHASH_SIZE); - rc = ksmbd_enc_p24(p21, sess->ntlmssp.cryptkey, key); - if (rc) { - pr_err("password processing failed\n"); - return rc; - } - - ksmbd_enc_md4(sess->sess_key, user_passkey(sess->user), - CIFS_SMB1_SESSKEY_SIZE); - memcpy(sess->sess_key + CIFS_SMB1_SESSKEY_SIZE, key, - CIFS_AUTH_RESP_SIZE); - sess->sequence_number = 1; - - if (strncmp(pw_buf, key, CIFS_AUTH_RESP_SIZE) != 0) { - ksmbd_debug(AUTH, "ntlmv1 authentication failed\n"); - return -EINVAL; - } - - ksmbd_debug(AUTH, "ntlmv1 authentication pass\n"); - return 0; -} - -/** * ksmbd_auth_ntlmv2() - NTLMv2 authentication handler * @sess: session of connection * @ntlmv2: NTLMv2 challenge response @@ -442,44 +286,6 @@ out: } /** - * __ksmbd_auth_ntlmv2() - NTLM2(extended security) authentication handler - * @sess: session of connection - * @client_nonce: client nonce from LM response. - * @ntlm_resp: ntlm response data from client. - * - * Return: 0 on success, error number on error - */ -static int __ksmbd_auth_ntlmv2(struct ksmbd_session *sess, char *client_nonce, - char *ntlm_resp) -{ - char sess_key[CIFS_SMB1_SESSKEY_SIZE] = {0}; - int rc; - unsigned char p21[21]; - char key[CIFS_AUTH_RESP_SIZE]; - - rc = ksmbd_enc_update_sess_key(sess_key, - client_nonce, - (char *)sess->ntlmssp.cryptkey, 8); - if (rc) { - pr_err("password processing failed\n"); - goto out; - } - - memset(p21, '\0', 21); - memcpy(p21, user_passkey(sess->user), CIFS_NTHASH_SIZE); - rc = ksmbd_enc_p24(p21, sess_key, key); - if (rc) { - pr_err("password processing failed\n"); - goto out; - } - - if (memcmp(ntlm_resp, key, CIFS_AUTH_RESP_SIZE) != 0) - rc = -EINVAL; -out: - return rc; -} - -/** * ksmbd_decode_ntlmssp_auth_blob() - helper function to construct * authenticate blob * @authblob: authenticate blob source pointer @@ -492,8 +298,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, int blob_len, struct ksmbd_session *sess) { char *domain_name; - unsigned int lm_off, nt_off; - unsigned short nt_len; + unsigned int nt_off, dn_off; + unsigned short nt_len, dn_len; int ret; if (blob_len < sizeof(struct authenticate_message)) { @@ -508,26 +314,17 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, return -EINVAL; } - lm_off = le32_to_cpu(authblob->LmChallengeResponse.BufferOffset); nt_off = le32_to_cpu(authblob->NtChallengeResponse.BufferOffset); nt_len = le16_to_cpu(authblob->NtChallengeResponse.Length); + dn_off = le32_to_cpu(authblob->DomainName.BufferOffset); + dn_len = le16_to_cpu(authblob->DomainName.Length); - /* process NTLM authentication */ - if (nt_len == CIFS_AUTH_RESP_SIZE) { - if (le32_to_cpu(authblob->NegotiateFlags) & - NTLMSSP_NEGOTIATE_EXTENDED_SEC) - return __ksmbd_auth_ntlmv2(sess, (char *)authblob + - lm_off, (char *)authblob + nt_off); - else - return ksmbd_auth_ntlm(sess, (char *)authblob + - nt_off); - } + if (blob_len < (u64)dn_off + dn_len || blob_len < (u64)nt_off + nt_len) + return -EINVAL; /* TODO : use domain name that imported from configuration file */ - domain_name = smb_strndup_from_utf16((const char *)authblob + - le32_to_cpu(authblob->DomainName.BufferOffset), - le16_to_cpu(authblob->DomainName.Length), true, - sess->conn->local_nls); + domain_name = smb_strndup_from_utf16((const char *)authblob + dn_off, + dn_len, true, sess->conn->local_nls); if (IS_ERR(domain_name)) return PTR_ERR(domain_name); diff --git a/fs/ksmbd/connection.c b/fs/ksmbd/connection.c index af086d35398a..b57a0d8a392f 100644 --- a/fs/ksmbd/connection.c +++ b/fs/ksmbd/connection.c @@ -61,6 +61,8 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) conn->local_nls = load_nls_default(); atomic_set(&conn->req_running, 0); atomic_set(&conn->r_count, 0); + conn->total_credits = 1; + init_waitqueue_head(&conn->req_running_q); INIT_LIST_HEAD(&conn->conns_list); INIT_LIST_HEAD(&conn->sessions); @@ -296,10 +298,12 @@ int ksmbd_conn_handler_loop(void *p) pdu_size = get_rfc1002_len(hdr_buf); ksmbd_debug(CONN, "RFC1002 header %u bytes\n", pdu_size); - /* make sure we have enough to get to SMB header end */ - if (!ksmbd_pdu_size_has_room(pdu_size)) { - ksmbd_debug(CONN, "SMB request too short (%u bytes)\n", - pdu_size); + /* + * Check if pdu size is valid (min : smb header size, + * max : 0x00FFFFFF). + */ + if (pdu_size < __SMB2_HEADER_STRUCTURE_SIZE || + pdu_size > MAX_STREAM_PROT_LEN) { continue; } diff --git a/fs/ksmbd/crypto_ctx.c b/fs/ksmbd/crypto_ctx.c index 5f4b1008d17e..81488d04199d 100644 --- a/fs/ksmbd/crypto_ctx.c +++ b/fs/ksmbd/crypto_ctx.c @@ -81,12 +81,6 @@ static struct shash_desc *alloc_shash_desc(int id) case CRYPTO_SHASH_SHA512: tfm = crypto_alloc_shash("sha512", 0, 0); break; - case CRYPTO_SHASH_MD4: - tfm = crypto_alloc_shash("md4", 0, 0); - break; - case CRYPTO_SHASH_MD5: - tfm = crypto_alloc_shash("md5", 0, 0); - break; default: return NULL; } @@ -214,16 +208,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void) return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA512); } -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md4(void) -{ - return ____crypto_shash_ctx_find(CRYPTO_SHASH_MD4); -} - -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md5(void) -{ - return ____crypto_shash_ctx_find(CRYPTO_SHASH_MD5); -} - static struct ksmbd_crypto_ctx *____crypto_aead_ctx_find(int id) { struct ksmbd_crypto_ctx *ctx; diff --git a/fs/ksmbd/crypto_ctx.h b/fs/ksmbd/crypto_ctx.h index ef11154b43df..4a367c62f653 100644 --- a/fs/ksmbd/crypto_ctx.h +++ b/fs/ksmbd/crypto_ctx.h @@ -15,8 +15,6 @@ enum { CRYPTO_SHASH_CMACAES, CRYPTO_SHASH_SHA256, CRYPTO_SHASH_SHA512, - CRYPTO_SHASH_MD4, - CRYPTO_SHASH_MD5, CRYPTO_SHASH_MAX, }; @@ -43,8 +41,6 @@ struct ksmbd_crypto_ctx { #define CRYPTO_CMACAES(c) ((c)->desc[CRYPTO_SHASH_CMACAES]) #define CRYPTO_SHA256(c) ((c)->desc[CRYPTO_SHASH_SHA256]) #define CRYPTO_SHA512(c) ((c)->desc[CRYPTO_SHASH_SHA512]) -#define CRYPTO_MD4(c) ((c)->desc[CRYPTO_SHASH_MD4]) -#define CRYPTO_MD5(c) ((c)->desc[CRYPTO_SHASH_MD5]) #define CRYPTO_HMACMD5_TFM(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]->tfm) #define CRYPTO_HMACSHA256_TFM(c)\ @@ -52,8 +48,6 @@ struct ksmbd_crypto_ctx { #define CRYPTO_CMACAES_TFM(c) ((c)->desc[CRYPTO_SHASH_CMACAES]->tfm) #define CRYPTO_SHA256_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA256]->tfm) #define CRYPTO_SHA512_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA512]->tfm) -#define CRYPTO_MD4_TFM(c) ((c)->desc[CRYPTO_SHASH_MD4]->tfm) -#define CRYPTO_MD5_TFM(c) ((c)->desc[CRYPTO_SHASH_MD5]->tfm) #define CRYPTO_GCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_GCM]) #define CRYPTO_CCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_CCM]) @@ -64,8 +58,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void); -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md4(void); -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_md5(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void); void ksmbd_crypto_destroy(void); diff --git a/fs/ksmbd/glob.h b/fs/ksmbd/glob.h index 49a5a3afa118..5b8f3e0ebdb3 100644 --- a/fs/ksmbd/glob.h +++ b/fs/ksmbd/glob.h @@ -12,7 +12,7 @@ #include "unicode.h" #include "vfs_cache.h" -#define KSMBD_VERSION "3.1.9" +#define KSMBD_VERSION "3.4.2" extern int ksmbd_debug_types; diff --git a/fs/ksmbd/ksmbd_netlink.h b/fs/ksmbd/ksmbd_netlink.h index 2fbe2bc1e093..c6718a05d347 100644 --- a/fs/ksmbd/ksmbd_netlink.h +++ b/fs/ksmbd/ksmbd_netlink.h @@ -211,6 +211,7 @@ struct ksmbd_tree_disconnect_request { */ struct ksmbd_logout_request { __s8 account[KSMBD_REQ_MAX_ACCOUNT_NAME_SZ]; /* user account name */ + __u32 account_flags; }; /* @@ -317,6 +318,7 @@ enum KSMBD_TREE_CONN_STATUS { #define KSMBD_USER_FLAG_BAD_UID BIT(2) #define KSMBD_USER_FLAG_BAD_USER BIT(3) #define KSMBD_USER_FLAG_GUEST_ACCOUNT BIT(4) +#define KSMBD_USER_FLAG_DELAY_SESSION BIT(5) /* * Share config flags. diff --git a/fs/ksmbd/mgmt/user_config.c b/fs/ksmbd/mgmt/user_config.c index d21629ae5c89..1019d3677d55 100644 --- a/fs/ksmbd/mgmt/user_config.c +++ b/fs/ksmbd/mgmt/user_config.c @@ -55,7 +55,7 @@ struct ksmbd_user *ksmbd_alloc_user(struct ksmbd_login_response *resp) void ksmbd_free_user(struct ksmbd_user *user) { - ksmbd_ipc_logout_request(user->name); + ksmbd_ipc_logout_request(user->name, user->flags); kfree(user->name); kfree(user->passkey); kfree(user); diff --git a/fs/ksmbd/mgmt/user_config.h b/fs/ksmbd/mgmt/user_config.h index b2bb074a0150..aff80b029579 100644 --- a/fs/ksmbd/mgmt/user_config.h +++ b/fs/ksmbd/mgmt/user_config.h @@ -18,6 +18,7 @@ struct ksmbd_user { size_t passkey_sz; char *passkey; + unsigned int failed_login_count; }; static inline bool user_guest(struct ksmbd_user *user) diff --git a/fs/ksmbd/misc.c b/fs/ksmbd/misc.c index 0b307ca28a19..60e7ac62c917 100644 --- a/fs/ksmbd/misc.c +++ b/fs/ksmbd/misc.c @@ -158,25 +158,18 @@ out: * Return : windows path string or error */ -char *convert_to_nt_pathname(char *filename, char *sharepath) +char *convert_to_nt_pathname(char *filename) { char *ab_pathname; - int len, name_len; - name_len = strlen(filename); - ab_pathname = kmalloc(name_len, GFP_KERNEL); + if (strlen(filename) == 0) + filename = "\\"; + + ab_pathname = kstrdup(filename, GFP_KERNEL); if (!ab_pathname) return NULL; - ab_pathname[0] = '\\'; - ab_pathname[1] = '\0'; - - len = strlen(sharepath); - if (!strncmp(filename, sharepath, len) && name_len != len) { - strscpy(ab_pathname, &filename[len], name_len); - ksmbd_conv_path_to_windows(ab_pathname); - } - + ksmbd_conv_path_to_windows(ab_pathname); return ab_pathname; } @@ -240,7 +233,7 @@ char *ksmbd_extract_sharename(char *treename) * * Return: converted name on success, otherwise NULL */ -char *convert_to_unix_name(struct ksmbd_share_config *share, char *name) +char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name) { int no_slash = 0, name_len, path_len; char *new_name; diff --git a/fs/ksmbd/misc.h b/fs/ksmbd/misc.h index af8717d4d85b..253366bd0951 100644 --- a/fs/ksmbd/misc.h +++ b/fs/ksmbd/misc.h @@ -14,13 +14,13 @@ struct ksmbd_file; int match_pattern(const char *str, size_t len, const char *pattern); int ksmbd_validate_filename(char *filename); int parse_stream_name(char *filename, char **stream_name, int *s_type); -char *convert_to_nt_pathname(char *filename, char *sharepath); +char *convert_to_nt_pathname(char *filename); int get_nlink(struct kstat *st); void ksmbd_conv_path_to_unix(char *path); void ksmbd_strip_last_slash(char *path); void ksmbd_conv_path_to_windows(char *path); char *ksmbd_extract_sharename(char *treename); -char *convert_to_unix_name(struct ksmbd_share_config *share, char *name); +char *convert_to_unix_name(struct ksmbd_share_config *share, const char *name); #define KSMBD_DIR_INFO_ALIGNMENT 8 struct ksmbd_dir_info; diff --git a/fs/ksmbd/oplock.c b/fs/ksmbd/oplock.c index 16b6236d1bd2..f9dae6ef2115 100644 --- a/fs/ksmbd/oplock.c +++ b/fs/ksmbd/oplock.c @@ -1451,26 +1451,47 @@ struct lease_ctx_info *parse_lease_state(void *open_req) */ struct create_context *smb2_find_context_vals(void *open_req, const char *tag) { - char *data_offset; struct create_context *cc; unsigned int next = 0; char *name; struct smb2_create_req *req = (struct smb2_create_req *)open_req; + unsigned int remain_len, name_off, name_len, value_off, value_len, + cc_len; - data_offset = (char *)req + 4 + le32_to_cpu(req->CreateContextsOffset); - cc = (struct create_context *)data_offset; + /* + * CreateContextsOffset and CreateContextsLength are guaranteed to + * be valid because of ksmbd_smb2_check_message(). + */ + cc = (struct create_context *)((char *)req + 4 + + le32_to_cpu(req->CreateContextsOffset)); + remain_len = le32_to_cpu(req->CreateContextsLength); do { - int val; - cc = (struct create_context *)((char *)cc + next); - name = le16_to_cpu(cc->NameOffset) + (char *)cc; - val = le16_to_cpu(cc->NameLength); - if (val < 4) + if (remain_len < offsetof(struct create_context, Buffer)) return ERR_PTR(-EINVAL); - if (memcmp(name, tag, val) == 0) - return cc; next = le32_to_cpu(cc->Next); + name_off = le16_to_cpu(cc->NameOffset); + name_len = le16_to_cpu(cc->NameLength); + value_off = le16_to_cpu(cc->DataOffset); + value_len = le32_to_cpu(cc->DataLength); + cc_len = next ? next : remain_len; + + if ((next & 0x7) != 0 || + next > remain_len || + name_off != offsetof(struct create_context, Buffer) || + name_len < 4 || + name_off + name_len > cc_len || + (value_off & 0x7) != 0 || + (value_off && (value_off < name_off + name_len)) || + ((u64)value_off + value_len > cc_len)) + return ERR_PTR(-EINVAL); + + name = (char *)cc + name_off; + if (memcmp(name, tag, name_len) == 0) + return cc; + + remain_len -= next; } while (next != 0); return NULL; diff --git a/fs/ksmbd/server.c b/fs/ksmbd/server.c index e6a9f6aa47eb..2a2b2135bfde 100644 --- a/fs/ksmbd/server.c +++ b/fs/ksmbd/server.c @@ -584,6 +584,9 @@ static int __init ksmbd_server_init(void) ret = ksmbd_workqueue_init(); if (ret) goto err_crypto_destroy; + + pr_warn_once("The ksmbd server is experimental, use at your own risk.\n"); + return 0; err_crypto_destroy: diff --git a/fs/ksmbd/smb2misc.c b/fs/ksmbd/smb2misc.c index 9aa46bb3e10d..030ca57c3784 100644 --- a/fs/ksmbd/smb2misc.c +++ b/fs/ksmbd/smb2misc.c @@ -80,18 +80,21 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = { }; /* - * Returns the pointer to the beginning of the data area. Length of the data - * area and the offset to it (from the beginning of the smb are also returned. + * Set length of the data area and the offset to arguments. + * if they are invalid, return error. */ -static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) +static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, + struct smb2_hdr *hdr) { + int ret = 0; + *off = 0; *len = 0; /* error reqeusts do not have data area */ if (hdr->Status && hdr->Status != STATUS_MORE_PROCESSING_REQUIRED && (((struct smb2_err_rsp *)hdr)->StructureSize) == SMB2_ERROR_STRUCTURE_SIZE2_LE) - return NULL; + return ret; /* * Following commands have data areas so we have to get the location @@ -165,69 +168,60 @@ static char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) case SMB2_IOCTL: *off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset); *len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount); - break; default: ksmbd_debug(SMB, "no length check for command\n"); break; } - /* - * Invalid length or offset probably means data area is invalid, but - * we have little choice but to ignore the data area in this case. - */ if (*off > 4096) { - ksmbd_debug(SMB, "offset %d too large, data area ignored\n", - *off); - *len = 0; - *off = 0; - } else if (*off < 0) { - ksmbd_debug(SMB, - "negative offset %d to data invalid ignore data area\n", - *off); - *off = 0; - *len = 0; - } else if (*len < 0) { - ksmbd_debug(SMB, - "negative data length %d invalid, data area ignored\n", - *len); - *len = 0; - } else if (*len > 128 * 1024) { - ksmbd_debug(SMB, "data area larger than 128K: %d\n", *len); - *len = 0; + ksmbd_debug(SMB, "offset %d too large\n", *off); + ret = -EINVAL; + } else if ((u64)*off + *len > MAX_STREAM_PROT_LEN) { + ksmbd_debug(SMB, "Request is larger than maximum stream protocol length(%u): %llu\n", + MAX_STREAM_PROT_LEN, (u64)*off + *len); + ret = -EINVAL; } - /* return pointer to beginning of data area, ie offset from SMB start */ - if ((*off != 0) && (*len != 0)) - return (char *)hdr + *off; - else - return NULL; + return ret; } /* * Calculate the size of the SMB message based on the fixed header * portion, the number of word parameters and the data portion of the message. */ -static unsigned int smb2_calc_size(void *buf) +static int smb2_calc_size(void *buf, unsigned int *len) { struct smb2_pdu *pdu = (struct smb2_pdu *)buf; struct smb2_hdr *hdr = &pdu->hdr; - int offset; /* the offset from the beginning of SMB to data area */ - int data_length; /* the length of the variable length data area */ + unsigned int offset; /* the offset from the beginning of SMB to data area */ + unsigned int data_length; /* the length of the variable length data area */ + int ret; + /* Structure Size has already been checked to make sure it is 64 */ - int len = le16_to_cpu(hdr->StructureSize); + *len = le16_to_cpu(hdr->StructureSize); /* * StructureSize2, ie length of fixed parameter area has already * been checked to make sure it is the correct length. */ - len += le16_to_cpu(pdu->StructureSize2); + *len += le16_to_cpu(pdu->StructureSize2); + /* + * StructureSize2 of smb2_lock pdu is set to 48, indicating + * the size of smb2 lock request with single smb2_lock_element + * regardless of number of locks. Subtract single + * smb2_lock_element for correct buffer size check. + */ + if (hdr->Command == SMB2_LOCK) + *len -= sizeof(struct smb2_lock_element); if (has_smb2_data_area[le16_to_cpu(hdr->Command)] == false) goto calc_size_exit; - smb2_get_data_area_len(&offset, &data_length, hdr); - ksmbd_debug(SMB, "SMB2 data length %d offset %d\n", data_length, + ret = smb2_get_data_area_len(&offset, &data_length, hdr); + if (ret) + return ret; + ksmbd_debug(SMB, "SMB2 data length %u offset %u\n", data_length, offset); if (data_length > 0) { @@ -237,16 +231,19 @@ static unsigned int smb2_calc_size(void *buf) * for some commands, typically those with odd StructureSize, * so we must add one to the calculation. */ - if (offset + 1 < len) + if (offset + 1 < *len) { ksmbd_debug(SMB, - "data area offset %d overlaps SMB2 header %d\n", - offset + 1, len); - else - len = offset + data_length; + "data area offset %d overlaps SMB2 header %u\n", + offset + 1, *len); + return -EINVAL; + } + + *len = offset + data_length; } + calc_size_exit: - ksmbd_debug(SMB, "SMB2 len %d\n", len); - return len; + ksmbd_debug(SMB, "SMB2 len %u\n", *len); + return 0; } static inline int smb2_query_info_req_len(struct smb2_query_info_req *h) @@ -287,11 +284,13 @@ static inline int smb2_ioctl_resp_len(struct smb2_ioctl_req *h) le32_to_cpu(h->MaxOutputResponse); } -static int smb2_validate_credit_charge(struct smb2_hdr *hdr) +static int smb2_validate_credit_charge(struct ksmbd_conn *conn, + struct smb2_hdr *hdr) { - int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; - int credit_charge = le16_to_cpu(hdr->CreditCharge); + unsigned int req_len = 0, expect_resp_len = 0, calc_credit_num, max_len; + unsigned short credit_charge = le16_to_cpu(hdr->CreditCharge); void *__hdr = hdr; + int ret; switch (hdr->Command) { case SMB2_QUERY_INFO: @@ -313,21 +312,37 @@ static int smb2_validate_credit_charge(struct smb2_hdr *hdr) req_len = smb2_ioctl_req_len(__hdr); expect_resp_len = smb2_ioctl_resp_len(__hdr); break; - default: + case SMB2_CANCEL: return 0; + default: + req_len = 1; + break; } - credit_charge = max(1, credit_charge); - max_len = max(req_len, expect_resp_len); + credit_charge = max_t(unsigned short, credit_charge, 1); + max_len = max_t(unsigned int, req_len, expect_resp_len); calc_credit_num = DIV_ROUND_UP(max_len, SMB2_MAX_BUFFER_SIZE); if (credit_charge < calc_credit_num) { - pr_err("Insufficient credit charge, given: %d, needed: %d\n", - credit_charge, calc_credit_num); + ksmbd_debug(SMB, "Insufficient credit charge, given: %d, needed: %d\n", + credit_charge, calc_credit_num); + return 1; + } else if (credit_charge > conn->max_credits) { + ksmbd_debug(SMB, "Too large credit charge: %d\n", credit_charge); return 1; } - return 0; + spin_lock(&conn->credits_lock); + if (credit_charge <= conn->total_credits) { + conn->total_credits -= credit_charge; + ret = 0; + } else { + ksmbd_debug(SMB, "Insufficient credits granted, given: %u, granted: %u\n", + credit_charge, conn->total_credits); + ret = 1; + } + spin_unlock(&conn->credits_lock); + return ret; } int ksmbd_smb2_check_message(struct ksmbd_work *work) @@ -385,24 +400,20 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) } } - if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) && - smb2_validate_credit_charge(hdr)) { - work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + if (smb2_calc_size(hdr, &clc_len)) return 1; - } - clc_len = smb2_calc_size(hdr); if (len != clc_len) { - /* server can return one byte more due to implied bcc[0] */ + /* client can return one byte more due to implied bcc[0] */ if (clc_len == len + 1) - return 0; + goto validate_credit; /* * Some windows servers (win2016) will pad also the final * PDU in a compound to 8 bytes. */ if (ALIGN(clc_len, 8) == len) - return 0; + goto validate_credit; /* * windows client also pad up to 8 bytes when compounding. @@ -415,12 +426,9 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) "cli req padded more than expected. Length %d not %d for cmd:%d mid:%llu\n", len, clc_len, command, le64_to_cpu(hdr->MessageId)); - return 0; + goto validate_credit; } - if (command == SMB2_LOCK_HE && len == 88) - return 0; - ksmbd_debug(SMB, "cli req too short, len %d not %d. cmd:%d mid:%llu\n", len, clc_len, command, @@ -429,6 +437,13 @@ int ksmbd_smb2_check_message(struct ksmbd_work *work) return 1; } +validate_credit: + if ((work->conn->vals->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU) && + smb2_validate_credit_charge(work->conn, hdr)) { + work->conn->ops->set_rsp_status(work, STATUS_INVALID_PARAMETER); + return 1; + } + return 0; } diff --git a/fs/ksmbd/smb2ops.c b/fs/ksmbd/smb2ops.c index 197473871aa4..fb6a65d23139 100644 --- a/fs/ksmbd/smb2ops.c +++ b/fs/ksmbd/smb2ops.c @@ -187,11 +187,6 @@ static struct smb_version_cmds smb2_0_server_cmds[NUMBER_OF_SMB2_COMMANDS] = { [SMB2_CHANGE_NOTIFY_HE] = { .proc = smb2_notify}, }; -int init_smb2_0_server(struct ksmbd_conn *conn) -{ - return -EOPNOTSUPP; -} - /** * init_smb2_1_server() - initialize a smb server connection with smb2.1 * command dispatcher @@ -289,6 +284,7 @@ int init_smb3_11_server(struct ksmbd_conn *conn) void init_smb2_max_read_size(unsigned int sz) { + sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE); smb21_server_values.max_read_size = sz; smb30_server_values.max_read_size = sz; smb302_server_values.max_read_size = sz; @@ -297,6 +293,7 @@ void init_smb2_max_read_size(unsigned int sz) void init_smb2_max_write_size(unsigned int sz) { + sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE); smb21_server_values.max_write_size = sz; smb30_server_values.max_write_size = sz; smb302_server_values.max_write_size = sz; @@ -305,6 +302,7 @@ void init_smb2_max_write_size(unsigned int sz) void init_smb2_max_trans_size(unsigned int sz) { + sz = clamp_val(sz, SMB3_MIN_IOSIZE, SMB3_MAX_IOSIZE); smb21_server_values.max_trans_size = sz; smb30_server_values.max_trans_size = sz; smb302_server_values.max_trans_size = sz; diff --git a/fs/ksmbd/smb2pdu.c b/fs/ksmbd/smb2pdu.c index c86164dc70bb..7e448df3f847 100644 --- a/fs/ksmbd/smb2pdu.c +++ b/fs/ksmbd/smb2pdu.c @@ -236,9 +236,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) if (conn->need_neg == false) return -EINVAL; - if (!(conn->dialect >= SMB20_PROT_ID && - conn->dialect <= SMB311_PROT_ID)) - return -EINVAL; rsp_hdr = work->response_buf; @@ -295,22 +292,6 @@ int init_smb2_neg_rsp(struct ksmbd_work *work) return 0; } -static int smb2_consume_credit_charge(struct ksmbd_work *work, - unsigned short credit_charge) -{ - struct ksmbd_conn *conn = work->conn; - unsigned int rsp_credits = 1; - - if (!conn->total_credits) - return 0; - - if (credit_charge > 0) - rsp_credits = credit_charge; - - conn->total_credits -= rsp_credits; - return rsp_credits; -} - /** * smb2_set_rsp_credits() - set number of credits in response buffer * @work: smb work containing smb response buffer @@ -320,49 +301,43 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) struct smb2_hdr *req_hdr = ksmbd_req_buf_next(work); struct smb2_hdr *hdr = ksmbd_resp_buf_next(work); struct ksmbd_conn *conn = work->conn; - unsigned short credits_requested = le16_to_cpu(req_hdr->CreditRequest); - unsigned short credit_charge = 1, credits_granted = 0; - unsigned short aux_max, aux_credits, min_credits; - int rsp_credit_charge; + unsigned short credits_requested; + unsigned short credit_charge, credits_granted = 0; + unsigned short aux_max, aux_credits; - if (hdr->Command == SMB2_CANCEL) - goto out; + if (work->send_no_response) + return 0; - /* get default minimum credits by shifting maximum credits by 4 */ - min_credits = conn->max_credits >> 4; + hdr->CreditCharge = req_hdr->CreditCharge; - if (conn->total_credits >= conn->max_credits) { + if (conn->total_credits > conn->max_credits) { + hdr->CreditRequest = 0; pr_err("Total credits overflow: %d\n", conn->total_credits); - conn->total_credits = min_credits; - } - - rsp_credit_charge = - smb2_consume_credit_charge(work, le16_to_cpu(req_hdr->CreditCharge)); - if (rsp_credit_charge < 0) return -EINVAL; + } - hdr->CreditCharge = cpu_to_le16(rsp_credit_charge); + credit_charge = max_t(unsigned short, + le16_to_cpu(req_hdr->CreditCharge), 1); + credits_requested = max_t(unsigned short, + le16_to_cpu(req_hdr->CreditRequest), 1); - if (credits_requested > 0) { - aux_credits = credits_requested - 1; - aux_max = 32; - if (hdr->Command == SMB2_NEGOTIATE) - aux_max = 0; - aux_credits = (aux_credits < aux_max) ? aux_credits : aux_max; - credits_granted = aux_credits + credit_charge; + /* according to smb2.credits smbtorture, Windows server + * 2016 or later grant up to 8192 credits at once. + * + * TODO: Need to adjuct CreditRequest value according to + * current cpu load + */ + aux_credits = credits_requested - 1; + if (hdr->Command == SMB2_NEGOTIATE) + aux_max = 0; + else + aux_max = conn->max_credits - credit_charge; + aux_credits = min_t(unsigned short, aux_credits, aux_max); + credits_granted = credit_charge + aux_credits; - /* if credits granted per client is getting bigger than default - * minimum credits then we should wrap it up within the limits. - */ - if ((conn->total_credits + credits_granted) > min_credits) - credits_granted = min_credits - conn->total_credits; - /* - * TODO: Need to adjuct CreditRequest value according to - * current cpu load - */ - } else if (conn->total_credits == 0) { - credits_granted = 1; - } + if (conn->max_credits - conn->total_credits < credits_granted) + credits_granted = conn->max_credits - + conn->total_credits; conn->total_credits += credits_granted; work->credits_granted += credits_granted; @@ -371,7 +346,6 @@ int smb2_set_rsp_credits(struct ksmbd_work *work) /* Update CreditRequest in last request */ hdr->CreditRequest = cpu_to_le16(work->credits_granted); } -out: ksmbd_debug(SMB, "credits: requested[%d] granted[%d] total_granted[%d]\n", credits_requested, credits_granted, @@ -433,7 +407,7 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work) work->compound_pfid = KSMBD_NO_FID; } memset((char *)rsp_hdr + 4, 0, sizeof(struct smb2_hdr) + 2); - rsp_hdr->ProtocolId = rcv_hdr->ProtocolId; + rsp_hdr->ProtocolId = SMB2_PROTO_NUMBER; rsp_hdr->StructureSize = SMB2_HEADER_STRUCTURE_SIZE; rsp_hdr->Command = rcv_hdr->Command; @@ -459,13 +433,28 @@ static void init_chained_smb2_rsp(struct ksmbd_work *work) bool is_chained_smb2_message(struct ksmbd_work *work) { struct smb2_hdr *hdr = work->request_buf; - unsigned int len; + unsigned int len, next_cmd; if (hdr->ProtocolId != SMB2_PROTO_NUMBER) return false; hdr = ksmbd_req_buf_next(work); - if (le32_to_cpu(hdr->NextCommand) > 0) { + next_cmd = le32_to_cpu(hdr->NextCommand); + if (next_cmd > 0) { + if ((u64)work->next_smb2_rcv_hdr_off + next_cmd + + __SMB2_HEADER_STRUCTURE_SIZE > + get_rfc1002_len(work->request_buf)) { + pr_err("next command(%u) offset exceeds smb msg size\n", + next_cmd); + return false; + } + + if ((u64)get_rfc1002_len(work->response_buf) + MAX_CIFS_SMALL_BUFFER_SIZE > + work->response_sz) { + pr_err("next response offset exceeds response buffer size\n"); + return false; + } + ksmbd_debug(SMB, "got SMB2 chained command\n"); init_chained_smb2_rsp(work); return true; @@ -535,7 +524,7 @@ int smb2_allocate_rsp_buf(struct ksmbd_work *work) { struct smb2_hdr *hdr = work->request_buf; size_t small_sz = MAX_CIFS_SMALL_BUFFER_SIZE; - size_t large_sz = work->conn->vals->max_trans_size + MAX_SMB2_HDR_SIZE; + size_t large_sz = small_sz + work->conn->vals->max_trans_size; size_t sz = small_sz; int cmd = le16_to_cpu(hdr->Command); @@ -634,7 +623,7 @@ static char * smb2_get_name(struct ksmbd_share_config *share, const char *src, const int maxlen, struct nls_table *local_nls) { - char *name, *unixname; + char *name; name = smb_strndup_from_utf16(src, maxlen, 1, local_nls); if (IS_ERR(name)) { @@ -642,19 +631,9 @@ smb2_get_name(struct ksmbd_share_config *share, const char *src, return name; } - /* change it to absolute unix name */ ksmbd_conv_path_to_unix(name); ksmbd_strip_last_slash(name); - - unixname = convert_to_unix_name(share, name); - kfree(name); - if (!unixname) { - pr_err("can not convert absolute name\n"); - return ERR_PTR(-ENOMEM); - } - - ksmbd_debug(SMB, "absolute name = %s\n", unixname); - return unixname; + return name; } int setup_async_work(struct ksmbd_work *work, void (*fn)(void **), void **arg) @@ -1068,6 +1047,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) struct smb2_negotiate_req *req = work->request_buf; struct smb2_negotiate_rsp *rsp = work->response_buf; int rc = 0; + unsigned int smb2_buf_len, smb2_neg_size; __le32 status; ksmbd_debug(SMB, "Received negotiate request\n"); @@ -1085,6 +1065,44 @@ int smb2_handle_negotiate(struct ksmbd_work *work) goto err_out; } + smb2_buf_len = get_rfc1002_len(work->request_buf); + smb2_neg_size = offsetof(struct smb2_negotiate_req, Dialects) - 4; + if (smb2_neg_size > smb2_buf_len) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + + if (conn->dialect == SMB311_PROT_ID) { + unsigned int nego_ctxt_off = le32_to_cpu(req->NegotiateContextOffset); + + if (smb2_buf_len < nego_ctxt_off) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + + if (smb2_neg_size > nego_ctxt_off) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + + if (smb2_neg_size + le16_to_cpu(req->DialectCount) * sizeof(__le16) > + nego_ctxt_off) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + } else { + if (smb2_neg_size + le16_to_cpu(req->DialectCount) * sizeof(__le16) > + smb2_buf_len) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + rc = -EINVAL; + goto err_out; + } + } + conn->cli_cap = le32_to_cpu(req->Capabilities); switch (conn->dialect) { case SMB311_PROT_ID: @@ -1128,13 +1146,6 @@ int smb2_handle_negotiate(struct ksmbd_work *work) case SMB21_PROT_ID: init_smb2_1_server(conn); break; - case SMB20_PROT_ID: - rc = init_smb2_0_server(conn); - if (rc) { - rsp->hdr.Status = STATUS_NOT_SUPPORTED; - goto err_out; - } - break; case SMB2X_PROT_ID: case BAD_PROT_ID: default: @@ -1153,11 +1164,9 @@ int smb2_handle_negotiate(struct ksmbd_work *work) rsp->MaxReadSize = cpu_to_le32(conn->vals->max_read_size); rsp->MaxWriteSize = cpu_to_le32(conn->vals->max_write_size); - if (conn->dialect > SMB20_PROT_ID) { - memcpy(conn->ClientGUID, req->ClientGUID, - SMB2_CLIENT_GUID_SIZE); - conn->cli_sec_mode = le16_to_cpu(req->SecurityMode); - } + memcpy(conn->ClientGUID, req->ClientGUID, + SMB2_CLIENT_GUID_SIZE); + conn->cli_sec_mode = le16_to_cpu(req->SecurityMode); rsp->StructureSize = cpu_to_le16(65); rsp->DialectRevision = cpu_to_le16(conn->dialect); @@ -1248,19 +1257,13 @@ static int generate_preauth_hash(struct ksmbd_work *work) return 0; } -static int decode_negotiation_token(struct ksmbd_work *work, - struct negotiate_message *negblob) +static int decode_negotiation_token(struct ksmbd_conn *conn, + struct negotiate_message *negblob, + size_t sz) { - struct ksmbd_conn *conn = work->conn; - struct smb2_sess_setup_req *req; - int sz; - if (!conn->use_spnego) return -EINVAL; - req = work->request_buf; - sz = le16_to_cpu(req->SecurityBufferLength); - if (ksmbd_decode_negTokenInit((char *)negblob, sz, conn)) { if (ksmbd_decode_negTokenTarg((char *)negblob, sz, conn)) { conn->auth_mechs |= KSMBD_AUTH_NTLMSSP; @@ -1272,9 +1275,9 @@ static int decode_negotiation_token(struct ksmbd_work *work, } static int ntlm_negotiate(struct ksmbd_work *work, - struct negotiate_message *negblob) + struct negotiate_message *negblob, + size_t negblob_len) { - struct smb2_sess_setup_req *req = work->request_buf; struct smb2_sess_setup_rsp *rsp = work->response_buf; struct challenge_message *chgblob; unsigned char *spnego_blob = NULL; @@ -1283,8 +1286,7 @@ static int ntlm_negotiate(struct ksmbd_work *work, int sz, rc; ksmbd_debug(SMB, "negotiate phase\n"); - sz = le16_to_cpu(req->SecurityBufferLength); - rc = ksmbd_decode_ntlmssp_neg_blob(negblob, sz, work->sess); + rc = ksmbd_decode_ntlmssp_neg_blob(negblob, negblob_len, work->sess); if (rc) return rc; @@ -1352,12 +1354,23 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn, struct authenticate_message *authblob; struct ksmbd_user *user; char *name; - int sz; + unsigned int auth_msg_len, name_off, name_len, secbuf_len; + secbuf_len = le16_to_cpu(req->SecurityBufferLength); + if (secbuf_len < sizeof(struct authenticate_message)) { + ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len); + return NULL; + } authblob = user_authblob(conn, req); - sz = le32_to_cpu(authblob->UserName.BufferOffset); - name = smb_strndup_from_utf16((const char *)authblob + sz, - le16_to_cpu(authblob->UserName.Length), + name_off = le32_to_cpu(authblob->UserName.BufferOffset); + name_len = le16_to_cpu(authblob->UserName.Length); + auth_msg_len = le16_to_cpu(req->SecurityBufferOffset) + secbuf_len; + + if (auth_msg_len < (u64)name_off + name_len) + return NULL; + + name = smb_strndup_from_utf16((const char *)authblob + name_off, + name_len, true, conn->local_nls); if (IS_ERR(name)) { @@ -1499,11 +1512,9 @@ binding_session: } } - if (conn->dialect > SMB20_PROT_ID) { - if (!ksmbd_conn_lookup_dialect(conn)) { - pr_err("fail to verify the dialect\n"); - return -ENOENT; - } + if (!ksmbd_conn_lookup_dialect(conn)) { + pr_err("fail to verify the dialect\n"); + return -ENOENT; } return 0; } @@ -1585,11 +1596,9 @@ static int krb5_authenticate(struct ksmbd_work *work) } } - if (conn->dialect > SMB20_PROT_ID) { - if (!ksmbd_conn_lookup_dialect(conn)) { - pr_err("fail to verify the dialect\n"); - return -ENOENT; - } + if (!ksmbd_conn_lookup_dialect(conn)) { + pr_err("fail to verify the dialect\n"); + return -ENOENT; } return 0; } @@ -1607,6 +1616,7 @@ int smb2_sess_setup(struct ksmbd_work *work) struct smb2_sess_setup_rsp *rsp = work->response_buf; struct ksmbd_session *sess; struct negotiate_message *negblob; + unsigned int negblob_len, negblob_off; int rc = 0; ksmbd_debug(SMB, "Received request for session setup\n"); @@ -1687,10 +1697,16 @@ int smb2_sess_setup(struct ksmbd_work *work) if (sess->state == SMB2_SESSION_EXPIRED) sess->state = SMB2_SESSION_IN_PROGRESS; + negblob_off = le16_to_cpu(req->SecurityBufferOffset); + negblob_len = le16_to_cpu(req->SecurityBufferLength); + if (negblob_off < (offsetof(struct smb2_sess_setup_req, Buffer) - 4) || + negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) + return -EINVAL; + negblob = (struct negotiate_message *)((char *)&req->hdr.ProtocolId + - le16_to_cpu(req->SecurityBufferOffset)); + negblob_off); - if (decode_negotiation_token(work, negblob) == 0) { + if (decode_negotiation_token(conn, negblob, negblob_len) == 0) { if (conn->mechToken) negblob = (struct negotiate_message *)conn->mechToken; } @@ -1714,7 +1730,7 @@ int smb2_sess_setup(struct ksmbd_work *work) sess->Preauth_HashValue = NULL; } else if (conn->preferred_auth_mech == KSMBD_AUTH_NTLMSSP) { if (negblob->MessageType == NtLmNegotiate) { - rc = ntlm_negotiate(work, negblob); + rc = ntlm_negotiate(work, negblob, negblob_len); if (rc) goto out_err; rsp->hdr.Status = @@ -1774,9 +1790,30 @@ out_err: conn->mechToken = NULL; } - if (rc < 0 && sess) { - ksmbd_session_destroy(sess); - work->sess = NULL; + if (rc < 0) { + /* + * SecurityBufferOffset should be set to zero + * in session setup error response. + */ + rsp->SecurityBufferOffset = 0; + + if (sess) { + bool try_delay = false; + + /* + * To avoid dictionary attacks (repeated session setups rapidly sent) to + * connect to server, ksmbd make a delay of a 5 seconds on session setup + * failure to make it harder to send enough random connection requests + * to break into a server. + */ + if (sess->user && sess->user->flags & KSMBD_USER_FLAG_DELAY_SESSION) + try_delay = true; + + ksmbd_session_destroy(sess); + work->sess = NULL; + if (try_delay) + ssleep(5); + } } return rc; @@ -2103,16 +2140,22 @@ out: * smb2_set_ea() - handler for setting extended attributes using set * info command * @eabuf: set info command buffer + * @buf_len: set info command buffer length * @path: dentry path for get ea * * Return: 0 on success, otherwise error */ -static int smb2_set_ea(struct smb2_ea_info *eabuf, struct path *path) +static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len, + struct path *path) { struct user_namespace *user_ns = mnt_user_ns(path->mnt); char *attr_name = NULL, *value; int rc = 0; - int next = 0; + unsigned int next = 0; + + if (buf_len < sizeof(struct smb2_ea_info) + eabuf->EaNameLength + + le16_to_cpu(eabuf->EaValueLength)) + return -EINVAL; attr_name = kmalloc(XATTR_NAME_MAX + 1, GFP_KERNEL); if (!attr_name) @@ -2177,7 +2220,13 @@ static int smb2_set_ea(struct smb2_ea_info *eabuf, struct path *path) next: next = le32_to_cpu(eabuf->NextEntryOffset); + if (next == 0 || buf_len < next) + break; + buf_len -= next; eabuf = (struct smb2_ea_info *)((char *)eabuf + next); + if (next < (u32)eabuf->EaNameLength + le16_to_cpu(eabuf->EaValueLength)) + break; + } while (next != 0); kfree(attr_name); @@ -2348,7 +2397,7 @@ static int smb2_creat(struct ksmbd_work *work, struct path *path, char *name, return rc; } - rc = ksmbd_vfs_kern_path(name, 0, path, 0); + rc = ksmbd_vfs_kern_path(work, name, 0, path, 0); if (rc) { pr_err("cannot get linux path (%s), err = %d\n", name, rc); @@ -2377,6 +2426,10 @@ static int smb2_create_sd_buffer(struct ksmbd_work *work, ksmbd_debug(SMB, "Set ACLs using SMB2_CREATE_SD_BUFFER context\n"); sd_buf = (struct create_sd_buf_req *)context; + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_sd_buf_req)) + return -EINVAL; return set_info_sec(work->conn, work->tcon, path, &sd_buf->ntsd, le32_to_cpu(sd_buf->ccontext.DataLength), true); } @@ -2423,7 +2476,7 @@ int smb2_open(struct ksmbd_work *work) struct oplock_info *opinfo; __le32 *next_ptr = NULL; int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0; - int rc = 0, len = 0; + int rc = 0; int contxt_cnt = 0, query_disk_id = 0; int maximal_access_ctxt = 0, posix_ctxt = 0; int s_type = 0; @@ -2495,17 +2548,11 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } } else { - len = strlen(share->path); - ksmbd_debug(SMB, "share path len %d\n", len); - name = kmalloc(len + 1, GFP_KERNEL); + name = kstrdup("", GFP_KERNEL); if (!name) { - rsp->hdr.Status = STATUS_NO_MEMORY; rc = -ENOMEM; goto err_out1; } - - memcpy(name, share->path, len); - *(name + len) = '\0'; } req_op_level = req->RequestedOplockLevel; @@ -2577,6 +2624,12 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } else if (context) { ea_buf = (struct create_ea_buf_req *)context; + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_ea_buf_req)) { + rc = -EINVAL; + goto err_out1; + } if (req->CreateOptions & FILE_NO_EA_KNOWLEDGE_LE) { rsp->hdr.Status = STATUS_ACCESS_DENIED; rc = -EACCES; @@ -2615,6 +2668,12 @@ int smb2_open(struct ksmbd_work *work) } else if (context) { struct create_posix *posix = (struct create_posix *)context; + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_posix)) { + rc = -EINVAL; + goto err_out1; + } ksmbd_debug(SMB, "get posix context\n"); posix_mode = le32_to_cpu(posix->Mode); @@ -2628,13 +2687,9 @@ int smb2_open(struct ksmbd_work *work) goto err_out1; } - if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) { - /* - * On delete request, instead of following up, need to - * look the current entity - */ - rc = ksmbd_vfs_kern_path(name, 0, &path, 1); - if (!rc) { + rc = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, 1); + if (!rc) { + if (req->CreateOptions & FILE_DELETE_ON_CLOSE_LE) { /* * If file exists with under flags, return access * denied error. @@ -2653,34 +2708,16 @@ int smb2_open(struct ksmbd_work *work) path_put(&path); goto err_out; } - } - } else { - if (test_share_config_flag(work->tcon->share_conf, - KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) { - /* - * Use LOOKUP_FOLLOW to follow the path of - * symlink in path buildup - */ - rc = ksmbd_vfs_kern_path(name, LOOKUP_FOLLOW, &path, 1); - if (rc) { /* Case for broken link ?*/ - rc = ksmbd_vfs_kern_path(name, 0, &path, 1); - } - } else { - rc = ksmbd_vfs_kern_path(name, 0, &path, 1); - if (!rc && d_is_symlink(path.dentry)) { - rc = -EACCES; - path_put(&path); - goto err_out; - } + } else if (d_is_symlink(path.dentry)) { + rc = -EACCES; + path_put(&path); + goto err_out; } } if (rc) { - if (rc == -EACCES) { - ksmbd_debug(SMB, - "User does not have right permission\n"); + if (rc != -ENOENT) goto err_out; - } ksmbd_debug(SMB, "can not get linux path for %s, rc = %d\n", name, rc); rc = 0; @@ -2786,7 +2823,15 @@ int smb2_open(struct ksmbd_work *work) created = true; user_ns = mnt_user_ns(path.mnt); if (ea_buf) { - rc = smb2_set_ea(&ea_buf->ea, &path); + if (le32_to_cpu(ea_buf->ccontext.DataLength) < + sizeof(struct smb2_ea_info)) { + rc = -EINVAL; + goto err_out; + } + + rc = smb2_set_ea(&ea_buf->ea, + le32_to_cpu(ea_buf->ccontext.DataLength), + &path); if (rc == -EOPNOTSUPP) rc = 0; else if (rc) @@ -3019,9 +3064,16 @@ int smb2_open(struct ksmbd_work *work) rc = PTR_ERR(az_req); goto err_out; } else if (az_req) { - loff_t alloc_size = le64_to_cpu(az_req->AllocationSize); + loff_t alloc_size; int err; + if (le16_to_cpu(az_req->ccontext.DataOffset) + + le32_to_cpu(az_req->ccontext.DataLength) < + sizeof(struct create_alloc_size_req)) { + rc = -EINVAL; + goto err_out; + } + alloc_size = le64_to_cpu(az_req->AllocationSize); ksmbd_debug(SMB, "request smb2 create allocate size : %llu\n", alloc_size); @@ -3176,7 +3228,7 @@ err_out1: rsp->hdr.Status = STATUS_INVALID_PARAMETER; else if (rc == -EOPNOTSUPP) rsp->hdr.Status = STATUS_NOT_SUPPORTED; - else if (rc == -EACCES || rc == -ESTALE) + else if (rc == -EACCES || rc == -ESTALE || rc == -EXDEV) rsp->hdr.Status = STATUS_ACCESS_DENIED; else if (rc == -ENOENT) rsp->hdr.Status = STATUS_OBJECT_NAME_INVALID; @@ -3742,6 +3794,24 @@ static int verify_info_level(int info_level) return 0; } +static int smb2_calc_max_out_buf_len(struct ksmbd_work *work, + unsigned short hdr2_len, + unsigned int out_buf_len) +{ + int free_len; + + if (out_buf_len > work->conn->vals->max_trans_size) + return -EINVAL; + + free_len = (int)(work->response_sz - + (get_rfc1002_len(work->response_buf) + 4)) - + hdr2_len; + if (free_len < 0) + return -EINVAL; + + return min_t(int, out_buf_len, free_len); +} + int smb2_query_dir(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; @@ -3818,9 +3888,13 @@ int smb2_query_dir(struct ksmbd_work *work) memset(&d_info, 0, sizeof(struct ksmbd_dir_info)); d_info.wptr = (char *)rsp->Buffer; d_info.rptr = (char *)rsp->Buffer; - d_info.out_buf_len = (work->response_sz - (get_rfc1002_len(rsp_org) + 4)); - d_info.out_buf_len = min_t(int, d_info.out_buf_len, le32_to_cpu(req->OutputBufferLength)) - - sizeof(struct smb2_query_directory_rsp); + d_info.out_buf_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (d_info.out_buf_len < 0) { + rc = -EINVAL; + goto err_out; + } d_info.flags = srch_flag; /* @@ -4041,6 +4115,10 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, path = &fp->filp->f_path; /* single EA entry is requested with given user.* name */ if (req->InputBufferLength) { + if (le32_to_cpu(req->InputBufferLength) < + sizeof(struct smb2_ea_info_req)) + return -EINVAL; + ea_req = (struct smb2_ea_info_req *)req->Buffer; } else { /* need to send all EAs, if no specific EA is requested*/ @@ -4050,12 +4128,11 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, le32_to_cpu(req->Flags)); } - buf_free_len = work->response_sz - - (get_rfc1002_len(rsp_org) + 4) - - sizeof(struct smb2_query_info_rsp); - - if (le32_to_cpu(req->OutputBufferLength) < buf_free_len) - buf_free_len = le32_to_cpu(req->OutputBufferLength); + buf_free_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (buf_free_len < 0) + return -EINVAL; rc = ksmbd_vfs_listxattr(path->dentry, &xattr_list); if (rc < 0) { @@ -4186,7 +4263,7 @@ static void get_file_access_info(struct smb2_query_info_rsp *rsp, static int get_file_basic_info(struct smb2_query_info_rsp *rsp, struct ksmbd_file *fp, void *rsp_org) { - struct smb2_file_all_info *basic_info; + struct smb2_file_basic_info *basic_info; struct kstat stat; u64 time; @@ -4196,7 +4273,7 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, return -EACCES; } - basic_info = (struct smb2_file_all_info *)rsp->Buffer; + basic_info = (struct smb2_file_basic_info *)rsp->Buffer; generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), &stat); basic_info->CreationTime = cpu_to_le64(fp->create_time); @@ -4209,9 +4286,8 @@ static int get_file_basic_info(struct smb2_query_info_rsp *rsp, basic_info->Attributes = fp->f_ci->m_fattr; basic_info->Pad1 = 0; rsp->OutputBufferLength = - cpu_to_le32(offsetof(struct smb2_file_all_info, AllocationSize)); - inc_rfc1001_len(rsp_org, offsetof(struct smb2_file_all_info, - AllocationSize)); + cpu_to_le32(sizeof(struct smb2_file_basic_info)); + inc_rfc1001_len(rsp_org, sizeof(struct smb2_file_basic_info)); return 0; } @@ -4288,8 +4364,7 @@ static int get_file_all_info(struct ksmbd_work *work, return -EACCES; } - filename = convert_to_nt_pathname(fp->filename, - work->tcon->share_conf->path); + filename = convert_to_nt_pathname(fp->filename); if (!filename) return -ENOMEM; @@ -4368,6 +4443,8 @@ static void get_file_stream_info(struct ksmbd_work *work, struct path *path = &fp->filp->f_path; ssize_t xattr_list_len; int nbytes = 0, streamlen, stream_name_len, next, idx = 0; + int buf_free_len; + struct smb2_query_info_req *req = ksmbd_req_buf_next(work); generic_fillattr(file_mnt_user_ns(fp->filp), file_inode(fp->filp), &stat); @@ -4381,6 +4458,12 @@ static void get_file_stream_info(struct ksmbd_work *work, goto out; } + buf_free_len = + smb2_calc_max_out_buf_len(work, 8, + le32_to_cpu(req->OutputBufferLength)); + if (buf_free_len < 0) + goto out; + while (idx < xattr_list_len) { stream_name = xattr_list + idx; streamlen = strlen(stream_name); @@ -4405,6 +4488,10 @@ static void get_file_stream_info(struct ksmbd_work *work, streamlen = snprintf(stream_buf, streamlen + 1, ":%s", &stream_name[XATTR_NAME_STREAM_LEN]); + next = sizeof(struct smb2_file_stream_info) + streamlen * 2; + if (next > buf_free_len) + break; + file_info = (struct smb2_file_stream_info *)&rsp->Buffer[nbytes]; streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, stream_buf, streamlen, @@ -4415,22 +4502,21 @@ static void get_file_stream_info(struct ksmbd_work *work, file_info->StreamSize = cpu_to_le64(stream_name_len); file_info->StreamAllocationSize = cpu_to_le64(stream_name_len); - next = sizeof(struct smb2_file_stream_info) + streamlen; nbytes += next; + buf_free_len -= next; file_info->NextEntryOffset = cpu_to_le32(next); } - if (nbytes) { + if (!S_ISDIR(stat.mode) && + buf_free_len >= sizeof(struct smb2_file_stream_info) + 7 * 2) { file_info = (struct smb2_file_stream_info *) &rsp->Buffer[nbytes]; streamlen = smbConvertToUTF16((__le16 *)file_info->StreamName, "::$DATA", 7, conn->local_nls, 0); streamlen *= 2; file_info->StreamNameLength = cpu_to_le32(streamlen); - file_info->StreamSize = S_ISDIR(stat.mode) ? 0 : - cpu_to_le64(stat.size); - file_info->StreamAllocationSize = S_ISDIR(stat.mode) ? 0 : - cpu_to_le64(stat.size); + file_info->StreamSize = 0; + file_info->StreamAllocationSize = 0; nbytes += sizeof(struct smb2_file_stream_info) + streamlen; } @@ -4745,12 +4831,8 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, struct path path; int rc = 0, len; int fs_infoclass_size = 0; - int lookup_flags = 0; - - if (test_share_config_flag(share, KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) - lookup_flags = LOOKUP_FOLLOW; - rc = ksmbd_vfs_kern_path(share->path, lookup_flags, &path, 0); + rc = kern_path(share->path, LOOKUP_NO_SYMLINKS, &path); if (rc) { pr_err("cannot create vfs path\n"); return -EIO; @@ -5299,7 +5381,7 @@ static int smb2_rename(struct ksmbd_work *work, goto out; len = strlen(new_name); - if (new_name[len - 1] != '/') { + if (len > 0 && new_name[len - 1] != '/') { pr_err("not allow base filename in rename\n"); rc = -ESHARE; goto out; @@ -5327,11 +5409,14 @@ static int smb2_rename(struct ksmbd_work *work, } ksmbd_debug(SMB, "new name %s\n", new_name); - rc = ksmbd_vfs_kern_path(new_name, 0, &path, 1); - if (rc) + rc = ksmbd_vfs_kern_path(work, new_name, LOOKUP_NO_SYMLINKS, &path, 1); + if (rc) { + if (rc != -ENOENT) + goto out; file_present = false; - else + } else { path_put(&path); + } if (ksmbd_share_veto_filename(share, new_name)) { rc = -ENOENT; @@ -5371,7 +5456,7 @@ out: static int smb2_create_link(struct ksmbd_work *work, struct ksmbd_share_config *share, struct smb2_file_link_info *file_info, - struct file *filp, + unsigned int buf_len, struct file *filp, struct nls_table *local_nls) { char *link_name = NULL, *target_name = NULL, *pathname = NULL; @@ -5379,6 +5464,10 @@ static int smb2_create_link(struct ksmbd_work *work, bool file_present = true; int rc; + if (buf_len < (u64)sizeof(struct smb2_file_link_info) + + le32_to_cpu(file_info->FileNameLength)) + return -EINVAL; + ksmbd_debug(SMB, "setting FILE_LINK_INFORMATION\n"); pathname = kmalloc(PATH_MAX, GFP_KERNEL); if (!pathname) @@ -5401,11 +5490,14 @@ static int smb2_create_link(struct ksmbd_work *work, } ksmbd_debug(SMB, "target name is %s\n", target_name); - rc = ksmbd_vfs_kern_path(link_name, 0, &path, 0); - if (rc) + rc = ksmbd_vfs_kern_path(work, link_name, LOOKUP_NO_SYMLINKS, &path, 0); + if (rc) { + if (rc != -ENOENT) + goto out; file_present = false; - else + } else { path_put(&path); + } if (file_info->ReplaceIfExists) { if (file_present) { @@ -5435,12 +5527,11 @@ out: return rc; } -static int set_file_basic_info(struct ksmbd_file *fp, char *buf, +static int set_file_basic_info(struct ksmbd_file *fp, + struct smb2_file_basic_info *file_info, struct ksmbd_share_config *share) { - struct smb2_file_all_info *file_info; struct iattr attrs; - struct timespec64 ctime; struct file *filp; struct inode *inode; struct user_namespace *user_ns; @@ -5449,7 +5540,6 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, if (!(fp->daccess & FILE_WRITE_ATTRIBUTES_LE)) return -EACCES; - file_info = (struct smb2_file_all_info *)buf; attrs.ia_valid = 0; filp = fp->filp; inode = file_inode(filp); @@ -5463,13 +5553,11 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, attrs.ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET); } - if (file_info->ChangeTime) { + attrs.ia_valid |= ATTR_CTIME; + if (file_info->ChangeTime) attrs.ia_ctime = ksmbd_NTtimeToUnix(file_info->ChangeTime); - ctime = attrs.ia_ctime; - attrs.ia_valid |= ATTR_CTIME; - } else { - ctime = inode->i_ctime; - } + else + attrs.ia_ctime = inode->i_ctime; if (file_info->LastWriteTime) { attrs.ia_mtime = ksmbd_NTtimeToUnix(file_info->LastWriteTime); @@ -5515,18 +5603,17 @@ static int set_file_basic_info(struct ksmbd_file *fp, char *buf, return -EACCES; inode_lock(inode); + inode->i_ctime = attrs.ia_ctime; + attrs.ia_valid &= ~ATTR_CTIME; rc = notify_change(user_ns, dentry, &attrs, NULL); - if (!rc) { - inode->i_ctime = ctime; - mark_inode_dirty(inode); - } inode_unlock(inode); } return rc; } static int set_file_allocation_info(struct ksmbd_work *work, - struct ksmbd_file *fp, char *buf) + struct ksmbd_file *fp, + struct smb2_file_alloc_info *file_alloc_info) { /* * TODO : It's working fine only when store dos attributes @@ -5534,7 +5621,6 @@ static int set_file_allocation_info(struct ksmbd_work *work, * properly with any smb.conf option */ - struct smb2_file_alloc_info *file_alloc_info; loff_t alloc_blks; struct inode *inode; int rc; @@ -5542,7 +5628,6 @@ static int set_file_allocation_info(struct ksmbd_work *work, if (!(fp->daccess & FILE_WRITE_DATA_LE)) return -EACCES; - file_alloc_info = (struct smb2_file_alloc_info *)buf; alloc_blks = (le64_to_cpu(file_alloc_info->AllocationSize) + 511) >> 9; inode = file_inode(fp->filp); @@ -5565,7 +5650,7 @@ static int set_file_allocation_info(struct ksmbd_work *work, * inode size is retained by backup inode size. */ size = i_size_read(inode); - rc = ksmbd_vfs_truncate(work, NULL, fp, alloc_blks * 512); + rc = ksmbd_vfs_truncate(work, fp, alloc_blks * 512); if (rc) { pr_err("truncate failed! filename : %s, err %d\n", fp->filename, rc); @@ -5578,9 +5663,8 @@ static int set_file_allocation_info(struct ksmbd_work *work, } static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, - char *buf) + struct smb2_file_eof_info *file_eof_info) { - struct smb2_file_eof_info *file_eof_info; loff_t newsize; struct inode *inode; int rc; @@ -5588,7 +5672,6 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, if (!(fp->daccess & FILE_WRITE_DATA_LE)) return -EACCES; - file_eof_info = (struct smb2_file_eof_info *)buf; newsize = le64_to_cpu(file_eof_info->EndOfFile); inode = file_inode(fp->filp); @@ -5602,7 +5685,7 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, if (inode->i_sb->s_magic != MSDOS_SUPER_MAGIC) { ksmbd_debug(SMB, "filename : %s truncated to newsize %lld\n", fp->filename, newsize); - rc = ksmbd_vfs_truncate(work, NULL, fp, newsize); + rc = ksmbd_vfs_truncate(work, fp, newsize); if (rc) { ksmbd_debug(SMB, "truncate failed! filename : %s err %d\n", fp->filename, rc); @@ -5615,7 +5698,8 @@ static int set_end_of_file_info(struct ksmbd_work *work, struct ksmbd_file *fp, } static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, - char *buf) + struct smb2_file_rename_info *rename_info, + unsigned int buf_len) { struct user_namespace *user_ns; struct ksmbd_file *parent_fp; @@ -5628,6 +5712,10 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, return -EACCES; } + if (buf_len < (u64)sizeof(struct smb2_file_rename_info) + + le32_to_cpu(rename_info->FileNameLength)) + return -EINVAL; + user_ns = file_mnt_user_ns(fp->filp); if (ksmbd_stream_fd(fp)) goto next; @@ -5650,14 +5738,13 @@ static int set_rename_info(struct ksmbd_work *work, struct ksmbd_file *fp, } } next: - return smb2_rename(work, fp, user_ns, - (struct smb2_file_rename_info *)buf, + return smb2_rename(work, fp, user_ns, rename_info, work->sess->conn->local_nls); } -static int set_file_disposition_info(struct ksmbd_file *fp, char *buf) +static int set_file_disposition_info(struct ksmbd_file *fp, + struct smb2_file_disposition_info *file_info) { - struct smb2_file_disposition_info *file_info; struct inode *inode; if (!(fp->daccess & FILE_DELETE_LE)) { @@ -5666,7 +5753,6 @@ static int set_file_disposition_info(struct ksmbd_file *fp, char *buf) } inode = file_inode(fp->filp); - file_info = (struct smb2_file_disposition_info *)buf; if (file_info->DeletePending) { if (S_ISDIR(inode->i_mode) && ksmbd_vfs_empty_dir(fp) == -ENOTEMPTY) @@ -5678,15 +5764,14 @@ static int set_file_disposition_info(struct ksmbd_file *fp, char *buf) return 0; } -static int set_file_position_info(struct ksmbd_file *fp, char *buf) +static int set_file_position_info(struct ksmbd_file *fp, + struct smb2_file_pos_info *file_info) { - struct smb2_file_pos_info *file_info; loff_t current_byte_offset; unsigned long sector_size; struct inode *inode; inode = file_inode(fp->filp); - file_info = (struct smb2_file_pos_info *)buf; current_byte_offset = le64_to_cpu(file_info->CurrentByteOffset); sector_size = inode->i_sb->s_blocksize; @@ -5702,12 +5787,11 @@ static int set_file_position_info(struct ksmbd_file *fp, char *buf) return 0; } -static int set_file_mode_info(struct ksmbd_file *fp, char *buf) +static int set_file_mode_info(struct ksmbd_file *fp, + struct smb2_file_mode_info *file_info) { - struct smb2_file_mode_info *file_info; __le32 mode; - file_info = (struct smb2_file_mode_info *)buf; mode = file_info->Mode; if ((mode & ~FILE_MODE_INFO_MASK) || @@ -5737,40 +5821,74 @@ static int set_file_mode_info(struct ksmbd_file *fp, char *buf) * TODO: need to implement an error handling for STATUS_INFO_LENGTH_MISMATCH */ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, - int info_class, char *buf, + struct smb2_set_info_req *req, struct ksmbd_share_config *share) { - switch (info_class) { + unsigned int buf_len = le32_to_cpu(req->BufferLength); + + switch (req->FileInfoClass) { case FILE_BASIC_INFORMATION: - return set_file_basic_info(fp, buf, share); + { + if (buf_len < sizeof(struct smb2_file_basic_info)) + return -EINVAL; + return set_file_basic_info(fp, (struct smb2_file_basic_info *)req->Buffer, share); + } case FILE_ALLOCATION_INFORMATION: - return set_file_allocation_info(work, fp, buf); + { + if (buf_len < sizeof(struct smb2_file_alloc_info)) + return -EINVAL; + return set_file_allocation_info(work, fp, + (struct smb2_file_alloc_info *)req->Buffer); + } case FILE_END_OF_FILE_INFORMATION: - return set_end_of_file_info(work, fp, buf); + { + if (buf_len < sizeof(struct smb2_file_eof_info)) + return -EINVAL; + return set_end_of_file_info(work, fp, + (struct smb2_file_eof_info *)req->Buffer); + } case FILE_RENAME_INFORMATION: + { if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, "User does not have write permission\n"); return -EACCES; } - return set_rename_info(work, fp, buf); + if (buf_len < sizeof(struct smb2_file_rename_info)) + return -EINVAL; + + return set_rename_info(work, fp, + (struct smb2_file_rename_info *)req->Buffer, + buf_len); + } case FILE_LINK_INFORMATION: + { + if (buf_len < sizeof(struct smb2_file_link_info)) + return -EINVAL; + return smb2_create_link(work, work->tcon->share_conf, - (struct smb2_file_link_info *)buf, fp->filp, + (struct smb2_file_link_info *)req->Buffer, + buf_len, fp->filp, work->sess->conn->local_nls); - + } case FILE_DISPOSITION_INFORMATION: + { if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) { ksmbd_debug(SMB, "User does not have write permission\n"); return -EACCES; } - return set_file_disposition_info(fp, buf); + if (buf_len < sizeof(struct smb2_file_disposition_info)) + return -EINVAL; + + return set_file_disposition_info(fp, + (struct smb2_file_disposition_info *)req->Buffer); + } case FILE_FULL_EA_INFORMATION: { if (!(fp->daccess & FILE_WRITE_EA_LE)) { @@ -5779,18 +5897,29 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, return -EACCES; } - return smb2_set_ea((struct smb2_ea_info *)buf, - &fp->filp->f_path); - } + if (buf_len < sizeof(struct smb2_ea_info)) + return -EINVAL; + return smb2_set_ea((struct smb2_ea_info *)req->Buffer, + buf_len, &fp->filp->f_path); + } case FILE_POSITION_INFORMATION: - return set_file_position_info(fp, buf); + { + if (buf_len < sizeof(struct smb2_file_pos_info)) + return -EINVAL; + return set_file_position_info(fp, (struct smb2_file_pos_info *)req->Buffer); + } case FILE_MODE_INFORMATION: - return set_file_mode_info(fp, buf); + { + if (buf_len < sizeof(struct smb2_file_mode_info)) + return -EINVAL; + + return set_file_mode_info(fp, (struct smb2_file_mode_info *)req->Buffer); + } } - pr_err("Unimplemented Fileinfoclass :%d\n", info_class); + pr_err("Unimplemented Fileinfoclass :%d\n", req->FileInfoClass); return -EOPNOTSUPP; } @@ -5851,8 +5980,7 @@ int smb2_set_info(struct ksmbd_work *work) switch (req->InfoType) { case SMB2_O_INFO_FILE: ksmbd_debug(SMB, "GOT SMB2_O_INFO_FILE\n"); - rc = smb2_set_info_file(work, fp, req->FileInfoClass, - req->Buffer, work->tcon->share_conf); + rc = smb2_set_info_file(work, fp, req, work->tcon->share_conf); break; case SMB2_O_INFO_SECURITY: ksmbd_debug(SMB, "GOT SMB2_O_INFO_SECURITY\n"); @@ -5879,7 +6007,7 @@ int smb2_set_info(struct ksmbd_work *work) return 0; err_out: - if (rc == -EACCES || rc == -EPERM) + if (rc == -EACCES || rc == -EPERM || rc == -EXDEV) rsp->hdr.Status = STATUS_ACCESS_DENIED; else if (rc == -EINVAL) rsp->hdr.Status = STATUS_INVALID_PARAMETER; @@ -6141,8 +6269,7 @@ static noinline int smb2_write_pipe(struct ksmbd_work *work) (offsetof(struct smb2_write_req, Buffer) - 4)) { data_buf = (char *)&req->Buffer[0]; } else { - if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) || - (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) { + if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) { pr_err("invalid write data offset %u, smb_len %u\n", le16_to_cpu(req->DataOffset), get_rfc1002_len(req)); @@ -6300,8 +6427,7 @@ int smb2_write(struct ksmbd_work *work) (offsetof(struct smb2_write_req, Buffer) - 4)) { data_buf = (char *)&req->Buffer[0]; } else { - if ((le16_to_cpu(req->DataOffset) > get_rfc1002_len(req)) || - (le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req))) { + if ((u64)le16_to_cpu(req->DataOffset) + length > get_rfc1002_len(req)) { pr_err("invalid write data offset %u, smb_len %u\n", le16_to_cpu(req->DataOffset), get_rfc1002_len(req)); @@ -6944,24 +7070,26 @@ out2: return err; } -static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req, +static int fsctl_copychunk(struct ksmbd_work *work, + struct copychunk_ioctl_req *ci_req, + unsigned int cnt_code, + unsigned int input_count, + unsigned long long volatile_id, + unsigned long long persistent_id, struct smb2_ioctl_rsp *rsp) { - struct copychunk_ioctl_req *ci_req; struct copychunk_ioctl_rsp *ci_rsp; struct ksmbd_file *src_fp = NULL, *dst_fp = NULL; struct srv_copychunk *chunks; unsigned int i, chunk_count, chunk_count_written = 0; unsigned int chunk_size_written = 0; loff_t total_size_written = 0; - int ret, cnt_code; + int ret = 0; - cnt_code = le32_to_cpu(req->CntCode); - ci_req = (struct copychunk_ioctl_req *)&req->Buffer[0]; ci_rsp = (struct copychunk_ioctl_rsp *)&rsp->Buffer[0]; - rsp->VolatileFileId = req->VolatileFileId; - rsp->PersistentFileId = req->PersistentFileId; + rsp->VolatileFileId = cpu_to_le64(volatile_id); + rsp->PersistentFileId = cpu_to_le64(persistent_id); ci_rsp->ChunksWritten = cpu_to_le32(ksmbd_server_side_copy_max_chunk_count()); ci_rsp->ChunkBytesWritten = @@ -6971,12 +7099,13 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req, chunks = (struct srv_copychunk *)&ci_req->Chunks[0]; chunk_count = le32_to_cpu(ci_req->ChunkCount); + if (chunk_count == 0) + goto out; total_size_written = 0; /* verify the SRV_COPYCHUNK_COPY packet */ if (chunk_count > ksmbd_server_side_copy_max_chunk_count() || - le32_to_cpu(req->InputCount) < - offsetof(struct copychunk_ioctl_req, Chunks) + + input_count < offsetof(struct copychunk_ioctl_req, Chunks) + chunk_count * sizeof(struct srv_copychunk)) { rsp->hdr.Status = STATUS_INVALID_PARAMETER; return -EINVAL; @@ -6997,9 +7126,7 @@ static int fsctl_copychunk(struct ksmbd_work *work, struct smb2_ioctl_req *req, src_fp = ksmbd_lookup_foreign_fd(work, le64_to_cpu(ci_req->ResumeKey[0])); - dst_fp = ksmbd_lookup_fd_slow(work, - le64_to_cpu(req->VolatileFileId), - le64_to_cpu(req->PersistentFileId)); + dst_fp = ksmbd_lookup_fd_slow(work, volatile_id, persistent_id); ret = -EINVAL; if (!src_fp || src_fp->persistent_id != le64_to_cpu(ci_req->ResumeKey[1])) { @@ -7074,8 +7201,8 @@ static __be32 idev_ipv4_address(struct in_device *idev) } static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, - struct smb2_ioctl_req *req, - struct smb2_ioctl_rsp *rsp) + struct smb2_ioctl_rsp *rsp, + unsigned int out_buf_len) { struct network_interface_info_ioctl_rsp *nii_rsp = NULL; int nbytes = 0; @@ -7087,6 +7214,12 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, rtnl_lock(); for_each_netdev(&init_net, netdev) { + if (out_buf_len < + nbytes + sizeof(struct network_interface_info_ioctl_rsp)) { + rtnl_unlock(); + return -ENOSPC; + } + if (netdev->type == ARPHRD_LOOPBACK) continue; @@ -7166,11 +7299,6 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, if (nii_rsp) nii_rsp->Next = 0; - if (!nbytes) { - rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL; - return -EINVAL; - } - rsp->PersistentFileId = cpu_to_le64(SMB2_NO_FID); rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); return nbytes; @@ -7178,11 +7306,16 @@ static int fsctl_query_iface_info_ioctl(struct ksmbd_conn *conn, static int fsctl_validate_negotiate_info(struct ksmbd_conn *conn, struct validate_negotiate_info_req *neg_req, - struct validate_negotiate_info_rsp *neg_rsp) + struct validate_negotiate_info_rsp *neg_rsp, + unsigned int in_buf_len) { int ret = 0; int dialect; + if (in_buf_len < sizeof(struct validate_negotiate_info_req) + + le16_to_cpu(neg_req->DialectCount) * sizeof(__le16)) + return -EINVAL; + dialect = ksmbd_lookup_dialect_by_id(neg_req->Dialects, neg_req->DialectCount); if (dialect == BAD_PROT_ID || dialect != conn->dialect) { @@ -7216,7 +7349,7 @@ err_out: static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id, struct file_allocated_range_buffer *qar_req, struct file_allocated_range_buffer *qar_rsp, - int in_count, int *out_count) + unsigned int in_count, unsigned int *out_count) { struct ksmbd_file *fp; loff_t start, length; @@ -7243,7 +7376,8 @@ static int fsctl_query_allocated_ranges(struct ksmbd_work *work, u64 id, } static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id, - int out_buf_len, struct smb2_ioctl_req *req, + unsigned int out_buf_len, + struct smb2_ioctl_req *req, struct smb2_ioctl_rsp *rsp) { struct ksmbd_rpc_command *rpc_resp; @@ -7357,8 +7491,7 @@ int smb2_ioctl(struct ksmbd_work *work) { struct smb2_ioctl_req *req; struct smb2_ioctl_rsp *rsp, *rsp_org; - int cnt_code, nbytes = 0; - int out_buf_len; + unsigned int cnt_code, nbytes = 0, out_buf_len, in_buf_len; u64 id = KSMBD_NO_FID; struct ksmbd_conn *conn = work->conn; int ret = 0; @@ -7386,8 +7519,14 @@ int smb2_ioctl(struct ksmbd_work *work) } cnt_code = le32_to_cpu(req->CntCode); - out_buf_len = le32_to_cpu(req->MaxOutputResponse); - out_buf_len = min(KSMBD_IPC_MAX_PAYLOAD, out_buf_len); + ret = smb2_calc_max_out_buf_len(work, 48, + le32_to_cpu(req->MaxOutputResponse)); + if (ret < 0) { + rsp->hdr.Status = STATUS_INVALID_PARAMETER; + goto out; + } + out_buf_len = (unsigned int)ret; + in_buf_len = le32_to_cpu(req->InputCount); switch (cnt_code) { case FSCTL_DFS_GET_REFERRALS: @@ -7415,6 +7554,7 @@ int smb2_ioctl(struct ksmbd_work *work) break; } case FSCTL_PIPE_TRANSCEIVE: + out_buf_len = min_t(u32, KSMBD_IPC_MAX_PAYLOAD, out_buf_len); nbytes = fsctl_pipe_transceive(work, id, out_buf_len, req, rsp); break; case FSCTL_VALIDATE_NEGOTIATE_INFO: @@ -7423,9 +7563,16 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } + if (in_buf_len < sizeof(struct validate_negotiate_info_req)) + return -EINVAL; + + if (out_buf_len < sizeof(struct validate_negotiate_info_rsp)) + return -EINVAL; + ret = fsctl_validate_negotiate_info(conn, (struct validate_negotiate_info_req *)&req->Buffer[0], - (struct validate_negotiate_info_rsp *)&rsp->Buffer[0]); + (struct validate_negotiate_info_rsp *)&rsp->Buffer[0], + in_buf_len); if (ret < 0) goto out; @@ -7434,9 +7581,10 @@ int smb2_ioctl(struct ksmbd_work *work) rsp->VolatileFileId = cpu_to_le64(SMB2_NO_FID); break; case FSCTL_QUERY_NETWORK_INTERFACE_INFO: - nbytes = fsctl_query_iface_info_ioctl(conn, req, rsp); - if (nbytes < 0) + ret = fsctl_query_iface_info_ioctl(conn, rsp, out_buf_len); + if (ret < 0) goto out; + nbytes = ret; break; case FSCTL_REQUEST_RESUME_KEY: if (out_buf_len < sizeof(struct resume_key_ioctl_rsp)) { @@ -7461,15 +7609,33 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } + if (in_buf_len < sizeof(struct copychunk_ioctl_req)) { + ret = -EINVAL; + goto out; + } + if (out_buf_len < sizeof(struct copychunk_ioctl_rsp)) { ret = -EINVAL; goto out; } nbytes = sizeof(struct copychunk_ioctl_rsp); - fsctl_copychunk(work, req, rsp); + rsp->VolatileFileId = req->VolatileFileId; + rsp->PersistentFileId = req->PersistentFileId; + fsctl_copychunk(work, + (struct copychunk_ioctl_req *)&req->Buffer[0], + le32_to_cpu(req->CntCode), + le32_to_cpu(req->InputCount), + le64_to_cpu(req->VolatileFileId), + le64_to_cpu(req->PersistentFileId), + rsp); break; case FSCTL_SET_SPARSE: + if (in_buf_len < sizeof(struct file_sparse)) { + ret = -EINVAL; + goto out; + } + ret = fsctl_set_sparse(work, id, (struct file_sparse *)&req->Buffer[0]); if (ret < 0) @@ -7488,6 +7654,11 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } + if (in_buf_len < sizeof(struct file_zero_data_information)) { + ret = -EINVAL; + goto out; + } + zero_data = (struct file_zero_data_information *)&req->Buffer[0]; @@ -7507,6 +7678,11 @@ int smb2_ioctl(struct ksmbd_work *work) break; } case FSCTL_QUERY_ALLOCATED_RANGES: + if (in_buf_len < sizeof(struct file_allocated_range_buffer)) { + ret = -EINVAL; + goto out; + } + ret = fsctl_query_allocated_ranges(work, id, (struct file_allocated_range_buffer *)&req->Buffer[0], (struct file_allocated_range_buffer *)&rsp->Buffer[0], @@ -7547,6 +7723,11 @@ int smb2_ioctl(struct ksmbd_work *work) struct duplicate_extents_to_file *dup_ext; loff_t src_off, dst_off, length, cloned; + if (in_buf_len < sizeof(struct duplicate_extents_to_file)) { + ret = -EINVAL; + goto out; + } + dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0]; fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle, @@ -7617,6 +7798,8 @@ out: rsp->hdr.Status = STATUS_OBJECT_NAME_NOT_FOUND; else if (ret == -EOPNOTSUPP) rsp->hdr.Status = STATUS_NOT_SUPPORTED; + else if (ret == -ENOSPC) + rsp->hdr.Status = STATUS_BUFFER_TOO_SMALL; else if (ret < 0 || rsp->hdr.Status == 0) rsp->hdr.Status = STATUS_INVALID_PARAMETER; smb2_set_err_rsp(work); @@ -8206,7 +8389,8 @@ void smb3_preauth_hash_rsp(struct ksmbd_work *work) WORK_BUFFERS(work, req, rsp); - if (le16_to_cpu(req->Command) == SMB2_NEGOTIATE_HE) + if (le16_to_cpu(req->Command) == SMB2_NEGOTIATE_HE && + conn->preauth_info) ksmbd_gen_preauth_integrity_hash(conn, (char *)rsp, conn->preauth_info->Preauth_HashValue); @@ -8310,31 +8494,29 @@ int smb3_decrypt_req(struct ksmbd_work *work) struct smb2_hdr *hdr; unsigned int pdu_length = get_rfc1002_len(buf); struct kvec iov[2]; - unsigned int buf_data_size = pdu_length + 4 - + int buf_data_size = pdu_length + 4 - sizeof(struct smb2_transform_hdr); struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)buf; - unsigned int orig_len = le32_to_cpu(tr_hdr->OriginalMessageSize); int rc = 0; - sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId)); - if (!sess) { - pr_err("invalid session id(%llx) in transform header\n", - le64_to_cpu(tr_hdr->SessionId)); - return -ECONNABORTED; - } - - if (pdu_length + 4 < - sizeof(struct smb2_transform_hdr) + sizeof(struct smb2_hdr)) { + if (buf_data_size < sizeof(struct smb2_hdr)) { pr_err("Transform message is too small (%u)\n", pdu_length); return -ECONNABORTED; } - if (pdu_length + 4 < orig_len + sizeof(struct smb2_transform_hdr)) { + if (buf_data_size < le32_to_cpu(tr_hdr->OriginalMessageSize)) { pr_err("Transform message is broken\n"); return -ECONNABORTED; } + sess = ksmbd_session_lookup_all(conn, le64_to_cpu(tr_hdr->SessionId)); + if (!sess) { + pr_err("invalid session id(%llx) in transform header\n", + le64_to_cpu(tr_hdr->SessionId)); + return -ECONNABORTED; + } + iov[0].iov_base = buf; iov[0].iov_len = sizeof(struct smb2_transform_hdr); iov[1].iov_base = buf + sizeof(struct smb2_transform_hdr); diff --git a/fs/ksmbd/smb2pdu.h b/fs/ksmbd/smb2pdu.h index bcec845b03f3..ff5a2f01d34a 100644 --- a/fs/ksmbd/smb2pdu.h +++ b/fs/ksmbd/smb2pdu.h @@ -113,6 +113,8 @@ #define SMB21_DEFAULT_IOSIZE (1024 * 1024) #define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) #define SMB3_DEFAULT_TRANS_SIZE (1024 * 1024) +#define SMB3_MIN_IOSIZE (64 * 1024) +#define SMB3_MAX_IOSIZE (8 * 1024 * 1024) /* * SMB2 Header Definition @@ -1464,6 +1466,15 @@ struct smb2_file_all_info { /* data block encoding of response to level 18 */ char FileName[1]; } __packed; /* level 18 Query */ +struct smb2_file_basic_info { /* data block encoding of response to level 18 */ + __le64 CreationTime; /* Beginning of FILE_BASIC_INFO equivalent */ + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 Attributes; + __u32 Pad1; /* End of FILE_BASIC_INFO_INFO equivalent */ +} __packed; + struct smb2_file_alt_name_info { __le32 FileNameLength; char FileName[0]; @@ -1628,7 +1639,6 @@ struct smb2_posix_info { } __packed; /* functions */ -int init_smb2_0_server(struct ksmbd_conn *conn); void init_smb2_1_server(struct ksmbd_conn *conn); void init_smb3_0_server(struct ksmbd_conn *conn); void init_smb3_02_server(struct ksmbd_conn *conn); diff --git a/fs/ksmbd/smb_common.c b/fs/ksmbd/smb_common.c index 43d3123d8b62..707490ab1f4c 100644 --- a/fs/ksmbd/smb_common.c +++ b/fs/ksmbd/smb_common.c @@ -21,7 +21,6 @@ static const char basechars[43] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_-!@#$%"; #define MAGIC_CHAR '~' #define PERIOD '.' #define mangle(V) ((char)(basechars[(V) % MANGLE_BASE])) -#define KSMBD_MIN_SUPPORTED_HEADER_SIZE (sizeof(struct smb2_hdr)) struct smb_protocol { int index; @@ -89,7 +88,7 @@ unsigned int ksmbd_server_side_copy_max_total_size(void) inline int ksmbd_min_protocol(void) { - return SMB2_PROT; + return SMB21_PROT; } inline int ksmbd_max_protocol(void) @@ -129,16 +128,22 @@ int ksmbd_lookup_protocol_idx(char *str) * * check for valid smb signature and packet direction(request/response) * - * Return: 0 on success, otherwise 1 + * Return: 0 on success, otherwise -EINVAL */ int ksmbd_verify_smb_message(struct ksmbd_work *work) { - struct smb2_hdr *smb2_hdr = work->request_buf; + struct smb2_hdr *smb2_hdr = work->request_buf + work->next_smb2_rcv_hdr_off; + struct smb_hdr *hdr; if (smb2_hdr->ProtocolId == SMB2_PROTO_NUMBER) return ksmbd_smb2_check_message(work); - return 0; + hdr = work->request_buf; + if (*(__le32 *)hdr->Protocol == SMB1_PROTO_NUMBER && + hdr->Command == SMB_COM_NEGOTIATE) + return 0; + + return -EINVAL; } /** @@ -149,20 +154,7 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work) */ bool ksmbd_smb_request(struct ksmbd_conn *conn) { - int type = *(char *)conn->request_buf; - - switch (type) { - case RFC1002_SESSION_MESSAGE: - /* Regular SMB request */ - return true; - case RFC1002_SESSION_KEEP_ALIVE: - ksmbd_debug(SMB, "RFC 1002 session keep alive\n"); - break; - default: - ksmbd_debug(SMB, "RFC 1002 unknown request type 0x%x\n", type); - } - - return false; + return conn->request_buf[0] == 0; } static bool supported_protocol(int idx) @@ -176,10 +168,12 @@ static bool supported_protocol(int idx) idx <= server_conf.max_protocol); } -static char *next_dialect(char *dialect, int *next_off) +static char *next_dialect(char *dialect, int *next_off, int bcount) { dialect = dialect + *next_off; - *next_off = strlen(dialect); + *next_off = strnlen(dialect, bcount); + if (dialect[*next_off] != '\0') + return NULL; return dialect; } @@ -194,7 +188,9 @@ static int ksmbd_lookup_dialect_by_name(char *cli_dialects, __le16 byte_count) dialect = cli_dialects; bcount = le16_to_cpu(byte_count); do { - dialect = next_dialect(dialect, &next); + dialect = next_dialect(dialect, &next, bcount); + if (!dialect) + break; ksmbd_debug(SMB, "client requested dialect %s\n", dialect); if (!strcmp(dialect, smb1_protos[i].name)) { @@ -242,13 +238,22 @@ int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count) static int ksmbd_negotiate_smb_dialect(void *buf) { - __le32 proto; + int smb_buf_length = get_rfc1002_len(buf); + __le32 proto = ((struct smb2_hdr *)buf)->ProtocolId; - proto = ((struct smb2_hdr *)buf)->ProtocolId; if (proto == SMB2_PROTO_NUMBER) { struct smb2_negotiate_req *req; + int smb2_neg_size = + offsetof(struct smb2_negotiate_req, Dialects) - 4; req = (struct smb2_negotiate_req *)buf; + if (smb2_neg_size > smb_buf_length) + goto err_out; + + if (smb2_neg_size + le16_to_cpu(req->DialectCount) * sizeof(__le16) > + smb_buf_length) + goto err_out; + return ksmbd_lookup_dialect_by_id(req->Dialects, req->DialectCount); } @@ -258,14 +263,22 @@ static int ksmbd_negotiate_smb_dialect(void *buf) struct smb_negotiate_req *req; req = (struct smb_negotiate_req *)buf; + if (le16_to_cpu(req->ByteCount) < 2) + goto err_out; + + if (offsetof(struct smb_negotiate_req, DialectsArray) - 4 + + le16_to_cpu(req->ByteCount) > smb_buf_length) { + goto err_out; + } + return ksmbd_lookup_dialect_by_name(req->DialectsArray, req->ByteCount); } +err_out: return BAD_PROT_ID; } -#define SMB_COM_NEGOTIATE 0x72 int ksmbd_init_smb_server(struct ksmbd_work *work) { struct ksmbd_conn *conn = work->conn; @@ -280,11 +293,6 @@ int ksmbd_init_smb_server(struct ksmbd_work *work) return 0; } -bool ksmbd_pdu_size_has_room(unsigned int pdu) -{ - return (pdu >= KSMBD_MIN_SUPPORTED_HEADER_SIZE - 4); -} - int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, struct ksmbd_file *dir, struct ksmbd_dir_info *d_info, @@ -419,7 +427,7 @@ int ksmbd_extract_shortname(struct ksmbd_conn *conn, const char *longname, static int __smb2_negotiate(struct ksmbd_conn *conn) { - return (conn->dialect >= SMB20_PROT_ID && + return (conn->dialect >= SMB21_PROT_ID && conn->dialect <= SMB311_PROT_ID); } @@ -449,7 +457,7 @@ int ksmbd_smb_negotiate_common(struct ksmbd_work *work, unsigned int command) } } - if (command == SMB2_NEGOTIATE_HE) { + if (command == SMB2_NEGOTIATE_HE && __smb2_negotiate(conn)) { ret = smb2_handle_negotiate(work); init_smb2_neg_rsp(work); return ret; diff --git a/fs/ksmbd/smb_common.h b/fs/ksmbd/smb_common.h index 57c667c1be06..6e79e7577f6b 100644 --- a/fs/ksmbd/smb_common.h +++ b/fs/ksmbd/smb_common.h @@ -48,13 +48,7 @@ #define CIFS_DEFAULT_IOSIZE (64 * 1024) #define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */ -/* RFC 1002 session packet types */ -#define RFC1002_SESSION_MESSAGE 0x00 -#define RFC1002_SESSION_REQUEST 0x81 -#define RFC1002_POSITIVE_SESSION_RESPONSE 0x82 -#define RFC1002_NEGATIVE_SESSION_RESPONSE 0x83 -#define RFC1002_RETARGET_SESSION_RESPONSE 0x84 -#define RFC1002_SESSION_KEEP_ALIVE 0x85 +#define MAX_STREAM_PROT_LEN 0x00FFFFFF /* Responses when opening a file. */ #define F_SUPERSEDED 0 @@ -210,6 +204,7 @@ FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES) #define SMB1_PROTO_NUMBER cpu_to_le32(0x424d53ff) +#define SMB_COM_NEGOTIATE 0x72 #define SMB1_CLIENT_GUID_SIZE (16) struct smb_hdr { @@ -500,8 +495,6 @@ int ksmbd_lookup_dialect_by_id(__le16 *cli_dialects, __le16 dialects_count); int ksmbd_init_smb_server(struct ksmbd_work *work); -bool ksmbd_pdu_size_has_room(unsigned int pdu); - struct ksmbd_kstat; int ksmbd_populate_dot_dotdot_entries(struct ksmbd_work *work, int info_level, diff --git a/fs/ksmbd/smbacl.c b/fs/ksmbd/smbacl.c index 0a95cdec8c80..bd792db32623 100644 --- a/fs/ksmbd/smbacl.c +++ b/fs/ksmbd/smbacl.c @@ -380,7 +380,7 @@ static void parse_dacl(struct user_namespace *user_ns, { int i, ret; int num_aces = 0; - int acl_size; + unsigned int acl_size; char *acl_base; struct smb_ace **ppace; struct posix_acl_entry *cf_pace, *cf_pdace; @@ -392,7 +392,7 @@ static void parse_dacl(struct user_namespace *user_ns, return; /* validate that we do not go past end of acl */ - if (end_of_acl <= (char *)pdacl || + if (end_of_acl < (char *)pdacl + sizeof(struct smb_acl) || end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { pr_err("ACL too small to parse DACL\n"); return; @@ -431,8 +431,22 @@ static void parse_dacl(struct user_namespace *user_ns, * user/group/other have no permissions */ for (i = 0; i < num_aces; ++i) { + if (end_of_acl - acl_base < acl_size) + break; + ppace[i] = (struct smb_ace *)(acl_base + acl_size); acl_base = (char *)ppace[i]; + acl_size = offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth); + + if (end_of_acl - acl_base < acl_size || + ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || + (end_of_acl - acl_base < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) || + (le16_to_cpu(ppace[i]->size) < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth)) + break; + acl_size = le16_to_cpu(ppace[i]->size); ppace[i]->access_req = smb_map_generic_desired_access(ppace[i]->access_req); @@ -807,6 +821,9 @@ int parse_sec_desc(struct user_namespace *user_ns, struct smb_ntsd *pntsd, if (!pntsd) return -EIO; + if (acl_len < sizeof(struct smb_ntsd)) + return -EINVAL; + owner_sid_ptr = (struct smb_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); group_sid_ptr = (struct smb_sid *)((char *)pntsd + diff --git a/fs/ksmbd/transport_ipc.c b/fs/ksmbd/transport_ipc.c index 44aea33a67fa..1acf1892a466 100644 --- a/fs/ksmbd/transport_ipc.c +++ b/fs/ksmbd/transport_ipc.c @@ -601,7 +601,7 @@ int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id, return ret; } -int ksmbd_ipc_logout_request(const char *account) +int ksmbd_ipc_logout_request(const char *account, int flags) { struct ksmbd_ipc_msg *msg; struct ksmbd_logout_request *req; @@ -616,6 +616,7 @@ int ksmbd_ipc_logout_request(const char *account) msg->type = KSMBD_EVENT_LOGOUT_REQUEST; req = (struct ksmbd_logout_request *)msg->payload; + req->account_flags = flags; strscpy(req->account, account, KSMBD_REQ_MAX_ACCOUNT_NAME_SZ); ret = ipc_msg_send(msg); diff --git a/fs/ksmbd/transport_ipc.h b/fs/ksmbd/transport_ipc.h index 9eacc895ffdb..5e5b90a0c187 100644 --- a/fs/ksmbd/transport_ipc.h +++ b/fs/ksmbd/transport_ipc.h @@ -25,7 +25,7 @@ ksmbd_ipc_tree_connect_request(struct ksmbd_session *sess, struct sockaddr *peer_addr); int ksmbd_ipc_tree_disconnect_request(unsigned long long session_id, unsigned long long connect_id); -int ksmbd_ipc_logout_request(const char *account); +int ksmbd_ipc_logout_request(const char *account, int flags); struct ksmbd_share_config_response * ksmbd_ipc_share_config_request(const char *name); struct ksmbd_spnego_authen_response * diff --git a/fs/ksmbd/transport_rdma.c b/fs/ksmbd/transport_rdma.c index 52b2556e76b1..a2fd5a4d4cd5 100644 --- a/fs/ksmbd/transport_rdma.c +++ b/fs/ksmbd/transport_rdma.c @@ -20,7 +20,6 @@ #define SUBMOD_NAME "smb_direct" #include <linux/kthread.h> -#include <linux/rwlock.h> #include <linux/list.h> #include <linux/mempool.h> #include <linux/highmem.h> @@ -550,6 +549,10 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) switch (recvmsg->type) { case SMB_DIRECT_MSG_NEGOTIATE_REQ: + if (wc->byte_len < sizeof(struct smb_direct_negotiate_req)) { + put_empty_recvmsg(t, recvmsg); + return; + } t->negotiation_requested = true; t->full_packet_received = true; wake_up_interruptible(&t->wait_status); @@ -557,10 +560,23 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) case SMB_DIRECT_MSG_DATA_TRANSFER: { struct smb_direct_data_transfer *data_transfer = (struct smb_direct_data_transfer *)recvmsg->packet; - int data_length = le32_to_cpu(data_transfer->data_length); + unsigned int data_length; int avail_recvmsg_count, receive_credits; + if (wc->byte_len < + offsetof(struct smb_direct_data_transfer, padding)) { + put_empty_recvmsg(t, recvmsg); + return; + } + + data_length = le32_to_cpu(data_transfer->data_length); if (data_length) { + if (wc->byte_len < sizeof(struct smb_direct_data_transfer) + + (u64)data_length) { + put_empty_recvmsg(t, recvmsg); + return; + } + if (t->full_packet_received) recvmsg->first_segment = true; @@ -569,7 +585,7 @@ static void recv_done(struct ib_cq *cq, struct ib_wc *wc) else t->full_packet_received = true; - enqueue_reassembly(t, recvmsg, data_length); + enqueue_reassembly(t, recvmsg, (int)data_length); wake_up_interruptible(&t->wait_reassembly_queue); spin_lock(&t->receive_credit_lock); diff --git a/fs/ksmbd/transport_tcp.c b/fs/ksmbd/transport_tcp.c index dc15a5ecd2e0..c14320e03b69 100644 --- a/fs/ksmbd/transport_tcp.c +++ b/fs/ksmbd/transport_tcp.c @@ -215,7 +215,7 @@ out_error: * ksmbd_kthread_fn() - listen to new SMB connections and callback server * @p: arguments to forker thread * - * Return: Returns a task_struct or ERR_PTR + * Return: 0 on success, error number otherwise */ static int ksmbd_kthread_fn(void *p) { @@ -387,7 +387,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket) /** * create_socket - create socket for ksmbd/0 * - * Return: Returns a task_struct or ERR_PTR + * Return: 0 on success, error number otherwise */ static int create_socket(struct interface *iface) { diff --git a/fs/ksmbd/vfs.c b/fs/ksmbd/vfs.c index b047f2980d96..835b384b0895 100644 --- a/fs/ksmbd/vfs.c +++ b/fs/ksmbd/vfs.c @@ -19,6 +19,8 @@ #include <linux/sched/xacct.h> #include <linux/crc32c.h> +#include "../internal.h" /* for vfs_path_lookup */ + #include "glob.h" #include "oplock.h" #include "connection.h" @@ -44,7 +46,6 @@ static char *extract_last_component(char *path) p++; } else { p = NULL; - pr_err("Invalid path %s\n", path); } return p; } @@ -155,7 +156,7 @@ int ksmbd_vfs_query_maximal_access(struct user_namespace *user_ns, /** * ksmbd_vfs_create() - vfs helper for smb create file * @work: work - * @name: file name + * @name: file name that is relative to share * @mode: file create mode * * Return: 0 on success, otherwise error @@ -166,7 +167,8 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) struct dentry *dentry; int err; - dentry = kern_path_create(AT_FDCWD, name, &path, 0); + dentry = ksmbd_vfs_kern_path_create(work, name, + LOOKUP_NO_SYMLINKS, &path); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); if (err != -ENOENT) @@ -191,7 +193,7 @@ int ksmbd_vfs_create(struct ksmbd_work *work, const char *name, umode_t mode) /** * ksmbd_vfs_mkdir() - vfs helper for smb create directory * @work: work - * @name: directory name + * @name: directory name that is relative to share * @mode: directory create mode * * Return: 0 on success, otherwise error @@ -203,7 +205,9 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) struct dentry *dentry; int err; - dentry = kern_path_create(AT_FDCWD, name, &path, LOOKUP_DIRECTORY); + dentry = ksmbd_vfs_kern_path_create(work, name, + LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, + &path); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); if (err != -EEXIST) @@ -578,7 +582,7 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id) /** * ksmbd_vfs_remove_file() - vfs helper for smb rmdir or unlink - * @name: absolute directory or file name + * @name: directory or file name that is relative to share * * Return: 0 on success, otherwise error */ @@ -588,16 +592,11 @@ int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) struct path path; struct dentry *parent; int err; - int flags = 0; if (ksmbd_override_fsids(work)) return -ENOMEM; - if (test_share_config_flag(work->tcon->share_conf, - KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) - flags = LOOKUP_FOLLOW; - - err = kern_path(name, flags, &path); + err = ksmbd_vfs_kern_path(work, name, LOOKUP_NO_SYMLINKS, &path, false); if (err) { ksmbd_debug(VFS, "can't get %s, err %d\n", name, err); ksmbd_revert_fsids(work); @@ -642,7 +641,7 @@ out_err: /** * ksmbd_vfs_link() - vfs helper for creating smb hardlink * @oldname: source file name - * @newname: hardlink name + * @newname: hardlink name that is relative to share * * Return: 0 on success, otherwise error */ @@ -652,24 +651,20 @@ int ksmbd_vfs_link(struct ksmbd_work *work, const char *oldname, struct path oldpath, newpath; struct dentry *dentry; int err; - int flags = 0; if (ksmbd_override_fsids(work)) return -ENOMEM; - if (test_share_config_flag(work->tcon->share_conf, - KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) - flags = LOOKUP_FOLLOW; - - err = kern_path(oldname, flags, &oldpath); + err = kern_path(oldname, LOOKUP_NO_SYMLINKS, &oldpath); if (err) { pr_err("cannot get linux path for %s, err = %d\n", oldname, err); goto out1; } - dentry = kern_path_create(AT_FDCWD, newname, &newpath, - flags | LOOKUP_REVAL); + dentry = ksmbd_vfs_kern_path_create(work, newname, + LOOKUP_NO_SYMLINKS | LOOKUP_REVAL, + &newpath); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); pr_err("path create err for %s, err %d\n", newname, err); @@ -788,21 +783,19 @@ int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, struct dentry *src_dent, *trap_dent, *src_child; char *dst_name; int err; - int flags; dst_name = extract_last_component(newname); - if (!dst_name) - return -EINVAL; + if (!dst_name) { + dst_name = newname; + newname = ""; + } src_dent_parent = dget_parent(fp->filp->f_path.dentry); src_dent = fp->filp->f_path.dentry; - flags = LOOKUP_DIRECTORY; - if (test_share_config_flag(work->tcon->share_conf, - KSMBD_SHARE_FLAG_FOLLOW_SYMLINKS)) - flags |= LOOKUP_FOLLOW; - - err = kern_path(newname, flags, &dst_path); + err = ksmbd_vfs_kern_path(work, newname, + LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, + &dst_path, false); if (err) { ksmbd_debug(VFS, "Cannot get path for %s [%d]\n", newname, err); goto out; @@ -848,61 +841,43 @@ out: /** * ksmbd_vfs_truncate() - vfs helper for smb file truncate * @work: work - * @name: old filename * @fid: file id of old file * @size: truncate to given size * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name, +int ksmbd_vfs_truncate(struct ksmbd_work *work, struct ksmbd_file *fp, loff_t size) { - struct path path; int err = 0; + struct file *filp; - if (name) { - err = kern_path(name, 0, &path); - if (err) { - pr_err("cannot get linux path for %s, err %d\n", - name, err); - return err; - } - err = vfs_truncate(&path, size); - if (err) - pr_err("truncate failed for %s err %d\n", - name, err); - path_put(&path); - } else { - struct file *filp; - - filp = fp->filp; - - /* Do we need to break any of a levelII oplock? */ - smb_break_all_levII_oplock(work, fp, 1); + filp = fp->filp; - if (!work->tcon->posix_extensions) { - struct inode *inode = file_inode(filp); + /* Do we need to break any of a levelII oplock? */ + smb_break_all_levII_oplock(work, fp, 1); - if (size < inode->i_size) { - err = check_lock_range(filp, size, - inode->i_size - 1, WRITE); - } else { - err = check_lock_range(filp, inode->i_size, - size - 1, WRITE); - } + if (!work->tcon->posix_extensions) { + struct inode *inode = file_inode(filp); - if (err) { - pr_err("failed due to lock\n"); - return -EAGAIN; - } + if (size < inode->i_size) { + err = check_lock_range(filp, size, + inode->i_size - 1, WRITE); + } else { + err = check_lock_range(filp, inode->i_size, + size - 1, WRITE); } - err = vfs_truncate(&filp->f_path, size); - if (err) - pr_err("truncate failed for filename : %s err %d\n", - fp->filename, err); + if (err) { + pr_err("failed due to lock\n"); + return -EAGAIN; + } } + err = vfs_truncate(&filp->f_path, size); + if (err) + pr_err("truncate failed for filename : %s err %d\n", + fp->filename, err); return err; } @@ -1048,7 +1023,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, - int in_count, int *out_count) + unsigned int in_count, unsigned int *out_count) { struct file *f = fp->filp; struct inode *inode = file_inode(fp->filp); @@ -1220,22 +1195,25 @@ static int ksmbd_vfs_lookup_in_dir(struct path *dir, char *name, size_t namelen) /** * ksmbd_vfs_kern_path() - lookup a file and get path info - * @name: name of file for lookup + * @name: file path that is relative to share * @flags: lookup flags * @path: if lookup succeed, return path info * @caseless: caseless filename lookup * * Return: 0 on success, otherwise error */ -int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, - bool caseless) +int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, + unsigned int flags, struct path *path, bool caseless) { + struct ksmbd_share_config *share_conf = work->tcon->share_conf; int err; - if (name[0] != '/') - return -EINVAL; - - err = kern_path(name, flags, path); + flags |= LOOKUP_BENEATH; + err = vfs_path_lookup(share_conf->vfs_path.dentry, + share_conf->vfs_path.mnt, + name, + flags, + path); if (!err) return 0; @@ -1249,11 +1227,10 @@ int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, return -ENOMEM; path_len = strlen(filepath); - remain_len = path_len - 1; + remain_len = path_len; - err = kern_path("/", flags, &parent); - if (err) - goto out; + parent = share_conf->vfs_path; + path_get(&parent); while (d_can_lookup(parent.dentry)) { char *filename = filepath + path_len - remain_len; @@ -1266,21 +1243,21 @@ int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, err = ksmbd_vfs_lookup_in_dir(&parent, filename, filename_len); - if (err) { - path_put(&parent); + path_put(&parent); + if (err) goto out; - } - path_put(&parent); next[0] = '\0'; - err = kern_path(filepath, flags, &parent); + err = vfs_path_lookup(share_conf->vfs_path.dentry, + share_conf->vfs_path.mnt, + filepath, + flags, + &parent); if (err) goto out; - - if (is_last) { - path->mnt = parent.mnt; - path->dentry = parent.dentry; + else if (is_last) { + *path = parent; goto out; } @@ -1296,6 +1273,23 @@ out: return err; } +struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, + const char *name, + unsigned int flags, + struct path *path) +{ + char *abs_name; + struct dentry *dent; + + abs_name = convert_to_unix_name(work->tcon->share_conf, name); + if (!abs_name) + return ERR_PTR(-ENOMEM); + + dent = kern_path_create(AT_FDCWD, abs_name, path, flags); + kfree(abs_name); + return dent; +} + int ksmbd_vfs_remove_acl_xattrs(struct user_namespace *user_ns, struct dentry *dentry) { diff --git a/fs/ksmbd/vfs.h b/fs/ksmbd/vfs.h index 85db50abdb24..b0d5b8feb4a3 100644 --- a/fs/ksmbd/vfs.h +++ b/fs/ksmbd/vfs.h @@ -126,7 +126,7 @@ int ksmbd_vfs_link(struct ksmbd_work *work, int ksmbd_vfs_getattr(struct path *path, struct kstat *stat); int ksmbd_vfs_fp_rename(struct ksmbd_work *work, struct ksmbd_file *fp, char *newname); -int ksmbd_vfs_truncate(struct ksmbd_work *work, const char *name, +int ksmbd_vfs_truncate(struct ksmbd_work *work, struct ksmbd_file *fp, loff_t size); struct srv_copychunk; int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, @@ -152,8 +152,13 @@ int ksmbd_vfs_xattr_stream_name(char *stream_name, char **xattr_stream_name, size_t *xattr_stream_name_size, int s_type); int ksmbd_vfs_remove_xattr(struct user_namespace *user_ns, struct dentry *dentry, char *attr_name); -int ksmbd_vfs_kern_path(char *name, unsigned int flags, struct path *path, +int ksmbd_vfs_kern_path(struct ksmbd_work *work, + char *name, unsigned int flags, struct path *path, bool caseless); +struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, + const char *name, + unsigned int flags, + struct path *path); int ksmbd_vfs_empty_dir(struct ksmbd_file *fp); void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option); int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, @@ -161,7 +166,7 @@ int ksmbd_vfs_zero_data(struct ksmbd_work *work, struct ksmbd_file *fp, struct file_allocated_range_buffer; int ksmbd_vfs_fqar_lseek(struct ksmbd_file *fp, loff_t start, loff_t length, struct file_allocated_range_buffer *ranges, - int in_count, int *out_count); + unsigned int in_count, unsigned int *out_count); int ksmbd_vfs_unlink(struct user_namespace *user_ns, struct dentry *dir, struct dentry *dentry); void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat); diff --git a/fs/lockd/svcxdr.h b/fs/lockd/svcxdr.h index c69a0bb76c94..4f1a451da5ba 100644 --- a/fs/lockd/svcxdr.h +++ b/fs/lockd/svcxdr.h @@ -134,18 +134,9 @@ svcxdr_decode_owner(struct xdr_stream *xdr, struct xdr_netobj *obj) static inline bool svcxdr_encode_owner(struct xdr_stream *xdr, const struct xdr_netobj *obj) { - unsigned int quadlen = XDR_QUADLEN(obj->len); - __be32 *p; - - if (xdr_stream_encode_u32(xdr, obj->len) < 0) - return false; - p = xdr_reserve_space(xdr, obj->len); - if (!p) + if (obj->len > XDR_MAX_NETOBJ) return false; - p[quadlen - 1] = 0; /* XDR pad */ - memcpy(p, obj->data, obj->len); - - return true; + return xdr_stream_encode_opaque(xdr, obj->data, obj->len) > 0; } #endif /* _LOCKD_SVCXDR_H_ */ diff --git a/fs/locks.c b/fs/locks.c index 3d6fb4ae847b..0fca9d680978 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2,117 +2,11 @@ /* * linux/fs/locks.c * - * Provide support for fcntl()'s F_GETLK, F_SETLK, and F_SETLKW calls. - * Doug Evans (dje@spiff.uucp), August 07, 1992 + * We implement four types of file locks: BSD locks, posix locks, open + * file description locks, and leases. For details about BSD locks, + * see the flock(2) man page; for details about the other three, see + * fcntl(2). * - * Deadlock detection added. - * FIXME: one thing isn't handled yet: - * - mandatory locks (requires lots of changes elsewhere) - * Kelly Carmichael (kelly@[142.24.8.65]), September 17, 1994. - * - * Miscellaneous edits, and a total rewrite of posix_lock_file() code. - * Kai Petzke (wpp@marie.physik.tu-berlin.de), 1994 - * - * Converted file_lock_table to a linked list from an array, which eliminates - * the limits on how many active file locks are open. - * Chad Page (pageone@netcom.com), November 27, 1994 - * - * Removed dependency on file descriptors. dup()'ed file descriptors now - * get the same locks as the original file descriptors, and a close() on - * any file descriptor removes ALL the locks on the file for the current - * process. Since locks still depend on the process id, locks are inherited - * after an exec() but not after a fork(). This agrees with POSIX, and both - * BSD and SVR4 practice. - * Andy Walker (andy@lysaker.kvaerner.no), February 14, 1995 - * - * Scrapped free list which is redundant now that we allocate locks - * dynamically with kmalloc()/kfree(). - * Andy Walker (andy@lysaker.kvaerner.no), February 21, 1995 - * - * Implemented two lock personalities - FL_FLOCK and FL_POSIX. - * - * FL_POSIX locks are created with calls to fcntl() and lockf() through the - * fcntl() system call. They have the semantics described above. - * - * FL_FLOCK locks are created with calls to flock(), through the flock() - * system call, which is new. Old C libraries implement flock() via fcntl() - * and will continue to use the old, broken implementation. - * - * FL_FLOCK locks follow the 4.4 BSD flock() semantics. They are associated - * with a file pointer (filp). As a result they can be shared by a parent - * process and its children after a fork(). They are removed when the last - * file descriptor referring to the file pointer is closed (unless explicitly - * unlocked). - * - * FL_FLOCK locks never deadlock, an existing lock is always removed before - * upgrading from shared to exclusive (or vice versa). When this happens - * any processes blocked by the current lock are woken up and allowed to - * run before the new lock is applied. - * Andy Walker (andy@lysaker.kvaerner.no), June 09, 1995 - * - * Removed some race conditions in flock_lock_file(), marked other possible - * races. Just grep for FIXME to see them. - * Dmitry Gorodchanin (pgmdsg@ibi.com), February 09, 1996. - * - * Addressed Dmitry's concerns. Deadlock checking no longer recursive. - * Lock allocation changed to GFP_ATOMIC as we can't afford to sleep - * once we've checked for blocking and deadlocking. - * Andy Walker (andy@lysaker.kvaerner.no), April 03, 1996. - * - * Initial implementation of mandatory locks. SunOS turned out to be - * a rotten model, so I implemented the "obvious" semantics. - * See 'Documentation/filesystems/mandatory-locking.rst' for details. - * Andy Walker (andy@lysaker.kvaerner.no), April 06, 1996. - * - * Don't allow mandatory locks on mmap()'ed files. Added simple functions to - * check if a file has mandatory locks, used by mmap(), open() and creat() to - * see if system call should be rejected. Ref. HP-UX/SunOS/Solaris Reference - * Manual, Section 2. - * Andy Walker (andy@lysaker.kvaerner.no), April 09, 1996. - * - * Tidied up block list handling. Added '/proc/locks' interface. - * Andy Walker (andy@lysaker.kvaerner.no), April 24, 1996. - * - * Fixed deadlock condition for pathological code that mixes calls to - * flock() and fcntl(). - * Andy Walker (andy@lysaker.kvaerner.no), April 29, 1996. - * - * Allow only one type of locking scheme (FL_POSIX or FL_FLOCK) to be in use - * for a given file at a time. Changed the CONFIG_LOCK_MANDATORY scheme to - * guarantee sensible behaviour in the case where file system modules might - * be compiled with different options than the kernel itself. - * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996. - * - * Added a couple of missing wake_up() calls. Thanks to Thomas Meckel - * (Thomas.Meckel@mni.fh-giessen.de) for spotting this. - * Andy Walker (andy@lysaker.kvaerner.no), May 15, 1996. - * - * Changed FL_POSIX locks to use the block list in the same way as FL_FLOCK - * locks. Changed process synchronisation to avoid dereferencing locks that - * have already been freed. - * Andy Walker (andy@lysaker.kvaerner.no), Sep 21, 1996. - * - * Made the block list a circular list to minimise searching in the list. - * Andy Walker (andy@lysaker.kvaerner.no), Sep 25, 1996. - * - * Made mandatory locking a mount option. Default is not to allow mandatory - * locking. - * Andy Walker (andy@lysaker.kvaerner.no), Oct 04, 1996. - * - * Some adaptations for NFS support. - * Olaf Kirch (okir@monad.swb.de), Dec 1996, - * - * Fixed /proc/locks interface so that we can't overrun the buffer we are handed. - * Andy Walker (andy@lysaker.kvaerner.no), May 12, 1997. - * - * Use slab allocator instead of kmalloc/kfree. - * Use generic list implementation from <linux/list.h>. - * Sped up posix_locks_deadlock by only considering blocked locks. - * Matthew Wilcox <willy@debian.org>, March, 2000. - * - * Leases and LOCK_MAND - * Matthew Wilcox <willy@debian.org>, June, 2000. - * Stephen Rothwell <sfr@canb.auug.org.au>, June, 2000. * * Locking conflicts and dependencies: * If multiple threads attempt to lock the same byte (or flock the same file) @@ -461,8 +355,6 @@ static void locks_move_blocks(struct file_lock *new, struct file_lock *fl) } static inline int flock_translate_cmd(int cmd) { - if (cmd & LOCK_MAND) - return cmd & (LOCK_MAND | LOCK_RW); switch (cmd) { case LOCK_SH: return F_RDLCK; @@ -942,8 +834,6 @@ static bool flock_locks_conflict(struct file_lock *caller_fl, */ if (caller_fl->fl_file == sys_fl->fl_file) return false; - if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND)) - return false; return locks_conflict(caller_fl, sys_fl); } @@ -2116,11 +2006,9 @@ EXPORT_SYMBOL(locks_lock_inode_wait); * - %LOCK_SH -- a shared lock. * - %LOCK_EX -- an exclusive lock. * - %LOCK_UN -- remove an existing lock. - * - %LOCK_MAND -- a 'mandatory' flock. - * This exists to emulate Windows Share Modes. + * - %LOCK_MAND -- a 'mandatory' flock. (DEPRECATED) * - * %LOCK_MAND can be combined with %LOCK_READ or %LOCK_WRITE to allow other - * processes read and write access respectively. + * %LOCK_MAND support has been removed from the kernel. */ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) { @@ -2137,9 +2025,22 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) cmd &= ~LOCK_NB; unlock = (cmd == LOCK_UN); - if (!unlock && !(cmd & LOCK_MAND) && - !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) + if (!unlock && !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) + goto out_putf; + + /* + * LOCK_MAND locks were broken for a long time in that they never + * conflicted with one another and didn't prevent any sort of open, + * read or write activity. + * + * Just ignore these requests now, to preserve legacy behavior, but + * throw a warning to let people know that they don't actually work. + */ + if (cmd & LOCK_MAND) { + pr_warn_once("Attempt to set a LOCK_MAND lock via flock(2). This support has been removed and the request ignored.\n"); + error = 0; goto out_putf; + } lock = flock_make_lock(f.file, cmd, NULL); if (IS_ERR(lock)) { @@ -2718,6 +2619,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, struct inode *inode = NULL; unsigned int fl_pid; struct pid_namespace *proc_pidns = proc_pid_ns(file_inode(f->file)->i_sb); + int type; fl_pid = locks_translate_pid(fl, proc_pidns); /* @@ -2745,11 +2647,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, seq_printf(f, " %s ", (inode == NULL) ? "*NOINODE*" : "ADVISORY "); } else if (IS_FLOCK(fl)) { - if (fl->fl_type & LOCK_MAND) { - seq_puts(f, "FLOCK MSNFS "); - } else { - seq_puts(f, "FLOCK ADVISORY "); - } + seq_puts(f, "FLOCK ADVISORY "); } else if (IS_LEASE(fl)) { if (fl->fl_flags & FL_DELEG) seq_puts(f, "DELEG "); @@ -2765,17 +2663,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, } else { seq_puts(f, "UNKNOWN UNKNOWN "); } - if (fl->fl_type & LOCK_MAND) { - seq_printf(f, "%s ", - (fl->fl_type & LOCK_READ) - ? (fl->fl_type & LOCK_WRITE) ? "RW " : "READ " - : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE "); - } else { - int type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type; + type = IS_LEASE(fl) ? target_leasetype(fl) : fl->fl_type; - seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : - (type == F_RDLCK) ? "READ" : "UNLCK"); - } + seq_printf(f, "%s ", (type == F_WRLCK) ? "WRITE" : + (type == F_RDLCK) ? "READ" : "UNLCK"); if (inode) { /* userspace relies on this representation of dev_t */ seq_printf(f, "%d %02x:%02x:%lu ", fl_pid, diff --git a/fs/namei.c b/fs/namei.c index 1946d9667790..1f9d2187c765 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3076,9 +3076,7 @@ static int handle_truncate(struct user_namespace *mnt_userns, struct file *filp) int error = get_write_access(inode); if (error) return error; - /* - * Refuse to truncate files with mandatory locks held on them. - */ + error = security_path_truncate(path); if (!error) { error = do_truncate(mnt_userns, path->dentry, 0, diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 0b6cd3b8734c..994ec22d4040 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -150,7 +150,7 @@ static void netfs_clear_unread(struct netfs_read_subrequest *subreq) { struct iov_iter iter; - iov_iter_xarray(&iter, WRITE, &subreq->rreq->mapping->i_pages, + iov_iter_xarray(&iter, READ, &subreq->rreq->mapping->i_pages, subreq->start + subreq->transferred, subreq->len - subreq->transferred); iov_iter_zero(iov_iter_count(&iter), &iter); diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index acb1d22907da..5e56da748b2a 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -252,7 +252,7 @@ bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, d->bdev = bdev; - d->len = i_size_read(d->bdev->bd_inode); + d->len = bdev_nr_bytes(d->bdev); d->map = bl_map_simple; printk(KERN_INFO "pNFS: using block device %s\n", @@ -367,7 +367,7 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d, return PTR_ERR(bdev); d->bdev = bdev; - d->len = i_size_read(d->bdev->bd_inode); + d->len = bdev_nr_bytes(d->bdev); d->map = bl_map_simple; d->pr_key = v->scsi.pr_key; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 2e894fec036b..7a5f287c5391 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -275,7 +275,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) res = (long) dreq->count; WARN_ON_ONCE(dreq->count < 0); } - dreq->iocb->ki_complete(dreq->iocb, res, 0); + dreq->iocb->ki_complete(dreq->iocb, res); } complete(&dreq->completion); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index aa353fd58240..24e7dccce355 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -843,15 +843,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; - /* - * The NFSv4 protocol doesn't support LOCK_MAND, which is not part of - * any standard. In principle we might be able to support LOCK_MAND - * on NFSv2/3 since NLMv3/4 support DOS share modes, but for now the - * NFS code is not set up for it. - */ - if (fl->fl_type & LOCK_MAND) - return -EINVAL; - if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e1214bb6b7ee..459860aa8fd7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -127,7 +127,8 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, return NULL; err = security_dentry_init_security(dentry, sattr->ia_mode, - &dentry->d_name, (void **)&label->label, &label->len); + &dentry->d_name, NULL, + (void **)&label->label, &label->len); if (err == 0) return label; diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index edec45831585..0a9b72685f98 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -42,7 +42,6 @@ EXPORT_SYMBOL_GPL(locks_start_grace); /** * locks_end_grace - * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * Call this function to state that the given lock manager is ready to diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 6e9ea4ee0f73..3d1d17256a91 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -109,7 +109,6 @@ config NFSD_SCSILAYOUT depends on NFSD_V4 && BLOCK select NFSD_PNFS select EXPORTFS_BLOCK_OPS - select SCSI_COMMON help This option enables support for the exporting pNFS SCSI layouts in the kernel's NFS server. The pNFS SCSI layout enables NFS diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index c99dee99a3c1..e5c0982a381d 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -9,9 +9,6 @@ #include <linux/pr.h> #include <linux/nfsd/debug.h> -#include <scsi/scsi_proto.h> -#include <scsi/scsi_common.h> -#include <scsi/scsi_request.h> #include "blocklayoutxdr.h" #include "pnfs.h" @@ -211,109 +208,6 @@ const struct nfsd4_layout_ops bl_layout_ops = { #endif /* CONFIG_NFSD_BLOCKLAYOUT */ #ifdef CONFIG_NFSD_SCSILAYOUT -static int nfsd4_scsi_identify_device(struct block_device *bdev, - struct pnfs_block_volume *b) -{ - struct request_queue *q = bdev->bd_disk->queue; - struct request *rq; - struct scsi_request *req; - /* - * The allocation length (passed in bytes 3 and 4 of the INQUIRY - * command descriptor block) specifies the number of bytes that have - * been allocated for the data-in buffer. - * 252 is the highest one-byte value that is a multiple of 4. - * 65532 is the highest two-byte value that is a multiple of 4. - */ - size_t bufflen = 252, maxlen = 65532, len, id_len; - u8 *buf, *d, type, assoc; - int retries = 1, error; - - if (WARN_ON_ONCE(!blk_queue_scsi_passthrough(q))) - return -EINVAL; - -again: - buf = kzalloc(bufflen, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - rq = blk_get_request(q, REQ_OP_DRV_IN, 0); - if (IS_ERR(rq)) { - error = -ENOMEM; - goto out_free_buf; - } - req = scsi_req(rq); - - error = blk_rq_map_kern(q, rq, buf, bufflen, GFP_KERNEL); - if (error) - goto out_put_request; - - req->cmd[0] = INQUIRY; - req->cmd[1] = 1; - req->cmd[2] = 0x83; - req->cmd[3] = bufflen >> 8; - req->cmd[4] = bufflen & 0xff; - req->cmd_len = COMMAND_SIZE(INQUIRY); - - blk_execute_rq(NULL, rq, 1); - if (req->result) { - pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", - req->result); - error = -EIO; - goto out_put_request; - } - - len = (buf[2] << 8) + buf[3] + 4; - if (len > bufflen) { - if (len <= maxlen && retries--) { - blk_put_request(rq); - kfree(buf); - bufflen = len; - goto again; - } - pr_err("pNFS: INQUIRY 0x83 response invalid (len = %zd)\n", - len); - goto out_put_request; - } - - d = buf + 4; - for (d = buf + 4; d < buf + len; d += id_len + 4) { - id_len = d[3]; - type = d[1] & 0xf; - assoc = (d[1] >> 4) & 0x3; - - /* - * We only care about a EUI-64 and NAA designator types - * with LU association. - */ - if (assoc != 0x00) - continue; - if (type != 0x02 && type != 0x03) - continue; - if (id_len != 8 && id_len != 12 && id_len != 16) - continue; - - b->scsi.code_set = PS_CODE_SET_BINARY; - b->scsi.designator_type = type == 0x02 ? - PS_DESIGNATOR_EUI64 : PS_DESIGNATOR_NAA; - b->scsi.designator_len = id_len; - memcpy(b->scsi.designator, d + 4, id_len); - - /* - * If we found a 8 or 12 byte descriptor continue on to - * see if a 16 byte one is available. If we find a - * 16 byte descriptor we're done. - */ - if (id_len == 16) - break; - } - -out_put_request: - blk_put_request(rq); -out_free_buf: - kfree(buf); - return error; -} - #define NFSD_MDS_PR_KEY 0x0100000000000000ULL /* @@ -325,6 +219,31 @@ static u64 nfsd4_scsi_pr_key(struct nfs4_client *clp) return ((u64)clp->cl_clientid.cl_boot << 32) | clp->cl_clientid.cl_id; } +static const u8 designator_types[] = { + PS_DESIGNATOR_EUI64, + PS_DESIGNATOR_NAA, +}; + +static int +nfsd4_block_get_unique_id(struct gendisk *disk, struct pnfs_block_volume *b) +{ + int ret, i; + + for (i = 0; i < ARRAY_SIZE(designator_types); i++) { + u8 type = designator_types[i]; + + ret = disk->fops->get_unique_id(disk, b->scsi.designator, type); + if (ret > 0) { + b->scsi.code_set = PS_CODE_SET_BINARY; + b->scsi.designator_type = type; + b->scsi.designator_len = ret; + return 0; + } + } + + return -EINVAL; +} + static int nfsd4_block_get_device_info_scsi(struct super_block *sb, struct nfs4_client *clp, @@ -333,7 +252,7 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, struct pnfs_block_deviceaddr *dev; struct pnfs_block_volume *b; const struct pr_ops *ops; - int error; + int ret; dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) + sizeof(struct pnfs_block_volume), GFP_KERNEL); @@ -347,33 +266,38 @@ nfsd4_block_get_device_info_scsi(struct super_block *sb, b->type = PNFS_BLOCK_VOLUME_SCSI; b->scsi.pr_key = nfsd4_scsi_pr_key(clp); - error = nfsd4_scsi_identify_device(sb->s_bdev, b); - if (error) - return error; + ret = nfsd4_block_get_unique_id(sb->s_bdev->bd_disk, b); + if (ret < 0) + goto out_free_dev; + ret = -EINVAL; ops = sb->s_bdev->bd_disk->fops->pr_ops; if (!ops) { pr_err("pNFS: device %s does not support PRs.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } - error = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); - if (error) { + ret = ops->pr_register(sb->s_bdev, 0, NFSD_MDS_PR_KEY, true); + if (ret) { pr_err("pNFS: failed to register key for device %s.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } - error = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY, + ret = ops->pr_reserve(sb->s_bdev, NFSD_MDS_PR_KEY, PR_EXCLUSIVE_ACCESS_REG_ONLY, 0); - if (error) { + if (ret) { pr_err("pNFS: failed to reserve device %s.\n", sb->s_id); - return -EINVAL; + goto out_free_dev; } return 0; + +out_free_dev: + kfree(dev); + return ret; } static __be32 diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 7629248fdd53..fdf89fcf1a0c 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -542,7 +542,7 @@ nfsd_file_close_inode_sync(struct inode *inode) } /** - * nfsd_file_close_inode_sync - attempt to forcibly close a nfsd_file + * nfsd_file_close_inode - attempt a delayed close of a nfsd_file * @inode: inode of the file to attempt to remove * * Walk the whole hash bucket, looking for any files that correspond to "inode". @@ -602,6 +602,9 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie) { + if (WARN_ON_ONCE(!inode)) + return 0; + trace_nfsd_file_fsnotify_handle_event(inode, mask); /* Should be no marks on non-regular files */ diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index a97873f2d22b..6d1b5bb051c5 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -145,8 +145,9 @@ void nfsd4_setup_layout_type(struct svc_export *exp) #ifdef CONFIG_NFSD_SCSILAYOUT if (sb->s_export_op->map_blocks && sb->s_export_op->commit_blocks && - sb->s_bdev && sb->s_bdev->bd_disk->fops->pr_ops && - blk_queue_scsi_passthrough(sb->s_bdev->bd_disk->queue)) + sb->s_bdev && + sb->s_bdev->bd_disk->fops->pr_ops && + sb->s_bdev->bd_disk->fops->get_unique_id) exp->ex_layout_types |= 1 << LAYOUT_SCSI; #endif } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 42356416f0a0..3f4027a5de88 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3570,7 +3570,7 @@ static struct nfsd4_conn *__nfsd4_find_conn(struct svc_xprt *xpt, struct nfsd4_s } static __be32 nfsd4_match_existing_connection(struct svc_rqst *rqst, - struct nfsd4_session *session, u32 req) + struct nfsd4_session *session, u32 req, struct nfsd4_conn **conn) { struct nfs4_client *clp = session->se_client; struct svc_xprt *xpt = rqst->rq_xprt; @@ -3593,6 +3593,8 @@ static __be32 nfsd4_match_existing_connection(struct svc_rqst *rqst, else status = nfserr_inval; spin_unlock(&clp->cl_lock); + if (status == nfs_ok && conn) + *conn = c; return status; } @@ -3617,8 +3619,16 @@ __be32 nfsd4_bind_conn_to_session(struct svc_rqst *rqstp, status = nfserr_wrong_cred; if (!nfsd4_mach_creds_match(session->se_client, rqstp)) goto out; - status = nfsd4_match_existing_connection(rqstp, session, bcts->dir); - if (status == nfs_ok || status == nfserr_inval) + status = nfsd4_match_existing_connection(rqstp, session, + bcts->dir, &conn); + if (status == nfs_ok) { + if (bcts->dir == NFS4_CDFC4_FORE_OR_BOTH || + bcts->dir == NFS4_CDFC4_BACK) + conn->cn_flags |= NFS4_CDFC4_BACK; + nfsd4_probe_callback(session->se_client); + goto out; + } + if (status == nfserr_inval) goto out; status = nfsd4_map_bcts_dir(&bcts->dir); if (status) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 7abeccb975b2..cf030ebe2827 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -3544,15 +3544,18 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, goto fail; cd->rd_maxcount -= entry_bytes; /* - * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so - * let's always let through the first entry, at least: + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", and + * notes that it could be zero. If it is zero, then the server + * should enforce only the rd_maxcount value. */ - if (!cd->rd_dircount) - goto fail; - name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; - if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) - goto fail; - cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (cd->rd_dircount) { + name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); + if (!cd->rd_dircount) + cd->rd_maxcount = 0; + } cd->cookie_offset = cookie_offset; skip_entry: diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c2c3d9077dc5..070e5dd03e26 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -793,7 +793,10 @@ out_close: svc_xprt_put(xprt); } out_err: - nfsd_destroy(net); + if (!list_empty(&nn->nfsd_serv->sv_permsocks)) + nn->nfsd_serv->sv_nrthreads--; + else + nfsd_destroy(net); return err; } @@ -1545,7 +1548,7 @@ static int __init init_nfsd(void) goto out_free_all; return 0; out_free_all: - unregister_pernet_subsys(&nfsd_net_ops); + unregister_filesystem(&nfsd_fs_type); out_free_exports: remove_proc_entry("fs/nfs/exports", NULL); remove_proc_entry("fs/nfs", NULL); diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index adf3bb0a8048..6ce8617b562d 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * alloc.c - NILFS dat/inode allocator + * NILFS dat/inode allocator * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h index 0303c3968cee..b667e869ac07 100644 --- a/fs/nilfs2/alloc.h +++ b/fs/nilfs2/alloc.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator + * Persistent object (dat entry/disk inode) allocator/deallocator * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 5900879d5693..798a2c1b38c6 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * bmap.c - NILFS block mapping. + * NILFS block mapping. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index 2c63858e81c9..608168a5cb88 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * bmap.h - NILFS block mapping. + * NILFS block mapping. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 4391fd3abd8f..66bdaa2cf496 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * btnode.c - NILFS B-tree node cache + * NILFS B-tree node cache * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 0f88dbc9bcb3..11663650add7 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * btnode.h - NILFS B-tree node cache + * NILFS B-tree node cache * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index ab9ec073330f..3594eabe1419 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * btree.c - NILFS B-tree. + * NILFS B-tree. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index d1421b646ce4..92868e1a48ca 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * btree.h - NILFS B-tree. + * NILFS B-tree. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index ce144776b4ef..9ebefb3acb0e 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * cpfile.c - NILFS checkpoint file. + * NILFS checkpoint file. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index 6336222df24a..edabb2dc5756 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * cpfile.h - NILFS checkpoint file. + * NILFS checkpoint file. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 8bccdf1158fc..dc51d3b7a7bf 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * dat.c - NILFS disk address translation. + * NILFS disk address translation. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index b17ee34580ae..468c82d26183 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * dat.h - NILFS disk address translation. + * NILFS disk address translation. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 81394e22d0a0..f8f4c2ff52f4 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * dir.c - NILFS directory entry operations + * NILFS directory entry operations * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index f353101955e3..a35f2795b242 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * direct.c - NILFS direct block pointer. + * NILFS direct block pointer. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h index ec9a23c77994..b7ca896269af 100644 --- a/fs/nilfs2/direct.h +++ b/fs/nilfs2/direct.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * direct.h - NILFS direct block pointer. + * NILFS direct block pointer. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 7cf765258fda..a265d391ffe9 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * file.c - NILFS regular file handling primitives including fsync(). + * NILFS regular file handling primitives including fsync(). * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 448320496856..a8f5315f01e3 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * gcinode.c - dummy inodes to buffer blocks for garbage collection + * Dummy inodes to buffer blocks for garbage collection * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c index 02727ed3a7c6..a8a4bc8490b4 100644 --- a/fs/nilfs2/ifile.c +++ b/fs/nilfs2/ifile.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * ifile.c - NILFS inode file + * NILFS inode file * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index a1e1e5711a05..35c5273f4821 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * ifile.h - NILFS inode file + * NILFS inode file * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 2e8eb263cf0f..e3d807d5b83a 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * inode.c - NILFS inode operations. + * NILFS inode operations. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 640ac8fe891e..fec194a666f4 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * ioctl.c - NILFS ioctl operations. + * NILFS ioctl operations. * * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation. * @@ -1107,7 +1107,7 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp) goto out; ret = -ERANGE; - if (range[1] > i_size_read(inode->i_sb->s_bdev->bd_inode)) + if (range[1] > bdev_nr_bytes(inode->i_sb->s_bdev)) goto out; segbytes = nilfs->ns_blocks_per_segment * nilfs->ns_blocksize; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 97769fe4d588..4b3d33cf0041 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * mdt.c - meta data file for NILFS + * Meta data file for NILFS * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index e77aea4bb921..8f86080a436d 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * mdt.h - NILFS meta data file prototype and definitions + * NILFS meta data file prototype and definitions * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 91eebeb0c48b..23899e0ae850 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * namei.c - NILFS pathname lookup operations. + * NILFS pathname lookup operations. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 60b21b6eeac0..a7b81755c350 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * nilfs.h - NILFS local header file. + * NILFS local header file. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 171fb5cd427f..bc3e2cd4117f 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * page.c - buffer/page management specific to NILFS + * Buffer/page management specific to NILFS * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 62b9bb469e92..569263b23c0c 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * page.h - buffer/page management specific to NILFS + * Buffer/page management specific to NILFS * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index 2217f904a7cf..9e2ed76c0f25 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * recovery.c - NILFS recovery logic + * NILFS recovery logic * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 56872e93823d..43287b0d3e9b 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * segbuf.c - NILFS segment buffer + * NILFS segment buffer * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 9bea1bd59041..e20091ededba 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * segbuf.h - NILFS Segment buffer prototypes and definitions + * NILFS Segment buffer prototypes and definitions * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 686c8ee7b29c..85a853334771 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * segment.c - NILFS segment constructor. + * NILFS segment constructor. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index f5cf5308f3fc..1060f72ebf5a 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * segment.h - NILFS Segment constructor prototypes and definitions + * NILFS Segment constructor prototypes and definitions * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 63722475e17e..e385cca2004a 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * sufile.c - NILFS segment usage file. + * NILFS segment usage file. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index c4e2c7a7add1..8e8a1a5a0402 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * sufile.h - NILFS segment usage file. + * NILFS segment usage file. * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index f6b2d280aab5..63e5fa74016c 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * super.c - NILFS module and super block management. + * NILFS module and super block management. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * @@ -403,7 +403,7 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize) int ret; ret = -ERANGE; - devsize = i_size_read(sb->s_bdev->bd_inode); + devsize = bdev_nr_bytes(sb->s_bdev); if (newsize > devsize) goto out; diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c index 62f8a7ac19c8..81f35c5b5a40 100644 --- a/fs/nilfs2/sysfs.c +++ b/fs/nilfs2/sysfs.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * sysfs.c - sysfs support implementation. + * Sysfs support implementation. * * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation. * Copyright (C) 2014 HGST, Inc., a Western Digital Company. @@ -95,7 +95,7 @@ static ssize_t nilfs_snapshot_inodes_count_show(struct nilfs_snapshot_attr *attr, struct nilfs_root *root, char *buf) { - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic64_read(&root->inodes_count)); } @@ -103,7 +103,7 @@ static ssize_t nilfs_snapshot_blocks_count_show(struct nilfs_snapshot_attr *attr, struct nilfs_root *root, char *buf) { - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic64_read(&root->blocks_count)); } @@ -116,7 +116,7 @@ static ssize_t nilfs_snapshot_README_show(struct nilfs_snapshot_attr *attr, struct nilfs_root *root, char *buf) { - return snprintf(buf, PAGE_SIZE, snapshot_readme_str); + return sysfs_emit(buf, snapshot_readme_str); } NILFS_SNAPSHOT_RO_ATTR(inodes_count); @@ -217,7 +217,7 @@ static ssize_t nilfs_mounted_snapshots_README_show(struct nilfs_mounted_snapshots_attr *attr, struct the_nilfs *nilfs, char *buf) { - return snprintf(buf, PAGE_SIZE, mounted_snapshots_readme_str); + return sysfs_emit(buf, mounted_snapshots_readme_str); } NILFS_MOUNTED_SNAPSHOTS_RO_ATTR(README); @@ -255,7 +255,7 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr, ncheckpoints = cpstat.cs_ncps; - return snprintf(buf, PAGE_SIZE, "%llu\n", ncheckpoints); + return sysfs_emit(buf, "%llu\n", ncheckpoints); } static ssize_t @@ -278,7 +278,7 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr, nsnapshots = cpstat.cs_nsss; - return snprintf(buf, PAGE_SIZE, "%llu\n", nsnapshots); + return sysfs_emit(buf, "%llu\n", nsnapshots); } static ssize_t @@ -292,7 +292,7 @@ nilfs_checkpoints_last_seg_checkpoint_show(struct nilfs_checkpoints_attr *attr, last_cno = nilfs->ns_last_cno; spin_unlock(&nilfs->ns_last_segment_lock); - return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno); + return sysfs_emit(buf, "%llu\n", last_cno); } static ssize_t @@ -306,7 +306,7 @@ nilfs_checkpoints_next_checkpoint_show(struct nilfs_checkpoints_attr *attr, cno = nilfs->ns_cno; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", cno); + return sysfs_emit(buf, "%llu\n", cno); } static const char checkpoints_readme_str[] = @@ -322,7 +322,7 @@ static ssize_t nilfs_checkpoints_README_show(struct nilfs_checkpoints_attr *attr, struct the_nilfs *nilfs, char *buf) { - return snprintf(buf, PAGE_SIZE, checkpoints_readme_str); + return sysfs_emit(buf, checkpoints_readme_str); } NILFS_CHECKPOINTS_RO_ATTR(checkpoints_number); @@ -353,7 +353,7 @@ nilfs_segments_segments_number_show(struct nilfs_segments_attr *attr, struct the_nilfs *nilfs, char *buf) { - return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_nsegments); + return sysfs_emit(buf, "%lu\n", nilfs->ns_nsegments); } static ssize_t @@ -361,7 +361,7 @@ nilfs_segments_blocks_per_segment_show(struct nilfs_segments_attr *attr, struct the_nilfs *nilfs, char *buf) { - return snprintf(buf, PAGE_SIZE, "%lu\n", nilfs->ns_blocks_per_segment); + return sysfs_emit(buf, "%lu\n", nilfs->ns_blocks_per_segment); } static ssize_t @@ -375,7 +375,7 @@ nilfs_segments_clean_segments_show(struct nilfs_segments_attr *attr, ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile); up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); - return snprintf(buf, PAGE_SIZE, "%lu\n", ncleansegs); + return sysfs_emit(buf, "%lu\n", ncleansegs); } static ssize_t @@ -395,7 +395,7 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr, return err; } - return snprintf(buf, PAGE_SIZE, "%llu\n", sustat.ss_ndirtysegs); + return sysfs_emit(buf, "%llu\n", sustat.ss_ndirtysegs); } static const char segments_readme_str[] = @@ -411,7 +411,7 @@ nilfs_segments_README_show(struct nilfs_segments_attr *attr, struct the_nilfs *nilfs, char *buf) { - return snprintf(buf, PAGE_SIZE, segments_readme_str); + return sysfs_emit(buf, segments_readme_str); } NILFS_SEGMENTS_RO_ATTR(segments_number); @@ -448,7 +448,7 @@ nilfs_segctor_last_pseg_block_show(struct nilfs_segctor_attr *attr, last_pseg = nilfs->ns_last_pseg; spin_unlock(&nilfs->ns_last_segment_lock); - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)last_pseg); } @@ -463,7 +463,7 @@ nilfs_segctor_last_seg_sequence_show(struct nilfs_segctor_attr *attr, last_seq = nilfs->ns_last_seq; spin_unlock(&nilfs->ns_last_segment_lock); - return snprintf(buf, PAGE_SIZE, "%llu\n", last_seq); + return sysfs_emit(buf, "%llu\n", last_seq); } static ssize_t @@ -477,7 +477,7 @@ nilfs_segctor_last_seg_checkpoint_show(struct nilfs_segctor_attr *attr, last_cno = nilfs->ns_last_cno; spin_unlock(&nilfs->ns_last_segment_lock); - return snprintf(buf, PAGE_SIZE, "%llu\n", last_cno); + return sysfs_emit(buf, "%llu\n", last_cno); } static ssize_t @@ -491,7 +491,7 @@ nilfs_segctor_current_seg_sequence_show(struct nilfs_segctor_attr *attr, seg_seq = nilfs->ns_seg_seq; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", seg_seq); + return sysfs_emit(buf, "%llu\n", seg_seq); } static ssize_t @@ -505,7 +505,7 @@ nilfs_segctor_current_last_full_seg_show(struct nilfs_segctor_attr *attr, segnum = nilfs->ns_segnum; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", segnum); + return sysfs_emit(buf, "%llu\n", segnum); } static ssize_t @@ -519,7 +519,7 @@ nilfs_segctor_next_full_seg_show(struct nilfs_segctor_attr *attr, nextnum = nilfs->ns_nextnum; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", nextnum); + return sysfs_emit(buf, "%llu\n", nextnum); } static ssize_t @@ -533,7 +533,7 @@ nilfs_segctor_next_pseg_offset_show(struct nilfs_segctor_attr *attr, pseg_offset = nilfs->ns_pseg_offset; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%lu\n", pseg_offset); + return sysfs_emit(buf, "%lu\n", pseg_offset); } static ssize_t @@ -547,7 +547,7 @@ nilfs_segctor_next_checkpoint_show(struct nilfs_segctor_attr *attr, cno = nilfs->ns_cno; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", cno); + return sysfs_emit(buf, "%llu\n", cno); } static ssize_t @@ -575,7 +575,7 @@ nilfs_segctor_last_seg_write_time_secs_show(struct nilfs_segctor_attr *attr, ctime = nilfs->ns_ctime; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", ctime); + return sysfs_emit(buf, "%llu\n", ctime); } static ssize_t @@ -603,7 +603,7 @@ nilfs_segctor_last_nongc_write_time_secs_show(struct nilfs_segctor_attr *attr, nongc_ctime = nilfs->ns_nongc_ctime; up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", nongc_ctime); + return sysfs_emit(buf, "%llu\n", nongc_ctime); } static ssize_t @@ -617,7 +617,7 @@ nilfs_segctor_dirty_data_blocks_count_show(struct nilfs_segctor_attr *attr, ndirtyblks = atomic_read(&nilfs->ns_ndirtyblks); up_read(&nilfs->ns_segctor_sem); - return snprintf(buf, PAGE_SIZE, "%u\n", ndirtyblks); + return sysfs_emit(buf, "%u\n", ndirtyblks); } static const char segctor_readme_str[] = @@ -654,7 +654,7 @@ static ssize_t nilfs_segctor_README_show(struct nilfs_segctor_attr *attr, struct the_nilfs *nilfs, char *buf) { - return snprintf(buf, PAGE_SIZE, segctor_readme_str); + return sysfs_emit(buf, segctor_readme_str); } NILFS_SEGCTOR_RO_ATTR(last_pseg_block); @@ -723,7 +723,7 @@ nilfs_superblock_sb_write_time_secs_show(struct nilfs_superblock_attr *attr, sbwtime = nilfs->ns_sbwtime; up_read(&nilfs->ns_sem); - return snprintf(buf, PAGE_SIZE, "%llu\n", sbwtime); + return sysfs_emit(buf, "%llu\n", sbwtime); } static ssize_t @@ -737,7 +737,7 @@ nilfs_superblock_sb_write_count_show(struct nilfs_superblock_attr *attr, sbwcount = nilfs->ns_sbwcount; up_read(&nilfs->ns_sem); - return snprintf(buf, PAGE_SIZE, "%u\n", sbwcount); + return sysfs_emit(buf, "%u\n", sbwcount); } static ssize_t @@ -751,7 +751,7 @@ nilfs_superblock_sb_update_frequency_show(struct nilfs_superblock_attr *attr, sb_update_freq = nilfs->ns_sb_update_freq; up_read(&nilfs->ns_sem); - return snprintf(buf, PAGE_SIZE, "%u\n", sb_update_freq); + return sysfs_emit(buf, "%u\n", sb_update_freq); } static ssize_t @@ -799,7 +799,7 @@ static ssize_t nilfs_superblock_README_show(struct nilfs_superblock_attr *attr, struct the_nilfs *nilfs, char *buf) { - return snprintf(buf, PAGE_SIZE, sb_readme_str); + return sysfs_emit(buf, sb_readme_str); } NILFS_SUPERBLOCK_RO_ATTR(sb_write_time); @@ -834,7 +834,7 @@ ssize_t nilfs_dev_revision_show(struct nilfs_dev_attr *attr, u32 major = le32_to_cpu(sbp[0]->s_rev_level); u16 minor = le16_to_cpu(sbp[0]->s_minor_rev_level); - return snprintf(buf, PAGE_SIZE, "%d.%d\n", major, minor); + return sysfs_emit(buf, "%d.%d\n", major, minor); } static @@ -842,7 +842,7 @@ ssize_t nilfs_dev_blocksize_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - return snprintf(buf, PAGE_SIZE, "%u\n", nilfs->ns_blocksize); + return sysfs_emit(buf, "%u\n", nilfs->ns_blocksize); } static @@ -853,7 +853,7 @@ ssize_t nilfs_dev_device_size_show(struct nilfs_dev_attr *attr, struct nilfs_super_block **sbp = nilfs->ns_sbp; u64 dev_size = le64_to_cpu(sbp[0]->s_dev_size); - return snprintf(buf, PAGE_SIZE, "%llu\n", dev_size); + return sysfs_emit(buf, "%llu\n", dev_size); } static @@ -864,7 +864,7 @@ ssize_t nilfs_dev_free_blocks_show(struct nilfs_dev_attr *attr, sector_t free_blocks = 0; nilfs_count_free_blocks(nilfs, &free_blocks); - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)free_blocks); } @@ -875,7 +875,7 @@ ssize_t nilfs_dev_uuid_show(struct nilfs_dev_attr *attr, { struct nilfs_super_block **sbp = nilfs->ns_sbp; - return snprintf(buf, PAGE_SIZE, "%pUb\n", sbp[0]->s_uuid); + return sysfs_emit(buf, "%pUb\n", sbp[0]->s_uuid); } static @@ -903,7 +903,7 @@ static ssize_t nilfs_dev_README_show(struct nilfs_dev_attr *attr, struct the_nilfs *nilfs, char *buf) { - return snprintf(buf, PAGE_SIZE, dev_readme_str); + return sysfs_emit(buf, dev_readme_str); } NILFS_DEV_RO_ATTR(revision); @@ -1047,7 +1047,7 @@ void nilfs_sysfs_delete_device_group(struct the_nilfs *nilfs) static ssize_t nilfs_feature_revision_show(struct kobject *kobj, struct attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d.%d\n", + return sysfs_emit(buf, "%d.%d\n", NILFS_CURRENT_REV, NILFS_MINOR_REV); } @@ -1060,7 +1060,7 @@ static ssize_t nilfs_feature_README_show(struct kobject *kobj, struct attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, features_readme_str); + return sysfs_emit(buf, features_readme_str); } NILFS_FEATURE_RO_ATTR(revision); diff --git a/fs/nilfs2/sysfs.h b/fs/nilfs2/sysfs.h index d001eb862dae..78a87a016928 100644 --- a/fs/nilfs2/sysfs.h +++ b/fs/nilfs2/sysfs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * sysfs.h - sysfs support declarations. + * Sysfs support declarations. * * Copyright (C) 2005-2014 Nippon Telegraph and Telephone Corporation. * Copyright (C) 2014 HGST, Inc., a Western Digital Company. diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index c8bfc01da5d7..dd48a8f74d57 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0+ /* - * the_nilfs.c - the_nilfs shared structure. + * the_nilfs shared structure. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * @@ -489,7 +489,7 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, { struct nilfs_super_block **sbp = nilfs->ns_sbp; struct buffer_head **sbh = nilfs->ns_sbh; - u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size); + u64 sb2off = NILFS_SB2_OFFSET_BYTES(bdev_nr_bytes(nilfs->ns_bdev)); int valid[2], swp = 0; sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize, diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 987c8ab02aee..47c7dfbb7ea5 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0+ */ /* - * the_nilfs.h - the_nilfs shared structure. + * the_nilfs shared structure. * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 057abd2cf887..b6091775aa6e 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -111,6 +111,16 @@ static bool fanotify_name_event_equal(struct fanotify_name_event *fne1, return fanotify_info_equal(info1, info2); } +static bool fanotify_error_event_equal(struct fanotify_error_event *fee1, + struct fanotify_error_event *fee2) +{ + /* Error events against the same file system are always merged. */ + if (!fanotify_fsid_equal(&fee1->fsid, &fee2->fsid)) + return false; + + return true; +} + static bool fanotify_should_merge(struct fanotify_event *old, struct fanotify_event *new) { @@ -141,6 +151,9 @@ static bool fanotify_should_merge(struct fanotify_event *old, case FANOTIFY_EVENT_TYPE_FID_NAME: return fanotify_name_event_equal(FANOTIFY_NE(old), FANOTIFY_NE(new)); + case FANOTIFY_EVENT_TYPE_FS_ERROR: + return fanotify_error_event_equal(FANOTIFY_EE(old), + FANOTIFY_EE(new)); default: WARN_ON_ONCE(1); } @@ -176,6 +189,10 @@ static int fanotify_merge(struct fsnotify_group *group, break; if (fanotify_should_merge(old, new)) { old->mask |= new->mask; + + if (fanotify_is_error_event(old->mask)) + FANOTIFY_EE(old)->err_count++; + return 1; } } @@ -343,13 +360,23 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, static int fanotify_encode_fh_len(struct inode *inode) { int dwords = 0; + int fh_len; if (!inode) return 0; exportfs_encode_inode_fh(inode, NULL, &dwords, NULL); + fh_len = dwords << 2; + + /* + * struct fanotify_error_event might be preallocated and is + * limited to MAX_HANDLE_SZ. This should never happen, but + * safeguard by forcing an invalid file handle. + */ + if (WARN_ON_ONCE(fh_len > MAX_HANDLE_SZ)) + return 0; - return dwords << 2; + return fh_len; } /* @@ -370,8 +397,14 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, fh->type = FILEID_ROOT; fh->len = 0; fh->flags = 0; + + /* + * Invalid FHs are used by FAN_FS_ERROR for errors not + * linked to any inode. The f_handle won't be reported + * back to userspace. + */ if (!inode) - return 0; + goto out; /* * !gpf means preallocated variable size fh, but fh_len could @@ -403,8 +436,13 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, fh->type = type; fh->len = fh_len; - /* Mix fh into event merge key */ - *hash ^= fanotify_hash_fh(fh); +out: + /* + * Mix fh into event merge key. Hash might be NULL in case of + * unhashed FID events (i.e. FAN_FS_ERROR). + */ + if (hash) + *hash ^= fanotify_hash_fh(fh); return FANOTIFY_FH_HDR_LEN + fh_len; @@ -452,7 +490,7 @@ static struct inode *fanotify_dfid_inode(u32 event_mask, const void *data, if (event_mask & ALL_FSNOTIFY_DIRENT_EVENTS) return dir; - if (S_ISDIR(inode->i_mode)) + if (inode && S_ISDIR(inode->i_mode)) return inode; return dir; @@ -563,6 +601,44 @@ static struct fanotify_event *fanotify_alloc_name_event(struct inode *id, return &fne->fae; } +static struct fanotify_event *fanotify_alloc_error_event( + struct fsnotify_group *group, + __kernel_fsid_t *fsid, + const void *data, int data_type, + unsigned int *hash) +{ + struct fs_error_report *report = + fsnotify_data_error_report(data, data_type); + struct inode *inode; + struct fanotify_error_event *fee; + int fh_len; + + if (WARN_ON_ONCE(!report)) + return NULL; + + fee = mempool_alloc(&group->fanotify_data.error_events_pool, GFP_NOFS); + if (!fee) + return NULL; + + fee->fae.type = FANOTIFY_EVENT_TYPE_FS_ERROR; + fee->error = report->error; + fee->err_count = 1; + fee->fsid = *fsid; + + inode = report->inode; + fh_len = fanotify_encode_fh_len(inode); + + /* Bad fh_len. Fallback to using an invalid fh. Should never happen. */ + if (!fh_len && inode) + inode = NULL; + + fanotify_encode_fh(&fee->object_fh, inode, fh_len, NULL, 0); + + *hash ^= fanotify_hash_fsid(fsid); + + return &fee->fae; +} + static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, @@ -630,6 +706,9 @@ static struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, if (fanotify_is_perm_event(mask)) { event = fanotify_alloc_perm_event(path, gfp); + } else if (fanotify_is_error_event(mask)) { + event = fanotify_alloc_error_event(group, fsid, data, + data_type, &hash); } else if (name_event && (file_name || child)) { event = fanotify_alloc_name_event(id, fsid, file_name, child, &hash, gfp); @@ -702,6 +781,9 @@ static void fanotify_insert_event(struct fsnotify_group *group, assert_spin_locked(&group->notification_lock); + if (!fanotify_is_hashed_event(event->mask)) + return; + pr_debug("%s: group=%p event=%p bucket=%u\n", __func__, group, event, bucket); @@ -738,8 +820,9 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_ONDIR != FS_ISDIR); BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); + BUILD_BUG_ON(FAN_FS_ERROR != FS_ERROR); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 20); mask = fanotify_group_event_mask(group, iter_info, mask, data, data_type, dir); @@ -778,9 +861,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, } fsn_event = &event->fse; - ret = fsnotify_add_event(group, fsn_event, fanotify_merge, - fanotify_is_hashed_event(mask) ? - fanotify_insert_event : NULL); + ret = fsnotify_insert_event(group, fsn_event, fanotify_merge, + fanotify_insert_event); if (ret) { /* Permission events shouldn't be merged */ BUG_ON(ret == 1 && mask & FANOTIFY_PERM_EVENTS); @@ -805,6 +887,9 @@ static void fanotify_free_group_priv(struct fsnotify_group *group) if (group->fanotify_data.ucounts) dec_ucount(group->fanotify_data.ucounts, UCOUNT_FANOTIFY_GROUPS); + + if (mempool_initialized(&group->fanotify_data.error_events_pool)) + mempool_exit(&group->fanotify_data.error_events_pool); } static void fanotify_free_path_event(struct fanotify_event *event) @@ -833,7 +918,16 @@ static void fanotify_free_name_event(struct fanotify_event *event) kfree(FANOTIFY_NE(event)); } -static void fanotify_free_event(struct fsnotify_event *fsn_event) +static void fanotify_free_error_event(struct fsnotify_group *group, + struct fanotify_event *event) +{ + struct fanotify_error_event *fee = FANOTIFY_EE(event); + + mempool_free(fee, &group->fanotify_data.error_events_pool); +} + +static void fanotify_free_event(struct fsnotify_group *group, + struct fsnotify_event *fsn_event) { struct fanotify_event *event; @@ -855,6 +949,9 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) case FANOTIFY_EVENT_TYPE_OVERFLOW: kfree(event); break; + case FANOTIFY_EVENT_TYPE_FS_ERROR: + fanotify_free_error_event(group, event); + break; default: WARN_ON_ONCE(1); } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 4a5e555dc3d2..d25f500bf7e7 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -141,6 +141,7 @@ enum fanotify_event_type { FANOTIFY_EVENT_TYPE_PATH, FANOTIFY_EVENT_TYPE_PATH_PERM, FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ + FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */ __FANOTIFY_EVENT_TYPE_NUM }; @@ -170,12 +171,18 @@ static inline void fanotify_init_event(struct fanotify_event *event, event->pid = NULL; } +#define FANOTIFY_INLINE_FH(name, size) \ +struct { \ + struct fanotify_fh (name); \ + /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \ + unsigned char _inline_fh_buf[(size)]; \ +} + struct fanotify_fid_event { struct fanotify_event fae; __kernel_fsid_t fsid; - struct fanotify_fh object_fh; - /* Reserve space in object_fh.buf[] - access with fanotify_fh_buf() */ - unsigned char _inline_fh_buf[FANOTIFY_INLINE_FH_LEN]; + + FANOTIFY_INLINE_FH(object_fh, FANOTIFY_INLINE_FH_LEN); }; static inline struct fanotify_fid_event * @@ -196,12 +203,30 @@ FANOTIFY_NE(struct fanotify_event *event) return container_of(event, struct fanotify_name_event, fae); } +struct fanotify_error_event { + struct fanotify_event fae; + s32 error; /* Error reported by the Filesystem. */ + u32 err_count; /* Suppressed errors count */ + + __kernel_fsid_t fsid; /* FSID this error refers to. */ + + FANOTIFY_INLINE_FH(object_fh, MAX_HANDLE_SZ); +}; + +static inline struct fanotify_error_event * +FANOTIFY_EE(struct fanotify_event *event) +{ + return container_of(event, struct fanotify_error_event, fae); +} + static inline __kernel_fsid_t *fanotify_event_fsid(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_FID) return &FANOTIFY_FE(event)->fsid; else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) return &FANOTIFY_NE(event)->fsid; + else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) + return &FANOTIFY_EE(event)->fsid; else return NULL; } @@ -213,6 +238,8 @@ static inline struct fanotify_fh *fanotify_event_object_fh( return &FANOTIFY_FE(event)->object_fh; else if (event->type == FANOTIFY_EVENT_TYPE_FID_NAME) return fanotify_info_file_fh(&FANOTIFY_NE(event)->info); + else if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) + return &FANOTIFY_EE(event)->object_fh; else return NULL; } @@ -244,6 +271,19 @@ static inline int fanotify_event_dir_fh_len(struct fanotify_event *event) return info ? fanotify_info_dir_fh_len(info) : 0; } +static inline bool fanotify_event_has_object_fh(struct fanotify_event *event) +{ + /* For error events, even zeroed fh are reported. */ + if (event->type == FANOTIFY_EVENT_TYPE_FS_ERROR) + return true; + return fanotify_event_object_fh_len(event) > 0; +} + +static inline bool fanotify_event_has_dir_fh(struct fanotify_event *event) +{ + return fanotify_event_dir_fh_len(event) > 0; +} + struct fanotify_path_event { struct fanotify_event fae; struct path path; @@ -287,6 +327,11 @@ static inline struct fanotify_event *FANOTIFY_E(struct fsnotify_event *fse) return container_of(fse, struct fanotify_event, fse); } +static inline bool fanotify_is_error_event(u32 mask) +{ + return mask & FAN_FS_ERROR; +} + static inline bool fanotify_event_has_path(struct fanotify_event *event) { return event->type == FANOTIFY_EVENT_TYPE_PATH || @@ -315,7 +360,8 @@ static inline struct path *fanotify_event_path(struct fanotify_event *event) */ static inline bool fanotify_is_hashed_event(u32 mask) { - return !fanotify_is_perm_event(mask) && !(mask & FS_Q_OVERFLOW); + return !(fanotify_is_perm_event(mask) || + fsnotify_is_overflow_event(mask)); } static inline unsigned int fanotify_event_hash_bucket( diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 6facdf476255..559bc1e9926d 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -30,6 +30,7 @@ #define FANOTIFY_DEFAULT_MAX_EVENTS 16384 #define FANOTIFY_OLD_DEFAULT_MAX_MARKS 8192 #define FANOTIFY_DEFAULT_MAX_GROUPS 128 +#define FANOTIFY_DEFAULT_FEE_POOL_SIZE 32 /* * Legacy fanotify marks limits (8192) is per group and we introduced a tunable @@ -114,6 +115,8 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; (sizeof(struct fanotify_event_info_fid) + sizeof(struct file_handle)) #define FANOTIFY_PIDFD_INFO_HDR_LEN \ sizeof(struct fanotify_event_info_pidfd) +#define FANOTIFY_ERROR_INFO_LEN \ + (sizeof(struct fanotify_event_info_error)) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -126,17 +129,26 @@ static int fanotify_fid_info_len(int fh_len, int name_len) FANOTIFY_EVENT_ALIGN); } -static int fanotify_event_info_len(unsigned int info_mode, - struct fanotify_event *event) +static size_t fanotify_event_len(unsigned int info_mode, + struct fanotify_event *event) { - struct fanotify_info *info = fanotify_event_info(event); - int dir_fh_len = fanotify_event_dir_fh_len(event); - int fh_len = fanotify_event_object_fh_len(event); - int info_len = 0; + size_t event_len = FAN_EVENT_METADATA_LEN; + struct fanotify_info *info; + int dir_fh_len; + int fh_len; int dot_len = 0; - if (dir_fh_len) { - info_len += fanotify_fid_info_len(dir_fh_len, info->name_len); + if (!info_mode) + return event_len; + + if (fanotify_is_error_event(event->mask)) + event_len += FANOTIFY_ERROR_INFO_LEN; + + info = fanotify_event_info(event); + + if (fanotify_event_has_dir_fh(event)) { + dir_fh_len = fanotify_event_dir_fh_len(event); + event_len += fanotify_fid_info_len(dir_fh_len, info->name_len); } else if ((info_mode & FAN_REPORT_NAME) && (event->mask & FAN_ONDIR)) { /* @@ -147,12 +159,14 @@ static int fanotify_event_info_len(unsigned int info_mode, } if (info_mode & FAN_REPORT_PIDFD) - info_len += FANOTIFY_PIDFD_INFO_HDR_LEN; + event_len += FANOTIFY_PIDFD_INFO_HDR_LEN; - if (fh_len) - info_len += fanotify_fid_info_len(fh_len, dot_len); + if (fanotify_event_has_object_fh(event)) { + fh_len = fanotify_event_object_fh_len(event); + event_len += fanotify_fid_info_len(fh_len, dot_len); + } - return info_len; + return event_len; } /* @@ -181,7 +195,7 @@ static void fanotify_unhash_event(struct fsnotify_group *group, static struct fanotify_event *get_one_event(struct fsnotify_group *group, size_t count) { - size_t event_size = FAN_EVENT_METADATA_LEN; + size_t event_size; struct fanotify_event *event = NULL; struct fsnotify_event *fsn_event; unsigned int info_mode = FAN_GROUP_FLAG(group, FANOTIFY_INFO_MODES); @@ -194,8 +208,7 @@ static struct fanotify_event *get_one_event(struct fsnotify_group *group, goto out; event = FANOTIFY_E(fsn_event); - if (info_mode) - event_size += fanotify_event_info_len(info_mode, event); + event_size = fanotify_event_len(info_mode, event); if (event_size > count) { event = ERR_PTR(-EINVAL); @@ -316,6 +329,28 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } +static size_t copy_error_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_event_info_error info; + struct fanotify_error_event *fee = FANOTIFY_EE(event); + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_ERROR; + info.hdr.pad = 0; + info.hdr.len = FANOTIFY_ERROR_INFO_LEN; + + if (WARN_ON(count < info.hdr.len)) + return -EFAULT; + + info.error = fee->error; + info.error_count = fee->err_count; + + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + return info.hdr.len; +} + static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, int info_type, const char *name, size_t name_len, @@ -331,9 +366,6 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, pr_debug("%s: fh_len=%zu name_len=%zu, info_len=%zu, count=%zu\n", __func__, fh_len, name_len, info_len, count); - if (!fh_len) - return 0; - if (WARN_ON_ONCE(len < sizeof(info) || len > count)) return -EFAULT; @@ -368,6 +400,11 @@ static int copy_fid_info_to_user(__kernel_fsid_t *fsid, struct fanotify_fh *fh, handle.handle_type = fh->type; handle.handle_bytes = fh_len; + + /* Mangle handle_type for bad file_handle */ + if (!fh_len) + handle.handle_type = FILEID_INVALID; + if (copy_to_user(buf, &handle, sizeof(handle))) return -EFAULT; @@ -444,7 +481,7 @@ static int copy_info_records_to_user(struct fanotify_event *event, /* * Event info records order is as follows: dir fid + name, child fid. */ - if (fanotify_event_dir_fh_len(event)) { + if (fanotify_event_has_dir_fh(event)) { info_type = info->name_len ? FAN_EVENT_INFO_TYPE_DFID_NAME : FAN_EVENT_INFO_TYPE_DFID; ret = copy_fid_info_to_user(fanotify_event_fsid(event), @@ -460,7 +497,7 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } - if (fanotify_event_object_fh_len(event)) { + if (fanotify_event_has_object_fh(event)) { const char *dot = NULL; int dot_len = 0; @@ -520,6 +557,15 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + if (fanotify_is_error_event(event->mask)) { + ret = copy_error_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + return total_bytes; } @@ -537,8 +583,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - metadata.event_len = FAN_EVENT_METADATA_LEN + - fanotify_event_info_len(info_mode, event); + metadata.event_len = fanotify_event_len(info_mode, event); metadata.metadata_len = FAN_EVENT_METADATA_LEN; metadata.vers = FANOTIFY_METADATA_VERSION; metadata.reserved = 0; @@ -1049,6 +1094,15 @@ out_dec_ucounts: return ERR_PTR(ret); } +static int fanotify_group_init_error_pool(struct fsnotify_group *group) +{ + if (mempool_initialized(&group->fanotify_data.error_events_pool)) + return 0; + + return mempool_init_kmalloc_pool(&group->fanotify_data.error_events_pool, + FANOTIFY_DEFAULT_FEE_POOL_SIZE, + sizeof(struct fanotify_error_event)); +} static int fanotify_add_mark(struct fsnotify_group *group, fsnotify_connp_t *connp, unsigned int type, @@ -1057,6 +1111,7 @@ static int fanotify_add_mark(struct fsnotify_group *group, { struct fsnotify_mark *fsn_mark; __u32 added; + int ret = 0; mutex_lock(&group->mark_mutex); fsn_mark = fsnotify_find_mark(connp, group); @@ -1067,13 +1122,26 @@ static int fanotify_add_mark(struct fsnotify_group *group, return PTR_ERR(fsn_mark); } } + + /* + * Error events are pre-allocated per group, only if strictly + * needed (i.e. FAN_FS_ERROR was requested). + */ + if (!(flags & FAN_MARK_IGNORED_MASK) && (mask & FAN_FS_ERROR)) { + ret = fanotify_group_init_error_pool(group); + if (ret) + goto out; + } + added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); if (added & ~fsnotify_conn_mask(fsn_mark->connector)) fsnotify_recalc_mask(fsn_mark->connector); + +out: mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); - return 0; + return ret; } static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, @@ -1295,16 +1363,15 @@ out_destroy_group: return fd; } -/* Check if filesystem can encode a unique fid */ -static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) +static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid) { __kernel_fsid_t root_fsid; int err; /* - * Make sure path is not in filesystem with zero fsid (e.g. tmpfs). + * Make sure dentry is not of a filesystem with zero fsid (e.g. fuse). */ - err = vfs_get_fsid(path->dentry, fsid); + err = vfs_get_fsid(dentry, fsid); if (err) return err; @@ -1312,10 +1379,10 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) return -ENODEV; /* - * Make sure path is not inside a filesystem subvolume (e.g. btrfs) + * Make sure dentry is not of a filesystem subvolume (e.g. btrfs) * which uses a different fsid than sb root. */ - err = vfs_get_fsid(path->dentry->d_sb->s_root, &root_fsid); + err = vfs_get_fsid(dentry->d_sb->s_root, &root_fsid); if (err) return err; @@ -1323,6 +1390,12 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) root_fsid.val[1] != fsid->val[1]) return -EXDEV; + return 0; +} + +/* Check if filesystem can encode a unique fid */ +static int fanotify_test_fid(struct dentry *dentry) +{ /* * We need to make sure that the file system supports at least * encoding a file handle so user can use name_to_handle_at() to @@ -1330,8 +1403,8 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid) * objects. However, name_to_handle_at() requires that the * filesystem also supports decoding file handles. */ - if (!path->dentry->d_sb->s_export_op || - !path->dentry->d_sb->s_export_op->fh_to_dentry) + if (!dentry->d_sb->s_export_op || + !dentry->d_sb->s_export_op->fh_to_dentry) return -EOPNOTSUPP; return 0; @@ -1447,15 +1520,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, group->priority == FS_PRIO_0) goto fput_and_out; + if (mask & FAN_FS_ERROR && + mark_type != FAN_MARK_FILESYSTEM) + goto fput_and_out; + /* - * Events with data type inode do not carry enough information to report - * event->fd, so we do not allow setting a mask for inode events unless - * group supports reporting fid. - * inode events are not supported on a mount mark, because they do not - * carry enough information (i.e. path) to be filtered by mount point. + * Events that do not carry enough information to report + * event->fd require a group that supports reporting fid. Those + * events are not supported on a mount mark, because they do not + * carry enough information (i.e. path) to be filtered by mount + * point. */ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); - if (mask & FANOTIFY_INODE_EVENTS && + if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) goto fput_and_out; @@ -1482,7 +1559,11 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } if (fid_mode) { - ret = fanotify_test_fid(&path, &__fsid); + ret = fanotify_test_fsid(path.dentry, &__fsid); + if (ret) + goto path_put_and_out; + + ret = fanotify_test_fid(path.dentry); if (ret) goto path_put_and_out; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 963e6ce75b96..4034ca566f95 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -252,6 +252,9 @@ static int fsnotify_handle_inode_event(struct fsnotify_group *group, if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; + if (WARN_ON_ONCE(!inode && !dir)) + return 0; + if ((inode_mark->mask & FS_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; @@ -455,16 +458,16 @@ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - - * either @dir or @inode must be non-NULL. - * if both are non-NULL event may be reported to both. + * If @dir and @inode are both non-NULL, event may be + * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); + struct super_block *sb = fsnotify_data_sb(data, data_type); struct fsnotify_iter_info iter_info = {}; - struct super_block *sb; struct mount *mnt = NULL; struct inode *parent = NULL; int ret = 0; @@ -483,7 +486,6 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, */ parent = dir; } - sb = inode->i_sb; /* * Optimization: srcu_read_lock() has a memory barrier which can diff --git a/fs/notify/group.c b/fs/notify/group.c index fb89c351295d..6a297efc4788 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -88,7 +88,7 @@ void fsnotify_destroy_group(struct fsnotify_group *group) * that deliberately ignores overflow events. */ if (group->overflow_event) - group->ops->free_event(group->overflow_event); + group->ops->free_event(group, group->overflow_event); fsnotify_put_group(group); } diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index d1a64daa0171..d92d7b0adc9a 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -116,7 +116,7 @@ int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, if (len) strcpy(event->name, name->name); - ret = fsnotify_add_event(group, fsn_event, inotify_merge, NULL); + ret = fsnotify_add_event(group, fsn_event, inotify_merge); if (ret) { /* Our event wasn't used in the end. Free it. */ fsnotify_destroy_event(group, fsn_event); @@ -177,7 +177,8 @@ static void inotify_free_group_priv(struct fsnotify_group *group) dec_inotify_instances(group->inotify_data.ucounts); } -static void inotify_free_event(struct fsnotify_event *fsn_event) +static void inotify_free_event(struct fsnotify_group *group, + struct fsnotify_event *fsn_event) { kfree(INOTIFY_E(fsn_event)); } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 62051247f6d2..29fca3284bb5 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -94,10 +94,10 @@ static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg) __u32 mask; /* - * Everything should accept their own ignored and should receive events - * when the inode is unmounted. All directories care about children. + * Everything should receive events when the inode is unmounted. + * All directories care about children. */ - mask = (FS_IN_IGNORED | FS_UNMOUNT); + mask = (FS_UNMOUNT); if (S_ISDIR(inode->i_mode)) mask |= FS_EVENT_ON_CHILD; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 32f45543b9c6..9022ae650cf8 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -64,7 +64,7 @@ void fsnotify_destroy_event(struct fsnotify_group *group, WARN_ON(!list_empty(&event->list)); spin_unlock(&group->notification_lock); } - group->ops->free_event(event); + group->ops->free_event(group, event); } /* @@ -78,12 +78,12 @@ void fsnotify_destroy_event(struct fsnotify_group *group, * 2 if the event was not queued - either the queue of events has overflown * or the group is shutting down. */ -int fsnotify_add_event(struct fsnotify_group *group, - struct fsnotify_event *event, - int (*merge)(struct fsnotify_group *, - struct fsnotify_event *), - void (*insert)(struct fsnotify_group *, - struct fsnotify_event *)) +int fsnotify_insert_event(struct fsnotify_group *group, + struct fsnotify_event *event, + int (*merge)(struct fsnotify_group *, + struct fsnotify_event *), + void (*insert)(struct fsnotify_group *, + struct fsnotify_event *)) { int ret = 0; struct list_head *list = &group->notification_list; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index ab4f3362466d..2ae25e48a41a 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -5,6 +5,7 @@ * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. */ +#include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> #include <linux/gfp.h> @@ -1829,7 +1830,7 @@ again: * pages being swapped out between us bringing them into memory * and doing the actual copying. */ - if (unlikely(iov_iter_fault_in_readable(i, bytes))) { + if (unlikely(fault_in_iov_iter_readable(i, bytes))) { status = -EFAULT; break; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 0d7e948cb29c..5ae8de09b271 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -2772,13 +2772,12 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) ntfs_debug("Set device block size to %i bytes (block size bits %i).", blocksize, sb->s_blocksize_bits); /* Determine the size of the device in units of block_size bytes. */ - if (!i_size_read(sb->s_bdev->bd_inode)) { + vol->nr_blocks = sb_bdev_nr_blocks(sb); + if (!vol->nr_blocks) { if (!silent) ntfs_error(sb, "Unable to determine device size."); goto err_out_now; } - vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; /* Read the boot sector and return unlocked buffer head to it. */ if (!(bh = read_ntfs_boot_sector(sb, silent))) { if (!silent) @@ -2816,8 +2815,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) goto err_out_now; } BUG_ON(blocksize != sb->s_blocksize); - vol->nr_blocks = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; + vol->nr_blocks = sb_bdev_nr_blocks(sb); ntfs_debug("Changed device block size to %i bytes (block size " "bits %i) to match volume sector size.", blocksize, sb->s_blocksize_bits); diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index 34c4cbf7e29b..e8c00dda42ad 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -6,13 +6,9 @@ * TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame? */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/hash.h> -#include <linux/nls.h> -#include <linux/ratelimit.h> #include <linux/slab.h> +#include <linux/kernel.h> #include "debug.h" #include "ntfs.h" @@ -291,7 +287,7 @@ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, if (!rsize) { /* Empty resident -> Non empty nonresident. */ } else if (!is_data) { - err = ntfs_sb_write_run(sbi, run, 0, data, rsize); + err = ntfs_sb_write_run(sbi, run, 0, data, rsize, 0); if (err) goto out2; } else if (!page) { @@ -451,11 +447,8 @@ again: again_1: align = sbi->cluster_size; - if (is_ext) { + if (is_ext) align <<= attr_b->nres.c_unit; - if (is_attr_sparsed(attr_b)) - keep_prealloc = false; - } old_valid = le64_to_cpu(attr_b->nres.valid_size); old_size = le64_to_cpu(attr_b->nres.data_size); @@ -465,9 +458,6 @@ again_1: new_alloc = (new_size + align - 1) & ~(u64)(align - 1); new_alen = new_alloc >> cluster_bits; - if (keep_prealloc && is_ext) - keep_prealloc = false; - if (keep_prealloc && new_size < old_size) { attr_b->nres.data_size = cpu_to_le64(new_size); mi_b->dirty = true; @@ -529,7 +519,7 @@ add_alloc_in_same_attr_seg: } else if (pre_alloc == -1) { pre_alloc = 0; if (type == ATTR_DATA && !name_len && - sbi->options.prealloc) { + sbi->options->prealloc) { CLST new_alen2 = bytes_to_cluster( sbi, get_pre_allocated(new_size)); pre_alloc = new_alen2 - new_alen; @@ -1966,7 +1956,7 @@ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) return 0; from = vbo; - to = (vbo + bytes) < data_size ? (vbo + bytes) : data_size; + to = min_t(u64, vbo + bytes, data_size); memset(Add2Ptr(resident_data(attr_b), from), 0, to - from); return 0; } diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c index fa32399eb517..bad6d8a849a2 100644 --- a/fs/ntfs3/attrlist.c +++ b/fs/ntfs3/attrlist.c @@ -5,10 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" @@ -336,7 +333,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, if (attr && attr->non_res) { err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, - al->size); + al->size, 0); if (err) return err; al->dirty = false; @@ -423,7 +420,7 @@ next: return true; } -int al_update(struct ntfs_inode *ni) +int al_update(struct ntfs_inode *ni, int sync) { int err; struct ATTRIB *attr; @@ -445,7 +442,7 @@ int al_update(struct ntfs_inode *ni) memcpy(resident_data(attr), al->le, al->size); } else { err = ntfs_sb_write_run(ni->mi.sbi, &al->run, 0, al->le, - al->size); + al->size, sync); if (err) goto out; diff --git a/fs/ntfs3/bitfunc.c b/fs/ntfs3/bitfunc.c index ce304d40b5e1..50d838093790 100644 --- a/fs/ntfs3/bitfunc.c +++ b/fs/ntfs3/bitfunc.c @@ -5,13 +5,8 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/fs.h> -#include <linux/nls.h> +#include <linux/types.h> -#include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" #define BITS_IN_SIZE_T (sizeof(size_t) * 8) @@ -124,8 +119,7 @@ bool are_bits_set(const ulong *lmap, size_t bit, size_t nbits) pos = nbits & 7; if (pos) { - u8 mask = fill_mask[pos]; - + mask = fill_mask[pos]; if ((*map & mask) != mask) return false; } diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 831501555009..aa184407520f 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -10,12 +10,10 @@ * */ -#include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> -#include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" @@ -435,7 +433,7 @@ static void wnd_remove_free_ext(struct wnd_bitmap *wnd, size_t bit, size_t len) ; } else { n3 = rb_next(&e->count.node); - max_new_len = len > new_len ? len : new_len; + max_new_len = max(len, new_len); if (!n3) { wnd->extent_max = max_new_len; } else { @@ -731,7 +729,7 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { @@ -784,7 +782,7 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); bh = wnd_map(wnd, iw); if (IS_ERR(bh)) { @@ -834,7 +832,7 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); if (wbits != wnd->free_bits[iw]) { bool ret; @@ -926,7 +924,7 @@ use_wnd: wbits = wnd->bits_last; tail = wbits - wbit; - op = tail < bits ? tail : bits; + op = min_t(u32, tail, bits); if (wnd->free_bits[iw]) { bool ret; diff --git a/fs/ntfs3/debug.h b/fs/ntfs3/debug.h index 31120569a87b..53ef7489c75f 100644 --- a/fs/ntfs3/debug.h +++ b/fs/ntfs3/debug.h @@ -11,6 +11,9 @@ #ifndef _LINUX_NTFS3_DEBUG_H #define _LINUX_NTFS3_DEBUG_H +struct super_block; +struct inode; + #ifndef Add2Ptr #define Add2Ptr(P, I) ((void *)((u8 *)(P) + (I))) #define PtrOffset(B, O) ((size_t)((size_t)(O) - (size_t)(B))) diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c index 93f6d485564e..fb438d604040 100644 --- a/fs/ntfs3/dir.c +++ b/fs/ntfs3/dir.c @@ -7,10 +7,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/iversion.h> #include <linux/nls.h> #include "debug.h" @@ -18,30 +15,27 @@ #include "ntfs_fs.h" /* Convert little endian UTF-16 to NLS string. */ -int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len) { - int ret, uni_len, warn; - const __le16 *ip; + int ret, warn; u8 *op; - struct nls_table *nls = sbi->options.nls; + struct nls_table *nls = sbi->options->nls; static_assert(sizeof(wchar_t) == sizeof(__le16)); if (!nls) { /* UTF-16 -> UTF-8 */ - ret = utf16s_to_utf8s((wchar_t *)uni->name, uni->len, - UTF16_LITTLE_ENDIAN, buf, buf_len); + ret = utf16s_to_utf8s(name, len, UTF16_LITTLE_ENDIAN, buf, + buf_len); buf[ret] = '\0'; return ret; } - ip = uni->name; op = buf; - uni_len = uni->len; warn = 0; - while (uni_len--) { + while (len--) { u16 ec; int charlen; char dump[5]; @@ -52,7 +46,7 @@ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, break; } - ec = le16_to_cpu(*ip++); + ec = le16_to_cpu(*name++); charlen = nls->uni2char(ec, op, buf_len); if (charlen > 0) { @@ -186,7 +180,7 @@ int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, { int ret, slen; const u8 *end; - struct nls_table *nls = sbi->options.nls; + struct nls_table *nls = sbi->options->nls; u16 *uname = uni->name; static_assert(sizeof(wchar_t) == sizeof(u16)); @@ -301,14 +295,14 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, return 0; /* Skip meta files. Unless option to show metafiles is set. */ - if (!sbi->options.showmeta && ntfs_is_meta_file(sbi, ino)) + if (!sbi->options->showmeta && ntfs_is_meta_file(sbi, ino)) return 0; - if (sbi->options.nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) + if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) return 0; - name_len = ntfs_utf16_to_nls(sbi, (struct le_str *)&fname->name_len, - name, PATH_MAX); + name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name, + PATH_MAX); if (name_len <= 0) { ntfs_warn(sbi->sb, "failed to convert name for inode %lx.", ino); diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 424450e77ad5..787b53b984ee 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -8,11 +8,11 @@ */ #include <linux/backing-dev.h> +#include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/compat.h> #include <linux/falloc.h> #include <linux/fiemap.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" @@ -588,8 +588,11 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len) truncate_pagecache(inode, vbo_down); if (!is_sparsed(ni) && !is_compressed(ni)) { - /* Normal file. */ - err = ntfs_zero_range(inode, vbo, end); + /* + * Normal file, can't make hole. + * TODO: Try to find way to save info about hole. + */ + err = -EOPNOTSUPP; goto out; } @@ -737,7 +740,7 @@ int ntfs3_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, umode_t mode = inode->i_mode; int err; - if (sbi->options.no_acs_rules) { + if (sbi->options->noacsrules) { /* "No access rules" - Force any changes of time etc. */ attr->ia_valid |= ATTR_FORCE; /* and disable for editing some attributes. */ @@ -987,7 +990,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) frame_vbo = pos & ~(frame_size - 1); index = frame_vbo >> PAGE_SHIFT; - if (unlikely(iov_iter_fault_in_readable(from, bytes))) { + if (unlikely(fault_in_iov_iter_readable(from, bytes))) { err = -EFAULT; goto out; } @@ -1185,7 +1188,7 @@ static int ntfs_file_release(struct inode *inode, struct file *file) int err = 0; /* If we are last writer on the inode, drop the block reservation. */ - if (sbi->options.prealloc && ((file->f_mode & FMODE_WRITE) && + if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) && atomic_read(&inode->i_writecount) == 1)) { ni_lock(ni); down_write(&ni->file.run_lock); diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 938b12d56ca6..6f47a9c17f89 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -5,11 +5,8 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fiemap.h> #include <linux/fs.h> -#include <linux/nls.h> #include <linux/vmalloc.h> #include "debug.h" @@ -708,18 +705,35 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) continue; mi = ni_find_mi(ni, ino_get(&le->ref)); + if (!mi) { + /* Should never happened, 'cause already checked. */ + goto bad; + } attr = mi_find_attr(mi, NULL, le->type, le_name(le), le->name_len, &le->id); + if (!attr) { + /* Should never happened, 'cause already checked. */ + goto bad; + } asize = le32_to_cpu(attr->size); /* Insert into primary record. */ attr_ins = mi_insert_attr(&ni->mi, le->type, le_name(le), le->name_len, asize, le16_to_cpu(attr->name_off)); - id = attr_ins->id; + if (!attr_ins) { + /* + * Internal error. + * Either no space in primary record (already checked). + * Either tried to insert another + * non indexed attribute (logic error). + */ + goto bad; + } /* Copy all except id. */ + id = attr_ins->id; memcpy(attr_ins, attr, asize); attr_ins->id = id; @@ -735,6 +749,10 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni) ni->attr_list.dirty = false; return 0; +bad: + ntfs_inode_err(&ni->vfs_inode, "Internal error"); + make_bad_inode(&ni->vfs_inode); + return -EINVAL; } /* @@ -956,6 +974,13 @@ static int ni_ins_attr_ext(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le, continue; } + /* + * Do not try to insert this attribute + * if there is no room in record. + */ + if (le32_to_cpu(mi->mrec->used) + asize > sbi->record_size) + continue; + /* Try to insert attribute into this subrecord. */ attr = ni_ins_new_attr(ni, mi, le, type, name, name_len, asize, name_off, svcn, ins_le); @@ -1451,7 +1476,7 @@ int ni_insert_resident(struct ntfs_inode *ni, u32 data_size, attr->res.flags = RESIDENT_FLAG_INDEXED; /* is_attr_indexed(attr)) == true */ - le16_add_cpu(&ni->mi.mrec->hard_links, +1); + le16_add_cpu(&ni->mi.mrec->hard_links, 1); ni->mi.dirty = true; } attr->res.res = 0; @@ -1606,7 +1631,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, *le = NULL; - if (FILE_NAME_POSIX == name_type) + if (name_type == FILE_NAME_POSIX) return NULL; /* Enumerate all names. */ @@ -1706,18 +1731,16 @@ out: /* * ni_parse_reparse * - * Buffer is at least 24 bytes. + * buffer - memory for reparse buffer header */ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, - void *buffer) + struct REPARSE_DATA_BUFFER *buffer) { const struct REPARSE_DATA_BUFFER *rp = NULL; u8 bits; u16 len; typeof(rp->CompressReparseBuffer) *cmpr; - static_assert(sizeof(struct REPARSE_DATA_BUFFER) <= 24); - /* Try to estimate reparse point. */ if (!attr->non_res) { rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); @@ -1803,6 +1826,9 @@ enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, return REPARSE_NONE; } + if (buffer != rp) + memcpy(buffer, rp, sizeof(struct REPARSE_DATA_BUFFER)); + /* Looks like normal symlink. */ return REPARSE_LINK; } @@ -2906,9 +2932,8 @@ bool ni_remove_name_undo(struct ntfs_inode *dir_ni, struct ntfs_inode *ni, memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), de + 1, de_key_size); mi_get_ref(&ni->mi, &de->ref); - if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) { + if (indx_insert_entry(&dir_ni->dir, dir_ni, de, sbi, NULL, 1)) return false; - } } return true; @@ -3077,7 +3102,9 @@ static bool ni_update_parent(struct ntfs_inode *ni, struct NTFS_DUP_INFO *dup, const struct EA_INFO *info; info = resident_data_ex(attr, sizeof(struct EA_INFO)); - dup->ea_size = info->size_pack; + /* If ATTR_EA_INFO exists 'info' can't be NULL. */ + if (info) + dup->ea_size = info->size_pack; } } @@ -3205,7 +3232,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint) goto out; } - err = al_update(ni); + err = al_update(ni, sync); if (err) goto out; } diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index b5853aed0e25..06492f088d60 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -6,12 +6,8 @@ */ #include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/hash.h> -#include <linux/nls.h> #include <linux/random.h> -#include <linux/ratelimit.h> #include <linux/slab.h> #include "debug.h" @@ -2219,7 +2215,7 @@ file_is_valid: err = ntfs_sb_write_run(log->ni->mi.sbi, &log->ni->file.run, off, page, - log->page_size); + log->page_size, 0); if (err) goto out; @@ -3710,7 +3706,7 @@ move_data: if (a_dirty) { attr = oa->attr; - err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes); + err = ntfs_sb_write_run(sbi, oa->run1, vbo, buffer_le, bytes, 0); if (err) goto out; } @@ -5152,10 +5148,10 @@ end_reply: ntfs_fix_pre_write(&rh->rhdr, log->page_size); - err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size); + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rh, log->page_size, 0); if (!err) err = ntfs_sb_write_run(sbi, &log->ni->file.run, log->page_size, - rh, log->page_size); + rh, log->page_size, 0); kfree(rh); if (err) diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 91e3743e1442..4de9acb16968 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -8,7 +8,7 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> #include "debug.h" #include "ntfs.h" @@ -358,7 +358,7 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, enum ALLOCATE_OPT opt) { int err; - CLST alen = 0; + CLST alen; struct super_block *sb = sbi->sb; size_t alcn, zlen, zeroes, zlcn, zlen2, ztrim, new_zlen; struct wnd_bitmap *wnd = &sbi->used.bitmap; @@ -370,27 +370,28 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, if (!zlen) { err = ntfs_refresh_zone(sbi); if (err) - goto out; + goto up_write; + zlen = wnd_zone_len(wnd); } if (!zlen) { ntfs_err(sbi->sb, "no free space to extend mft"); - goto out; + err = -ENOSPC; + goto up_write; } lcn = wnd_zone_bit(wnd); - alen = zlen > len ? len : zlen; + alen = min_t(CLST, len, zlen); wnd_zone_set(wnd, lcn + alen, zlen - alen); err = wnd_set_used(wnd, lcn, alen); - if (err) { - up_write(&wnd->rw_lock); - return err; - } + if (err) + goto up_write; + alcn = lcn; - goto out; + goto space_found; } /* * 'Cause cluster 0 is always used this value means that we should use @@ -404,49 +405,45 @@ int ntfs_look_for_free_space(struct ntfs_sb_info *sbi, CLST lcn, CLST len, alen = wnd_find(wnd, len, lcn, BITMAP_FIND_MARK_AS_USED, &alcn); if (alen) - goto out; + goto space_found; /* Try to use clusters from MftZone. */ zlen = wnd_zone_len(wnd); zeroes = wnd_zeroes(wnd); /* Check too big request */ - if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) - goto out; + if (len > zeroes + zlen || zlen <= NTFS_MIN_MFT_ZONE) { + err = -ENOSPC; + goto up_write; + } /* How many clusters to cat from zone. */ zlcn = wnd_zone_bit(wnd); zlen2 = zlen >> 1; - ztrim = len > zlen ? zlen : (len > zlen2 ? len : zlen2); - new_zlen = zlen - ztrim; - - if (new_zlen < NTFS_MIN_MFT_ZONE) { - new_zlen = NTFS_MIN_MFT_ZONE; - if (new_zlen > zlen) - new_zlen = zlen; - } + ztrim = clamp_val(len, zlen2, zlen); + new_zlen = max_t(size_t, zlen - ztrim, NTFS_MIN_MFT_ZONE); wnd_zone_set(wnd, zlcn, new_zlen); /* Allocate continues clusters. */ alen = wnd_find(wnd, len, 0, BITMAP_FIND_MARK_AS_USED | BITMAP_FIND_FULL, &alcn); - -out: - if (alen) { - err = 0; - *new_len = alen; - *new_lcn = alcn; - - ntfs_unmap_meta(sb, alcn, alen); - - /* Set hint for next requests. */ - if (!(opt & ALLOCATE_MFT)) - sbi->used.next_free_lcn = alcn + alen; - } else { + if (!alen) { err = -ENOSPC; + goto up_write; } +space_found: + err = 0; + *new_len = alen; + *new_lcn = alcn; + + ntfs_unmap_meta(sb, alcn, alen); + + /* Set hint for next requests. */ + if (!(opt & ALLOCATE_MFT)) + sbi->used.next_free_lcn = alcn + alen; +up_write: up_write(&wnd->rw_lock); return err; } @@ -1080,7 +1077,7 @@ int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, } int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, - u64 vbo, const void *buf, size_t bytes) + u64 vbo, const void *buf, size_t bytes, int sync) { struct super_block *sb = sbi->sb; u8 cluster_bits = sbi->cluster_bits; @@ -1099,8 +1096,8 @@ int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, len = ((u64)clen << cluster_bits) - off; for (;;) { - u32 op = len < bytes ? len : bytes; - int err = ntfs_sb_write(sb, lbo, op, buf, 0); + u32 op = min_t(u64, len, bytes); + int err = ntfs_sb_write(sb, lbo, op, buf, sync); if (err) return err; @@ -1300,7 +1297,7 @@ int ntfs_get_bh(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo, nb->off = off = lbo & (blocksize - 1); for (;;) { - u32 len32 = len < bytes ? len : bytes; + u32 len32 = min_t(u64, len, bytes); sector_t block = lbo >> sb->s_blocksize_bits; do { @@ -2175,7 +2172,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, /* Write main SDS bucket. */ err = ntfs_sb_write_run(sbi, &ni->file.run, sbi->security.next_off, - d_security, aligned_sec_size); + d_security, aligned_sec_size, 0); if (err) goto out; @@ -2193,7 +2190,7 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi, /* Write copy SDS bucket. */ err = ntfs_sb_write_run(sbi, &ni->file.run, mirr_off, d_security, - aligned_sec_size); + aligned_sec_size, 0); if (err) goto out; diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 0daca9adc54c..6f81e3a49abf 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -8,7 +8,7 @@ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> #include "debug.h" #include "ntfs.h" @@ -671,138 +671,74 @@ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, const struct INDEX_HDR *hdr, const void *key, size_t key_len, const void *ctx, int *diff) { - struct NTFS_DE *e; + struct NTFS_DE *e, *found = NULL; NTFS_CMP_FUNC cmp = indx->cmp; + int min_idx = 0, mid_idx, max_idx = 0; + int diff2; + int table_size = 8; u32 e_size, e_key_len; u32 end = le32_to_cpu(hdr->used); u32 off = le32_to_cpu(hdr->de_off); + u16 offs[128]; -#ifdef NTFS3_INDEX_BINARY_SEARCH - int max_idx = 0, fnd, min_idx; - int nslots = 64; - u16 *offs; - - if (end > 0x10000) - goto next; - - offs = kmalloc(sizeof(u16) * nslots, GFP_NOFS); - if (!offs) - goto next; +fill_table: + if (off + sizeof(struct NTFS_DE) > end) + return NULL; - /* Use binary search algorithm. */ -next1: - if (off + sizeof(struct NTFS_DE) > end) { - e = NULL; - goto out1; - } e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); - if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) { - e = NULL; - goto out1; - } - - if (max_idx >= nslots) { - u16 *ptr; - int new_slots = ALIGN(2 * nslots, 8); - - ptr = kmalloc(sizeof(u16) * new_slots, GFP_NOFS); - if (ptr) - memcpy(ptr, offs, sizeof(u16) * max_idx); - kfree(offs); - offs = ptr; - nslots = new_slots; - if (!ptr) - goto next; - } - - /* Store entry table. */ - offs[max_idx] = off; + if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) + return NULL; if (!de_is_last(e)) { + offs[max_idx] = off; off += e_size; - max_idx += 1; - goto next1; - } - /* - * Table of pointers is created. - * Use binary search to find entry that is <= to the search value. - */ - fnd = -1; - min_idx = 0; + max_idx++; + if (max_idx < table_size) + goto fill_table; - while (min_idx <= max_idx) { - int mid_idx = min_idx + ((max_idx - min_idx) >> 1); - int diff2; - - e = Add2Ptr(hdr, offs[mid_idx]); + max_idx--; + } - e_key_len = le16_to_cpu(e->key_size); +binary_search: + e_key_len = le16_to_cpu(e->key_size); - diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); + diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); + if (diff2 > 0) { + if (found) { + min_idx = mid_idx + 1; + } else { + if (de_is_last(e)) + return NULL; - if (!diff2) { - *diff = 0; - goto out1; + max_idx = 0; + table_size = min(table_size * 2, + (int)ARRAY_SIZE(offs)); + goto fill_table; } - - if (diff2 < 0) { + } else if (diff2 < 0) { + if (found) max_idx = mid_idx - 1; - fnd = mid_idx; - if (!fnd) - break; - } else { - min_idx = mid_idx + 1; - } - } + else + max_idx--; - if (fnd == -1) { - e = NULL; - goto out1; + found = e; + } else { + *diff = 0; + return e; } - *diff = -1; - e = Add2Ptr(hdr, offs[fnd]); - -out1: - kfree(offs); - - return e; -#endif - -next: - /* - * Entries index are sorted. - * Enumerate all entries until we find entry - * that is <= to the search value. - */ - if (off + sizeof(struct NTFS_DE) > end) - return NULL; - - e = Add2Ptr(hdr, off); - e_size = le16_to_cpu(e->size); - - if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) - return NULL; - - off += e_size; - - e_key_len = le16_to_cpu(e->key_size); - - *diff = (*cmp)(key, key_len, e + 1, e_key_len, ctx); - if (!*diff) - return e; + if (min_idx > max_idx) { + *diff = -1; + return found; + } - if (*diff <= 0) - return e; + mid_idx = (min_idx + max_idx) >> 1; + e = Add2Ptr(hdr, offs[mid_idx]); - if (de_is_last(e)) { - *diff = 1; - return e; - } - goto next; + goto binary_search; } /* @@ -1136,9 +1072,7 @@ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, if (!e) return -EINVAL; - if (fnd) - fnd->root_de = e; - + fnd->root_de = e; err = 0; for (;;) { @@ -1401,7 +1335,7 @@ ok: static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, CLST *vbn) { - int err = -ENOMEM; + int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *bitmap; struct ATTRIB *alloc; diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index db2a5a4c38e4..a87ab3ad3cd3 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -5,10 +5,8 @@ * */ -#include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/iversion.h> #include <linux/mpage.h> #include <linux/namei.h> #include <linux/nls.h> @@ -49,8 +47,8 @@ static struct inode *ntfs_read_mft(struct inode *inode, inode->i_op = NULL; /* Setup 'uid' and 'gid' */ - inode->i_uid = sbi->options.fs_uid; - inode->i_gid = sbi->options.fs_gid; + inode->i_uid = sbi->options->fs_uid; + inode->i_gid = sbi->options->fs_gid; err = mi_init(&ni->mi, sbi, ino); if (err) @@ -224,12 +222,9 @@ next_attr: if (!attr->non_res) { ni->i_valid = inode->i_size = rsize; inode_set_bytes(inode, rsize); - t32 = asize; - } else { - t32 = le16_to_cpu(attr->nres.run_off); } - mode = S_IFREG | (0777 & sbi->options.fs_fmask_inv); + mode = S_IFREG | (0777 & sbi->options->fs_fmask_inv); if (!attr->non_res) { ni->ni_flags |= NI_FLAG_RESIDENT; @@ -272,7 +267,7 @@ next_attr: goto out; mode = sb->s_root - ? (S_IFDIR | (0777 & sbi->options.fs_dmask_inv)) + ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) : (S_IFDIR | 0777); goto next_attr; @@ -315,17 +310,14 @@ next_attr: rp_fa = ni_parse_reparse(ni, attr, &rp); switch (rp_fa) { case REPARSE_LINK: - if (!attr->non_res) { - inode->i_size = rsize; - inode_set_bytes(inode, rsize); - t32 = asize; - } else { - inode->i_size = - le64_to_cpu(attr->nres.data_size); - t32 = le16_to_cpu(attr->nres.run_off); - } + /* + * Normal symlink. + * Assume one unicode symbol == one utf8. + */ + inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer + .PrintNameLength) / + sizeof(u16); - /* Looks like normal symlink. */ ni->i_valid = inode->i_size; /* Clear directory bit. */ @@ -422,7 +414,7 @@ end_enum: ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; - inode_nohighmem(inode); // ?? + inode_nohighmem(inode); } else if (S_ISREG(mode)) { ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_file_inode_operations; @@ -443,7 +435,7 @@ end_enum: goto out; } - if ((sbi->options.sys_immutable && + if ((sbi->options->sys_immutable && (std5->fa & FILE_ATTRIBUTE_SYSTEM)) && !S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) { inode->i_flags |= S_IMMUTABLE; @@ -1054,7 +1046,7 @@ int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, if (!ret && i2) ret = writeback_inode(i2); if (!ret) - ret = filemap_flush(sb->s_bdev->bd_inode->i_mapping); + ret = sync_blockdev_nowait(sb->s_bdev); return ret; } @@ -1200,9 +1192,13 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, struct REPARSE_DATA_BUFFER *rp = NULL; bool rp_inserted = false; + ni_lock_dir(dir_ni); + dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL); - if (!dir_root) - return ERR_PTR(-EINVAL); + if (!dir_root) { + err = -EINVAL; + goto out1; + } if (S_ISDIR(mode)) { /* Use parent's directory attributes. */ @@ -1244,7 +1240,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, * } */ } else if (S_ISREG(mode)) { - if (sbi->options.sparse) { + if (sbi->options->sparse) { /* Sparsed regular file, cause option 'sparse'. */ fa = FILE_ATTRIBUTE_SPARSE_FILE | FILE_ATTRIBUTE_ARCHIVE; @@ -1486,7 +1482,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, asize = ALIGN(SIZEOF_RESIDENT + nsize, 8); t16 = PtrOffset(rec, attr); - /* 0x78 - the size of EA + EAINFO to store WSL */ + /* + * Below function 'ntfs_save_wsl_perm' requires 0x78 bytes. + * It is good idea to keep extened attributes resident. + */ if (asize + t16 + 0x78 + 8 > sbi->record_size) { CLST alen; CLST clst = bytes_to_cluster(sbi, nsize); @@ -1521,14 +1520,14 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, } asize = SIZEOF_NONRESIDENT + ALIGN(err, 8); - inode->i_size = nsize; } else { attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.data_size = cpu_to_le32(nsize); memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize); - inode->i_size = nsize; nsize = 0; } + /* Size of symlink equals the length of input string. */ + inode->i_size = size; attr->size = cpu_to_le32(asize); @@ -1551,6 +1550,9 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, if (err) goto out6; + /* Unlock parent directory before ntfs_init_acl. */ + ni_unlock(dir_ni); + inode->i_generation = le16_to_cpu(rec->seq); dir->i_mtime = dir->i_ctime = inode->i_atime; @@ -1562,6 +1564,8 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; inode->i_mapping->a_ops = &ntfs_aops; + inode->i_size = size; + inode_nohighmem(inode); } else if (S_ISREG(mode)) { inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; @@ -1577,7 +1581,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { err = ntfs_init_acl(mnt_userns, inode, dir); if (err) - goto out6; + goto out7; } else #endif { @@ -1586,7 +1590,7 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, /* Write non resident data. */ if (nsize) { - err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize); + err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0); if (err) goto out7; } @@ -1607,8 +1611,10 @@ struct inode *ntfs_create_inode(struct user_namespace *mnt_userns, out7: /* Undo 'indx_insert_entry'. */ + ni_lock_dir(dir_ni); indx_delete_entry(&dir_ni->dir, dir_ni, new_de + 1, le16_to_cpu(new_de->key_size), sbi); + /* ni_unlock(dir_ni); will be called later. */ out6: if (rp_inserted) ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); @@ -1632,8 +1638,10 @@ out2: kfree(rp); out1: - if (err) + if (err) { + ni_unlock(dir_ni); return ERR_PTR(err); + } unlock_new_inode(inode); @@ -1754,15 +1762,15 @@ void ntfs_evict_inode(struct inode *inode) static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, int buflen) { - int i, err = 0; + int i, err = -EINVAL; struct ntfs_inode *ni = ntfs_i(inode); struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - u64 i_size = inode->i_size; - u16 nlen = 0; + u64 size; + u16 ulen = 0; void *to_free = NULL; struct REPARSE_DATA_BUFFER *rp; - struct le_str *uni; + const __le16 *uname; struct ATTRIB *attr; /* Reparse data present. Try to parse it. */ @@ -1771,68 +1779,64 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, *buffer = 0; - /* Read into temporal buffer. */ - if (i_size > sbi->reparse.max_size || i_size <= sizeof(u32)) { - err = -EINVAL; - goto out; - } - attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL); - if (!attr) { - err = -EINVAL; + if (!attr) goto out; - } if (!attr->non_res) { - rp = resident_data_ex(attr, i_size); - if (!rp) { - err = -EINVAL; + rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); + if (!rp) goto out; - } + size = le32_to_cpu(attr->res.data_size); } else { - rp = kmalloc(i_size, GFP_NOFS); + size = le64_to_cpu(attr->nres.data_size); + rp = NULL; + } + + if (size > sbi->reparse.max_size || size <= sizeof(u32)) + goto out; + + if (!rp) { + rp = kmalloc(size, GFP_NOFS); if (!rp) { err = -ENOMEM; goto out; } to_free = rp; - err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, i_size, NULL); + /* Read into temporal buffer. */ + err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, size, NULL); if (err) goto out; } - err = -EINVAL; - /* Microsoft Tag. */ switch (rp->ReparseTag) { case IO_REPARSE_TAG_MOUNT_POINT: /* Mount points and junctions. */ /* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */ - if (i_size <= offsetof(struct REPARSE_DATA_BUFFER, - MountPointReparseBuffer.PathBuffer)) + if (size <= offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer)) goto out; - uni = Add2Ptr(rp, - offsetof(struct REPARSE_DATA_BUFFER, - MountPointReparseBuffer.PathBuffer) + - le16_to_cpu(rp->MountPointReparseBuffer - .PrintNameOffset) - - 2); - nlen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); + uname = Add2Ptr(rp, + offsetof(struct REPARSE_DATA_BUFFER, + MountPointReparseBuffer.PathBuffer) + + le16_to_cpu(rp->MountPointReparseBuffer + .PrintNameOffset)); + ulen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); break; case IO_REPARSE_TAG_SYMLINK: /* FolderSymbolicLink */ /* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */ - if (i_size <= offsetof(struct REPARSE_DATA_BUFFER, - SymbolicLinkReparseBuffer.PathBuffer)) + if (size <= offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer)) goto out; - uni = Add2Ptr(rp, - offsetof(struct REPARSE_DATA_BUFFER, - SymbolicLinkReparseBuffer.PathBuffer) + - le16_to_cpu(rp->SymbolicLinkReparseBuffer - .PrintNameOffset) - - 2); - nlen = le16_to_cpu( + uname = Add2Ptr( + rp, offsetof(struct REPARSE_DATA_BUFFER, + SymbolicLinkReparseBuffer.PathBuffer) + + le16_to_cpu(rp->SymbolicLinkReparseBuffer + .PrintNameOffset)); + ulen = le16_to_cpu( rp->SymbolicLinkReparseBuffer.PrintNameLength); break; @@ -1864,29 +1868,28 @@ static noinline int ntfs_readlink_hlp(struct inode *inode, char *buffer, goto out; } if (!IsReparseTagNameSurrogate(rp->ReparseTag) || - i_size <= sizeof(struct REPARSE_POINT)) { + size <= sizeof(struct REPARSE_POINT)) { goto out; } /* Users tag. */ - uni = Add2Ptr(rp, sizeof(struct REPARSE_POINT) - 2); - nlen = le16_to_cpu(rp->ReparseDataLength) - + uname = Add2Ptr(rp, sizeof(struct REPARSE_POINT)); + ulen = le16_to_cpu(rp->ReparseDataLength) - sizeof(struct REPARSE_POINT); } /* Convert nlen from bytes to UNICODE chars. */ - nlen >>= 1; + ulen >>= 1; /* Check that name is available. */ - if (!nlen || &uni->name[nlen] > (__le16 *)Add2Ptr(rp, i_size)) + if (!ulen || uname + ulen > (__le16 *)Add2Ptr(rp, size)) goto out; /* If name is already zero terminated then truncate it now. */ - if (!uni->name[nlen - 1]) - nlen -= 1; - uni->len = nlen; + if (!uname[ulen - 1]) + ulen -= 1; - err = ntfs_utf16_to_nls(sbi, uni, buffer, buflen); + err = ntfs_utf16_to_nls(sbi, uname, ulen, buffer, buflen); if (err < 0) goto out; diff --git a/fs/ntfs3/lib/decompress_common.h b/fs/ntfs3/lib/decompress_common.h index 2d70ae42f1b5..dd7ced000d0e 100644 --- a/fs/ntfs3/lib/decompress_common.h +++ b/fs/ntfs3/lib/decompress_common.h @@ -5,6 +5,9 @@ * Copyright (C) 2015 Eric Biggers */ +#ifndef _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H +#define _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H + #include <linux/string.h> #include <linux/compiler.h> #include <linux/types.h> @@ -336,3 +339,5 @@ static forceinline u8 *lz_copy(u8 *dst, u32 length, u32 offset, const u8 *bufend return dst; } + +#endif /* _LINUX_NTFS3_LIB_DECOMPRESS_COMMON_H */ diff --git a/fs/ntfs3/lib/lib.h b/fs/ntfs3/lib/lib.h index f508fbad2e71..90309a5ae59c 100644 --- a/fs/ntfs3/lib/lib.h +++ b/fs/ntfs3/lib/lib.h @@ -7,6 +7,10 @@ * - linux kernel code style */ +#ifndef _LINUX_NTFS3_LIB_LIB_H +#define _LINUX_NTFS3_LIB_LIB_H + +#include <linux/types.h> /* globals from xpress_decompress.c */ struct xpress_decompressor *xpress_allocate_decompressor(void); @@ -24,3 +28,5 @@ int lzx_decompress(struct lzx_decompressor *__restrict d, const void *__restrict compressed_data, size_t compressed_size, void *__restrict uncompressed_data, size_t uncompressed_size); + +#endif /* _LINUX_NTFS3_LIB_LIB_H */ diff --git a/fs/ntfs3/lznt.c b/fs/ntfs3/lznt.c index f1f691a67cc4..28f654561f27 100644 --- a/fs/ntfs3/lznt.c +++ b/fs/ntfs3/lznt.c @@ -5,13 +5,13 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/fs.h> -#include <linux/nls.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> #include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" // clang-format off @@ -292,7 +292,7 @@ next: /* * get_lznt_ctx * @level: 0 - Standard compression. - * !0 - Best compression, requires a lot of cpu. + * !0 - Best compression, requires a lot of cpu. */ struct lznt *get_lznt_ctx(int level) { diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index e58415d07132..bc741213ad84 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -5,11 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/iversion.h> -#include <linux/namei.h> #include <linux/nls.h> #include "debug.h" @@ -99,16 +95,11 @@ static struct dentry *ntfs_lookup(struct inode *dir, struct dentry *dentry, static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct ntfs_inode *ni = ntfs_i(dir); struct inode *inode; - ni_lock_dir(ni); - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFREG | mode, 0, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -120,16 +111,11 @@ static int ntfs_create(struct user_namespace *mnt_userns, struct inode *dir, static int ntfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct ntfs_inode *ni = ntfs_i(dir); struct inode *inode; - ni_lock_dir(ni); - inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, mode, rdev, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -200,15 +186,10 @@ static int ntfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, { u32 size = strlen(symname); struct inode *inode; - struct ntfs_inode *ni = ntfs_i(dir); - - ni_lock_dir(ni); inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFLNK | 0777, 0, symname, size, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } @@ -219,15 +200,10 @@ static int ntfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; - struct ntfs_inode *ni = ntfs_i(dir); - - ni_lock_dir(ni); inode = ntfs_create_inode(mnt_userns, dir, dentry, NULL, S_IFDIR | mode, 0, NULL, 0, NULL); - ni_unlock(ni); - return IS_ERR(inode) ? PTR_ERR(inode) : 0; } diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 6bb3e595263b..9cc396b117bf 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -10,19 +10,27 @@ #ifndef _LINUX_NTFS3_NTFS_H #define _LINUX_NTFS3_NTFS_H -/* TODO: Check 4K MFT record and 512 bytes cluster. */ +#include <linux/blkdev.h> +#include <linux/build_bug.h> +#include <linux/kernel.h> +#include <linux/stddef.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "debug.h" -/* Activate this define to use binary search in indexes. */ -#define NTFS3_INDEX_BINARY_SEARCH +/* TODO: Check 4K MFT record and 512 bytes cluster. */ /* Check each run for marked clusters. */ #define NTFS3_CHECK_FREE_CLST #define NTFS_NAME_LEN 255 -/* ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. */ -#define NTFS_LINK_MAX 0x400 -//#define NTFS_LINK_MAX 0xffff +/* + * ntfs.sys used 500 maximum links on-disk struct allows up to 0xffff. + * xfstest generic/041 creates 3003 hardlinks. + */ +#define NTFS_LINK_MAX 4000 /* * Activate to use 64 bit clusters instead of 32 bits in ntfs.sys. diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index dc71c59fd445..8aaec7e0804e 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -9,6 +9,37 @@ #ifndef _LINUX_NTFS3_NTFS_FS_H #define _LINUX_NTFS3_NTFS_FS_H +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/cleancache.h> +#include <linux/fs.h> +#include <linux/highmem.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/mutex.h> +#include <linux/page-flags.h> +#include <linux/pagemap.h> +#include <linux/rbtree.h> +#include <linux/rwsem.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/time64.h> +#include <linux/types.h> +#include <linux/uidgid.h> +#include <asm/div64.h> +#include <asm/page.h> + +#include "debug.h" +#include "ntfs.h" + +struct dentry; +struct fiemap_extent_info; +struct user_namespace; +struct page; +struct writeback_control; +enum utf16_endian; + + #define MINUS_ONE_T ((size_t)(-1)) /* Biggest MFT / smallest cluster */ #define MAXIMUM_BYTES_PER_MFT 4096 @@ -52,6 +83,7 @@ // clang-format on struct ntfs_mount_options { + char *nls_name; struct nls_table *nls; kuid_t fs_uid; @@ -59,19 +91,16 @@ struct ntfs_mount_options { u16 fs_fmask_inv; u16 fs_dmask_inv; - unsigned uid : 1, /* uid was set. */ - gid : 1, /* gid was set. */ - fmask : 1, /* fmask was set. */ - dmask : 1, /* dmask was set. */ - sys_immutable : 1, /* Immutable system files. */ - discard : 1, /* Issue discard requests on deletions. */ - sparse : 1, /* Create sparse files. */ - showmeta : 1, /* Show meta files. */ - nohidden : 1, /* Do not show hidden files. */ - force : 1, /* Rw mount dirty volume. */ - no_acs_rules : 1, /*Exclude acs rules. */ - prealloc : 1 /* Preallocate space when file is growing. */ - ; + unsigned fmask : 1; /* fmask was set. */ + unsigned dmask : 1; /*dmask was set. */ + unsigned sys_immutable : 1; /* Immutable system files. */ + unsigned discard : 1; /* Issue discard requests on deletions. */ + unsigned sparse : 1; /* Create sparse files. */ + unsigned showmeta : 1; /* Show meta files. */ + unsigned nohidden : 1; /* Do not show hidden files. */ + unsigned force : 1; /* RW mount dirty volume. */ + unsigned noacsrules : 1; /* Exclude acs rules. */ + unsigned prealloc : 1; /* Preallocate space when file is growing. */ }; /* Special value to unpack and deallocate. */ @@ -182,10 +211,8 @@ struct ntfs_sb_info { u32 blocks_per_cluster; // cluster_size / sb->s_blocksize u32 record_size; - u32 sector_size; u32 index_size; - u8 sector_bits; u8 cluster_bits; u8 record_bits; @@ -279,7 +306,7 @@ struct ntfs_sb_info { #endif } compress; - struct ntfs_mount_options options; + struct ntfs_mount_options *options; struct ratelimit_state msg_ratelimit; }; @@ -436,7 +463,7 @@ bool al_remove_le(struct ntfs_inode *ni, struct ATTR_LIST_ENTRY *le); bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn, const __le16 *name, size_t name_len, const struct MFT_REF *ref); -int al_update(struct ntfs_inode *ni); +int al_update(struct ntfs_inode *ni, int sync); static inline size_t al_aligned(size_t size) { return (size + 1023) & ~(size_t)1023; @@ -448,7 +475,7 @@ bool are_bits_set(const ulong *map, size_t bit, size_t nbits); size_t get_set_bits_ex(const ulong *map, size_t bit, size_t nbits); /* Globals from dir.c */ -int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const struct le_str *uni, +int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len); int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, struct cpu_str *uni, u32 max_ulen, @@ -520,7 +547,7 @@ struct ATTR_FILE_NAME *ni_fname_type(struct ntfs_inode *ni, u8 name_type, struct ATTR_LIST_ENTRY **entry); int ni_new_attr_flags(struct ntfs_inode *ni, enum FILE_ATTRIBUTE new_fa); enum REPARSE_SIGN ni_parse_reparse(struct ntfs_inode *ni, struct ATTRIB *attr, - void *buffer); + struct REPARSE_DATA_BUFFER *buffer); int ni_write_inode(struct inode *inode, int sync, const char *hint); #define _ni_write_inode(i, w) ni_write_inode(i, w, __func__) int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, @@ -577,7 +604,7 @@ int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer); int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buffer, int wait); int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, - u64 vbo, const void *buf, size_t bytes); + u64 vbo, const void *buf, size_t bytes, int sync); struct buffer_head *ntfs_bread_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, u64 vbo); int ntfs_read_run_nb(struct ntfs_sb_info *sbi, const struct runs_tree *run, diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c index 103705c86772..861e35791506 100644 --- a/fs/ntfs3/record.c +++ b/fs/ntfs3/record.c @@ -5,10 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index 26ed2b64345e..a8fec651f973 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -7,10 +7,8 @@ */ #include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/log2.h> -#include <linux/nls.h> #include "debug.h" #include "ntfs.h" diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 55bbc9200a10..29813200c7af 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -23,16 +23,15 @@ * */ -#include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/fs.h> -#include <linux/iversion.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/log2.h> #include <linux/module.h> #include <linux/nls.h> -#include <linux/parser.h> #include <linux/seq_file.h> #include <linux/statfs.h> @@ -205,9 +204,11 @@ void *ntfs_put_shared(void *ptr) return ret; } -static inline void clear_mount_options(struct ntfs_mount_options *options) +static inline void put_mount_options(struct ntfs_mount_options *options) { + kfree(options->nls_name); unload_nls(options->nls); + kfree(options); } enum Opt { @@ -223,218 +224,175 @@ enum Opt { Opt_nohidden, Opt_showmeta, Opt_acl, - Opt_noatime, - Opt_nls, + Opt_iocharset, Opt_prealloc, - Opt_no_acs_rules, + Opt_noacsrules, Opt_err, }; -static const match_table_t ntfs_tokens = { - { Opt_uid, "uid=%u" }, - { Opt_gid, "gid=%u" }, - { Opt_umask, "umask=%o" }, - { Opt_dmask, "dmask=%o" }, - { Opt_fmask, "fmask=%o" }, - { Opt_immutable, "sys_immutable" }, - { Opt_discard, "discard" }, - { Opt_force, "force" }, - { Opt_sparse, "sparse" }, - { Opt_nohidden, "nohidden" }, - { Opt_acl, "acl" }, - { Opt_noatime, "noatime" }, - { Opt_showmeta, "showmeta" }, - { Opt_nls, "nls=%s" }, - { Opt_prealloc, "prealloc" }, - { Opt_no_acs_rules, "no_acs_rules" }, - { Opt_err, NULL }, +static const struct fs_parameter_spec ntfs_fs_parameters[] = { + fsparam_u32("uid", Opt_uid), + fsparam_u32("gid", Opt_gid), + fsparam_u32oct("umask", Opt_umask), + fsparam_u32oct("dmask", Opt_dmask), + fsparam_u32oct("fmask", Opt_fmask), + fsparam_flag_no("sys_immutable", Opt_immutable), + fsparam_flag_no("discard", Opt_discard), + fsparam_flag_no("force", Opt_force), + fsparam_flag_no("sparse", Opt_sparse), + fsparam_flag_no("hidden", Opt_nohidden), + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("showmeta", Opt_showmeta), + fsparam_flag_no("prealloc", Opt_prealloc), + fsparam_flag_no("acsrules", Opt_noacsrules), + fsparam_string("iocharset", Opt_iocharset), + {} }; -static noinline int ntfs_parse_options(struct super_block *sb, char *options, - int silent, - struct ntfs_mount_options *opts) +/* + * Load nls table or if @nls is utf8 then return NULL. + */ +static struct nls_table *ntfs_load_nls(char *nls) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - char nls_name[30]; - struct nls_table *nls; + struct nls_table *ret; - opts->fs_uid = current_uid(); - opts->fs_gid = current_gid(); - opts->fs_fmask_inv = opts->fs_dmask_inv = ~current_umask(); - nls_name[0] = 0; + if (!nls) + nls = CONFIG_NLS_DEFAULT; - if (!options) - goto out; + if (strcmp(nls, "utf8") == 0) + return NULL; - while ((p = strsep(&options, ","))) { - int token; + if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0) + return load_nls_default(); - if (!*p) - continue; + ret = load_nls(nls); + if (ret) + return ret; - token = match_token(p, ntfs_tokens, args); - switch (token) { - case Opt_immutable: - opts->sys_immutable = 1; - break; - case Opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->fs_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(opts->fs_uid)) - return -EINVAL; - opts->uid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - opts->fs_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(opts->fs_gid)) - return -EINVAL; - opts->gid = 1; - break; - case Opt_umask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_fmask_inv = opts->fs_dmask_inv = ~option; - opts->fmask = opts->dmask = 1; - break; - case Opt_dmask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_dmask_inv = ~option; - opts->dmask = 1; - break; - case Opt_fmask: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->fs_fmask_inv = ~option; - opts->fmask = 1; - break; - case Opt_discard: - opts->discard = 1; - break; - case Opt_force: - opts->force = 1; - break; - case Opt_sparse: - opts->sparse = 1; - break; - case Opt_nohidden: - opts->nohidden = 1; - break; - case Opt_acl: + return ERR_PTR(-EINVAL); +} + +static int ntfs_fs_parse_param(struct fs_context *fc, + struct fs_parameter *param) +{ + struct ntfs_mount_options *opts = fc->fs_private; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, ntfs_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->fs_uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(opts->fs_uid)) + return invalf(fc, "ntfs3: Invalid value for uid."); + break; + case Opt_gid: + opts->fs_gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(opts->fs_gid)) + return invalf(fc, "ntfs3: Invalid value for gid."); + break; + case Opt_umask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for umask."); + opts->fs_fmask_inv = ~result.uint_32; + opts->fs_dmask_inv = ~result.uint_32; + opts->fmask = 1; + opts->dmask = 1; + break; + case Opt_dmask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for dmask."); + opts->fs_dmask_inv = ~result.uint_32; + opts->dmask = 1; + break; + case Opt_fmask: + if (result.uint_32 & ~07777) + return invalf(fc, "ntfs3: Invalid value for fmask."); + opts->fs_fmask_inv = ~result.uint_32; + opts->fmask = 1; + break; + case Opt_immutable: + opts->sys_immutable = result.negated ? 0 : 1; + break; + case Opt_discard: + opts->discard = result.negated ? 0 : 1; + break; + case Opt_force: + opts->force = result.negated ? 0 : 1; + break; + case Opt_sparse: + opts->sparse = result.negated ? 0 : 1; + break; + case Opt_nohidden: + opts->nohidden = result.negated ? 1 : 0; + break; + case Opt_acl: + if (!result.negated) #ifdef CONFIG_NTFS3_FS_POSIX_ACL - sb->s_flags |= SB_POSIXACL; - break; + fc->sb_flags |= SB_POSIXACL; #else - ntfs_err(sb, "support for ACL not compiled in!"); - return -EINVAL; + return invalf(fc, "ntfs3: Support for ACL not compiled in!"); #endif - case Opt_noatime: - sb->s_flags |= SB_NOATIME; - break; - case Opt_showmeta: - opts->showmeta = 1; - break; - case Opt_nls: - match_strlcpy(nls_name, &args[0], sizeof(nls_name)); - break; - case Opt_prealloc: - opts->prealloc = 1; - break; - case Opt_no_acs_rules: - opts->no_acs_rules = 1; - break; - default: - if (!silent) - ntfs_err( - sb, - "Unrecognized mount option \"%s\" or missing value", - p); - //return -EINVAL; - } - } - -out: - if (!strcmp(nls_name[0] ? nls_name : CONFIG_NLS_DEFAULT, "utf8")) { - /* - * For UTF-8 use utf16s_to_utf8s()/utf8s_to_utf16s() - * instead of NLS. - */ - nls = NULL; - } else if (nls_name[0]) { - nls = load_nls(nls_name); - if (!nls) { - ntfs_err(sb, "failed to load \"%s\"", nls_name); - return -EINVAL; - } - } else { - nls = load_nls_default(); - if (!nls) { - ntfs_err(sb, "failed to load default nls"); - return -EINVAL; - } + else + fc->sb_flags &= ~SB_POSIXACL; + break; + case Opt_showmeta: + opts->showmeta = result.negated ? 0 : 1; + break; + case Opt_iocharset: + kfree(opts->nls_name); + opts->nls_name = param->string; + param->string = NULL; + break; + case Opt_prealloc: + opts->prealloc = result.negated ? 0 : 1; + break; + case Opt_noacsrules: + opts->noacsrules = result.negated ? 1 : 0; + break; + default: + /* Should not be here unless we forget add case. */ + return -EINVAL; } - opts->nls = nls; - return 0; } -static int ntfs_remount(struct super_block *sb, int *flags, char *data) +static int ntfs_fs_reconfigure(struct fs_context *fc) { - int err, ro_rw; + struct super_block *sb = fc->root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - struct ntfs_mount_options old_opts; - char *orig_data = kstrdup(data, GFP_KERNEL); - - if (data && !orig_data) - return -ENOMEM; + struct ntfs_mount_options *new_opts = fc->fs_private; + int ro_rw; - /* Store original options. */ - memcpy(&old_opts, &sbi->options, sizeof(old_opts)); - clear_mount_options(&sbi->options); - memset(&sbi->options, 0, sizeof(sbi->options)); - - err = ntfs_parse_options(sb, data, 0, &sbi->options); - if (err) - goto restore_opts; - - ro_rw = sb_rdonly(sb) && !(*flags & SB_RDONLY); + ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY); if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { - ntfs_warn( - sb, - "Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); - err = -EINVAL; - goto restore_opts; + errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); + return -EINVAL; } + new_opts->nls = ntfs_load_nls(new_opts->nls_name); + if (IS_ERR(new_opts->nls)) { + new_opts->nls = NULL; + errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name); + return -EINVAL; + } + if (new_opts->nls != sbi->options->nls) + return invalf(fc, "ntfs3: Cannot use different iocharset when remounting!"); + sync_filesystem(sb); if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) && - !sbi->options.force) { - ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!"); - err = -EINVAL; - goto restore_opts; + !new_opts->force) { + errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!"); + return -EINVAL; } - clear_mount_options(&old_opts); - - *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME) | - SB_NODIRATIME | SB_NOATIME; - ntfs_info(sb, "re-mounted. Opts: %s", orig_data); - err = 0; - goto out; + memcpy(sbi->options, new_opts, sizeof(*new_opts)); -restore_opts: - clear_mount_options(&sbi->options); - memcpy(&sbi->options, &old_opts, sizeof(old_opts)); - -out: - kfree(orig_data); - return err; + return 0; } static struct kmem_cache *ntfs_inode_cachep; @@ -513,8 +471,6 @@ static noinline void put_ntfs(struct ntfs_sb_info *sbi) xpress_free_decompressor(sbi->compress.xpress); lzx_free_decompressor(sbi->compress.lzx); #endif - clear_mount_options(&sbi->options); - kfree(sbi); } @@ -525,7 +481,9 @@ static void ntfs_put_super(struct super_block *sb) /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); + put_mount_options(sbi->options); put_ntfs(sbi); + sb->s_fs_info = NULL; sync_blockdev(sb->s_bdev); } @@ -552,23 +510,21 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) { struct super_block *sb = root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; - struct ntfs_mount_options *opts = &sbi->options; + struct ntfs_mount_options *opts = sbi->options; struct user_namespace *user_ns = seq_user_ns(m); - if (opts->uid) - seq_printf(m, ",uid=%u", - from_kuid_munged(user_ns, opts->fs_uid)); - if (opts->gid) - seq_printf(m, ",gid=%u", - from_kgid_munged(user_ns, opts->fs_gid)); + seq_printf(m, ",uid=%u", + from_kuid_munged(user_ns, opts->fs_uid)); + seq_printf(m, ",gid=%u", + from_kgid_munged(user_ns, opts->fs_gid)); if (opts->fmask) seq_printf(m, ",fmask=%04o", ~opts->fs_fmask_inv); if (opts->dmask) seq_printf(m, ",dmask=%04o", ~opts->fs_dmask_inv); if (opts->nls) - seq_printf(m, ",nls=%s", opts->nls->charset); + seq_printf(m, ",iocharset=%s", opts->nls->charset); else - seq_puts(m, ",nls=utf8"); + seq_puts(m, ",iocharset=utf8"); if (opts->sys_immutable) seq_puts(m, ",sys_immutable"); if (opts->discard) @@ -581,14 +537,12 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nohidden"); if (opts->force) seq_puts(m, ",force"); - if (opts->no_acs_rules) - seq_puts(m, ",no_acs_rules"); + if (opts->noacsrules) + seq_puts(m, ",noacsrules"); if (opts->prealloc) seq_puts(m, ",prealloc"); if (sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); - if (sb->s_flags & SB_NOATIME) - seq_puts(m, ",noatime"); return 0; } @@ -643,7 +597,6 @@ static const struct super_operations ntfs_sops = { .statfs = ntfs_statfs, .show_options = ntfs_show_options, .sync_fs = ntfs_sync_fs, - .remount_fs = ntfs_remount, .write_inode = ntfs3_write_inode, }; @@ -729,7 +682,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, struct ntfs_sb_info *sbi = sb->s_fs_info; int err; u32 mb, gb, boot_sector_size, sct_per_clst, record_size; - u64 sectors, clusters, fs_size, mlcn, mlcn2; + u64 sectors, clusters, mlcn, mlcn2; struct NTFS_BOOT *boot; struct buffer_head *bh; struct MFT_REC *rec; @@ -787,20 +740,20 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, goto out; } - sbi->sector_size = boot_sector_size; - sbi->sector_bits = blksize_bits(boot_sector_size); - fs_size = (sectors + 1) << sbi->sector_bits; + sbi->volume.size = sectors * boot_sector_size; - gb = format_size_gb(fs_size, &mb); + gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb); /* * - Volume formatted and mounted with the same sector size. * - Volume formatted 4K and mounted as 512. * - Volume formatted 512 and mounted as 4K. */ - if (sbi->sector_size != sector_size) { - ntfs_warn(sb, - "Different NTFS' sector size and media sector size"); + if (boot_sector_size != sector_size) { + ntfs_warn( + sb, + "Different NTFS' sector size (%u) and media sector size (%u)", + boot_sector_size, sector_size); dev_size += sector_size - 1; } @@ -810,8 +763,19 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, sbi->mft.lbo = mlcn << sbi->cluster_bits; sbi->mft.lbo2 = mlcn2 << sbi->cluster_bits; - if (sbi->cluster_size < sbi->sector_size) + /* Compare boot's cluster and sector. */ + if (sbi->cluster_size < boot_sector_size) + goto out; + + /* Compare boot's cluster and media sector. */ + if (sbi->cluster_size < sector_size) { + /* No way to use ntfs_get_block in this case. */ + ntfs_err( + sb, + "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u)", + sbi->cluster_size, sector_size); goto out; + } sbi->cluster_mask = sbi->cluster_size - 1; sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; @@ -836,10 +800,9 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, : (u32)boot->index_size << sbi->cluster_bits; sbi->volume.ser_num = le64_to_cpu(boot->serial_num); - sbi->volume.size = sectors << sbi->sector_bits; /* Warning if RAW volume. */ - if (dev_size < fs_size) { + if (dev_size < sbi->volume.size + boot_sector_size) { u32 mb0, gb0; gb0 = format_size_gb(dev_size, &mb0); @@ -883,8 +846,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, rec->total = cpu_to_le32(sbi->record_size); ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END; - if (sbi->cluster_size < PAGE_SIZE) - sb_set_blocksize(sb, sbi->cluster_size); + sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE)); sbi->block_mask = sb->s_blocksize - 1; sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits; @@ -897,9 +859,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, if (clusters >= (1ull << (64 - sbi->cluster_bits))) sbi->maxbytes = -1; sbi->maxbytes_sparse = -1; + sb->s_maxbytes = MAX_LFS_FILESIZE; #else /* Maximum size for sparse file. */ sbi->maxbytes_sparse = (1ull << (sbi->cluster_bits + 32)) - 1; + sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; #endif err = 0; @@ -913,14 +877,13 @@ out: /* * ntfs_fill_super - Try to mount. */ -static int ntfs_fill_super(struct super_block *sb, void *data, int silent) +static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) { int err; - struct ntfs_sb_info *sbi; + struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; - struct inode *bd_inode = bdev->bd_inode; - struct request_queue *rq = bdev_get_queue(bdev); - struct inode *inode = NULL; + struct request_queue *rq; + struct inode *inode; struct ntfs_inode *ni; size_t i, tt; CLST vcn, lcn, len; @@ -928,18 +891,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) const struct VOLUME_INFO *info; u32 idx, done, bytes; struct ATTR_DEF_ENTRY *t; - u16 *upcase = NULL; u16 *shared; - bool is_ro; struct MFT_REF ref; ref.high = 0; - sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); - if (!sbi) - return -ENOMEM; - - sb->s_fs_info = sbi; sbi->sb = sb; sb->s_flags |= SB_NODIRATIME; sb->s_magic = 0x7366746e; // "ntfs" @@ -948,41 +904,27 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec sb->s_xattr = ntfs_xattr_handlers; - ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); - - err = ntfs_parse_options(sb, data, silent, &sbi->options); - if (err) + sbi->options->nls = ntfs_load_nls(sbi->options->nls_name); + if (IS_ERR(sbi->options->nls)) { + sbi->options->nls = NULL; + errorf(fc, "Cannot load nls %s", sbi->options->nls_name); + err = -EINVAL; goto out; + } - if (!rq || !blk_queue_discard(rq) || !rq->limits.discard_granularity) { - ; - } else { + rq = bdev_get_queue(bdev); + if (blk_queue_discard(rq) && rq->limits.discard_granularity) { sbi->discard_granularity = rq->limits.discard_granularity; sbi->discard_granularity_mask_inv = ~(u64)(sbi->discard_granularity - 1); } - sb_set_blocksize(sb, PAGE_SIZE); - /* Parse boot. */ err = ntfs_init_from_boot(sb, rq ? queue_logical_block_size(rq) : 512, - bd_inode->i_size); + bdev_nr_bytes(bdev)); if (err) goto out; -#ifdef CONFIG_NTFS3_64BIT_CLUSTER - sb->s_maxbytes = MAX_LFS_FILESIZE; -#else - sb->s_maxbytes = 0xFFFFFFFFull << sbi->cluster_bits; -#endif - - mutex_init(&sbi->compress.mtx_lznt); -#ifdef CONFIG_NTFS3_LZX_XPRESS - mutex_init(&sbi->compress.mtx_xpress); - mutex_init(&sbi->compress.mtx_lzx); -#endif - /* * Load $Volume. This should be done before $LogFile * 'cause 'sbi->volume.ni' is used 'ntfs_set_state'. @@ -991,9 +933,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_VOL); inode = ntfs_iget5(sb, &ref, &NAME_VOLUME); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Volume."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1015,36 +956,33 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) } else { /* Should we break mounting here? */ //err = -EINVAL; - //goto out; + //goto put_inode_out; } attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL); if (!attr || is_attr_ext(attr)) { err = -EINVAL; - goto out; + goto put_inode_out; } info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO); if (!info) { err = -EINVAL; - goto out; + goto put_inode_out; } sbi->volume.major_ver = info->major_ver; sbi->volume.minor_ver = info->minor_ver; sbi->volume.flags = info->flags; - sbi->volume.ni = ni; - inode = NULL; /* Load $MFTMirr to estimate recs_mirr. */ ref.low = cpu_to_le32(MFT_REC_MIRR); ref.seq = cpu_to_le16(MFT_REC_MIRR); inode = ntfs_iget5(sb, &ref, &NAME_MIRROR); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFTMirr."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1058,9 +996,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_LOG); inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load \x24LogFile."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1068,22 +1005,19 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) err = ntfs_loadlog_and_replay(ni, sbi); if (err) - goto out; + goto put_inode_out; iput(inode); - inode = NULL; - - is_ro = sb_rdonly(sbi->sb); if (sbi->flags & NTFS_FLAGS_NEED_REPLAY) { - if (!is_ro) { + if (!sb_rdonly(sb)) { ntfs_warn(sb, "failed to replay log file. Can't mount rw!"); err = -EINVAL; goto out; } } else if (sbi->volume.flags & VOLUME_FLAG_DIRTY) { - if (!is_ro && !sbi->options.force) { + if (!sb_rdonly(sb) && !sbi->options->force) { ntfs_warn( sb, "volume is dirty and \"force\" flag is not set!"); @@ -1098,9 +1032,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) inode = ntfs_iget5(sb, &ref, &NAME_MFT); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFT."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1112,11 +1045,11 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) err = wnd_init(&sbi->mft.bitmap, sb, tt); if (err) - goto out; + goto put_inode_out; err = ni_load_all_mi(ni); if (err) - goto out; + goto put_inode_out; sbi->mft.ni = ni; @@ -1125,9 +1058,8 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_BADCLUST); inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $BadClus."); - inode = NULL; + err = PTR_ERR(inode); goto out; } @@ -1150,18 +1082,15 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_BITMAP); inode = ntfs_iget5(sb, &ref, &NAME_BITMAP); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Bitmap."); - inode = NULL; + err = PTR_ERR(inode); goto out; } - ni = ntfs_i(inode); - #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (inode->i_size >> 32) { err = -EINVAL; - goto out; + goto put_inode_out; } #endif @@ -1169,14 +1098,14 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) tt = sbi->used.bitmap.nbits; if (inode->i_size < bitmap_size(tt)) { err = -EINVAL; - goto out; + goto put_inode_out; } /* Not necessary. */ sbi->used.bitmap.set_tail = true; - err = wnd_init(&sbi->used.bitmap, sbi->sb, tt); + err = wnd_init(&sbi->used.bitmap, sb, tt); if (err) - goto out; + goto put_inode_out; iput(inode); @@ -1188,23 +1117,22 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) /* Load $AttrDef. */ ref.low = cpu_to_le32(MFT_REC_ATTR); ref.seq = cpu_to_le16(MFT_REC_ATTR); - inode = ntfs_iget5(sbi->sb, &ref, &NAME_ATTRDEF); + inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $AttrDef -> %d", err); - inode = NULL; + err = PTR_ERR(inode); goto out; } if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY)) { err = -EINVAL; - goto out; + goto put_inode_out; } bytes = inode->i_size; sbi->def_table = t = kmalloc(bytes, GFP_NOFS); if (!t) { err = -ENOMEM; - goto out; + goto put_inode_out; } for (done = idx = 0; done < bytes; done += PAGE_SIZE, idx++) { @@ -1213,7 +1141,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) if (IS_ERR(page)) { err = PTR_ERR(page); - goto out; + goto put_inode_out; } memcpy(Add2Ptr(t, done), page_address(page), min(PAGE_SIZE, tail)); @@ -1221,7 +1149,7 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) if (!idx && ATTR_STD != t->type) { err = -EINVAL; - goto out; + goto put_inode_out; } } @@ -1254,33 +1182,24 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ref.seq = cpu_to_le16(MFT_REC_UPCASE); inode = ntfs_iget5(sb, &ref, &NAME_UPCASE); if (IS_ERR(inode)) { + ntfs_err(sb, "Failed to load $UpCase."); err = PTR_ERR(inode); - ntfs_err(sb, "Failed to load \x24LogFile."); - inode = NULL; goto out; } - ni = ntfs_i(inode); - if (inode->i_size != 0x10000 * sizeof(short)) { err = -EINVAL; - goto out; - } - - sbi->upcase = upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); - if (!upcase) { - err = -ENOMEM; - goto out; + goto put_inode_out; } for (idx = 0; idx < (0x10000 * sizeof(short) >> PAGE_SHIFT); idx++) { const __le16 *src; - u16 *dst = Add2Ptr(upcase, idx << PAGE_SHIFT); + u16 *dst = Add2Ptr(sbi->upcase, idx << PAGE_SHIFT); struct page *page = ntfs_map_page(inode->i_mapping, idx); if (IS_ERR(page)) { err = PTR_ERR(page); - goto out; + goto put_inode_out; } src = page_address(page); @@ -1294,14 +1213,13 @@ static int ntfs_fill_super(struct super_block *sb, void *data, int silent) ntfs_unmap_page(page); } - shared = ntfs_set_shared(upcase, 0x10000 * sizeof(short)); - if (shared && upcase != shared) { + shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short)); + if (shared && sbi->upcase != shared) { + kvfree(sbi->upcase); sbi->upcase = shared; - kvfree(upcase); } iput(inode); - inode = NULL; if (is_ntfs3(sbi)) { /* Load $Secure. */ @@ -1331,34 +1249,31 @@ load_root: ref.seq = cpu_to_le16(MFT_REC_ROOT); inode = ntfs_iget5(sb, &ref, &NAME_ROOT); if (IS_ERR(inode)) { - err = PTR_ERR(inode); ntfs_err(sb, "Failed to load root."); - inode = NULL; + err = PTR_ERR(inode); goto out; } - ni = ntfs_i(inode); - sb->s_root = d_make_root(inode); - if (!sb->s_root) { - err = -EINVAL; - goto out; + err = -ENOMEM; + goto put_inode_out; } + fc->fs_private = NULL; + return 0; -out: +put_inode_out: iput(inode); - - if (sb->s_root) { - d_drop(sb->s_root); - sb->s_root = NULL; - } - +out: + /* + * Free resources here. + * ntfs_fs_free will be called with fc->s_fs_info = NULL + */ put_ntfs(sbi); - sb->s_fs_info = NULL; + return err; } @@ -1403,7 +1318,7 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) if (sbi->flags & NTFS_FLAGS_NODISCARD) return -EOPNOTSUPP; - if (!sbi->options.discard) + if (!sbi->options->discard) return -EOPNOTSUPP; lbo = (u64)lcn << sbi->cluster_bits; @@ -1428,19 +1343,99 @@ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) return err; } -static struct dentry *ntfs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) +static int ntfs_fs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, ntfs_fill_super); +} + +/* + * ntfs_fs_free - Free fs_context. + * + * Note that this will be called after fill_super and reconfigure + * even when they pass. So they have to take pointers if they pass. + */ +static void ntfs_fs_free(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ntfs_fill_super); + struct ntfs_mount_options *opts = fc->fs_private; + struct ntfs_sb_info *sbi = fc->s_fs_info; + + if (sbi) + put_ntfs(sbi); + + if (opts) + put_mount_options(opts); +} + +static const struct fs_context_operations ntfs_context_ops = { + .parse_param = ntfs_fs_parse_param, + .get_tree = ntfs_fs_get_tree, + .reconfigure = ntfs_fs_reconfigure, + .free = ntfs_fs_free, +}; + +/* + * ntfs_init_fs_context - Initialize spi and opts + * + * This will called when mount/remount. We will first initiliaze + * options so that if remount we can use just that. + */ +static int ntfs_init_fs_context(struct fs_context *fc) +{ + struct ntfs_mount_options *opts; + struct ntfs_sb_info *sbi; + + opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS); + if (!opts) + return -ENOMEM; + + /* Default options. */ + opts->fs_uid = current_uid(); + opts->fs_gid = current_gid(); + opts->fs_fmask_inv = ~current_umask(); + opts->fs_dmask_inv = ~current_umask(); + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + goto ok; + + sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); + if (!sbi) + goto free_opts; + + sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); + if (!sbi->upcase) + goto free_sbi; + + ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + mutex_init(&sbi->compress.mtx_lznt); +#ifdef CONFIG_NTFS3_LZX_XPRESS + mutex_init(&sbi->compress.mtx_xpress); + mutex_init(&sbi->compress.mtx_lzx); +#endif + + sbi->options = opts; + fc->s_fs_info = sbi; +ok: + fc->fs_private = opts; + fc->ops = &ntfs_context_ops; + + return 0; +free_sbi: + kfree(sbi); +free_opts: + kfree(opts); + return -ENOMEM; } // clang-format off static struct file_system_type ntfs_fs_type = { - .owner = THIS_MODULE, - .name = "ntfs3", - .mount = ntfs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .owner = THIS_MODULE, + .name = "ntfs3", + .init_fs_context = ntfs_init_fs_context, + .parameters = ntfs_fs_parameters, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; // clang-format on diff --git a/fs/ntfs3/upcase.c b/fs/ntfs3/upcase.c index bbeba778237e..b5e8256fd710 100644 --- a/fs/ntfs3/upcase.c +++ b/fs/ntfs3/upcase.c @@ -5,13 +5,9 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> -#include <linux/module.h> -#include <linux/nls.h> +#include <linux/kernel.h> +#include <linux/types.h> -#include "debug.h" -#include "ntfs.h" #include "ntfs_fs.h" static inline u16 upcase_unicode_char(const u16 *upcase, u16 chr) diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 7282d85c4ece..afd0ddad826f 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -5,10 +5,7 @@ * */ -#include <linux/blkdev.h> -#include <linux/buffer_head.h> #include <linux/fs.h> -#include <linux/nls.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <linux/xattr.h> @@ -78,6 +75,7 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, size_t add_bytes, const struct EA_INFO **info) { int err; + struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *attr_info, *attr_ea; void *ea_p; @@ -102,10 +100,10 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, /* Check Ea limit. */ size = le32_to_cpu((*info)->size); - if (size > ni->mi.sbi->ea_max_size) + if (size > sbi->ea_max_size) return -EFBIG; - if (attr_size(attr_ea) > ni->mi.sbi->ea_max_size) + if (attr_size(attr_ea) > sbi->ea_max_size) return -EFBIG; /* Allocate memory for packed Ea. */ @@ -113,15 +111,16 @@ static int ntfs_read_ea(struct ntfs_inode *ni, struct EA_FULL **ea, if (!ea_p) return -ENOMEM; - if (attr_ea->non_res) { + if (!size) { + ; + } else if (attr_ea->non_res) { struct runs_tree run; run_init(&run); err = attr_load_runs(attr_ea, ni, &run, NULL); if (!err) - err = ntfs_read_run_nb(ni->mi.sbi, &run, 0, ea_p, size, - NULL); + err = ntfs_read_run_nb(sbi, &run, 0, ea_p, size, NULL); run_close(&run); if (err) @@ -260,7 +259,7 @@ out: static noinline int ntfs_set_ea(struct inode *inode, const char *name, size_t name_len, const void *value, - size_t val_size, int flags, int locked) + size_t val_size, int flags) { struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = ni->mi.sbi; @@ -279,8 +278,7 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, u64 new_sz; void *p; - if (!locked) - ni_lock(ni); + ni_lock(ni); run_init(&ea_run); @@ -370,21 +368,22 @@ static noinline int ntfs_set_ea(struct inode *inode, const char *name, new_ea->name[name_len] = 0; memcpy(new_ea->name + name_len + 1, value, val_size); new_pack = le16_to_cpu(ea_info.size_pack) + packed_ea_size(new_ea); - - /* Should fit into 16 bits. */ - if (new_pack > 0xffff) { - err = -EFBIG; // -EINVAL? - goto out; - } ea_info.size_pack = cpu_to_le16(new_pack); - /* New size of ATTR_EA. */ size += add; - if (size > sbi->ea_max_size) { + ea_info.size = cpu_to_le32(size); + + /* + * 1. Check ea_info.size_pack for overflow. + * 2. New attibute size must fit value from $AttrDef + */ + if (new_pack > 0xffff || size > sbi->ea_max_size) { + ntfs_inode_warn( + inode, + "The size of extended attributes must not exceed 64KiB"); err = -EFBIG; // -EINVAL? goto out; } - ea_info.size = cpu_to_le32(size); update_ea: @@ -444,7 +443,7 @@ update_ea: /* Delete xattr, ATTR_EA */ ni_remove_attr_le(ni, attr, mi, le); } else if (attr->non_res) { - err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size); + err = ntfs_sb_write_run(sbi, &ea_run, 0, ea_all, size, 0); if (err) goto out; } else { @@ -468,8 +467,7 @@ update_ea: mark_inode_dirty(&ni->vfs_inode); out: - if (!locked) - ni_unlock(ni); + ni_unlock(ni); run_close(&ea_run); kfree(ea_all); @@ -478,12 +476,6 @@ out: } #ifdef CONFIG_NTFS3_FS_POSIX_ACL -static inline void ntfs_posix_acl_release(struct posix_acl *acl) -{ - if (acl && refcount_dec_and_test(&acl->a_refcount)) - kfree(acl); -} - static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, struct inode *inode, int type, int locked) @@ -521,12 +513,15 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, /* Translate extended attribute to acl. */ if (err >= 0) { acl = posix_acl_from_xattr(mnt_userns, buf, err); - if (!IS_ERR(acl)) - set_cached_acl(inode, type, acl); + } else if (err == -ENODATA) { + acl = NULL; } else { - acl = err == -ENODATA ? NULL : ERR_PTR(err); + acl = ERR_PTR(err); } + if (!IS_ERR(acl)) + set_cached_acl(inode, type, acl); + __putname(buf); return acl; @@ -546,12 +541,13 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, - int type, int locked) + int type) { const char *name; size_t size, name_len; void *value = NULL; int err = 0; + int flags; if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; @@ -561,22 +557,15 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, if (acl) { umode_t mode = inode->i_mode; - err = posix_acl_equiv_mode(acl, &mode); - if (err < 0) - return err; + err = posix_acl_update_mode(mnt_userns, inode, &mode, + &acl); + if (err) + goto out; if (inode->i_mode != mode) { inode->i_mode = mode; mark_inode_dirty(inode); } - - if (!err) { - /* - * ACL can be exactly represented in the - * traditional file mode permission bits. - */ - acl = NULL; - } } name = XATTR_NAME_POSIX_ACL_ACCESS; name_len = sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1; @@ -594,20 +583,24 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, } if (!acl) { + /* Remove xattr if it can be presented via mode. */ size = 0; value = NULL; + flags = XATTR_REPLACE; } else { size = posix_acl_xattr_size(acl->a_count); value = kmalloc(size, GFP_NOFS); if (!value) return -ENOMEM; - err = posix_acl_to_xattr(mnt_userns, acl, value, size); if (err < 0) goto out; + flags = 0; } - err = ntfs_set_ea(inode, name, name_len, value, size, 0, locked); + err = ntfs_set_ea(inode, name, name_len, value, size, flags); + if (err == -ENODATA && !size) + err = 0; /* Removing non existed xattr. */ if (!err) set_cached_acl(inode, type, acl); @@ -623,68 +616,7 @@ out: int ntfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type) { - return ntfs_set_acl_ex(mnt_userns, inode, acl, type, 0); -} - -static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, void *buffer, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - acl = ntfs_get_acl(inode, type, false); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (!acl) - return -ENODATA; - - err = posix_acl_to_xattr(mnt_userns, acl, buffer, size); - ntfs_posix_acl_release(acl); - - return err; -} - -static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, - struct inode *inode, int type, const void *value, - size_t size) -{ - struct posix_acl *acl; - int err; - - if (!(inode->i_sb->s_flags & SB_POSIXACL)) { - ntfs_inode_warn(inode, "add mount option \"acl\" to use acl"); - return -EOPNOTSUPP; - } - - if (!inode_owner_or_capable(mnt_userns, inode)) - return -EPERM; - - if (!value) { - acl = NULL; - } else { - acl = posix_acl_from_xattr(mnt_userns, value, size); - if (IS_ERR(acl)) - return PTR_ERR(acl); - - if (acl) { - err = posix_acl_valid(mnt_userns, acl); - if (err) - goto release_and_out; - } - } - - err = ntfs_set_acl(mnt_userns, inode, acl, type); - -release_and_out: - ntfs_posix_acl_release(acl); - return err; + return ntfs_set_acl_ex(mnt_userns, inode, acl, type); } /* @@ -698,54 +630,27 @@ int ntfs_init_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *default_acl, *acl; int err; - /* - * TODO: Refactoring lock. - * ni_lock(dir) ... -> posix_acl_create(dir,...) -> ntfs_get_acl -> ni_lock(dir) - */ - inode->i_default_acl = NULL; - - default_acl = ntfs_get_acl_ex(mnt_userns, dir, ACL_TYPE_DEFAULT, 1); - - if (!default_acl || default_acl == ERR_PTR(-EOPNOTSUPP)) { - inode->i_mode &= ~current_umask(); - err = 0; - goto out; - } - - if (IS_ERR(default_acl)) { - err = PTR_ERR(default_acl); - goto out; - } - - acl = default_acl; - err = __posix_acl_create(&acl, GFP_NOFS, &inode->i_mode); - if (err < 0) - goto out1; - if (!err) { - posix_acl_release(acl); - acl = NULL; - } + err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); + if (err) + return err; - if (!S_ISDIR(inode->i_mode)) { + if (default_acl) { + err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, + ACL_TYPE_DEFAULT); posix_acl_release(default_acl); - default_acl = NULL; + } else { + inode->i_default_acl = NULL; } - if (default_acl) - err = ntfs_set_acl_ex(mnt_userns, inode, default_acl, - ACL_TYPE_DEFAULT, 1); - if (!acl) inode->i_acl = NULL; - else if (!err) - err = ntfs_set_acl_ex(mnt_userns, inode, acl, ACL_TYPE_ACCESS, - 1); - - posix_acl_release(acl); -out1: - posix_acl_release(default_acl); + else { + if (!err) + err = ntfs_set_acl_ex(mnt_userns, inode, acl, + ACL_TYPE_ACCESS); + posix_acl_release(acl); + } -out: return err; } #endif @@ -772,7 +677,7 @@ int ntfs_acl_chmod(struct user_namespace *mnt_userns, struct inode *inode) int ntfs_permission(struct user_namespace *mnt_userns, struct inode *inode, int mask) { - if (ntfs_sb(inode->i_sb)->options.no_acs_rules) { + if (ntfs_sb(inode->i_sb)->options->noacsrules) { /* "No access rules" mode - Allow all changes. */ return 0; } @@ -880,23 +785,6 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de, goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - /* TODO: init_user_ns? */ - err = ntfs_xattr_get_acl( - &init_user_ns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - buffer, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ err = ntfs_get_ea(inode, name, name_len, buffer, size, NULL); @@ -1009,24 +897,8 @@ set_new_fa: goto out; } -#ifdef CONFIG_NTFS3_FS_POSIX_ACL - if ((name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS))) || - (name_len == sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - 1 && - !memcmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)))) { - err = ntfs_xattr_set_acl( - mnt_userns, inode, - name_len == sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1 - ? ACL_TYPE_ACCESS - : ACL_TYPE_DEFAULT, - value, size); - goto out; - } -#endif /* Deal with NTFS extended attribute. */ - err = ntfs_set_ea(inode, name, name_len, value, size, flags, 0); + err = ntfs_set_ea(inode, name, name_len, value, size, flags); out: return err; @@ -1042,28 +914,29 @@ int ntfs_save_wsl_perm(struct inode *inode) int err; __le32 value; + /* TODO: refactor this, so we don't lock 4 times in ntfs_set_ea */ value = cpu_to_le32(i_uid_read(inode)); err = ntfs_set_ea(inode, "$LXUID", sizeof("$LXUID") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; value = cpu_to_le32(i_gid_read(inode)); err = ntfs_set_ea(inode, "$LXGID", sizeof("$LXGID") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; value = cpu_to_le32(inode->i_mode); err = ntfs_set_ea(inode, "$LXMOD", sizeof("$LXMOD") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { value = cpu_to_le32(inode->i_rdev); err = ntfs_set_ea(inode, "$LXDEV", sizeof("$LXDEV") - 1, &value, - sizeof(value), 0, 0); + sizeof(value), 0); if (err) goto out; } diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f1cc8258d34a..bb247bc349e4 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5940,6 +5940,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { + ocfs2_commit_trans(osb, handle); mlog_errno(status); goto bail; } @@ -5964,6 +5965,7 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, data_alloc_bh, start_blk, num_clusters); if (status < 0) { + ocfs2_commit_trans(osb, handle); mlog_errno(status); goto bail; } @@ -6921,13 +6923,12 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, } /* - * Zero the area past i_size but still within an allocated - * cluster. This avoids exposing nonzero data on subsequent file - * extends. + * Zero partial cluster for a hole punch or truncate. This avoids exposing + * nonzero data on subsequent file extends. * * We need to call this before i_size is updated on the inode because * otherwise block_write_full_page() will skip writeout of pages past - * i_size. The new_i_size parameter is passed for this reason. + * i_size. */ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, u64 range_start, u64 range_end) @@ -6945,6 +6946,15 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) return 0; + /* + * Avoid zeroing pages fully beyond current i_size. It is pointless as + * underlying blocks of those pages should be already zeroed out and + * page writeback will skip them anyway. + */ + range_end = min_t(u64, range_end, i_size_read(inode)); + if (range_start >= range_end) + return 0; + pages = kcalloc(ocfs2_pages_per_cluster(sb), sizeof(struct page *), GFP_NOFS); if (pages == NULL) { @@ -6953,9 +6963,6 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, goto out; } - if (range_start == range_end) - goto out; - ret = ocfs2_extent_map_get_blocks(inode, range_start >> sb->s_blocksize_bits, &phys, NULL, &ext_flags); @@ -7045,7 +7052,7 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct buffer_head *di_bh) { - int ret, i, has_data, num_pages = 0; + int ret, has_data, num_pages = 0; int need_free = 0; u32 bit_off, num; handle_t *handle; @@ -7054,26 +7061,17 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_alloc_context *data_ac = NULL; - struct page **pages = NULL; - loff_t end = osb->s_clustersize; + struct page *page = NULL; struct ocfs2_extent_tree et; int did_quota = 0; has_data = i_size_read(inode) ? 1 : 0; if (has_data) { - pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), - sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { - ret = -ENOMEM; - mlog_errno(ret); - return ret; - } - ret = ocfs2_reserve_clusters(osb, 1, &data_ac); if (ret) { mlog_errno(ret); - goto free_pages; + goto out; } } @@ -7093,7 +7091,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } if (has_data) { - unsigned int page_end; + unsigned int page_end = min_t(unsigned, PAGE_SIZE, + osb->s_clustersize); u64 phys; ret = dquot_alloc_space_nodirty(inode, @@ -7117,15 +7116,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, */ block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); - /* - * Non sparse file systems zero on extend, so no need - * to do that now. - */ - if (!ocfs2_sparse_alloc(osb) && - PAGE_SIZE < osb->s_clustersize) - end = PAGE_SIZE; - - ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); + ret = ocfs2_grab_eof_pages(inode, 0, page_end, &page, + &num_pages); if (ret) { mlog_errno(ret); need_free = 1; @@ -7136,20 +7128,15 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * This should populate the 1st page for us and mark * it up to date. */ - ret = ocfs2_read_inline_data(inode, pages[0], di_bh); + ret = ocfs2_read_inline_data(inode, page, di_bh); if (ret) { mlog_errno(ret); need_free = 1; goto out_unlock; } - page_end = PAGE_SIZE; - if (PAGE_SIZE > osb->s_clustersize) - page_end = osb->s_clustersize; - - for (i = 0; i < num_pages; i++) - ocfs2_map_and_dirty_page(inode, handle, 0, page_end, - pages[i], i > 0, &phys); + ocfs2_map_and_dirty_page(inode, handle, 0, page_end, page, 0, + &phys); } spin_lock(&oi->ip_lock); @@ -7180,8 +7167,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } out_unlock: - if (pages) - ocfs2_unlock_and_free_pages(pages, num_pages); + if (page) + ocfs2_unlock_and_free_pages(&page, num_pages); out_commit: if (ret < 0 && did_quota) @@ -7205,8 +7192,6 @@ out_commit: out: if (data_ac) ocfs2_free_alloc_context(data_ac); -free_pages: - kfree(pages); return ret; } diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0e7aad1b11cc..5cd5f7511dac 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2698,7 +2698,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) continue; } retry: - ret = -EINVAL; mlog(0, "attempting to send begin reco msg to %d\n", nodenum); ret = o2net_send_message(DLM_BEGIN_RECO_MSG, dlm->key, diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 359524b7341f..801e60bab955 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3951,7 +3951,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, oi = OCFS2_I(inode); oi->ip_dir_lock_gen++; mlog(0, "generation: %u\n", oi->ip_dir_lock_gen); - goto out; + goto out_forget; } if (!S_ISREG(inode->i_mode)) @@ -3982,6 +3982,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres, filemap_fdatawait(mapping); } +out_forget: forget_all_cached_acls(inode); out: diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 54d7843c0211..fc5f780fa235 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -476,10 +476,11 @@ int ocfs2_truncate_file(struct inode *inode, * greater than page size, so we have to truncate them * anyway. */ - unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(inode->i_mapping, new_i_size); if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + unmap_mapping_range(inode->i_mapping, + new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); status = ocfs2_truncate_inline(inode, di_bh, new_i_size, i_size_read(inode), 1); if (status) @@ -498,6 +499,9 @@ int ocfs2_truncate_file(struct inode *inode, goto bail_unlock_sem; } + unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(inode->i_mapping, new_i_size); + status = ocfs2_commit_truncate(osb, inode, di_bh); if (status < 0) { mlog_errno(status); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index bc8f32fab964..6c2411c2afcf 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -125,7 +125,6 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, struct inode *inode = NULL; struct super_block *sb = osb->sb; struct ocfs2_find_inode_args args; - journal_t *journal = OCFS2_SB(sb)->journal->j_journal; trace_ocfs2_iget_begin((unsigned long long)blkno, flags, sysfile_type); @@ -172,10 +171,11 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags, * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ - if (journal) { + if (osb->journal) { transaction_t *transaction; tid_t tid; struct ocfs2_inode_info *oi = OCFS2_I(inode); + journal_t *journal = osb->journal->j_journal; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 4f15750aac5d..b9c339335a53 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -810,19 +810,34 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) write_unlock(&journal->j_state_lock); } -int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty) { int status = -1; struct inode *inode = NULL; /* the journal inode */ journal_t *j_journal = NULL; + struct ocfs2_journal *journal = NULL; struct ocfs2_dinode *di = NULL; struct buffer_head *bh = NULL; - struct ocfs2_super *osb; int inode_lock = 0; - BUG_ON(!journal); + /* initialize our journal structure */ + journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); + if (!journal) { + mlog(ML_ERROR, "unable to alloc journal\n"); + status = -ENOMEM; + goto done; + } + osb->journal = journal; + journal->j_osb = osb; - osb = journal->j_osb; + atomic_set(&journal->j_num_trans, 0); + init_rwsem(&journal->j_trans_barrier); + init_waitqueue_head(&journal->j_checkpointed); + spin_lock_init(&journal->j_lock); + journal->j_trans_id = 1UL; + INIT_LIST_HEAD(&journal->j_la_cleanups); + INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); + journal->j_state = OCFS2_JOURNAL_FREE; /* already have the inode for our journal */ inode = ocfs2_get_system_file_inode(osb, JOURNAL_SYSTEM_INODE, @@ -1028,9 +1043,10 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) journal->j_state = OCFS2_JOURNAL_FREE; -// up_write(&journal->j_trans_barrier); done: iput(inode); + kfree(journal); + osb->journal = NULL; } static void ocfs2_clear_journal_error(struct super_block *sb, diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index d158acb8b38a..8dcb2f2cadbc 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -167,8 +167,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb); * ocfs2_start_checkpoint - Kick the commit thread to do a checkpoint. */ void ocfs2_set_journal_params(struct ocfs2_super *osb); -int ocfs2_journal_init(struct ocfs2_journal *journal, - int *dirty); +int ocfs2_journal_init(struct ocfs2_super *osb, int *dirty); void ocfs2_journal_shutdown(struct ocfs2_super *osb); int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 8521942f5af2..481017e1dac5 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1251,7 +1251,7 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, { struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct journal_head *jh; - int ret; + int ret = 1; if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap)) return 0; @@ -1259,14 +1259,18 @@ static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh, if (!buffer_jbd(bg_bh)) return 1; - jh = bh2jh(bg_bh); - spin_lock(&jh->b_state_lock); - bg = (struct ocfs2_group_desc *) jh->b_committed_data; - if (bg) - ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); - else - ret = 1; - spin_unlock(&jh->b_state_lock); + jbd_lock_bh_journal_head(bg_bh); + if (buffer_jbd(bg_bh)) { + jh = bh2jh(bg_bh); + spin_lock(&jh->b_state_lock); + bg = (struct ocfs2_group_desc *) jh->b_committed_data; + if (bg) + ret = !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap); + else + ret = 1; + spin_unlock(&jh->b_state_lock); + } + jbd_unlock_bh_journal_head(bg_bh); return ret; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index c86bd4e60e20..1286b88b6fa1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1894,8 +1894,6 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* This will disable recovery and flush any recovery work. */ ocfs2_recovery_exit(osb); - ocfs2_journal_shutdown(osb); - ocfs2_sync_blockdev(sb); ocfs2_purge_refcount_trees(osb); @@ -1918,6 +1916,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) ocfs2_release_system_inodes(osb); + ocfs2_journal_shutdown(osb); + /* * If we're dismounting due to mount error, mount.ocfs2 will clean * up heartbeat. If we're a local mount, there is no heartbeat. @@ -2016,7 +2016,6 @@ static int ocfs2_initialize_super(struct super_block *sb, int i, cbits, bbits; struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; struct inode *inode = NULL; - struct ocfs2_journal *journal; struct ocfs2_super *osb; u64 total_blocks; @@ -2167,11 +2166,17 @@ static int ocfs2_initialize_super(struct super_block *sb, } if (ocfs2_clusterinfo_valid(osb)) { + /* + * ci_stack and ci_cluster in ocfs2_cluster_info may not be null + * terminated, so make sure no overflow happens here by using + * memcpy. Destination strings will always be null terminated + * because osb is allocated using kzalloc. + */ osb->osb_stackflags = OCFS2_RAW_SB(di)->s_cluster_info.ci_stackflags; - strlcpy(osb->osb_cluster_stack, + memcpy(osb->osb_cluster_stack, OCFS2_RAW_SB(di)->s_cluster_info.ci_stack, - OCFS2_STACK_LABEL_LEN + 1); + OCFS2_STACK_LABEL_LEN); if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) { mlog(ML_ERROR, "couldn't mount because of an invalid " @@ -2180,9 +2185,9 @@ static int ocfs2_initialize_super(struct super_block *sb, status = -EINVAL; goto bail; } - strlcpy(osb->osb_cluster_name, + memcpy(osb->osb_cluster_name, OCFS2_RAW_SB(di)->s_cluster_info.ci_cluster, - OCFS2_CLUSTER_NAME_LEN + 1); + OCFS2_CLUSTER_NAME_LEN); } else { /* The empty string is identical with classic tools that * don't know about s_cluster_info. */ @@ -2191,33 +2196,6 @@ static int ocfs2_initialize_super(struct super_block *sb, get_random_bytes(&osb->s_next_generation, sizeof(u32)); - /* FIXME - * This should be done in ocfs2_journal_init(), but unknown - * ordering issues will cause the filesystem to crash. - * If anyone wants to figure out what part of the code - * refers to osb->journal before ocfs2_journal_init() is run, - * be my guest. - */ - /* initialize our journal structure */ - - journal = kzalloc(sizeof(struct ocfs2_journal), GFP_KERNEL); - if (!journal) { - mlog(ML_ERROR, "unable to alloc journal\n"); - status = -ENOMEM; - goto bail; - } - osb->journal = journal; - journal->j_osb = osb; - - atomic_set(&journal->j_num_trans, 0); - init_rwsem(&journal->j_trans_barrier); - init_waitqueue_head(&journal->j_checkpointed); - spin_lock_init(&journal->j_lock); - journal->j_trans_id = (unsigned long) 1; - INIT_LIST_HEAD(&journal->j_la_cleanups); - INIT_WORK(&journal->j_recovery_work, ocfs2_complete_recovery); - journal->j_state = OCFS2_JOURNAL_FREE; - INIT_WORK(&osb->dquot_drop_work, ocfs2_drop_dquot_refs); init_llist_head(&osb->dquot_drop_list); @@ -2398,7 +2376,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) * ourselves. */ /* Init our journal object. */ - status = ocfs2_journal_init(osb->journal, &dirty); + status = ocfs2_journal_init(osb, &dirty); if (status < 0) { mlog(ML_ERROR, "Could not initialize journal!\n"); goto finally; @@ -2507,12 +2485,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) kfree(osb->osb_orphan_wipes); kfree(osb->slot_recovery_generations); - /* FIXME - * This belongs in journal shutdown, but because we have to - * allocate osb->journal at the start of ocfs2_initialize_osb(), - * we free it here. - */ - kfree(osb->journal); kfree(osb->local_alloc_copy); kfree(osb->uuid_str); kfree(osb->vol_label); diff --git a/fs/open.c b/fs/open.c index daa324606a41..f732fb94600c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -856,8 +856,20 @@ static int do_dentry_open(struct file *f, * of THPs into the page cache will fail. */ smp_mb(); - if (filemap_nr_thps(inode->i_mapping)) - truncate_pagecache(inode, 0); + if (filemap_nr_thps(inode->i_mapping)) { + struct address_space *mapping = inode->i_mapping; + + filemap_invalidate_lock(inode->i_mapping); + /* + * unmap_mapping_range just need to be called once + * here, because the private pages is not need to be + * unmapped mapping (e.g. data segment of dynamic + * shared libraries here). + */ + unmap_mapping_range(mapping, 0, 0, 0); + truncate_inode_pages(mapping, 0); + filemap_invalidate_unlock(inode->i_mapping); + } } return 0; @@ -1248,6 +1260,8 @@ SYSCALL_DEFINE4(openat2, int, dfd, const char __user *, filename, if (err) return err; + audit_openat2_how(&tmp); + /* O_LARGEFILE is only allowed for non-O_PATH. */ if (!(tmp.flags & O_PATH) && force_o_largefile()) tmp.flags |= O_LARGEFILE; diff --git a/fs/orangefs/dcache.c b/fs/orangefs/dcache.c index fe484cf93e5c..8bbe9486e3a6 100644 --- a/fs/orangefs/dcache.c +++ b/fs/orangefs/dcache.c @@ -26,8 +26,10 @@ static int orangefs_revalidate_lookup(struct dentry *dentry) gossip_debug(GOSSIP_DCACHE_DEBUG, "%s: attempting lookup.\n", __func__); new_op = op_alloc(ORANGEFS_VFS_OP_LOOKUP); - if (!new_op) + if (!new_op) { + ret = -ENOMEM; goto out_put_parent; + } new_op->upcall.req.lookup.sym_follow = ORANGEFS_LOOKUP_LINK_NO_FOLLOW; new_op->upcall.req.lookup.parent_refn = parent->refn; diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index c1bb4c4b5d67..e5e3e500ed46 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -10,7 +10,7 @@ * Linux VFS inode operations. */ -#include <linux/bvec.h> +#include <linux/blkdev.h> #include <linux/fileattr.h> #include "protocol.h" #include "orangefs-kernel.h" diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 2f2e430461b2..d90d8addbfc2 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -11,6 +11,7 @@ #include <linux/parser.h> #include <linux/hashtable.h> +#include <linux/seq_file.h> /* a cache for orangefs-inode objects (i.e. orangefs inode private data) */ static struct kmem_cache *orangefs_inode_cache; @@ -475,7 +476,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, const char *devname, void *data) { - int ret = -EINVAL; + int ret; struct super_block *sb = ERR_PTR(-EINVAL); struct orangefs_kernel_op_s *new_op; struct dentry *d = ERR_PTR(-EINVAL); @@ -526,7 +527,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); if (!ORANGEFS_SB(sb)) { d = ERR_PTR(-ENOMEM); - goto free_op; + goto free_sb_and_op; } ret = orangefs_fill_sb(sb, diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index 44fea16751f1..fa125feed0ff 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -280,14 +280,14 @@ static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req) ovl_aio_put(aio_req); } -static void ovl_aio_rw_complete(struct kiocb *iocb, long res, long res2) +static void ovl_aio_rw_complete(struct kiocb *iocb, long res) { struct ovl_aio_req *aio_req = container_of(iocb, struct ovl_aio_req, iocb); struct kiocb *orig_iocb = aio_req->orig_iocb; ovl_aio_cleanup_handler(aio_req); - orig_iocb->ki_complete(orig_iocb, res, res2); + orig_iocb->ki_complete(orig_iocb, res); } static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index f5c25f580dd9..9323a854a60a 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -134,8 +134,7 @@ struct posix_acl *get_acl(struct inode *inode, int type) * to just call ->get_acl to fetch the ACL ourself. (This is going to * be an unlikely race.) */ - if (cmpxchg(p, ACL_NOT_CACHED, sentinel) != ACL_NOT_CACHED) - /* fall through */ ; + cmpxchg(p, ACL_NOT_CACHED, sentinel); /* * Normally, the ACL returned by ->get_acl will be cached. diff --git a/fs/proc/array.c b/fs/proc/array.c index 49be8c8ef555..ff869a66b34e 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -408,9 +408,9 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) cpumask_pr_args(&task->cpus_mask)); } -static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) +static inline void task_core_dumping(struct seq_file *m, struct task_struct *task) { - seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state); + seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state); seq_putc(m, '\n'); } @@ -436,7 +436,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, if (mm) { task_mem(m, mm); - task_core_dumping(m, mm); + task_core_dumping(m, task); task_thp_status(m, mm); mmput(mm); } @@ -541,7 +541,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, } if (permitted && (!whole || num_threads < 2)) - wchan = get_wchan(task); + wchan = !task_is_running(task); if (!whole) { min_flt = task->min_flt; maj_flt = task->maj_flt; @@ -606,10 +606,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, * * This works with older implementations of procps as well. */ - if (wchan) - seq_puts(m, " 1"); - else - seq_puts(m, " 0"); + seq_put_decimal_ull(m, " ", wchan); seq_put_decimal_ull(m, " ", 0); seq_put_decimal_ull(m, " ", 0); diff --git a/fs/proc/base.c b/fs/proc/base.c index 533d5836eb9a..13eda8de2998 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -67,6 +67,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/rcupdate.h> +#include <linux/kallsyms.h> #include <linux/stacktrace.h> #include <linux/resource.h> #include <linux/module.h> @@ -386,17 +387,19 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { unsigned long wchan; + char symname[KSYM_NAME_LEN]; - if (ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) - wchan = get_wchan(task); - else - wchan = 0; + if (!ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS)) + goto print0; - if (wchan) - seq_printf(m, "%ps", (void *) wchan); - else - seq_putc(m, '0'); + wchan = get_wchan(task); + if (wchan && !lookup_symbol_name(wchan, symname)) { + seq_puts(m, symname); + return 0; + } +print0: + seq_putc(m, '0'); return 0; } #endif /* CONFIG_KALLSYMS */ @@ -1979,19 +1982,21 @@ static int pid_revalidate(struct dentry *dentry, unsigned int flags) { struct inode *inode; struct task_struct *task; + int ret = 0; - if (flags & LOOKUP_RCU) - return -ECHILD; - - inode = d_inode(dentry); - task = get_proc_task(inode); + rcu_read_lock(); + inode = d_inode_rcu(dentry); + if (!inode) + goto out; + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { pid_update_inode(task, inode); - put_task_struct(task); - return 1; + ret = 1; } - return 0; +out: + rcu_read_unlock(); + return ret; } static inline bool proc_inode_is_dead(struct inode *inode) @@ -3799,7 +3804,10 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) task = next_tid(task), ctx->pos++) { char name[10 + 1]; unsigned int len; + tid = task_pid_nr_ns(task, ns); + if (!tid) + continue; /* The task has just exited. */ len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, proc_task_instantiate, task, NULL)) { diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 6561a06ef905..4fb8729a68d4 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -24,7 +24,7 @@ #ifdef arch_idle_time -static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) +u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle; @@ -46,7 +46,7 @@ static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) #else -static u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) +u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index cf25be3e0321..ad667dbc96f5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -397,7 +397,6 @@ struct mem_size_stats { u64 pss_shmem; u64 pss_locked; u64 swap_pss; - bool check_shmem_swap; }; static void smaps_page_accumulate(struct mem_size_stats *mss, @@ -478,9 +477,11 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; + struct vm_area_struct *vma = walk->vma; - mss->swap += shmem_partial_swap_usage( - walk->vma->vm_file->f_mapping, addr, end); + mss->swap += shmem_partial_swap_usage(walk->vma->vm_file->f_mapping, + linear_page_index(vma, addr), + linear_page_index(vma, end)); return 0; } @@ -488,6 +489,16 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, #define smaps_pte_hole NULL #endif /* CONFIG_SHMEM */ +static void smaps_pte_hole_lookup(unsigned long addr, struct mm_walk *walk) +{ +#ifdef CONFIG_SHMEM + if (walk->ops->pte_hole) { + /* depth is not used */ + smaps_pte_hole(addr, addr + PAGE_SIZE, 0, walk); + } +#endif +} + static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { @@ -516,12 +527,8 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_pfn_swap_entry(swpent)) page = pfn_swap_entry_to_page(swpent); - } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap - && pte_none(*pte))) { - page = xa_load(&vma->vm_file->f_mapping->i_pages, - linear_page_index(vma, addr)); - if (xa_is_value(page)) - mss->swap += PAGE_SIZE; + } else { + smaps_pte_hole_lookup(addr, walk); return; } @@ -735,8 +742,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, return; #ifdef CONFIG_SHMEM - /* In case of smaps_rollup, reset the value from previous vma */ - mss->check_shmem_swap = false; if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -754,7 +759,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, !(vma->vm_flags & VM_WRITE))) { mss->swap += shmem_swapped; } else { - mss->check_shmem_swap = true; ops = &smaps_shmem_walk_ops; } } diff --git a/fs/proc/uptime.c b/fs/proc/uptime.c index 5a1b228964fb..deb99bc9b7e6 100644 --- a/fs/proc/uptime.c +++ b/fs/proc/uptime.c @@ -12,18 +12,22 @@ static int uptime_proc_show(struct seq_file *m, void *v) { struct timespec64 uptime; struct timespec64 idle; - u64 nsec; + u64 idle_nsec; u32 rem; int i; - nsec = 0; - for_each_possible_cpu(i) - nsec += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE]; + idle_nsec = 0; + for_each_possible_cpu(i) { + struct kernel_cpustat kcs; + + kcpustat_cpu_fetch(&kcs, i); + idle_nsec += get_idle_time(&kcs, i); + } ktime_get_boottime_ts64(&uptime); timens_add_boottime(&uptime); - idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem); + idle.tv_sec = div_u64_rem(idle_nsec, NSEC_PER_SEC, &rem); idle.tv_nsec = rem; seq_printf(m, "%lu.%02lu %lu.%02lu\n", (unsigned long) uptime.tv_sec, diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9a15334da208..30a3b66f475a 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -26,7 +26,7 @@ #include <linux/vmalloc.h> #include <linux/pagemap.h> #include <linux/uaccess.h> -#include <linux/mem_encrypt.h> +#include <linux/cc_platform.h> #include <asm/io.h> #include "internal.h" @@ -62,46 +62,75 @@ core_param(novmcoredd, vmcoredd_disabled, bool, 0); /* Device Dump Size */ static size_t vmcoredd_orig_sz; -/* - * Returns > 0 for RAM pages, 0 for non-RAM pages, < 0 on error - * The called function has to take care of module refcounting. - */ -static int (*oldmem_pfn_is_ram)(unsigned long pfn); +static DECLARE_RWSEM(vmcore_cb_rwsem); +/* List of registered vmcore callbacks. */ +static LIST_HEAD(vmcore_cb_list); +/* Whether we had a surprise unregistration of a callback. */ +static bool vmcore_cb_unstable; +/* Whether the vmcore has been opened once. */ +static bool vmcore_opened; -int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)) +void register_vmcore_cb(struct vmcore_cb *cb) { - if (oldmem_pfn_is_ram) - return -EBUSY; - oldmem_pfn_is_ram = fn; - return 0; + down_write(&vmcore_cb_rwsem); + INIT_LIST_HEAD(&cb->next); + list_add_tail(&cb->next, &vmcore_cb_list); + /* + * Registering a vmcore callback after the vmcore was opened is + * very unusual (e.g., manual driver loading). + */ + if (vmcore_opened) + pr_warn_once("Unexpected vmcore callback registration\n"); + up_write(&vmcore_cb_rwsem); } -EXPORT_SYMBOL_GPL(register_oldmem_pfn_is_ram); +EXPORT_SYMBOL_GPL(register_vmcore_cb); -void unregister_oldmem_pfn_is_ram(void) +void unregister_vmcore_cb(struct vmcore_cb *cb) { - oldmem_pfn_is_ram = NULL; - wmb(); + down_write(&vmcore_cb_rwsem); + list_del(&cb->next); + /* + * Unregistering a vmcore callback after the vmcore was opened is + * very unusual (e.g., forced driver removal), but we cannot stop + * unregistering. + */ + if (vmcore_opened) { + pr_warn_once("Unexpected vmcore callback unregistration\n"); + vmcore_cb_unstable = true; + } + up_write(&vmcore_cb_rwsem); } -EXPORT_SYMBOL_GPL(unregister_oldmem_pfn_is_ram); +EXPORT_SYMBOL_GPL(unregister_vmcore_cb); -static int pfn_is_ram(unsigned long pfn) +static bool pfn_is_ram(unsigned long pfn) { - int (*fn)(unsigned long pfn); - /* pfn is ram unless fn() checks pagetype */ - int ret = 1; + struct vmcore_cb *cb; + bool ret = true; - /* - * Ask hypervisor if the pfn is really ram. - * A ballooned page contains no data and reading from such a page - * will cause high load in the hypervisor. - */ - fn = oldmem_pfn_is_ram; - if (fn) - ret = fn(pfn); + lockdep_assert_held_read(&vmcore_cb_rwsem); + if (unlikely(vmcore_cb_unstable)) + return false; + + list_for_each_entry(cb, &vmcore_cb_list, next) { + if (unlikely(!cb->pfn_is_ram)) + continue; + ret = cb->pfn_is_ram(cb, pfn); + if (!ret) + break; + } return ret; } +static int open_vmcore(struct inode *inode, struct file *file) +{ + down_read(&vmcore_cb_rwsem); + vmcore_opened = true; + up_read(&vmcore_cb_rwsem); + + return 0; +} + /* Reads a page from the oldmem device from given offset. */ ssize_t read_from_oldmem(char *buf, size_t count, u64 *ppos, int userbuf, @@ -117,6 +146,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); + down_read(&vmcore_cb_rwsem); do { if (count > (PAGE_SIZE - offset)) nr_bytes = PAGE_SIZE - offset; @@ -124,7 +154,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, nr_bytes = count; /* If pfn is not ram, return zeros for sparse dump files */ - if (pfn_is_ram(pfn) == 0) + if (!pfn_is_ram(pfn)) memset(buf, 0, nr_bytes); else { if (encrypted) @@ -136,8 +166,10 @@ ssize_t read_from_oldmem(char *buf, size_t count, tmp = copy_oldmem_page(pfn, buf, nr_bytes, offset, userbuf); - if (tmp < 0) + if (tmp < 0) { + up_read(&vmcore_cb_rwsem); return tmp; + } } *ppos += nr_bytes; count -= nr_bytes; @@ -147,6 +179,7 @@ ssize_t read_from_oldmem(char *buf, size_t count, offset = 0; } while (count); + up_read(&vmcore_cb_rwsem); return read; } @@ -177,7 +210,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos) */ ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos) { - return read_from_oldmem(buf, count, ppos, 0, mem_encrypt_active()); + return read_from_oldmem(buf, count, ppos, 0, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); } /* @@ -378,7 +411,7 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos, buflen); start = m->paddr + *fpos - m->offset; tmp = read_from_oldmem(buffer, tsz, &start, - userbuf, mem_encrypt_active()); + userbuf, cc_platform_has(CC_ATTR_MEM_ENCRYPT)); if (tmp < 0) return tmp; buflen -= tsz; @@ -537,14 +570,19 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot) { + int ret; + /* * Check if oldmem_pfn_is_ram was registered to avoid * looping over all pages without a reason. */ - if (oldmem_pfn_is_ram) - return remap_oldmem_pfn_checked(vma, from, pfn, size, prot); + down_read(&vmcore_cb_rwsem); + if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable) + ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else - return remap_oldmem_pfn_range(vma, from, pfn, size, prot); + ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); + up_read(&vmcore_cb_rwsem); + return ret; } static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) @@ -668,6 +706,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma) #endif static const struct proc_ops vmcore_proc_ops = { + .proc_open = open_vmcore, .proc_read = read_vmcore, .proc_lseek = default_llseek, .proc_mmap = mmap_vmcore, diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index 04ce58c939a0..5d1fbaffd66a 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -205,7 +205,6 @@ static ssize_t psblk_generic_blk_write(const char *buf, size_t bytes, static int __register_pstore_blk(struct pstore_device_info *dev, const char *devpath) { - struct inode *inode; int ret = -ENODEV; lockdep_assert_held(&pstore_blk_lock); @@ -217,14 +216,13 @@ static int __register_pstore_blk(struct pstore_device_info *dev, goto err; } - inode = file_inode(psblk_file); - if (!S_ISBLK(inode->i_mode)) { + if (!S_ISBLK(file_inode(psblk_file)->i_mode)) { pr_err("'%s' is not block device!\n", devpath); goto err_fput; } - inode = I_BDEV(psblk_file->f_mapping->host)->bd_inode; - dev->zone.total_size = i_size_read(inode); + dev->zone.total_size = + bdev_nr_bytes(I_BDEV(psblk_file->f_mapping->host)); ret = __register_pstore_device(dev); if (ret) diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c index a6ee23aadd28..66645a5a35f3 100644 --- a/fs/qnx4/dir.c +++ b/fs/qnx4/dir.c @@ -15,13 +15,48 @@ #include <linux/buffer_head.h> #include "qnx4.h" +/* + * A qnx4 directory entry is an inode entry or link info + * depending on the status field in the last byte. The + * first byte is where the name start either way, and a + * zero means it's empty. + * + * Also, due to a bug in gcc, we don't want to use the + * real (differently sized) name arrays in the inode and + * link entries, but always the 'de_name[]' one in the + * fake struct entry. + * + * See + * + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99578#c6 + * + * for details, but basically gcc will take the size of the + * 'name' array from one of the used union entries randomly. + * + * This use of 'de_name[]' (48 bytes) avoids the false positive + * warnings that would happen if gcc decides to use 'inode.di_name' + * (16 bytes) even when the pointer and size were to come from + * 'link.dl_name' (48 bytes). + * + * In all cases the actual name pointer itself is the same, it's + * only the gcc internal 'what is the size of this field' logic + * that can get confused. + */ +union qnx4_directory_entry { + struct { + const char de_name[48]; + u8 de_pad[15]; + u8 de_status; + }; + struct qnx4_inode_entry inode; + struct qnx4_link_info link; +}; + static int qnx4_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); unsigned int offset; struct buffer_head *bh; - struct qnx4_inode_entry *de; - struct qnx4_link_info *le; unsigned long blknum; int ix, ino; int size; @@ -38,27 +73,27 @@ static int qnx4_readdir(struct file *file, struct dir_context *ctx) } ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK; for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) { + union qnx4_directory_entry *de; + offset = ix * QNX4_DIR_ENTRY_SIZE; - de = (struct qnx4_inode_entry *) (bh->b_data + offset); - if (!de->di_fname[0]) + de = (union qnx4_directory_entry *) (bh->b_data + offset); + + if (!de->de_name[0]) continue; - if (!(de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK))) + if (!(de->de_status & (QNX4_FILE_USED|QNX4_FILE_LINK))) continue; - if (!(de->di_status & QNX4_FILE_LINK)) - size = QNX4_SHORT_NAME_MAX; - else - size = QNX4_NAME_MAX; - size = strnlen(de->di_fname, size); - QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname)); - if (!(de->di_status & QNX4_FILE_LINK)) + if (!(de->de_status & QNX4_FILE_LINK)) { + size = sizeof(de->inode.di_fname); ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; - else { - le = (struct qnx4_link_info*)de; - ino = ( le32_to_cpu(le->dl_inode_blk) - 1 ) * + } else { + size = sizeof(de->link.dl_fname); + ino = ( le32_to_cpu(de->link.dl_inode_blk) - 1 ) * QNX4_INODES_PER_BLOCK + - le->dl_inode_ndx; + de->link.dl_inode_ndx; } - if (!dir_emit(ctx, de->di_fname, size, ino, DT_UNKNOWN)) { + size = strnlen(de->de_name, size); + QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, name)); + if (!dir_emit(ctx, de->de_name, size, ino, DT_UNKNOWN)) { brelse(bh); return 0; } diff --git a/fs/quota/quota.c b/fs/quota/quota.c index 2bcc9a6f1bfc..052f143e2e0e 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -10,6 +10,7 @@ #include <linux/namei.h> #include <linux/slab.h> #include <asm/current.h> +#include <linux/blkdev.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/security.h> diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index d3e995e1046f..5f2405994280 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -414,6 +414,7 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, quota_error(dquot->dq_sb, "Quota structure has offset to " "other block (%u) than it should (%u)", blk, (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + ret = -EIO; goto out_buf; } ret = read_blk(info, blk, buf); @@ -479,6 +480,13 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (newblk < QT_TREEOFF || newblk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + newblk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth == info->dqi_qtree_depth - 1) { ret = free_dqentry(info, dquot, newblk); newblk = 0; @@ -578,6 +586,13 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); if (!blk) /* No reference? */ goto out_buf; + if (blk < QT_TREEOFF || blk >= info->dqi_blocks) { + quota_error(dquot->dq_sb, "Getting block too big (%u >= %u)", + blk, info->dqi_blocks); + ret = -EUCLEAN; + goto out_buf; + } + if (depth < info->dqi_qtree_depth - 1) ret = find_tree_dqentry(info, dquot, blk, depth+1); else diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 65e7e56005b8..bc66d0173e33 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -38,6 +38,7 @@ #include <linux/uaccess.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> +#include <linux/seq_file.h> #include "internal.h" struct ramfs_mount_opts { @@ -203,17 +204,20 @@ static int ramfs_parse_param(struct fs_context *fc, struct fs_parameter *param) int opt; opt = fs_parse(fc, ramfs_fs_parameters, param, &result); - if (opt < 0) { + if (opt == -ENOPARAM) { + opt = vfs_parse_fs_param_source(fc, param); + if (opt != -ENOPARAM) + return opt; /* * We might like to report bad mount options here; * but traditionally ramfs has ignored all mount options, * and as it is used as a !CONFIG_SHMEM simple substitute * for tmpfs, better continue to ignore other mount options. */ - if (opt == -ENOPARAM) - opt = 0; - return opt; + return 0; } + if (opt < 0) + return opt; switch (opt) { case Opt_mode: diff --git a/fs/read_write.c b/fs/read_write.c index af057c57bdc6..0074afa7ecb3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -368,10 +368,6 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t if (unlikely((ssize_t) count < 0)) return -EINVAL; - /* - * ranged mandatory locking does not apply to streams - it makes sense - * only for files where position has a meaning. - */ if (ppos) { loff_t pos = *ppos; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 58481f8d63d5..82e09901462e 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -1199,9 +1199,7 @@ static int reiserfs_parse_options(struct super_block *s, if (!strcmp(arg, "auto")) { /* From JFS code, to auto-get the size. */ - *blocks = - i_size_read(s->s_bdev->bd_inode) >> s-> - s_blocksize_bits; + *blocks = sb_bdev_nr_blocks(s); } else { *blocks = simple_strtoul(arg, &p, 0); if (*p != '\0') { @@ -1437,7 +1435,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) unsigned long safe_mask = 0; unsigned int commit_max_age = (unsigned int)-1; struct reiserfs_journal *journal = SB_JOURNAL(s); - char *new_opts; int err; char *qf_names[REISERFS_MAXQUOTAS]; unsigned int qfmt = 0; @@ -1445,10 +1442,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) int i; #endif - new_opts = kstrdup(arg, GFP_KERNEL); - if (arg && !new_opts) - return -ENOMEM; - sync_filesystem(s); reiserfs_write_lock(s); @@ -1599,7 +1592,6 @@ out_ok_unlocked: out_err_unlock: reiserfs_write_unlock(s); out_err: - kfree(new_opts); return err; } @@ -1986,9 +1978,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) * smaller than the filesystem. If the check fails then abort and * scream, because bad stuff will happen otherwise. */ - if (s->s_bdev && s->s_bdev->bd_inode - && i_size_read(s->s_bdev->bd_inode) < - sb_block_count(rs) * sb_blocksize(rs)) { + if (bdev_nr_bytes(s->s_bdev) < sb_block_count(rs) * sb_blocksize(rs)) { SWARN(silent, s, "", "Filesystem cannot be " "mounted because it is bigger than the device"); SWARN(silent, s, "", "You may need to run fsck " diff --git a/fs/seq_file.c b/fs/seq_file.c index 4a2cda04d3e2..f8e1f4ee87ff 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -383,22 +383,6 @@ void seq_escape_mem(struct seq_file *m, const char *src, size_t len, } EXPORT_SYMBOL(seq_escape_mem); -/** - * seq_escape - print string into buffer, escaping some characters - * @m: target buffer - * @s: string - * @esc: set of characters that need escaping - * - * Puts string into buffer, replacing each occurrence of character from - * @esc with usual octal escape. - * Use seq_has_overflowed() to check for errors. - */ -void seq_escape(struct seq_file *m, const char *s, const char *esc) -{ - seq_escape_str(m, s, ESCAPE_OCTAL, esc); -} -EXPORT_SYMBOL(seq_escape); - void seq_vprintf(struct seq_file *m, const char *f, va_list args) { int len; diff --git a/fs/smbfs_common/smb2pdu.h b/fs/smbfs_common/smb2pdu.h new file mode 100644 index 000000000000..7ccadcbe684b --- /dev/null +++ b/fs/smbfs_common/smb2pdu.h @@ -0,0 +1,989 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ +#ifndef _COMMON_SMB2PDU_H +#define _COMMON_SMB2PDU_H + +/* + * Note that, due to trying to use names similar to the protocol specifications, + * there are many mixed case field names in the structures below. Although + * this does not match typical Linux kernel style, it is necessary to be + * able to match against the protocol specfication. + * + * SMB2 commands + * Some commands have minimal (wct=0,bcc=0), or uninteresting, responses + * (ie no useful data other than the SMB error code itself) and are marked such. + * Knowing this helps avoid response buffer allocations and copy in some cases. + */ + +/* List of commands in host endian */ +#define SMB2_NEGOTIATE_HE 0x0000 +#define SMB2_SESSION_SETUP_HE 0x0001 +#define SMB2_LOGOFF_HE 0x0002 /* trivial request/resp */ +#define SMB2_TREE_CONNECT_HE 0x0003 +#define SMB2_TREE_DISCONNECT_HE 0x0004 /* trivial req/resp */ +#define SMB2_CREATE_HE 0x0005 +#define SMB2_CLOSE_HE 0x0006 +#define SMB2_FLUSH_HE 0x0007 /* trivial resp */ +#define SMB2_READ_HE 0x0008 +#define SMB2_WRITE_HE 0x0009 +#define SMB2_LOCK_HE 0x000A +#define SMB2_IOCTL_HE 0x000B +#define SMB2_CANCEL_HE 0x000C +#define SMB2_ECHO_HE 0x000D +#define SMB2_QUERY_DIRECTORY_HE 0x000E +#define SMB2_CHANGE_NOTIFY_HE 0x000F +#define SMB2_QUERY_INFO_HE 0x0010 +#define SMB2_SET_INFO_HE 0x0011 +#define SMB2_OPLOCK_BREAK_HE 0x0012 + +/* The same list in little endian */ +#define SMB2_NEGOTIATE cpu_to_le16(SMB2_NEGOTIATE_HE) +#define SMB2_SESSION_SETUP cpu_to_le16(SMB2_SESSION_SETUP_HE) +#define SMB2_LOGOFF cpu_to_le16(SMB2_LOGOFF_HE) +#define SMB2_TREE_CONNECT cpu_to_le16(SMB2_TREE_CONNECT_HE) +#define SMB2_TREE_DISCONNECT cpu_to_le16(SMB2_TREE_DISCONNECT_HE) +#define SMB2_CREATE cpu_to_le16(SMB2_CREATE_HE) +#define SMB2_CLOSE cpu_to_le16(SMB2_CLOSE_HE) +#define SMB2_FLUSH cpu_to_le16(SMB2_FLUSH_HE) +#define SMB2_READ cpu_to_le16(SMB2_READ_HE) +#define SMB2_WRITE cpu_to_le16(SMB2_WRITE_HE) +#define SMB2_LOCK cpu_to_le16(SMB2_LOCK_HE) +#define SMB2_IOCTL cpu_to_le16(SMB2_IOCTL_HE) +#define SMB2_CANCEL cpu_to_le16(SMB2_CANCEL_HE) +#define SMB2_ECHO cpu_to_le16(SMB2_ECHO_HE) +#define SMB2_QUERY_DIRECTORY cpu_to_le16(SMB2_QUERY_DIRECTORY_HE) +#define SMB2_CHANGE_NOTIFY cpu_to_le16(SMB2_CHANGE_NOTIFY_HE) +#define SMB2_QUERY_INFO cpu_to_le16(SMB2_QUERY_INFO_HE) +#define SMB2_SET_INFO cpu_to_le16(SMB2_SET_INFO_HE) +#define SMB2_OPLOCK_BREAK cpu_to_le16(SMB2_OPLOCK_BREAK_HE) + +#define SMB2_INTERNAL_CMD cpu_to_le16(0xFFFF) + +#define NUMBER_OF_SMB2_COMMANDS 0x0013 + +/* + * SMB2 Header Definition + * + * "MBZ" : Must be Zero + * "BB" : BugBug, Something to check/review/analyze later + * "PDU" : "Protocol Data Unit" (ie a network "frame") + * + */ + +#define __SMB2_HEADER_STRUCTURE_SIZE 64 +#define SMB2_HEADER_STRUCTURE_SIZE \ + cpu_to_le16(__SMB2_HEADER_STRUCTURE_SIZE) + +#define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) +#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) +#define SMB2_COMPRESSION_TRANSFORM_ID cpu_to_le32(0x424d53fc) + +/* + * SMB2 flag definitions + */ +#define SMB2_FLAGS_SERVER_TO_REDIR cpu_to_le32(0x00000001) +#define SMB2_FLAGS_ASYNC_COMMAND cpu_to_le32(0x00000002) +#define SMB2_FLAGS_RELATED_OPERATIONS cpu_to_le32(0x00000004) +#define SMB2_FLAGS_SIGNED cpu_to_le32(0x00000008) +#define SMB2_FLAGS_PRIORITY_MASK cpu_to_le32(0x00000070) /* SMB3.1.1 */ +#define SMB2_FLAGS_DFS_OPERATIONS cpu_to_le32(0x10000000) +#define SMB2_FLAGS_REPLAY_OPERATION cpu_to_le32(0x20000000) /* SMB3 & up */ + +/* See MS-SMB2 section 2.2.1 */ +struct smb2_hdr { + __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ + __le16 StructureSize; /* 64 */ + __le16 CreditCharge; /* MBZ */ + __le32 Status; /* Error from server */ + __le16 Command; + __le16 CreditRequest; /* CreditResponse */ + __le32 Flags; + __le32 NextCommand; + __le64 MessageId; + union { + struct { + __le32 ProcessId; + __le32 TreeId; + } __packed SyncId; + __le64 AsyncId; + } __packed Id; + __le64 SessionId; + __u8 Signature[16]; +} __packed; + +struct smb2_pdu { + struct smb2_hdr hdr; + __le16 StructureSize2; /* size of wct area (varies, request specific) */ +} __packed; + +#define SMB3_AES_CCM_NONCE 11 +#define SMB3_AES_GCM_NONCE 12 + +/* Transform flags (for 3.0 dialect this flag indicates CCM */ +#define TRANSFORM_FLAG_ENCRYPTED 0x0001 +struct smb2_transform_hdr { + __le32 ProtocolId; /* 0xFD 'S' 'M' 'B' */ + __u8 Signature[16]; + __u8 Nonce[16]; + __le32 OriginalMessageSize; + __u16 Reserved1; + __le16 Flags; /* EncryptionAlgorithm for 3.0, enc enabled for 3.1.1 */ + __le64 SessionId; +} __packed; + + +/* See MS-SMB2 2.2.42 */ +struct smb2_compression_transform_hdr_unchained { + __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ + __le32 OriginalCompressedSegmentSize; + __le16 CompressionAlgorithm; + __le16 Flags; + __le16 Length; /* if chained it is length, else offset */ +} __packed; + +/* See MS-SMB2 2.2.42.1 */ +#define SMB2_COMPRESSION_FLAG_NONE 0x0000 +#define SMB2_COMPRESSION_FLAG_CHAINED 0x0001 + +struct compression_payload_header { + __le16 CompressionAlgorithm; + __le16 Flags; + __le32 Length; /* length of compressed playload including field below if present */ + /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */ +} __packed; + +/* See MS-SMB2 2.2.42.2 */ +struct smb2_compression_transform_hdr_chained { + __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ + __le32 OriginalCompressedSegmentSize; + /* struct compression_payload_header[] */ +} __packed; + +/* See MS-SMB2 2.2.42.2.2 */ +struct compression_pattern_payload_v1 { + __le16 Pattern; + __le16 Reserved1; + __le16 Reserved2; + __le32 Repetitions; +} __packed; + +/* See MS-SMB2 section 2.2.9.2 */ +/* Context Types */ +#define SMB2_RESERVED_TREE_CONNECT_CONTEXT_ID 0x0000 +#define SMB2_REMOTED_IDENTITY_TREE_CONNECT_CONTEXT_ID cpu_to_le16(0x0001) + +struct tree_connect_contexts { + __le16 ContextType; + __le16 DataLength; + __le32 Reserved; + __u8 Data[]; +} __packed; + +/* Remoted identity tree connect context structures - see MS-SMB2 2.2.9.2.1 */ +struct smb3_blob_data { + __le16 BlobSize; + __u8 BlobData[]; +} __packed; + +/* Valid values for Attr */ +#define SE_GROUP_MANDATORY 0x00000001 +#define SE_GROUP_ENABLED_BY_DEFAULT 0x00000002 +#define SE_GROUP_ENABLED 0x00000004 +#define SE_GROUP_OWNER 0x00000008 +#define SE_GROUP_USE_FOR_DENY_ONLY 0x00000010 +#define SE_GROUP_INTEGRITY 0x00000020 +#define SE_GROUP_INTEGRITY_ENABLED 0x00000040 +#define SE_GROUP_RESOURCE 0x20000000 +#define SE_GROUP_LOGON_ID 0xC0000000 + +/* struct sid_attr_data is SidData array in BlobData format then le32 Attr */ + +struct sid_array_data { + __le16 SidAttrCount; + /* SidAttrList - array of sid_attr_data structs */ +} __packed; + +struct luid_attr_data { + +} __packed; + +/* + * struct privilege_data is the same as BLOB_DATA - see MS-SMB2 2.2.9.2.1.5 + * but with size of LUID_ATTR_DATA struct and BlobData set to LUID_ATTR DATA + */ + +struct privilege_array_data { + __le16 PrivilegeCount; + /* array of privilege_data structs */ +} __packed; + +struct remoted_identity_tcon_context { + __le16 TicketType; /* must be 0x0001 */ + __le16 TicketSize; /* total size of this struct */ + __le16 User; /* offset to SID_ATTR_DATA struct with user info */ + __le16 UserName; /* offset to null terminated Unicode username string */ + __le16 Domain; /* offset to null terminated Unicode domain name */ + __le16 Groups; /* offset to SID_ARRAY_DATA struct with group info */ + __le16 RestrictedGroups; /* similar to above */ + __le16 Privileges; /* offset to PRIVILEGE_ARRAY_DATA struct */ + __le16 PrimaryGroup; /* offset to SID_ARRAY_DATA struct */ + __le16 Owner; /* offset to BLOB_DATA struct */ + __le16 DefaultDacl; /* offset to BLOB_DATA struct */ + __le16 DeviceGroups; /* offset to SID_ARRAY_DATA struct */ + __le16 UserClaims; /* offset to BLOB_DATA struct */ + __le16 DeviceClaims; /* offset to BLOB_DATA struct */ + __u8 TicketInfo[]; /* variable length buf - remoted identity data */ +} __packed; + +struct smb2_tree_connect_req_extension { + __le32 TreeConnectContextOffset; + __le16 TreeConnectContextCount; + __u8 Reserved[10]; + __u8 PathName[]; /* variable sized array */ + /* followed by array of TreeConnectContexts */ +} __packed; + +/* Flags/Reserved for SMB3.1.1 */ +#define SMB2_TREE_CONNECT_FLAG_CLUSTER_RECONNECT cpu_to_le16(0x0001) +#define SMB2_TREE_CONNECT_FLAG_REDIRECT_TO_OWNER cpu_to_le16(0x0002) +#define SMB2_TREE_CONNECT_FLAG_EXTENSION_PRESENT cpu_to_le16(0x0004) + +struct smb2_tree_connect_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 Flags; /* Flags in SMB3.1.1 */ + __le16 PathOffset; + __le16 PathLength; + __u8 Buffer[1]; /* variable length */ +} __packed; + +/* Possible ShareType values */ +#define SMB2_SHARE_TYPE_DISK 0x01 +#define SMB2_SHARE_TYPE_PIPE 0x02 +#define SMB2_SHARE_TYPE_PRINT 0x03 + +/* + * Possible ShareFlags - exactly one and only one of the first 4 caching flags + * must be set (any of the remaining, SHI1005, flags may be set individually + * or in combination. + */ +#define SMB2_SHAREFLAG_MANUAL_CACHING 0x00000000 +#define SMB2_SHAREFLAG_AUTO_CACHING 0x00000010 +#define SMB2_SHAREFLAG_VDO_CACHING 0x00000020 +#define SMB2_SHAREFLAG_NO_CACHING 0x00000030 +#define SHI1005_FLAGS_DFS 0x00000001 +#define SHI1005_FLAGS_DFS_ROOT 0x00000002 +#define SHI1005_FLAGS_RESTRICT_EXCLUSIVE_OPENS 0x00000100 +#define SHI1005_FLAGS_FORCE_SHARED_DELETE 0x00000200 +#define SHI1005_FLAGS_ALLOW_NAMESPACE_CACHING 0x00000400 +#define SHI1005_FLAGS_ACCESS_BASED_DIRECTORY_ENUM 0x00000800 +#define SHI1005_FLAGS_FORCE_LEVELII_OPLOCK 0x00001000 +#define SHI1005_FLAGS_ENABLE_HASH_V1 0x00002000 +#define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 +#define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 +#define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ +#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ +#define SHI1005_FLAGS_ALL 0x0014FF33 + +/* Possible share capabilities */ +#define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ +#define SMB2_SHARE_CAP_CONTINUOUS_AVAILABILITY cpu_to_le32(0x00000010) /* 3.0 */ +#define SMB2_SHARE_CAP_SCALEOUT cpu_to_le32(0x00000020) /* 3.0 */ +#define SMB2_SHARE_CAP_CLUSTER cpu_to_le32(0x00000040) /* 3.0 */ +#define SMB2_SHARE_CAP_ASYMMETRIC cpu_to_le32(0x00000080) /* 3.02 */ +#define SMB2_SHARE_CAP_REDIRECT_TO_OWNER cpu_to_le32(0x00000100) /* 3.1.1 */ + +struct smb2_tree_connect_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 16 */ + __u8 ShareType; /* see below */ + __u8 Reserved; + __le32 ShareFlags; /* see below */ + __le32 Capabilities; /* see below */ + __le32 MaximalAccess; +} __packed; + +struct smb2_tree_disconnect_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_tree_disconnect_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + + +/* + * SMB2_NEGOTIATE_PROTOCOL See MS-SMB2 section 2.2.3 + */ +/* SecurityMode flags */ +#define SMB2_NEGOTIATE_SIGNING_ENABLED 0x0001 +#define SMB2_NEGOTIATE_SIGNING_ENABLED_LE cpu_to_le16(0x0001) +#define SMB2_NEGOTIATE_SIGNING_REQUIRED 0x0002 +#define SMB2_NEGOTIATE_SIGNING_REQUIRED_LE cpu_to_le16(0x0002) +#define SMB2_SEC_MODE_FLAGS_ALL 0x0003 + +/* Capabilities flags */ +#define SMB2_GLOBAL_CAP_DFS 0x00000001 +#define SMB2_GLOBAL_CAP_LEASING 0x00000002 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_LARGE_MTU 0X00000004 /* Resp only New to SMB2.1 */ +#define SMB2_GLOBAL_CAP_MULTI_CHANNEL 0x00000008 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_PERSISTENT_HANDLES 0x00000010 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_DIRECTORY_LEASING 0x00000020 /* New to SMB3 */ +#define SMB2_GLOBAL_CAP_ENCRYPTION 0x00000040 /* New to SMB3 */ +/* Internal types */ +#define SMB2_NT_FIND 0x00100000 +#define SMB2_LARGE_FILES 0x00200000 + +#define SMB2_CLIENT_GUID_SIZE 16 +#define SMB2_CREATE_GUID_SIZE 16 + +/* Dialects */ +#define SMB10_PROT_ID 0x0000 /* local only, not sent on wire w/CIFS negprot */ +#define SMB20_PROT_ID 0x0202 +#define SMB21_PROT_ID 0x0210 +#define SMB2X_PROT_ID 0x02FF +#define SMB30_PROT_ID 0x0300 +#define SMB302_PROT_ID 0x0302 +#define SMB311_PROT_ID 0x0311 +#define BAD_PROT_ID 0xFFFF + +#define SMB311_SALT_SIZE 32 +/* Hash Algorithm Types */ +#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) +#define SMB2_PREAUTH_HASH_SIZE 64 + +/* Negotiate Contexts - ContextTypes. See MS-SMB2 section 2.2.3.1 for details */ +#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) +#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) +#define SMB2_COMPRESSION_CAPABILITIES cpu_to_le16(3) +#define SMB2_NETNAME_NEGOTIATE_CONTEXT_ID cpu_to_le16(5) +#define SMB2_TRANSPORT_CAPABILITIES cpu_to_le16(6) +#define SMB2_RDMA_TRANSFORM_CAPABILITIES cpu_to_le16(7) +#define SMB2_SIGNING_CAPABILITIES cpu_to_le16(8) +#define SMB2_POSIX_EXTENSIONS_AVAILABLE cpu_to_le16(0x100) + +struct smb2_neg_context { + __le16 ContextType; + __le16 DataLength; + __le32 Reserved; + /* Followed by array of data. NOTE: some servers require padding to 8 byte boundary */ +} __packed; + +/* + * SaltLength that the server send can be zero, so the only three required + * fields (all __le16) end up six bytes total, so the minimum context data len + * in the response is six bytes which accounts for + * + * HashAlgorithmCount, SaltLength, and 1 HashAlgorithm. + */ +#define MIN_PREAUTH_CTXT_DATA_LEN 6 + +struct smb2_preauth_neg_context { + __le16 ContextType; /* 1 */ + __le16 DataLength; + __le32 Reserved; + __le16 HashAlgorithmCount; /* 1 */ + __le16 SaltLength; + __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ + __u8 Salt[SMB311_SALT_SIZE]; +} __packed; + +/* Encryption Algorithms Ciphers */ +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) +#define SMB2_ENCRYPTION_AES256_CCM cpu_to_le16(0x0003) +#define SMB2_ENCRYPTION_AES256_GCM cpu_to_le16(0x0004) + +/* Min encrypt context data is one cipher so 2 bytes + 2 byte count field */ +#define MIN_ENCRYPT_CTXT_DATA_LEN 4 +struct smb2_encryption_neg_context { + __le16 ContextType; /* 2 */ + __le16 DataLength; + __le32 Reserved; + /* CipherCount usally 2, but can be 3 when AES256-GCM enabled */ + __le16 CipherCount; /* AES128-GCM and AES128-CCM by default */ + __le16 Ciphers[]; +} __packed; + +/* See MS-SMB2 2.2.3.1.3 */ +#define SMB3_COMPRESS_NONE cpu_to_le16(0x0000) +#define SMB3_COMPRESS_LZNT1 cpu_to_le16(0x0001) +#define SMB3_COMPRESS_LZ77 cpu_to_le16(0x0002) +#define SMB3_COMPRESS_LZ77_HUFF cpu_to_le16(0x0003) +/* Pattern scanning algorithm See MS-SMB2 3.1.4.4.1 */ +#define SMB3_COMPRESS_PATTERN cpu_to_le16(0x0004) /* Pattern_V1 */ + +/* Compression Flags */ +#define SMB2_COMPRESSION_CAPABILITIES_FLAG_NONE cpu_to_le32(0x00000000) +#define SMB2_COMPRESSION_CAPABILITIES_FLAG_CHAINED cpu_to_le32(0x00000001) + +struct smb2_compression_capabilities_context { + __le16 ContextType; /* 3 */ + __le16 DataLength; + __le32 Reserved; + __le16 CompressionAlgorithmCount; + __le16 Padding; + __le32 Flags; + __le16 CompressionAlgorithms[3]; + __u16 Pad; /* Some servers require pad to DataLen multiple of 8 */ + /* Check if pad needed */ +} __packed; + +/* + * For smb2_netname_negotiate_context_id See MS-SMB2 2.2.3.1.4. + * Its struct simply contains NetName, an array of Unicode characters + */ +struct smb2_netname_neg_context { + __le16 ContextType; /* 5 */ + __le16 DataLength; + __le32 Reserved; + __le16 NetName[]; /* hostname of target converted to UCS-2 */ +} __packed; + +/* + * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5 + * and 2.2.4.1.5 + */ + +/* Flags */ +#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 + +struct smb2_transport_capabilities_context { + __le16 ContextType; /* 6 */ + __le16 DataLength; + __u32 Reserved; + __le32 Flags; + __u32 Pad; +} __packed; + +/* + * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * and 2.2.4.1.6 + */ + +/* RDMA Transform IDs */ +#define SMB2_RDMA_TRANSFORM_NONE 0x0000 +#define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 +#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002 + +struct smb2_rdma_transform_capabilities_context { + __le16 ContextType; /* 7 */ + __le16 DataLength; + __u32 Reserved; + __le16 TransformCount; + __u16 Reserved1; + __u32 Reserved2; + __le16 RDMATransformIds[]; +} __packed; + +/* + * For signing capabilities context see MS-SMB2 2.2.3.1.7 + * and 2.2.4.1.7 + */ + +/* Signing algorithms */ +#define SIGNING_ALG_HMAC_SHA256 0 +#define SIGNING_ALG_HMAC_SHA256_LE cpu_to_le16(0) +#define SIGNING_ALG_AES_CMAC 1 +#define SIGNING_ALG_AES_CMAC_LE cpu_to_le16(1) +#define SIGNING_ALG_AES_GMAC 2 +#define SIGNING_ALG_AES_GMAC_LE cpu_to_le16(2) + +struct smb2_signing_capabilities { + __le16 ContextType; /* 8 */ + __le16 DataLength; + __le32 Reserved; + __le16 SigningAlgorithmCount; + __le16 SigningAlgorithms[]; + /* Followed by padding to 8 byte boundary (required by some servers) */ +} __packed; + +#define POSIX_CTXT_DATA_LEN 16 +struct smb2_posix_neg_context { + __le16 ContextType; /* 0x100 */ + __le16 DataLength; + __le32 Reserved; + __u8 Name[16]; /* POSIX ctxt GUID 93AD25509CB411E7B42383DE968BCD7C */ +} __packed; + +struct smb2_negotiate_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 36 */ + __le16 DialectCount; + __le16 SecurityMode; + __le16 Reserved; /* MBZ */ + __le32 Capabilities; + __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; + /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ + __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ + __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ + __le16 Reserved2; + __le16 Dialects[]; +} __packed; + +struct smb2_negotiate_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 65 */ + __le16 SecurityMode; + __le16 DialectRevision; + __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ + __u8 ServerGUID[16]; + __le32 Capabilities; + __le32 MaxTransactSize; + __le32 MaxReadSize; + __le32 MaxWriteSize; + __le64 SystemTime; /* MBZ */ + __le64 ServerStartTime; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + + +/* + * SMB2_SESSION_SETUP See MS-SMB2 section 2.2.5 + */ +/* Flags */ +#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 +#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 + +struct smb2_sess_setup_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 25 */ + __u8 Flags; + __u8 SecurityMode; + __le32 Capabilities; + __le32 Channel; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __le64 PreviousSessionId; + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + +/* Currently defined SessionFlags */ +#define SMB2_SESSION_FLAG_IS_GUEST 0x0001 +#define SMB2_SESSION_FLAG_IS_GUEST_LE cpu_to_le16(0x0001) +#define SMB2_SESSION_FLAG_IS_NULL 0x0002 +#define SMB2_SESSION_FLAG_IS_NULL_LE cpu_to_le16(0x0002) +#define SMB2_SESSION_FLAG_ENCRYPT_DATA 0x0004 +#define SMB2_SESSION_FLAG_ENCRYPT_DATA_LE cpu_to_le16(0x0004) + +struct smb2_sess_setup_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 SessionFlags; + __le16 SecurityBufferOffset; + __le16 SecurityBufferLength; + __u8 Buffer[1]; /* variable length GSS security buffer */ +} __packed; + + +/* + * SMB2_LOGOFF See MS-SMB2 section 2.2.7 + */ +struct smb2_logoff_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + +struct smb2_logoff_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 4 */ + __le16 Reserved; +} __packed; + + +/* + * SMB2_CLOSE See MS-SMB2 section 2.2.15 + */ +/* Currently defined values for close flags */ +#define SMB2_CLOSE_FLAG_POSTQUERY_ATTRIB cpu_to_le16(0x0001) +struct smb2_close_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Flags; + __le32 Reserved; + __le64 PersistentFileId; /* opaque endianness */ + __le64 VolatileFileId; /* opaque endianness */ +} __packed; + +/* + * Maximum size of a SMB2_CLOSE response is 64 (smb2 header) + 60 (data) + */ +#define MAX_SMB2_CLOSE_RESPONSE_SIZE 124 + +struct smb2_close_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* 60 */ + __le16 Flags; + __le32 Reserved; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; /* Beginning of FILE_STANDARD_INFO equivalent */ + __le64 EndOfFile; + __le32 Attributes; +} __packed; + + +/* + * SMB2_READ See MS-SMB2 section 2.2.19 + */ +/* For read request Flags field below, following flag is defined for SMB3.02 */ +#define SMB2_READFLAG_READ_UNBUFFERED 0x01 +#define SMB2_READFLAG_REQUEST_COMPRESSED 0x02 /* See MS-SMB2 2.2.19 */ + +/* Channel field for read and write: exactly one of following flags can be set*/ +#define SMB2_CHANNEL_NONE cpu_to_le32(0x00000000) +#define SMB2_CHANNEL_RDMA_V1 cpu_to_le32(0x00000001) +#define SMB2_CHANNEL_RDMA_V1_INVALIDATE cpu_to_le32(0x00000002) +#define SMB2_CHANNEL_RDMA_TRANSFORM cpu_to_le32(0x00000003) + +/* SMB2 read request without RFC1001 length at the beginning */ +struct smb2_read_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __u8 Padding; /* offset from start of SMB2 header to place read */ + __u8 Flags; /* MBZ unless SMB3.02 or later */ + __le32 Length; + __le64 Offset; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 MinimumCount; + __le32 Channel; /* MBZ except for SMB3 or later */ + __le32 RemainingBytes; + __le16 ReadChannelInfoOffset; + __le16 ReadChannelInfoLength; + __u8 Buffer[1]; +} __packed; + +/* Read flags */ +#define SMB2_READFLAG_RESPONSE_NONE cpu_to_le32(0x00000000) +#define SMB2_READFLAG_RESPONSE_RDMA_TRANSFORM cpu_to_le32(0x00000001) + +struct smb2_read_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __le32 Flags; + __u8 Buffer[1]; +} __packed; + + +/* + * SMB2_WRITE See MS-SMB2 section 2.2.21 + */ +/* For write request Flags field below the following flags are defined: */ +#define SMB2_WRITEFLAG_WRITE_THROUGH 0x00000001 /* SMB2.1 or later */ +#define SMB2_WRITEFLAG_WRITE_UNBUFFERED 0x00000002 /* SMB3.02 or later */ + +struct smb2_write_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 49 */ + __le16 DataOffset; /* offset from start of SMB2 header to write data */ + __le32 Length; + __le64 Offset; + __le64 PersistentFileId; /* opaque endianness */ + __le64 VolatileFileId; /* opaque endianness */ + __le32 Channel; /* MBZ unless SMB3.02 or later */ + __le32 RemainingBytes; + __le16 WriteChannelInfoOffset; + __le16 WriteChannelInfoLength; + __le32 Flags; + __u8 Buffer[1]; +} __packed; + +struct smb2_write_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 17 */ + __u8 DataOffset; + __u8 Reserved; + __le32 DataLength; + __le32 DataRemaining; + __u32 Reserved2; + __u8 Buffer[1]; +} __packed; + + +/* + * SMB2_FLUSH See MS-SMB2 section 2.2.17 + */ +struct smb2_flush_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 24 */ + __le16 Reserved1; + __le32 Reserved2; + __le64 PersistentFileId; + __le64 VolatileFileId; +} __packed; + +struct smb2_flush_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; + __le16 Reserved; +} __packed; + + +/* + * SMB2_NOTIFY See MS-SMB2 section 2.2.35 + */ +/* notify flags */ +#define SMB2_WATCH_TREE 0x0001 + +/* notify completion filter flags. See MS-FSCC 2.6 and MS-SMB2 2.2.35 */ +#define FILE_NOTIFY_CHANGE_FILE_NAME 0x00000001 +#define FILE_NOTIFY_CHANGE_DIR_NAME 0x00000002 +#define FILE_NOTIFY_CHANGE_ATTRIBUTES 0x00000004 +#define FILE_NOTIFY_CHANGE_SIZE 0x00000008 +#define FILE_NOTIFY_CHANGE_LAST_WRITE 0x00000010 +#define FILE_NOTIFY_CHANGE_LAST_ACCESS 0x00000020 +#define FILE_NOTIFY_CHANGE_CREATION 0x00000040 +#define FILE_NOTIFY_CHANGE_EA 0x00000080 +#define FILE_NOTIFY_CHANGE_SECURITY 0x00000100 +#define FILE_NOTIFY_CHANGE_STREAM_NAME 0x00000200 +#define FILE_NOTIFY_CHANGE_STREAM_SIZE 0x00000400 +#define FILE_NOTIFY_CHANGE_STREAM_WRITE 0x00000800 + +/* SMB2 Notify Action Flags */ +#define FILE_ACTION_ADDED 0x00000001 +#define FILE_ACTION_REMOVED 0x00000002 +#define FILE_ACTION_MODIFIED 0x00000003 +#define FILE_ACTION_RENAMED_OLD_NAME 0x00000004 +#define FILE_ACTION_RENAMED_NEW_NAME 0x00000005 +#define FILE_ACTION_ADDED_STREAM 0x00000006 +#define FILE_ACTION_REMOVED_STREAM 0x00000007 +#define FILE_ACTION_MODIFIED_STREAM 0x00000008 +#define FILE_ACTION_REMOVED_BY_DELETE 0x00000009 + +struct smb2_change_notify_req { + struct smb2_hdr hdr; + __le16 StructureSize; + __le16 Flags; + __le32 OutputBufferLength; + __le64 PersistentFileId; /* opaque endianness */ + __le64 VolatileFileId; /* opaque endianness */ + __le32 CompletionFilter; + __u32 Reserved; +} __packed; + +struct smb2_change_notify_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 9 */ + __le16 OutputBufferOffset; + __le32 OutputBufferLength; + __u8 Buffer[1]; /* array of file notify structs */ +} __packed; + + +/* + * SMB2_CREATE See MS-SMB2 section 2.2.13 + */ +/* Oplock levels */ +#define SMB2_OPLOCK_LEVEL_NONE 0x00 +#define SMB2_OPLOCK_LEVEL_II 0x01 +#define SMB2_OPLOCK_LEVEL_EXCLUSIVE 0x08 +#define SMB2_OPLOCK_LEVEL_BATCH 0x09 +#define SMB2_OPLOCK_LEVEL_LEASE 0xFF +/* Non-spec internal type */ +#define SMB2_OPLOCK_LEVEL_NOCHANGE 0x99 + +/* Impersonation Levels. See MS-WPO section 9.7 and MSDN-IMPERS */ +#define IL_ANONYMOUS cpu_to_le32(0x00000000) +#define IL_IDENTIFICATION cpu_to_le32(0x00000001) +#define IL_IMPERSONATION cpu_to_le32(0x00000002) +#define IL_DELEGATE cpu_to_le32(0x00000003) + +/* File Attrubutes */ +#define FILE_ATTRIBUTE_READONLY 0x00000001 +#define FILE_ATTRIBUTE_HIDDEN 0x00000002 +#define FILE_ATTRIBUTE_SYSTEM 0x00000004 +#define FILE_ATTRIBUTE_DIRECTORY 0x00000010 +#define FILE_ATTRIBUTE_ARCHIVE 0x00000020 +#define FILE_ATTRIBUTE_NORMAL 0x00000080 +#define FILE_ATTRIBUTE_TEMPORARY 0x00000100 +#define FILE_ATTRIBUTE_SPARSE_FILE 0x00000200 +#define FILE_ATTRIBUTE_REPARSE_POINT 0x00000400 +#define FILE_ATTRIBUTE_COMPRESSED 0x00000800 +#define FILE_ATTRIBUTE_OFFLINE 0x00001000 +#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000 +#define FILE_ATTRIBUTE_ENCRYPTED 0x00004000 +#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000 +#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000 +#define FILE_ATTRIBUTE__MASK 0x00007FB7 + +#define FILE_ATTRIBUTE_READONLY_LE cpu_to_le32(0x00000001) +#define FILE_ATTRIBUTE_HIDDEN_LE cpu_to_le32(0x00000002) +#define FILE_ATTRIBUTE_SYSTEM_LE cpu_to_le32(0x00000004) +#define FILE_ATTRIBUTE_DIRECTORY_LE cpu_to_le32(0x00000010) +#define FILE_ATTRIBUTE_ARCHIVE_LE cpu_to_le32(0x00000020) +#define FILE_ATTRIBUTE_NORMAL_LE cpu_to_le32(0x00000080) +#define FILE_ATTRIBUTE_TEMPORARY_LE cpu_to_le32(0x00000100) +#define FILE_ATTRIBUTE_SPARSE_FILE_LE cpu_to_le32(0x00000200) +#define FILE_ATTRIBUTE_REPARSE_POINT_LE cpu_to_le32(0x00000400) +#define FILE_ATTRIBUTE_COMPRESSED_LE cpu_to_le32(0x00000800) +#define FILE_ATTRIBUTE_OFFLINE_LE cpu_to_le32(0x00001000) +#define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED_LE cpu_to_le32(0x00002000) +#define FILE_ATTRIBUTE_ENCRYPTED_LE cpu_to_le32(0x00004000) +#define FILE_ATTRIBUTE_INTEGRITY_STREAM_LE cpu_to_le32(0x00008000) +#define FILE_ATTRIBUTE_NO_SCRUB_DATA_LE cpu_to_le32(0x00020000) +#define FILE_ATTRIBUTE_MASK_LE cpu_to_le32(0x00007FB7) + +/* Desired Access Flags */ +#define FILE_READ_DATA_LE cpu_to_le32(0x00000001) +#define FILE_LIST_DIRECTORY_LE cpu_to_le32(0x00000001) +#define FILE_WRITE_DATA_LE cpu_to_le32(0x00000002) +#define FILE_APPEND_DATA_LE cpu_to_le32(0x00000004) +#define FILE_ADD_SUBDIRECTORY_LE cpu_to_le32(0x00000004) +#define FILE_READ_EA_LE cpu_to_le32(0x00000008) +#define FILE_WRITE_EA_LE cpu_to_le32(0x00000010) +#define FILE_EXECUTE_LE cpu_to_le32(0x00000020) +#define FILE_DELETE_CHILD_LE cpu_to_le32(0x00000040) +#define FILE_READ_ATTRIBUTES_LE cpu_to_le32(0x00000080) +#define FILE_WRITE_ATTRIBUTES_LE cpu_to_le32(0x00000100) +#define FILE_DELETE_LE cpu_to_le32(0x00010000) +#define FILE_READ_CONTROL_LE cpu_to_le32(0x00020000) +#define FILE_WRITE_DAC_LE cpu_to_le32(0x00040000) +#define FILE_WRITE_OWNER_LE cpu_to_le32(0x00080000) +#define FILE_SYNCHRONIZE_LE cpu_to_le32(0x00100000) +#define FILE_ACCESS_SYSTEM_SECURITY_LE cpu_to_le32(0x01000000) +#define FILE_MAXIMAL_ACCESS_LE cpu_to_le32(0x02000000) +#define FILE_GENERIC_ALL_LE cpu_to_le32(0x10000000) +#define FILE_GENERIC_EXECUTE_LE cpu_to_le32(0x20000000) +#define FILE_GENERIC_WRITE_LE cpu_to_le32(0x40000000) +#define FILE_GENERIC_READ_LE cpu_to_le32(0x80000000) +#define DESIRED_ACCESS_MASK cpu_to_le32(0xF21F01FF) + + +#define FILE_READ_DESIRED_ACCESS_LE (FILE_READ_DATA_LE | \ + FILE_READ_EA_LE | \ + FILE_GENERIC_READ_LE) +#define FILE_WRITE_DESIRE_ACCESS_LE (FILE_WRITE_DATA_LE | \ + FILE_APPEND_DATA_LE | \ + FILE_WRITE_EA_LE | \ + FILE_WRITE_ATTRIBUTES_LE | \ + FILE_GENERIC_WRITE_LE) + +/* ShareAccess Flags */ +#define FILE_SHARE_READ_LE cpu_to_le32(0x00000001) +#define FILE_SHARE_WRITE_LE cpu_to_le32(0x00000002) +#define FILE_SHARE_DELETE_LE cpu_to_le32(0x00000004) +#define FILE_SHARE_ALL_LE cpu_to_le32(0x00000007) + +/* CreateDisposition Flags */ +#define FILE_SUPERSEDE_LE cpu_to_le32(0x00000000) +#define FILE_OPEN_LE cpu_to_le32(0x00000001) +#define FILE_CREATE_LE cpu_to_le32(0x00000002) +#define FILE_OPEN_IF_LE cpu_to_le32(0x00000003) +#define FILE_OVERWRITE_LE cpu_to_le32(0x00000004) +#define FILE_OVERWRITE_IF_LE cpu_to_le32(0x00000005) +#define FILE_CREATE_MASK_LE cpu_to_le32(0x00000007) + +#define FILE_READ_RIGHTS (FILE_READ_DATA | FILE_READ_EA \ + | FILE_READ_ATTRIBUTES) +#define FILE_WRITE_RIGHTS (FILE_WRITE_DATA | FILE_APPEND_DATA \ + | FILE_WRITE_EA | FILE_WRITE_ATTRIBUTES) +#define FILE_EXEC_RIGHTS (FILE_EXECUTE) + +/* CreateOptions Flags */ +#define FILE_DIRECTORY_FILE_LE cpu_to_le32(0x00000001) +/* same as #define CREATE_NOT_FILE_LE cpu_to_le32(0x00000001) */ +#define FILE_WRITE_THROUGH_LE cpu_to_le32(0x00000002) +#define FILE_SEQUENTIAL_ONLY_LE cpu_to_le32(0x00000004) +#define FILE_NO_INTERMEDIATE_BUFFERING_LE cpu_to_le32(0x00000008) +#define FILE_NON_DIRECTORY_FILE_LE cpu_to_le32(0x00000040) +#define FILE_COMPLETE_IF_OPLOCKED_LE cpu_to_le32(0x00000100) +#define FILE_NO_EA_KNOWLEDGE_LE cpu_to_le32(0x00000200) +#define FILE_RANDOM_ACCESS_LE cpu_to_le32(0x00000800) +#define FILE_DELETE_ON_CLOSE_LE cpu_to_le32(0x00001000) +#define FILE_OPEN_BY_FILE_ID_LE cpu_to_le32(0x00002000) +#define FILE_OPEN_FOR_BACKUP_INTENT_LE cpu_to_le32(0x00004000) +#define FILE_NO_COMPRESSION_LE cpu_to_le32(0x00008000) +#define FILE_OPEN_REPARSE_POINT_LE cpu_to_le32(0x00200000) +#define FILE_OPEN_NO_RECALL_LE cpu_to_le32(0x00400000) +#define CREATE_OPTIONS_MASK_LE cpu_to_le32(0x00FFFFFF) + +#define FILE_READ_RIGHTS_LE (FILE_READ_DATA_LE | FILE_READ_EA_LE \ + | FILE_READ_ATTRIBUTES_LE) +#define FILE_WRITE_RIGHTS_LE (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE \ + | FILE_WRITE_EA_LE | FILE_WRITE_ATTRIBUTES_LE) +#define FILE_EXEC_RIGHTS_LE (FILE_EXECUTE_LE) + +/* Create Context Values */ +#define SMB2_CREATE_EA_BUFFER "ExtA" /* extended attributes */ +#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */ +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC" +#define SMB2_CREATE_ALLOCATION_SIZE "AISi" +#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc" +#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp" +#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid" +#define SMB2_CREATE_REQUEST_LEASE "RqLs" +#define SMB2_CREATE_DURABLE_HANDLE_REQUEST_V2 "DH2Q" +#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT_V2 "DH2C" +#define SMB2_CREATE_TAG_POSIX "\x93\xAD\x25\x50\x9C\xB4\x11\xE7\xB4\x23\x83\xDE\x96\x8B\xCD\x7C" + +/* Flag (SMB3 open response) values */ +#define SMB2_CREATE_FLAG_REPARSEPOINT 0x01 + +struct create_context { + __le32 Next; + __le16 NameOffset; + __le16 NameLength; + __le16 Reserved; + __le16 DataOffset; + __le32 DataLength; + __u8 Buffer[]; +} __packed; + +struct smb2_create_req { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 57 */ + __u8 SecurityFlags; + __u8 RequestedOplockLevel; + __le32 ImpersonationLevel; + __le64 SmbCreateFlags; + __le64 Reserved; + __le32 DesiredAccess; + __le32 FileAttributes; + __le32 ShareAccess; + __le32 CreateDisposition; + __le32 CreateOptions; + __le16 NameOffset; + __le16 NameLength; + __le32 CreateContextsOffset; + __le32 CreateContextsLength; + __u8 Buffer[]; +} __packed; + +struct smb2_create_rsp { + struct smb2_hdr hdr; + __le16 StructureSize; /* Must be 89 */ + __u8 OplockLevel; + __u8 Flags; /* 0x01 if reparse point */ + __le32 CreateAction; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le64 AllocationSize; + __le64 EndofFile; + __le32 FileAttributes; + __le32 Reserved2; + __le64 PersistentFileId; + __le64 VolatileFileId; + __le32 CreateContextsOffset; + __le32 CreateContextsLength; + __u8 Buffer[1]; +} __packed; + + +#endif /* _COMMON_SMB2PDU_H */ diff --git a/fs/smbfs_common/smbfsctl.h b/fs/smbfs_common/smbfsctl.h index d01e8c9d7a31..926f87cd6af0 100644 --- a/fs/smbfs_common/smbfsctl.h +++ b/fs/smbfs_common/smbfsctl.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: LGPL-2.1+ */ /* - * fs/cifs/smbfsctl.h: SMB, CIFS, SMB2 FSCTL definitions + * SMB, CIFS, SMB2 FSCTL definitions * * Copyright (c) International Business Machines Corp., 2002,2013 * Author(s): Steve French (sfrench@us.ibm.com) diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 60d6951915f4..bb44ff4c5cc6 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -16,6 +16,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/blkdev.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> @@ -179,8 +180,8 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) /* Check the filesystem does not extend beyond the end of the block device */ msblk->bytes_used = le64_to_cpu(sblk->bytes_used); - if (msblk->bytes_used < 0 || msblk->bytes_used > - i_size_read(sb->s_bdev->bd_inode)) + if (msblk->bytes_used < 0 || + msblk->bytes_used > bdev_nr_bytes(sb->s_bdev)) goto failed_mount; /* Check block size for sanity */ diff --git a/fs/super.c b/fs/super.c index bcef3a6f4c4b..3bfc0f8fbd5b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -476,6 +476,8 @@ void generic_shutdown_super(struct super_block *sb) spin_unlock(&sb_lock); up_write(&sb->s_umount); if (sb->s_bdi != &noop_backing_dev_info) { + if (sb->s_iflags & SB_I_PERSB_BDI) + bdi_unregister(sb->s_bdi); bdi_put(sb->s_bdi); sb->s_bdi = &noop_backing_dev_info; } @@ -1562,6 +1564,7 @@ int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) } WARN_ON(sb->s_bdi != &noop_backing_dev_info); sb->s_bdi = bdi; + sb->s_iflags |= SB_I_PERSB_BDI; return 0; } diff --git a/fs/sync.c b/fs/sync.c index 1373a610dc78..3ce8e2137f31 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -3,6 +3,7 @@ * High-level sync()-related operations */ +#include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/file.h> #include <linux/fs.h> @@ -22,25 +23,6 @@ SYNC_FILE_RANGE_WAIT_AFTER) /* - * Do the filesystem syncing work. For simple filesystems - * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to - * submit IO for these buffers via __sync_blockdev(). This also speeds up the - * wait == 1 case since in that case write_inode() functions do - * sync_dirty_buffer() and thus effectively write one block at a time. - */ -static int __sync_filesystem(struct super_block *sb, int wait) -{ - if (wait) - sync_inodes_sb(sb); - else - writeback_inodes_sb(sb, WB_REASON_SYNC); - - if (sb->s_op->sync_fs) - sb->s_op->sync_fs(sb, wait); - return __sync_blockdev(sb->s_bdev, wait); -} - -/* * Write out and wait upon all dirty data associated with this * superblock. Filesystem data as well as the underlying block * device. Takes the superblock lock. @@ -61,10 +43,25 @@ int sync_filesystem(struct super_block *sb) if (sb_rdonly(sb)) return 0; - ret = __sync_filesystem(sb, 0); + /* + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have + * to submit I/O for these buffers via sync_blockdev(). This also + * speeds up the wait == 1 case since in that case write_inode() + * methods call sync_dirty_buffer() and thus effectively write one block + * at a time. + */ + writeback_inodes_sb(sb, WB_REASON_SYNC); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 0); + ret = sync_blockdev_nowait(sb->s_bdev); if (ret < 0) return ret; - return __sync_filesystem(sb, 1); + + sync_inodes_sb(sb); + if (sb->s_op->sync_fs) + sb->s_op->sync_fs(sb, 1); + return sync_blockdev(sb->s_bdev); } EXPORT_SYMBOL(sync_filesystem); @@ -81,21 +78,6 @@ static void sync_fs_one_sb(struct super_block *sb, void *arg) sb->s_op->sync_fs(sb, *(int *)arg); } -static void fdatawrite_one_bdev(struct block_device *bdev, void *arg) -{ - filemap_fdatawrite(bdev->bd_inode->i_mapping); -} - -static void fdatawait_one_bdev(struct block_device *bdev, void *arg) -{ - /* - * We keep the error status of individual mapping so that - * applications can catch the writeback error using fsync(2). - * See filemap_fdatawait_keep_errors() for details. - */ - filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping); -} - /* * Sync everything. We start by waking flusher threads so that most of * writeback runs on all devices in parallel. Then we sync all inodes reliably @@ -114,8 +96,8 @@ void ksys_sync(void) iterate_supers(sync_inodes_one_sb, NULL); iterate_supers(sync_fs_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &wait); - iterate_bdevs(fdatawrite_one_bdev, NULL); - iterate_bdevs(fdatawait_one_bdev, NULL); + sync_bdevs(false); + sync_bdevs(true); if (unlikely(laptop_mode)) laptop_sync_completion(); } @@ -136,10 +118,10 @@ static void do_sync_work(struct work_struct *work) */ iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); iterate_supers(sync_inodes_one_sb, &nowait); iterate_supers(sync_fs_one_sb, &nowait); - iterate_bdevs(fdatawrite_one_bdev, NULL); + sync_bdevs(false); printk("Emergency Sync complete\n"); kfree(work); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 59dffd5ca517..b6b6796e1616 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -56,8 +56,7 @@ int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) kobject_get_ownership(kobj, &uid, &gid); - kn = kernfs_create_dir_ns(parent, kobject_name(kobj), - S_IRWXU | S_IRUGO | S_IXUGO, uid, gid, + kn = kernfs_create_dir_ns(parent, kobject_name(kobj), 0755, uid, gid, kobj, ns); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index d019d6ac6ad0..42dcf96881b6 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -45,6 +45,9 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) ssize_t count; char *buf; + if (WARN_ON_ONCE(!ops->show)) + return -EINVAL; + /* acquire buffer and ensure that it's >= PAGE_SIZE and clear */ count = seq_get_buf(sf, &buf); if (count < PAGE_SIZE) { @@ -53,15 +56,9 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v) } memset(buf, 0, PAGE_SIZE); - /* - * Invoke show(). Control may reach here via seq file lseek even - * if @ops->show() isn't implemented. - */ - if (ops->show) { - count = ops->show(kobj, of->kn->priv, buf); - if (count < 0) - return count; - } + count = ops->show(kobj, of->kn->priv, buf); + if (count < 0) + return count; /* * The code works fine with PAGE_SIZE return but it's likely to @@ -255,59 +252,74 @@ static const struct kernfs_ops sysfs_bin_kfops_mmap = { }; int sysfs_add_file_mode_ns(struct kernfs_node *parent, - const struct attribute *attr, bool is_bin, - umode_t mode, kuid_t uid, kgid_t gid, const void *ns) + const struct attribute *attr, umode_t mode, kuid_t uid, + kgid_t gid, const void *ns) { + struct kobject *kobj = parent->priv; + const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; struct lock_class_key *key = NULL; - const struct kernfs_ops *ops; + const struct kernfs_ops *ops = NULL; struct kernfs_node *kn; - loff_t size; - - if (!is_bin) { - struct kobject *kobj = parent->priv; - const struct sysfs_ops *sysfs_ops = kobj->ktype->sysfs_ops; - - /* every kobject with an attribute needs a ktype assigned */ - if (WARN(!sysfs_ops, KERN_ERR - "missing sysfs attribute operations for kobject: %s\n", - kobject_name(kobj))) - return -EINVAL; - - if (sysfs_ops->show && sysfs_ops->store) { - if (mode & SYSFS_PREALLOC) - ops = &sysfs_prealloc_kfops_rw; - else - ops = &sysfs_file_kfops_rw; - } else if (sysfs_ops->show) { - if (mode & SYSFS_PREALLOC) - ops = &sysfs_prealloc_kfops_ro; - else - ops = &sysfs_file_kfops_ro; - } else if (sysfs_ops->store) { - if (mode & SYSFS_PREALLOC) - ops = &sysfs_prealloc_kfops_wo; - else - ops = &sysfs_file_kfops_wo; - } else - ops = &sysfs_file_kfops_empty; - - size = PAGE_SIZE; + + /* every kobject with an attribute needs a ktype assigned */ + if (WARN(!sysfs_ops, KERN_ERR + "missing sysfs attribute operations for kobject: %s\n", + kobject_name(kobj))) + return -EINVAL; + + if (mode & SYSFS_PREALLOC) { + if (sysfs_ops->show && sysfs_ops->store) + ops = &sysfs_prealloc_kfops_rw; + else if (sysfs_ops->show) + ops = &sysfs_prealloc_kfops_ro; + else if (sysfs_ops->store) + ops = &sysfs_prealloc_kfops_wo; } else { - struct bin_attribute *battr = (void *)attr; - - if (battr->mmap) - ops = &sysfs_bin_kfops_mmap; - else if (battr->read && battr->write) - ops = &sysfs_bin_kfops_rw; - else if (battr->read) - ops = &sysfs_bin_kfops_ro; - else if (battr->write) - ops = &sysfs_bin_kfops_wo; - else - ops = &sysfs_file_kfops_empty; - - size = battr->size; + if (sysfs_ops->show && sysfs_ops->store) + ops = &sysfs_file_kfops_rw; + else if (sysfs_ops->show) + ops = &sysfs_file_kfops_ro; + else if (sysfs_ops->store) + ops = &sysfs_file_kfops_wo; + } + + if (!ops) + ops = &sysfs_file_kfops_empty; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!attr->ignore_lockdep) + key = attr->key ?: (struct lock_class_key *)&attr->skey; +#endif + + kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid, + PAGE_SIZE, ops, (void *)attr, ns, key); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, attr->name); + return PTR_ERR(kn); } + return 0; +} + +int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, + const struct bin_attribute *battr, umode_t mode, + kuid_t uid, kgid_t gid, const void *ns) +{ + const struct attribute *attr = &battr->attr; + struct lock_class_key *key = NULL; + const struct kernfs_ops *ops; + struct kernfs_node *kn; + + if (battr->mmap) + ops = &sysfs_bin_kfops_mmap; + else if (battr->read && battr->write) + ops = &sysfs_bin_kfops_rw; + else if (battr->read) + ops = &sysfs_bin_kfops_ro; + else if (battr->write) + ops = &sysfs_bin_kfops_wo; + else + ops = &sysfs_file_kfops_empty; #ifdef CONFIG_DEBUG_LOCK_ALLOC if (!attr->ignore_lockdep) @@ -315,7 +327,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent, #endif kn = __kernfs_create_file(parent, attr->name, mode & 0777, uid, gid, - size, ops, (void *)attr, ns, key); + battr->size, ops, (void *)attr, ns, key); if (IS_ERR(kn)) { if (PTR_ERR(kn) == -EEXIST) sysfs_warn_dup(parent, attr->name); @@ -340,9 +352,7 @@ int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); - return sysfs_add_file_mode_ns(kobj->sd, attr, false, attr->mode, - uid, gid, ns); - + return sysfs_add_file_mode_ns(kobj->sd, attr, attr->mode, uid, gid, ns); } EXPORT_SYMBOL_GPL(sysfs_create_file_ns); @@ -385,8 +395,8 @@ int sysfs_add_file_to_group(struct kobject *kobj, return -ENOENT; kobject_get_ownership(kobj, &uid, &gid); - error = sysfs_add_file_mode_ns(parent, attr, false, - attr->mode, uid, gid, NULL); + error = sysfs_add_file_mode_ns(parent, attr, attr->mode, uid, gid, + NULL); kernfs_put(parent); return error; @@ -555,8 +565,8 @@ int sysfs_create_bin_file(struct kobject *kobj, return -EINVAL; kobject_get_ownership(kobj, &uid, &gid); - return sysfs_add_file_mode_ns(kobj->sd, &attr->attr, true, - attr->attr.mode, uid, gid, NULL); + return sysfs_add_bin_file_mode_ns(kobj->sd, attr, attr->attr.mode, uid, + gid, NULL); } EXPORT_SYMBOL_GPL(sysfs_create_bin_file); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index f29d62004527..eeb0e3099421 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -61,8 +61,8 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, (*attr)->name, mode); mode &= SYSFS_PREALLOC | 0664; - error = sysfs_add_file_mode_ns(parent, *attr, false, - mode, uid, gid, NULL); + error = sysfs_add_file_mode_ns(parent, *attr, mode, uid, + gid, NULL); if (unlikely(error)) break; } @@ -90,10 +90,9 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, (*bin_attr)->attr.name, mode); mode &= SYSFS_PREALLOC | 0664; - error = sysfs_add_file_mode_ns(parent, - &(*bin_attr)->attr, true, - mode, - uid, gid, NULL); + error = sysfs_add_bin_file_mode_ns(parent, *bin_attr, + mode, uid, gid, + NULL); if (error) break; } @@ -340,8 +339,8 @@ int sysfs_merge_group(struct kobject *kobj, kobject_get_ownership(kobj, &uid, &gid); for ((i = 0, attr = grp->attrs); *attr && !error; (++i, ++attr)) - error = sysfs_add_file_mode_ns(parent, *attr, false, - (*attr)->mode, uid, gid, NULL); + error = sysfs_add_file_mode_ns(parent, *attr, (*attr)->mode, + uid, gid, NULL); if (error) { while (--i >= 0) kernfs_remove_by_name(parent, (*--attr)->name); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 0050cc0c0236..3f28c9af5756 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -28,9 +28,11 @@ void sysfs_warn_dup(struct kernfs_node *parent, const char *name); * file.c */ int sysfs_add_file_mode_ns(struct kernfs_node *parent, - const struct attribute *attr, bool is_bin, - umode_t amode, kuid_t uid, kgid_t gid, - const void *ns); + const struct attribute *attr, umode_t amode, kuid_t uid, + kgid_t gid, const void *ns); +int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent, + const struct bin_attribute *battr, umode_t mode, + kuid_t uid, kgid_t gid, const void *ns); /* * symlink.c diff --git a/fs/sysv/super.c b/fs/sysv/super.c index cc8e2ed155c8..d1def0771a40 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -474,10 +474,8 @@ static int v7_fill_super(struct super_block *sb, void *data, int silent) struct sysv_sb_info *sbi; struct buffer_head *bh; - if (440 != sizeof (struct v7_super_block)) - panic("V7 FS: bad super-block size"); - if (64 != sizeof (struct sysv_inode)) - panic("sysv fs: bad i-node size"); + BUILD_BUG_ON(sizeof(struct v7_super_block) != 440); + BUILD_BUG_ON(sizeof(struct sysv_inode) != 64); sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); if (!sbi) diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 1261e8b41edb..925a621b432e 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -432,7 +432,8 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, if (unlikely(!inode)) return failed_creating(dentry); - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + /* Do not set bits for OTH */ + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUSR| S_IRGRP | S_IXUSR | S_IXGRP; inode->i_op = ops; inode->i_fop = &simple_dir_operations; diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c index 22be7aeb96c4..c57b46a352d8 100644 --- a/fs/ubifs/crypto.c +++ b/fs/ubifs/crypto.c @@ -82,5 +82,4 @@ const struct fscrypt_operations ubifs_crypt_operations = { .get_context = ubifs_crypt_get_context, .set_context = ubifs_crypt_set_context, .empty_dir = ubifs_crypt_empty_dir, - .max_namelen = UBIFS_MAX_NLEN, }; diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index f1094cdcd6cd..46d697172197 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -47,8 +47,7 @@ unsigned int udf_get_last_session(struct super_block *sb) unsigned long udf_get_last_block(struct super_block *sb) { - struct block_device *bdev = sb->s_bdev; - struct cdrom_device_info *cdi = disk_to_cdi(bdev->bd_disk); + struct cdrom_device_info *cdi = disk_to_cdi(sb->s_bdev->bd_disk); unsigned long lblock = 0; /* @@ -56,7 +55,7 @@ unsigned long udf_get_last_block(struct super_block *sb) * Try using the device size... */ if (!cdi || cdrom_get_last_written(cdi, &lblock) || lblock == 0) - lblock = i_size_read(bdev->bd_inode) >> sb->s_blocksize_bits; + lblock = sb_bdev_nr_blocks(sb); if (lblock) return lblock - 1; diff --git a/fs/udf/super.c b/fs/udf/super.c index b2d7c57d0688..34247fba6df9 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1175,8 +1175,7 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) struct udf_inode_info *vati; uint32_t pos; struct virtualAllocationTable20 *vat20; - sector_t blocks = i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits; + sector_t blocks = sb_bdev_nr_blocks(sb); udf_find_vat_block(sb, p_index, type1_index, sbi->s_last_block); if (!sbi->s_vat_inode && @@ -1838,8 +1837,7 @@ static int udf_check_anchor_block(struct super_block *sb, sector_t block, int ret; if (UDF_QUERY_FLAG(sb, UDF_FLAG_VARCONV) && - udf_fixed_to_variable(block) >= - i_size_read(sb->s_bdev->bd_inode) >> sb->s_blocksize_bits) + udf_fixed_to_variable(block) >= sb_bdev_nr_blocks(sb)) return -EAGAIN; bh = udf_read_tagged(sb, block, block, &ident); @@ -1901,8 +1899,7 @@ static int udf_scan_anchors(struct super_block *sb, sector_t *lastblock, last[last_count++] = *lastblock - 152; for (i = 0; i < last_count; i++) { - if (last[i] >= i_size_read(sb->s_bdev->bd_inode) >> - sb->s_blocksize_bits) + if (last[i] >= sb_bdev_nr_blocks(sb)) continue; ret = udf_check_anchor_block(sb, last[i], fileset); if (ret != -EAGAIN) { diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 003f0d31743e..22bf14ab2d16 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1827,9 +1827,15 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx, if (mode_wp && mode_dontwake) return -EINVAL; - ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, - uffdio_wp.range.len, mode_wp, - &ctx->mmap_changing); + if (mmget_not_zero(ctx->mm)) { + ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len, mode_wp, + &ctx->mmap_changing); + mmput(ctx->mm); + } else { + return -ESRCH; + } + if (ret) return ret; diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 4f5e59f06284..37dd3fe5b1e9 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -21,10 +21,7 @@ #define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */ -#define VBSF_MOUNT_SIGNATURE_BYTE_0 ('\000') -#define VBSF_MOUNT_SIGNATURE_BYTE_1 ('\377') -#define VBSF_MOUNT_SIGNATURE_BYTE_2 ('\376') -#define VBSF_MOUNT_SIGNATURE_BYTE_3 ('\375') +static const unsigned char VBSF_MOUNT_SIGNATURE[4] = "\000\377\376\375"; static int follow_symlinks; module_param(follow_symlinks, int, 0444); @@ -386,12 +383,7 @@ fail_nomem: static int vboxsf_parse_monolithic(struct fs_context *fc, void *data) { - unsigned char *options = data; - - if (options && options[0] == VBSF_MOUNT_SIGNATURE_BYTE_0 && - options[1] == VBSF_MOUNT_SIGNATURE_BYTE_1 && - options[2] == VBSF_MOUNT_SIGNATURE_BYTE_2 && - options[3] == VBSF_MOUNT_SIGNATURE_BYTE_3) { + if (data && !memcmp(data, VBSF_MOUNT_SIGNATURE, 4)) { vbg_err("vboxsf: Old binary mount data not supported, remove obsolete mount.vboxsf and/or update your VBoxService.\n"); return -EINVAL; } diff --git a/fs/verity/enable.c b/fs/verity/enable.c index 77e159a0346b..60a4372aa4d7 100644 --- a/fs/verity/enable.c +++ b/fs/verity/enable.c @@ -177,7 +177,7 @@ static int build_merkle_tree(struct file *filp, * (level 0) and ascending to the root node (level 'num_levels - 1'). * Then at the end (level 'num_levels'), calculate the root hash. */ - blocks = (inode->i_size + params->block_size - 1) >> + blocks = ((u64)inode->i_size + params->block_size - 1) >> params->log_blocksize; for (level = 0; level <= params->num_levels; level++) { err = build_merkle_tree_level(filp, level, blocks, params, diff --git a/fs/verity/open.c b/fs/verity/open.c index 60ff8af7219f..92df87f5fa38 100644 --- a/fs/verity/open.c +++ b/fs/verity/open.c @@ -89,7 +89,7 @@ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, */ /* Compute number of levels and the number of blocks in each level */ - blocks = (inode->i_size + params->block_size - 1) >> log_blocksize; + blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize; pr_debug("Data is %lld bytes (%llu blocks)\n", inode->i_size, blocks); while (blocks > 1) { if (params->num_levels >= FS_VERITY_MAX_LEVELS) { diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 54da6d717a06..b987dc2c6851 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -72,10 +72,6 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) /* * Zone interfaces */ - -#define kmem_zone kmem_cache -#define kmem_zone_t struct kmem_cache - static inline struct page * kmem_to_page(void *addr) { diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index 005abfd9fd34..d7d875cef07a 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -850,7 +850,7 @@ xfs_ag_shrink_space( if (err2 != -ENOSPC) goto resv_err; - __xfs_bmap_add_free(*tpp, args.fsbno, delta, NULL, true); + __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL, true); /* * Roll the transaction before trying to re-init the per-ag diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h index 4c6f9045baca..3f597cad2c33 100644 --- a/fs/xfs/libxfs/xfs_ag.h +++ b/fs/xfs/libxfs/xfs_ag.h @@ -116,23 +116,29 @@ void xfs_perag_put(struct xfs_perag *pag); /* * Perag iteration APIs - * - * XXX: for_each_perag_range() usage really needs an iterator to clean up when - * we terminate at end_agno because we may have taken a reference to the perag - * beyond end_agno. Right now callers have to be careful to catch and clean that - * up themselves. This is not necessary for the callers of for_each_perag() and - * for_each_perag_from() because they terminate at sb_agcount where there are - * no perag structures in tree beyond end_agno. */ -#define for_each_perag_range(mp, next_agno, end_agno, pag) \ - for ((pag) = xfs_perag_get((mp), (next_agno)); \ - (pag) != NULL && (next_agno) <= (end_agno); \ - (next_agno) = (pag)->pag_agno + 1, \ - xfs_perag_put(pag), \ - (pag) = xfs_perag_get((mp), (next_agno))) +static inline struct xfs_perag * +xfs_perag_next( + struct xfs_perag *pag, + xfs_agnumber_t *agno, + xfs_agnumber_t end_agno) +{ + struct xfs_mount *mp = pag->pag_mount; + + *agno = pag->pag_agno + 1; + xfs_perag_put(pag); + if (*agno > end_agno) + return NULL; + return xfs_perag_get(mp, *agno); +} + +#define for_each_perag_range(mp, agno, end_agno, pag) \ + for ((pag) = xfs_perag_get((mp), (agno)); \ + (pag) != NULL; \ + (pag) = xfs_perag_next((pag), &(agno), (end_agno))) -#define for_each_perag_from(mp, next_agno, pag) \ - for_each_perag_range((mp), (next_agno), (mp)->m_sb.sb_agcount, (pag)) +#define for_each_perag_from(mp, agno, pag) \ + for_each_perag_range((mp), (agno), (mp)->m_sb.sb_agcount - 1, (pag)) #define for_each_perag(mp, agno, pag) \ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c index 2aa2b3484c28..fe94058d4e9e 100644 --- a/fs/xfs/libxfs/xfs_ag_resv.c +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -91,7 +91,8 @@ xfs_ag_resv_critical( trace_xfs_ag_resv_critical(pag, type, avail); /* Critically low if less than 10% or max btree height remains. */ - return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, + return XFS_TEST_ERROR(avail < orig / 10 || + avail < pag->pag_mount->m_agbtree_maxlevels, pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 95157f5a5a6c..353e53b892e6 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -27,7 +27,7 @@ #include "xfs_ag_resv.h" #include "xfs_bmap.h" -extern kmem_zone_t *xfs_bmap_free_item_zone; +struct kmem_cache *xfs_extfree_item_cache; struct workqueue_struct *xfs_alloc_wq; @@ -426,8 +426,8 @@ xfs_alloc_fix_len( */ STATIC int /* error code */ xfs_alloc_fixup_trees( - xfs_btree_cur_t *cnt_cur, /* cursor for by-size btree */ - xfs_btree_cur_t *bno_cur, /* cursor for by-block btree */ + struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */ + struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */ xfs_agblock_t fbno, /* starting block of free extent */ xfs_extlen_t flen, /* length of free extent */ xfs_agblock_t rbno, /* starting block of returned extent */ @@ -488,8 +488,8 @@ xfs_alloc_fixup_trees( struct xfs_btree_block *bnoblock; struct xfs_btree_block *cntblock; - bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); - cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); + bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp); + cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp); if (XFS_IS_CORRUPT(mp, bnoblock->bb_numrecs != @@ -1200,8 +1200,8 @@ xfs_alloc_ag_vextent_exact( xfs_alloc_arg_t *args) /* allocation argument structure */ { struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; - xfs_btree_cur_t *bno_cur;/* by block-number btree cursor */ - xfs_btree_cur_t *cnt_cur;/* by count btree cursor */ + struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ + struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ int error; xfs_agblock_t fbno; /* start block of found extent */ xfs_extlen_t flen; /* length of found extent */ @@ -1512,7 +1512,7 @@ xfs_alloc_ag_vextent_lastblock( * than minlen. */ if (*len || args->alignment > 1) { - acur->cnt->bc_ptrs[0] = 1; + acur->cnt->bc_levels[0].ptr = 1; do { error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); if (error) @@ -1658,8 +1658,8 @@ xfs_alloc_ag_vextent_size( xfs_alloc_arg_t *args) /* allocation argument structure */ { struct xfs_agf *agf = args->agbp->b_addr; - xfs_btree_cur_t *bno_cur; /* cursor for bno btree */ - xfs_btree_cur_t *cnt_cur; /* cursor for cnt btree */ + struct xfs_btree_cur *bno_cur; /* cursor for bno btree */ + struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */ int error; /* error result */ xfs_agblock_t fbno; /* start of found freespace */ xfs_extlen_t flen; /* length of found freespace */ @@ -2190,14 +2190,15 @@ xfs_free_ag_extent( */ /* - * Compute and fill in value of m_ag_maxlevels. + * Compute and fill in value of m_alloc_maxlevels. */ void xfs_alloc_compute_maxlevels( xfs_mount_t *mp) /* file system mount structure */ { - mp->m_ag_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, + mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, (mp->m_sb.sb_agblocks + 1) / 2); + ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk()); } /* @@ -2255,14 +2256,14 @@ xfs_alloc_min_freelist( const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; unsigned int min_free; - ASSERT(mp->m_ag_maxlevels > 0); + ASSERT(mp->m_alloc_maxlevels > 0); /* space needed by-bno freespace btree */ min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, - mp->m_ag_maxlevels); + mp->m_alloc_maxlevels); /* space needed by-size freespace btree */ min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, - mp->m_ag_maxlevels); + mp->m_alloc_maxlevels); /* space needed reverse mapping used space btree */ if (xfs_has_rmapbt(mp)) min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, @@ -2439,7 +2440,7 @@ xfs_agfl_reset( /* * Defer an AGFL block free. This is effectively equivalent to - * xfs_bmap_add_free() with some special handling particular to AGFL blocks. + * xfs_free_extent_later() with some special handling particular to AGFL blocks. * * Deferring AGFL frees helps prevent log reservation overruns due to too many * allocation operations in a transaction. AGFL frees are prone to this problem @@ -2458,21 +2459,74 @@ xfs_defer_agfl_block( struct xfs_mount *mp = tp->t_mountp; struct xfs_extent_free_item *new; /* new element */ - ASSERT(xfs_bmap_free_item_zone != NULL); + ASSERT(xfs_extfree_item_cache != NULL); ASSERT(oinfo != NULL); - new = kmem_cache_alloc(xfs_bmap_free_item_zone, + new = kmem_cache_zalloc(xfs_extfree_item_cache, GFP_KERNEL | __GFP_NOFAIL); new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); new->xefi_blockcount = 1; - new->xefi_oinfo = *oinfo; - new->xefi_skip_discard = false; + new->xefi_owner = oinfo->oi_owner; trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); } +/* + * Add the extent to the list of extents to be free at transaction end. + * The list is maintained sorted (by block number). + */ +void +__xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo, + bool skip_discard) +{ + struct xfs_extent_free_item *new; /* new element */ +#ifdef DEBUG + struct xfs_mount *mp = tp->t_mountp; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + + ASSERT(bno != NULLFSBLOCK); + ASSERT(len > 0); + ASSERT(len <= MAXEXTLEN); + ASSERT(!isnullstartblock(bno)); + agno = XFS_FSB_TO_AGNO(mp, bno); + agbno = XFS_FSB_TO_AGBNO(mp, bno); + ASSERT(agno < mp->m_sb.sb_agcount); + ASSERT(agbno < mp->m_sb.sb_agblocks); + ASSERT(len < mp->m_sb.sb_agblocks); + ASSERT(agbno + len <= mp->m_sb.sb_agblocks); +#endif + ASSERT(xfs_extfree_item_cache != NULL); + + new = kmem_cache_zalloc(xfs_extfree_item_cache, + GFP_KERNEL | __GFP_NOFAIL); + new->xefi_startblock = bno; + new->xefi_blockcount = (xfs_extlen_t)len; + if (skip_discard) + new->xefi_flags |= XFS_EFI_SKIP_DISCARD; + if (oinfo) { + ASSERT(oinfo->oi_offset == 0); + + if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) + new->xefi_flags |= XFS_EFI_ATTR_FORK; + if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) + new->xefi_flags |= XFS_EFI_BMBT_BLOCK; + new->xefi_owner = oinfo->oi_owner; + } else { + new->xefi_owner = XFS_RMAP_OWN_NULL; + } + trace_xfs_bmap_free_defer(tp->t_mountp, + XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, + XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); + xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); +} + #ifdef DEBUG /* * Check if an AGF has a free extent record whose length is equal to @@ -2903,13 +2957,16 @@ xfs_agf_verify( if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > mp->m_ag_maxlevels || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > mp->m_ag_maxlevels) + be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > + mp->m_alloc_maxlevels || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > + mp->m_alloc_maxlevels) return __this_address; if (xfs_has_rmapbt(mp) && (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || - be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > mp->m_rmap_maxlevels)) + be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > + mp->m_rmap_maxlevels)) return __this_address; if (xfs_has_rmapbt(mp) && @@ -3495,3 +3552,20 @@ xfs_agfl_walk( return 0; } + +int __init +xfs_extfree_intent_init_cache(void) +{ + xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent", + sizeof(struct xfs_extent_free_item), + 0, 0, NULL); + + return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_extfree_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_extfree_item_cache); + xfs_extfree_item_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index df4aefaf0046..1c14a0b1abea 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -98,7 +98,7 @@ unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); /* - * Compute and fill in value of m_ag_maxlevels. + * Compute and fill in value of m_alloc_maxlevels. */ void xfs_alloc_compute_maxlevels( @@ -248,4 +248,40 @@ xfs_buf_to_agfl_bno( return bp->b_addr; } +void __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno, + xfs_filblks_t len, const struct xfs_owner_info *oinfo, + bool skip_discard); + +/* + * List of extents to be free "later". + * The list is kept sorted on xbf_startblock. + */ +struct xfs_extent_free_item { + struct list_head xefi_list; + uint64_t xefi_owner; + xfs_fsblock_t xefi_startblock;/* starting fs block number */ + xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ + unsigned int xefi_flags; +}; + +#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */ +#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */ +#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */ + +static inline void +xfs_free_extent_later( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_filblks_t len, + const struct xfs_owner_info *oinfo) +{ + __xfs_free_extent_later(tp, bno, len, oinfo, false); +} + + +extern struct kmem_cache *xfs_extfree_item_cache; + +int __init xfs_extfree_intent_init_cache(void); +void xfs_extfree_intent_destroy_cache(void); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 6746fd735550..8c9f73cc0bee 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -20,6 +20,7 @@ #include "xfs_trans.h" #include "xfs_ag.h" +static struct kmem_cache *xfs_allocbt_cur_cache; STATIC struct xfs_btree_cur * xfs_allocbt_dup_cursor( @@ -316,7 +317,7 @@ xfs_allocbt_verify( if (pag && pag->pagf_init) { if (level >= pag->pagf_levels[btnum]) return __this_address; - } else if (level >= mp->m_ag_maxlevels) + } else if (level >= mp->m_alloc_maxlevels) return __this_address; return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); @@ -477,12 +478,8 @@ xfs_allocbt_init_common( ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT); - cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); - - cur->bc_tp = tp; - cur->bc_mp = mp; - cur->bc_btnum = btnum; - cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur = xfs_btree_alloc_cursor(mp, tp, btnum, mp->m_alloc_maxlevels, + xfs_allocbt_cur_cache); cur->bc_ag.abt.active = false; if (btnum == XFS_BTNUM_CNT) { @@ -571,6 +568,17 @@ xfs_allocbt_commit_staged_btree( } } +/* Calculate number of records in an alloc btree block. */ +static inline unsigned int +xfs_allocbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(xfs_alloc_rec_t); + return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); +} + /* * Calculate number of records in an alloc btree block. */ @@ -581,10 +589,26 @@ xfs_allocbt_maxrecs( int leaf) { blocklen -= XFS_ALLOC_BLOCK_LEN(mp); + return xfs_allocbt_block_maxrecs(blocklen, leaf); +} - if (leaf) - return blocklen / sizeof(xfs_alloc_rec_t); - return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); +/* Free space btrees are at their largest when every other block is free. */ +#define XFS_MAX_FREESP_RECORDS ((XFS_MAX_AG_BLOCKS + 1) / 2) + +/* Compute the max possible height for free space btrees. */ +unsigned int +xfs_allocbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN, + XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN); + + minrecs[0] = xfs_allocbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_allocbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_FREESP_RECORDS); } /* Calculate the freespace btree size for some records. */ @@ -595,3 +619,22 @@ xfs_allocbt_calc_size( { return xfs_btree_calc_size(mp->m_alloc_mnr, len); } + +int __init +xfs_allocbt_init_cur_cache(void) +{ + xfs_allocbt_cur_cache = kmem_cache_create("xfs_bnobt_cur", + xfs_btree_cur_sizeof(xfs_allocbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_allocbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_allocbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_allocbt_cur_cache); + xfs_allocbt_cur_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 2f6b816aaf9f..45df893ef6bb 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -60,4 +60,9 @@ extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, void xfs_allocbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); +unsigned int xfs_allocbt_maxlevels_ondisk(void); + +int __init xfs_allocbt_init_cur_cache(void); +void xfs_allocbt_destroy_cur_cache(void); + #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index e1d11e314228..014daa8c542d 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -770,7 +770,7 @@ xfs_attr_fork_remove( ASSERT(ip->i_afp->if_nextents == 0); xfs_idestroy_fork(ip->i_afp); - kmem_cache_free(xfs_ifork_zone, ip->i_afp); + kmem_cache_free(xfs_ifork_cache, ip->i_afp); ip->i_afp = NULL; ip->i_forkoff = 0; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index b48230f1a361..4dccd4d90622 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -37,8 +37,7 @@ #include "xfs_icache.h" #include "xfs_iomap.h" - -kmem_zone_t *xfs_bmap_free_item_zone; +struct kmem_cache *xfs_bmap_intent_cache; /* * Miscellaneous helper functions @@ -93,6 +92,7 @@ xfs_bmap_compute_maxlevels( maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs; } mp->m_bm_maxlevels[whichfork] = level; + ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk()); } unsigned int @@ -239,11 +239,11 @@ xfs_bmap_get_bp( if (!cur) return NULL; - for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) { - if (!cur->bc_bufs[i]) + for (i = 0; i < cur->bc_maxlevels; i++) { + if (!cur->bc_levels[i].bp) break; - if (xfs_buf_daddr(cur->bc_bufs[i]) == bno) - return cur->bc_bufs[i]; + if (xfs_buf_daddr(cur->bc_levels[i].bp) == bno) + return cur->bc_levels[i].bp; } /* Chase down all the log items to see if the bp is there */ @@ -316,7 +316,7 @@ xfs_check_block( */ STATIC void xfs_bmap_check_leaf_extents( - xfs_btree_cur_t *cur, /* btree cursor or null */ + struct xfs_btree_cur *cur, /* btree cursor or null */ xfs_inode_t *ip, /* incore inode pointer */ int whichfork) /* data or attr fork */ { @@ -522,56 +522,6 @@ xfs_bmap_validate_ret( #endif /* DEBUG */ /* - * bmap free list manipulation functions - */ - -/* - * Add the extent to the list of extents to be free at transaction end. - * The list is maintained sorted (by block number). - */ -void -__xfs_bmap_add_free( - struct xfs_trans *tp, - xfs_fsblock_t bno, - xfs_filblks_t len, - const struct xfs_owner_info *oinfo, - bool skip_discard) -{ - struct xfs_extent_free_item *new; /* new element */ -#ifdef DEBUG - struct xfs_mount *mp = tp->t_mountp; - xfs_agnumber_t agno; - xfs_agblock_t agbno; - - ASSERT(bno != NULLFSBLOCK); - ASSERT(len > 0); - ASSERT(len <= MAXEXTLEN); - ASSERT(!isnullstartblock(bno)); - agno = XFS_FSB_TO_AGNO(mp, bno); - agbno = XFS_FSB_TO_AGBNO(mp, bno); - ASSERT(agno < mp->m_sb.sb_agcount); - ASSERT(agbno < mp->m_sb.sb_agblocks); - ASSERT(len < mp->m_sb.sb_agblocks); - ASSERT(agbno + len <= mp->m_sb.sb_agblocks); -#endif - ASSERT(xfs_bmap_free_item_zone != NULL); - - new = kmem_cache_alloc(xfs_bmap_free_item_zone, - GFP_KERNEL | __GFP_NOFAIL); - new->xefi_startblock = bno; - new->xefi_blockcount = (xfs_extlen_t)len; - if (oinfo) - new->xefi_oinfo = *oinfo; - else - new->xefi_oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; - new->xefi_skip_discard = skip_discard; - trace_xfs_bmap_free_defer(tp->t_mountp, - XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, - XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); - xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); -} - -/* * Inode fork format manipulation functions */ @@ -625,12 +575,12 @@ xfs_bmap_btree_to_extents( if ((error = xfs_btree_check_block(cur, cblock, 0, cbp))) return error; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork); - xfs_bmap_add_free(cur->bc_tp, cbno, 1, &oinfo); + xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo); ip->i_nblocks--; xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L); xfs_trans_binval(tp, cbp); - if (cur->bc_bufs[0] == cbp) - cur->bc_bufs[0] = NULL; + if (cur->bc_levels[0].bp == cbp) + cur->bc_levels[0].bp = NULL; xfs_iroot_realloc(ip, -1, whichfork); ASSERT(ifp->if_broot == NULL); ifp->if_format = XFS_DINODE_FMT_EXTENTS; @@ -925,7 +875,7 @@ xfs_bmap_add_attrfork_btree( int *flags) /* inode logging flags */ { struct xfs_btree_block *block = ip->i_df.if_broot; - xfs_btree_cur_t *cur; /* btree cursor */ + struct xfs_btree_cur *cur; /* btree cursor */ int error; /* error return value */ xfs_mount_t *mp; /* file system mount struct */ int stat; /* newroot status */ @@ -968,7 +918,7 @@ xfs_bmap_add_attrfork_extents( struct xfs_inode *ip, /* incore inode pointer */ int *flags) /* inode logging flags */ { - xfs_btree_cur_t *cur; /* bmap btree cursor */ + struct xfs_btree_cur *cur; /* bmap btree cursor */ int error; /* error return value */ if (ip->i_df.if_nextents * sizeof(struct xfs_bmbt_rec) <= @@ -1988,11 +1938,11 @@ xfs_bmap_add_extent_unwritten_real( xfs_inode_t *ip, /* incore inode pointer */ int whichfork, struct xfs_iext_cursor *icur, - xfs_btree_cur_t **curp, /* if *curp is null, not a btree */ + struct xfs_btree_cur **curp, /* if *curp is null, not a btree */ xfs_bmbt_irec_t *new, /* new data to add to file extents */ int *logflagsp) /* inode logging flags */ { - xfs_btree_cur_t *cur; /* btree cursor */ + struct xfs_btree_cur *cur; /* btree cursor */ int error; /* error return value */ int i; /* temp state */ struct xfs_ifork *ifp; /* inode fork pointer */ @@ -5045,7 +4995,7 @@ xfs_bmap_del_extent_real( xfs_inode_t *ip, /* incore inode pointer */ xfs_trans_t *tp, /* current transaction pointer */ struct xfs_iext_cursor *icur, - xfs_btree_cur_t *cur, /* if null, not a btree */ + struct xfs_btree_cur *cur, /* if null, not a btree */ xfs_bmbt_irec_t *del, /* data to remove from extents */ int *logflagsp, /* inode logging flags */ int whichfork, /* data or attr fork */ @@ -5296,7 +5246,7 @@ xfs_bmap_del_extent_real( if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) { xfs_refcount_decrease_extent(tp, del); } else { - __xfs_bmap_add_free(tp, del->br_startblock, + __xfs_free_extent_later(tp, del->br_startblock, del->br_blockcount, NULL, (bflags & XFS_BMAPI_NODISCARD) || del->br_state == XFS_EXT_UNWRITTEN); @@ -6189,7 +6139,7 @@ __xfs_bmap_add( bmap->br_blockcount, bmap->br_state); - bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_NOFS); + bi = kmem_cache_alloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&bi->bi_list); bi->bi_type = type; bi->bi_owner = ip; @@ -6300,3 +6250,20 @@ xfs_bmap_validate_extent( return __this_address; return NULL; } + +int __init +xfs_bmap_intent_init_cache(void) +{ + xfs_bmap_intent_cache = kmem_cache_create("xfs_bmap_intent", + sizeof(struct xfs_bmap_intent), + 0, 0, NULL); + + return xfs_bmap_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_bmap_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_bmap_intent_cache); + xfs_bmap_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 67641f669918..03d9aaf87413 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -13,8 +13,6 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; -extern kmem_zone_t *xfs_bmap_free_item_zone; - /* * Argument structure for xfs_bmap_alloc. */ @@ -44,19 +42,6 @@ struct xfs_bmalloca { int flags; }; -/* - * List of extents to be free "later". - * The list is kept sorted on xbf_startblock. - */ -struct xfs_extent_free_item -{ - xfs_fsblock_t xefi_startblock;/* starting fs block number */ - xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ - bool xefi_skip_discard; - struct list_head xefi_list; - struct xfs_owner_info xefi_oinfo; /* extent owner */ -}; - #define XFS_BMAP_MAX_NMAP 4 /* @@ -189,9 +174,6 @@ unsigned int xfs_bmap_compute_attr_offset(struct xfs_mount *mp); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork); -void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno, - xfs_filblks_t len, const struct xfs_owner_info *oinfo, - bool skip_discard); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -239,16 +221,6 @@ int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, struct xfs_iext_cursor *icur, struct xfs_btree_cur **curp, struct xfs_bmbt_irec *new, int *logflagsp); -static inline void -xfs_bmap_add_free( - struct xfs_trans *tp, - xfs_fsblock_t bno, - xfs_filblks_t len, - const struct xfs_owner_info *oinfo) -{ - __xfs_bmap_add_free(tp, bno, len, oinfo, false); -} - enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, XFS_BMAP_UNMAP, @@ -257,8 +229,8 @@ enum xfs_bmap_intent_type { struct xfs_bmap_intent { struct list_head bi_list; enum xfs_bmap_intent_type bi_type; - struct xfs_inode *bi_owner; int bi_whichfork; + struct xfs_inode *bi_owner; struct xfs_bmbt_irec bi_bmap; }; @@ -290,4 +262,9 @@ int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, int flags); +extern struct kmem_cache *xfs_bmap_intent_cache; + +int __init xfs_bmap_intent_init_cache(void); +void xfs_bmap_intent_destroy_cache(void); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 72444b8b38a6..453309fc85f2 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -22,6 +22,8 @@ #include "xfs_trace.h" #include "xfs_rmap.h" +static struct kmem_cache *xfs_bmbt_cur_cache; + /* * Convert on-disk form of btree root to in-memory form. */ @@ -286,7 +288,7 @@ xfs_bmbt_free_block( struct xfs_owner_info oinfo; xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork); - xfs_bmap_add_free(cur->bc_tp, fsbno, 1, &oinfo); + xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo); ip->i_nblocks--; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -552,13 +554,9 @@ xfs_bmbt_init_cursor( struct xfs_btree_cur *cur; ASSERT(whichfork != XFS_COW_FORK); - cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); - - cur->bc_tp = tp; - cur->bc_mp = mp; + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP, + mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache); cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1; - cur->bc_btnum = XFS_BTNUM_BMAP; - cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2); cur->bc_ops = &xfs_bmbt_ops; @@ -575,6 +573,17 @@ xfs_bmbt_init_cursor( return cur; } +/* Calculate number of records in a block mapping btree block. */ +static inline unsigned int +xfs_bmbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(xfs_bmbt_rec_t); + return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); +} + /* * Calculate number of records in a bmap btree block. */ @@ -585,10 +594,24 @@ xfs_bmbt_maxrecs( int leaf) { blocklen -= XFS_BMBT_BLOCK_LEN(mp); + return xfs_bmbt_block_maxrecs(blocklen, leaf); +} - if (leaf) - return blocklen / sizeof(xfs_bmbt_rec_t); - return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t)); +/* Compute the max possible height for block mapping btrees. */ +unsigned int +xfs_bmbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN, + XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN); + + minrecs[0] = xfs_bmbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2; + + /* One extra level for the inode root. */ + return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1; } /* @@ -654,3 +677,22 @@ xfs_bmbt_calc_size( { return xfs_btree_calc_size(mp->m_bmap_dmnr, len); } + +int __init +xfs_bmbt_init_cur_cache(void) +{ + xfs_bmbt_cur_cache = kmem_cache_create("xfs_bmbt_cur", + xfs_btree_cur_sizeof(xfs_bmbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_bmbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_bmbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_bmbt_cur_cache); + xfs_bmbt_cur_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 729e3bc569be..3e7a40a83835 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -110,4 +110,9 @@ extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, unsigned long long len); +unsigned int xfs_bmbt_maxlevels_ondisk(void); + +int __init xfs_bmbt_init_cur_cache(void); +void xfs_bmbt_destroy_cur_cache(void); + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 298395481713..b4e19aacb9de 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -22,11 +22,11 @@ #include "xfs_log.h" #include "xfs_btree_staging.h" #include "xfs_ag.h" - -/* - * Cursor allocation zone. - */ -kmem_zone_t *xfs_btree_cur_zone; +#include "xfs_alloc_btree.h" +#include "xfs_ialloc_btree.h" +#include "xfs_bmap_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount_btree.h" /* * Btree magic numbers. @@ -367,8 +367,8 @@ xfs_btree_del_cursor( * way we won't have initialized all the entries down to 0. */ for (i = 0; i < cur->bc_nlevels; i++) { - if (cur->bc_bufs[i]) - xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); + if (cur->bc_levels[i].bp) + xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp); else if (!error) break; } @@ -379,7 +379,7 @@ xfs_btree_del_cursor( kmem_free(cur->bc_ops); if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS) && cur->bc_ag.pag) xfs_perag_put(cur->bc_ag.pag); - kmem_cache_free(xfs_btree_cur_zone, cur); + kmem_cache_free(cur->bc_cache, cur); } /* @@ -388,14 +388,14 @@ xfs_btree_del_cursor( */ int /* error */ xfs_btree_dup_cursor( - xfs_btree_cur_t *cur, /* input cursor */ - xfs_btree_cur_t **ncur) /* output cursor */ + struct xfs_btree_cur *cur, /* input cursor */ + struct xfs_btree_cur **ncur) /* output cursor */ { struct xfs_buf *bp; /* btree block's buffer pointer */ int error; /* error return value */ int i; /* level number of btree block */ xfs_mount_t *mp; /* mount structure for filesystem */ - xfs_btree_cur_t *new; /* new cursor value */ + struct xfs_btree_cur *new; /* new cursor value */ xfs_trans_t *tp; /* transaction pointer, can be NULL */ tp = cur->bc_tp; @@ -415,9 +415,9 @@ xfs_btree_dup_cursor( * For each level current, re-get the buffer and copy the ptr value. */ for (i = 0; i < new->bc_nlevels; i++) { - new->bc_ptrs[i] = cur->bc_ptrs[i]; - new->bc_ra[i] = cur->bc_ra[i]; - bp = cur->bc_bufs[i]; + new->bc_levels[i].ptr = cur->bc_levels[i].ptr; + new->bc_levels[i].ra = cur->bc_levels[i].ra; + bp = cur->bc_levels[i].bp; if (bp) { error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, xfs_buf_daddr(bp), mp->m_bsize, @@ -429,7 +429,7 @@ xfs_btree_dup_cursor( return error; } } - new->bc_bufs[i] = bp; + new->bc_levels[i].bp = bp; } *ncur = new; return 0; @@ -681,7 +681,7 @@ xfs_btree_get_block( return xfs_btree_get_iroot(cur); } - *bpp = cur->bc_bufs[level]; + *bpp = cur->bc_levels[level].bp; return XFS_BUF_TO_BLOCK(*bpp); } @@ -691,7 +691,7 @@ xfs_btree_get_block( */ STATIC int /* success=1, failure=0 */ xfs_btree_firstrec( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ int level) /* level to change */ { struct xfs_btree_block *block; /* generic btree block pointer */ @@ -711,7 +711,7 @@ xfs_btree_firstrec( /* * Set the ptr value to 1, that's the first record/key. */ - cur->bc_ptrs[level] = 1; + cur->bc_levels[level].ptr = 1; return 1; } @@ -721,7 +721,7 @@ xfs_btree_firstrec( */ STATIC int /* success=1, failure=0 */ xfs_btree_lastrec( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ int level) /* level to change */ { struct xfs_btree_block *block; /* generic btree block pointer */ @@ -741,7 +741,7 @@ xfs_btree_lastrec( /* * Set the ptr value to numrecs, that's the last record/key. */ - cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs); + cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs); return 1; } @@ -922,11 +922,11 @@ xfs_btree_readahead( (lev == cur->bc_nlevels - 1)) return 0; - if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev]) + if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra) return 0; - cur->bc_ra[lev] |= lr; - block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]); + cur->bc_levels[lev].ra |= lr; + block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) return xfs_btree_readahead_lblock(cur, lr, block); @@ -985,28 +985,28 @@ xfs_btree_readahead_ptr( */ STATIC void xfs_btree_setbuf( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ int lev, /* level in btree */ struct xfs_buf *bp) /* new buffer to set */ { struct xfs_btree_block *b; /* btree block */ - if (cur->bc_bufs[lev]) - xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]); - cur->bc_bufs[lev] = bp; - cur->bc_ra[lev] = 0; + if (cur->bc_levels[lev].bp) + xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp); + cur->bc_levels[lev].bp = bp; + cur->bc_levels[lev].ra = 0; b = XFS_BUF_TO_BLOCK(bp); if (cur->bc_flags & XFS_BTREE_LONG_PTRS) { if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK)) - cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK)) - cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; } else { if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK)) - cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA; + cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA; if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK)) - cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA; + cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA; } } @@ -1548,7 +1548,7 @@ xfs_btree_increment( #endif /* We're done if we remain in the block after the increment. */ - if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block)) + if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block)) goto out1; /* Fail if we just went off the right edge of the tree. */ @@ -1571,7 +1571,7 @@ xfs_btree_increment( goto error0; #endif - if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block)) + if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block)) break; /* Read-ahead the right block for the next loop. */ @@ -1598,14 +1598,14 @@ xfs_btree_increment( for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { union xfs_btree_ptr *ptrp; - ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); --lev; error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); - cur->bc_ptrs[lev] = 1; + cur->bc_levels[lev].ptr = 1; } out1: *stat = 1; @@ -1641,7 +1641,7 @@ xfs_btree_decrement( xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA); /* We're done if we remain in the block after the decrement. */ - if (--cur->bc_ptrs[level] > 0) + if (--cur->bc_levels[level].ptr > 0) goto out1; /* Get a pointer to the btree block. */ @@ -1665,7 +1665,7 @@ xfs_btree_decrement( * Stop when we don't go off the left edge of a block. */ for (lev = level + 1; lev < cur->bc_nlevels; lev++) { - if (--cur->bc_ptrs[lev] > 0) + if (--cur->bc_levels[lev].ptr > 0) break; /* Read-ahead the left block for the next loop. */ xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA); @@ -1691,13 +1691,13 @@ xfs_btree_decrement( for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) { union xfs_btree_ptr *ptrp; - ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block); + ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block); --lev; error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp); if (error) goto error0; xfs_btree_setbuf(cur, lev, bp); - cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block); + cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block); } out1: *stat = 1; @@ -1735,7 +1735,7 @@ xfs_btree_lookup_get_block( * * Otherwise throw it away and get a new one. */ - bp = cur->bc_bufs[level]; + bp = cur->bc_levels[level].bp; error = xfs_btree_ptr_to_daddr(cur, pp, &daddr); if (error) return error; @@ -1864,7 +1864,7 @@ xfs_btree_lookup( return -EFSCORRUPTED; } - cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE; + cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE; *stat = 0; return 0; } @@ -1916,7 +1916,7 @@ xfs_btree_lookup( if (error) goto error0; - cur->bc_ptrs[level] = keyno; + cur->bc_levels[level].ptr = keyno; } } @@ -1933,7 +1933,7 @@ xfs_btree_lookup( !xfs_btree_ptr_is_null(cur, &ptr)) { int i; - cur->bc_ptrs[0] = keyno; + cur->bc_levels[0].ptr = keyno; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; @@ -1944,7 +1944,7 @@ xfs_btree_lookup( } } else if (dir == XFS_LOOKUP_LE && diff > 0) keyno--; - cur->bc_ptrs[0] = keyno; + cur->bc_levels[0].ptr = keyno; /* Return if we succeeded or not. */ if (keyno == 0 || keyno > xfs_btree_get_numrecs(block)) @@ -2104,7 +2104,7 @@ __xfs_btree_updkeys( if (error) return error; #endif - ptr = cur->bc_ptrs[level]; + ptr = cur->bc_levels[level].ptr; nlkey = xfs_btree_key_addr(cur, ptr, block); nhkey = xfs_btree_high_key_addr(cur, ptr, block); if (!force_all && @@ -2171,7 +2171,7 @@ xfs_btree_update_keys( if (error) return error; #endif - ptr = cur->bc_ptrs[level]; + ptr = cur->bc_levels[level].ptr; kp = xfs_btree_key_addr(cur, ptr, block); xfs_btree_copy_keys(cur, kp, &key, 1); xfs_btree_log_keys(cur, bp, ptr, ptr); @@ -2205,7 +2205,7 @@ xfs_btree_update( goto error0; #endif /* Get the address of the rec to be updated. */ - ptr = cur->bc_ptrs[0]; + ptr = cur->bc_levels[0].ptr; rp = xfs_btree_rec_addr(cur, ptr, block); /* Fill in the new contents and log them. */ @@ -2280,7 +2280,7 @@ xfs_btree_lshift( * If the cursor entry is the one that would be moved, don't * do it... it's too complicated. */ - if (cur->bc_ptrs[level] <= 1) + if (cur->bc_levels[level].ptr <= 1) goto out0; /* Set up the left neighbor as "left". */ @@ -2414,7 +2414,7 @@ xfs_btree_lshift( goto error0; /* Slide the cursor value left one. */ - cur->bc_ptrs[level]--; + cur->bc_levels[level].ptr--; *stat = 1; return 0; @@ -2476,7 +2476,7 @@ xfs_btree_rshift( * do it... it's too complicated. */ lrecs = xfs_btree_get_numrecs(left); - if (cur->bc_ptrs[level] >= lrecs) + if (cur->bc_levels[level].ptr >= lrecs) goto out0; /* Set up the right neighbor as "right". */ @@ -2664,7 +2664,7 @@ __xfs_btree_split( */ lrecs = xfs_btree_get_numrecs(left); rrecs = lrecs / 2; - if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1) + if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1) rrecs++; src_index = (lrecs - rrecs + 1); @@ -2760,9 +2760,9 @@ __xfs_btree_split( * If it's just pointing past the last entry in left, then we'll * insert there, so don't change anything in that case. */ - if (cur->bc_ptrs[level] > lrecs + 1) { + if (cur->bc_levels[level].ptr > lrecs + 1) { xfs_btree_setbuf(cur, level, rbp); - cur->bc_ptrs[level] -= lrecs; + cur->bc_levels[level].ptr -= lrecs; } /* * If there are more levels, we'll need another cursor which refers @@ -2772,7 +2772,7 @@ __xfs_btree_split( error = xfs_btree_dup_cursor(cur, curp); if (error) goto error0; - (*curp)->bc_ptrs[level + 1]++; + (*curp)->bc_levels[level + 1].ptr++; } *ptrp = rptr; *stat = 1; @@ -2933,7 +2933,8 @@ xfs_btree_new_iroot( be16_add_cpu(&block->bb_level, 1); xfs_btree_set_numrecs(block, 1); cur->bc_nlevels++; - cur->bc_ptrs[level + 1] = 1; + ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); + cur->bc_levels[level + 1].ptr = 1; kp = xfs_btree_key_addr(cur, 1, block); ckp = xfs_btree_key_addr(cur, 1, cblock); @@ -3094,8 +3095,9 @@ xfs_btree_new_root( /* Fix up the cursor. */ xfs_btree_setbuf(cur, cur->bc_nlevels, nbp); - cur->bc_ptrs[cur->bc_nlevels] = nptr; + cur->bc_levels[cur->bc_nlevels].ptr = nptr; cur->bc_nlevels++; + ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); *stat = 1; return 0; error0: @@ -3152,7 +3154,7 @@ xfs_btree_make_block_unfull( return error; if (*stat) { - *oindex = *index = cur->bc_ptrs[level]; + *oindex = *index = cur->bc_levels[level].ptr; return 0; } @@ -3167,7 +3169,7 @@ xfs_btree_make_block_unfull( return error; - *index = cur->bc_ptrs[level]; + *index = cur->bc_levels[level].ptr; return 0; } @@ -3214,7 +3216,7 @@ xfs_btree_insrec( } /* If we're off the left edge, return failure. */ - ptr = cur->bc_ptrs[level]; + ptr = cur->bc_levels[level].ptr; if (ptr == 0) { *stat = 0; return 0; @@ -3557,7 +3559,7 @@ xfs_btree_kill_iroot( if (error) return error; - cur->bc_bufs[level - 1] = NULL; + cur->bc_levels[level - 1].bp = NULL; be16_add_cpu(&block->bb_level, -1); xfs_trans_log_inode(cur->bc_tp, ip, XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork)); @@ -3590,8 +3592,8 @@ xfs_btree_kill_root( if (error) return error; - cur->bc_bufs[level] = NULL; - cur->bc_ra[level] = 0; + cur->bc_levels[level].bp = NULL; + cur->bc_levels[level].ra = 0; cur->bc_nlevels--; return 0; @@ -3650,7 +3652,7 @@ xfs_btree_delrec( tcur = NULL; /* Get the index of the entry being deleted, check for nothing there. */ - ptr = cur->bc_ptrs[level]; + ptr = cur->bc_levels[level].ptr; if (ptr == 0) { *stat = 0; return 0; @@ -3960,7 +3962,7 @@ xfs_btree_delrec( xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR); tcur = NULL; if (level == 0) - cur->bc_ptrs[0]++; + cur->bc_levels[0].ptr++; *stat = 1; return 0; @@ -4097,9 +4099,9 @@ xfs_btree_delrec( * cursor to the left block, and fix up the index. */ if (bp != lbp) { - cur->bc_bufs[level] = lbp; - cur->bc_ptrs[level] += lrecs; - cur->bc_ra[level] = 0; + cur->bc_levels[level].bp = lbp; + cur->bc_levels[level].ptr += lrecs; + cur->bc_levels[level].ra = 0; } /* * If we joined with the right neighbor and there's a level above @@ -4119,16 +4121,16 @@ xfs_btree_delrec( * We can't use decrement because it would change the next level up. */ if (level > 0) - cur->bc_ptrs[level]--; + cur->bc_levels[level].ptr--; /* * We combined blocks, so we have to update the parent keys if the - * btree supports overlapped intervals. However, bc_ptrs[level + 1] - * points to the old block so that the caller knows which record to - * delete. Therefore, the caller must be savvy enough to call updkeys - * for us if we return stat == 2. The other exit points from this - * function don't require deletions further up the tree, so they can - * call updkeys directly. + * btree supports overlapped intervals. However, + * bc_levels[level + 1].ptr points to the old block so that the caller + * knows which record to delete. Therefore, the caller must be savvy + * enough to call updkeys for us if we return stat == 2. The other + * exit points from this function don't require deletions further up + * the tree, so they can call updkeys directly. */ /* Return value means the next level up has something to do. */ @@ -4182,7 +4184,7 @@ xfs_btree_delete( if (i == 0) { for (level = 1; level < cur->bc_nlevels; level++) { - if (cur->bc_ptrs[level] == 0) { + if (cur->bc_levels[level].ptr == 0) { error = xfs_btree_decrement(cur, level, &i); if (error) goto error0; @@ -4213,7 +4215,7 @@ xfs_btree_get_rec( int error; /* error return value */ #endif - ptr = cur->bc_ptrs[0]; + ptr = cur->bc_levels[0].ptr; block = xfs_btree_get_block(cur, 0, &bp); #ifdef DEBUG @@ -4512,21 +4514,76 @@ xfs_btree_sblock_verify( } /* - * Calculate the number of btree levels needed to store a given number of - * records in a short-format btree. + * For the given limits on leaf and keyptr records per block, calculate the + * height of the tree needed to index the number of leaf records. */ -uint +unsigned int xfs_btree_compute_maxlevels( - uint *limits, - unsigned long len) + const unsigned int *limits, + unsigned long long records) +{ + unsigned long long level_blocks = howmany_64(records, limits[0]); + unsigned int height = 1; + + while (level_blocks > 1) { + level_blocks = howmany_64(level_blocks, limits[1]); + height++; + } + + return height; +} + +/* + * For the given limits on leaf and keyptr records per block, calculate the + * number of blocks needed to index the given number of leaf records. + */ +unsigned long long +xfs_btree_calc_size( + const unsigned int *limits, + unsigned long long records) +{ + unsigned long long level_blocks = howmany_64(records, limits[0]); + unsigned long long blocks = level_blocks; + + while (level_blocks > 1) { + level_blocks = howmany_64(level_blocks, limits[1]); + blocks += level_blocks; + } + + return blocks; +} + +/* + * Given a number of available blocks for the btree to consume with records and + * pointers, calculate the height of the tree needed to index all the records + * that space can hold based on the number of pointers each interior node + * holds. + * + * We start by assuming a single level tree consumes a single block, then track + * the number of blocks each node level consumes until we no longer have space + * to store the next node level. At this point, we are indexing all the leaf + * blocks in the space, and there's no more free space to split the tree any + * further. That's our maximum btree height. + */ +unsigned int +xfs_btree_space_to_height( + const unsigned int *limits, + unsigned long long leaf_blocks) { - uint level; - unsigned long maxblocks; + unsigned long long node_blocks = limits[1]; + unsigned long long blocks_left = leaf_blocks - 1; + unsigned int height = 1; + + if (leaf_blocks < 1) + return 0; + + while (node_blocks < blocks_left) { + blocks_left -= node_blocks; + node_blocks *= limits[1]; + height++; + } - maxblocks = (len + limits[0] - 1) / limits[0]; - for (level = 1; maxblocks > 1; level++) - maxblocks = (maxblocks + limits[1] - 1) / limits[1]; - return level; + return height; } /* @@ -4661,23 +4718,25 @@ xfs_btree_overlapped_query_range( if (error) goto out; #endif - cur->bc_ptrs[level] = 1; + cur->bc_levels[level].ptr = 1; while (level < cur->bc_nlevels) { block = xfs_btree_get_block(cur, level, &bp); /* End of node, pop back towards the root. */ - if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) { + if (cur->bc_levels[level].ptr > + be16_to_cpu(block->bb_numrecs)) { pop_up: if (level < cur->bc_nlevels - 1) - cur->bc_ptrs[level + 1]++; + cur->bc_levels[level + 1].ptr++; level++; continue; } if (level == 0) { /* Handle a leaf node. */ - recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); + recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, + block); cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp); ldiff = cur->bc_ops->diff_two_keys(cur, &rec_hkey, @@ -4700,14 +4759,15 @@ pop_up: /* Record is larger than high key; pop. */ goto pop_up; } - cur->bc_ptrs[level]++; + cur->bc_levels[level].ptr++; continue; } /* Handle an internal node. */ - lkp = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block); - hkp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block); - pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block); + lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); + hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, + block); + pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); ldiff = cur->bc_ops->diff_two_keys(cur, hkp, low_key); hdiff = cur->bc_ops->diff_two_keys(cur, high_key, lkp); @@ -4730,13 +4790,13 @@ pop_up: if (error) goto out; #endif - cur->bc_ptrs[level] = 1; + cur->bc_levels[level].ptr = 1; continue; } else if (hdiff < 0) { /* The low key is larger than the upper range; pop. */ goto pop_up; } - cur->bc_ptrs[level]++; + cur->bc_levels[level].ptr++; } out: @@ -4747,13 +4807,14 @@ out: * with a zero-results range query, so release the buffers if we * failed to return any results. */ - if (cur->bc_bufs[0] == NULL) { + if (cur->bc_levels[0].bp == NULL) { for (i = 0; i < cur->bc_nlevels; i++) { - if (cur->bc_bufs[i]) { - xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]); - cur->bc_bufs[i] = NULL; - cur->bc_ptrs[i] = 0; - cur->bc_ra[i] = 0; + if (cur->bc_levels[i].bp) { + xfs_trans_brelse(cur->bc_tp, + cur->bc_levels[i].bp); + cur->bc_levels[i].bp = NULL; + cur->bc_levels[i].ptr = 0; + cur->bc_levels[i].ra = 0; } } } @@ -4816,29 +4877,6 @@ xfs_btree_query_all( return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv); } -/* - * Calculate the number of blocks needed to store a given number of records - * in a short-format (per-AG metadata) btree. - */ -unsigned long long -xfs_btree_calc_size( - uint *limits, - unsigned long long len) -{ - int level; - int maxrecs; - unsigned long long rval; - - maxrecs = limits[0]; - for (level = 0, rval = 0; len > 1; level++) { - len += maxrecs - 1; - do_div(len, maxrecs); - maxrecs = limits[1]; - rval += len; - } - return rval; -} - static int xfs_btree_count_blocks_helper( struct xfs_btree_cur *cur, @@ -4915,7 +4953,7 @@ xfs_btree_has_more_records( block = xfs_btree_get_block(cur, 0, &bp); /* There are still records in this block. */ - if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block)) + if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block)) return true; /* There are more record blocks. */ @@ -4924,3 +4962,42 @@ xfs_btree_has_more_records( else return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); } + +/* Set up all the btree cursor caches. */ +int __init +xfs_btree_init_cur_caches(void) +{ + int error; + + error = xfs_allocbt_init_cur_cache(); + if (error) + return error; + error = xfs_inobt_init_cur_cache(); + if (error) + goto err; + error = xfs_bmbt_init_cur_cache(); + if (error) + goto err; + error = xfs_rmapbt_init_cur_cache(); + if (error) + goto err; + error = xfs_refcountbt_init_cur_cache(); + if (error) + goto err; + + return 0; +err: + xfs_btree_destroy_cur_caches(); + return error; +} + +/* Destroy all the btree cursor caches, if they've been allocated. */ +void +xfs_btree_destroy_cur_caches(void) +{ + xfs_allocbt_destroy_cur_cache(); + xfs_inobt_destroy_cur_cache(); + xfs_bmbt_destroy_cur_cache(); + xfs_rmapbt_destroy_cur_cache(); + xfs_refcountbt_destroy_cur_cache(); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4eaf8517f850..22d9f411fde6 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -13,8 +13,6 @@ struct xfs_trans; struct xfs_ifork; struct xfs_perag; -extern kmem_zone_t *xfs_btree_cur_zone; - /* * Generic key, ptr and record wrapper structures. * @@ -92,8 +90,6 @@ uint32_t xfs_btree_magic(int crc, xfs_btnum_t btnum); #define XFS_BTREE_STATS_ADD(cur, stat, val) \ XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val) -#define XFS_BTREE_MAXLEVELS 9 /* max of all btrees */ - struct xfs_btree_ops { /* size of the key and record structures */ size_t key_len; @@ -181,18 +177,18 @@ union xfs_btree_irec { /* Per-AG btree information. */ struct xfs_btree_cur_ag { - struct xfs_perag *pag; + struct xfs_perag *pag; union { struct xfs_buf *agbp; struct xbtree_afakeroot *afake; /* for staging cursor */ }; union { struct { - unsigned long nr_ops; /* # record updates */ - int shape_changes; /* # of extent splits */ + unsigned int nr_ops; /* # record updates */ + unsigned int shape_changes; /* # of extent splits */ } refc; struct { - bool active; /* allocation cursor state */ + bool active; /* allocation cursor state */ } abt; }; }; @@ -212,26 +208,35 @@ struct xfs_btree_cur_ino { #define XFS_BTCUR_BMBT_INVALID_OWNER (1 << 1) }; +struct xfs_btree_level { + /* buffer pointer */ + struct xfs_buf *bp; + + /* key/record number */ + uint16_t ptr; + + /* readahead info */ +#define XFS_BTCUR_LEFTRA (1 << 0) /* left sibling has been read-ahead */ +#define XFS_BTCUR_RIGHTRA (1 << 1) /* right sibling has been read-ahead */ + uint16_t ra; +}; + /* * Btree cursor structure. * This collects all information needed by the btree code in one place. */ -typedef struct xfs_btree_cur +struct xfs_btree_cur { struct xfs_trans *bc_tp; /* transaction we're in, if any */ struct xfs_mount *bc_mp; /* file system mount struct */ const struct xfs_btree_ops *bc_ops; - uint bc_flags; /* btree features - below */ + struct kmem_cache *bc_cache; /* cursor cache */ + unsigned int bc_flags; /* btree features - below */ + xfs_btnum_t bc_btnum; /* identifies which btree type */ union xfs_btree_irec bc_rec; /* current insert/search record value */ - struct xfs_buf *bc_bufs[XFS_BTREE_MAXLEVELS]; /* buf ptr per level */ - int bc_ptrs[XFS_BTREE_MAXLEVELS]; /* key/record # */ - uint8_t bc_ra[XFS_BTREE_MAXLEVELS]; /* readahead bits */ -#define XFS_BTCUR_LEFTRA 1 /* left sibling has been read-ahead */ -#define XFS_BTCUR_RIGHTRA 2 /* right sibling has been read-ahead */ - uint8_t bc_nlevels; /* number of levels in the tree */ - uint8_t bc_blocklog; /* log2(blocksize) of btree blocks */ - xfs_btnum_t bc_btnum; /* identifies which btree type */ - int bc_statoff; /* offset of btre stats array */ + uint8_t bc_nlevels; /* number of levels in the tree */ + uint8_t bc_maxlevels; /* maximum levels for this btree type */ + int bc_statoff; /* offset of btree stats array */ /* * Short btree pointers need an agno to be able to turn the pointers @@ -243,7 +248,21 @@ typedef struct xfs_btree_cur struct xfs_btree_cur_ag bc_ag; struct xfs_btree_cur_ino bc_ino; }; -} xfs_btree_cur_t; + + /* Must be at the end of the struct! */ + struct xfs_btree_level bc_levels[]; +}; + +/* + * Compute the size of a btree cursor that can handle a btree of a given + * height. The bc_levels array handles node and leaf blocks, so its size + * is exactly nlevels. + */ +static inline size_t +xfs_btree_cur_sizeof(unsigned int nlevels) +{ + return struct_size((struct xfs_btree_cur *)NULL, bc_levels, nlevels); +} /* cursor flags */ #define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */ @@ -258,7 +277,6 @@ typedef struct xfs_btree_cur */ #define XFS_BTREE_STAGING (1<<5) - #define XFS_BTREE_NOERROR 0 #define XFS_BTREE_ERROR 1 @@ -309,7 +327,7 @@ xfs_btree_check_sptr( */ void xfs_btree_del_cursor( - xfs_btree_cur_t *cur, /* btree cursor */ + struct xfs_btree_cur *cur, /* btree cursor */ int error); /* del because of error */ /* @@ -318,8 +336,8 @@ xfs_btree_del_cursor( */ int /* error */ xfs_btree_dup_cursor( - xfs_btree_cur_t *cur, /* input cursor */ - xfs_btree_cur_t **ncur);/* output cursor */ + struct xfs_btree_cur *cur, /* input cursor */ + struct xfs_btree_cur **ncur);/* output cursor */ /* * Compute first and last byte offsets for the fields given. @@ -460,8 +478,12 @@ xfs_failaddr_t xfs_btree_lblock_v5hdr_verify(struct xfs_buf *bp, xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, unsigned int max_recs); -uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); -unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); +unsigned int xfs_btree_compute_maxlevels(const unsigned int *limits, + unsigned long long records); +unsigned long long xfs_btree_calc_size(const unsigned int *limits, + unsigned long long records); +unsigned int xfs_btree_space_to_height(const unsigned int *limits, + unsigned long long blocks); /* * Return codes for the query range iterator function are 0 to continue @@ -527,7 +549,7 @@ struct xfs_ifork *xfs_btree_ifork_ptr(struct xfs_btree_cur *cur); /* Does this cursor point to the last block in the given level? */ static inline bool xfs_btree_islastblock( - xfs_btree_cur_t *cur, + struct xfs_btree_cur *cur, int level) { struct xfs_btree_block *block; @@ -558,4 +580,27 @@ void xfs_btree_copy_keys(struct xfs_btree_cur *cur, union xfs_btree_key *dst_key, const union xfs_btree_key *src_key, int numkeys); +static inline struct xfs_btree_cur * +xfs_btree_alloc_cursor( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_btnum_t btnum, + uint8_t maxlevels, + struct kmem_cache *cache) +{ + struct xfs_btree_cur *cur; + + cur = kmem_cache_zalloc(cache, GFP_NOFS | __GFP_NOFAIL); + cur->bc_tp = tp; + cur->bc_mp = mp; + cur->bc_btnum = btnum; + cur->bc_maxlevels = maxlevels; + cur->bc_cache = cache; + + return cur; +} + +int __init xfs_btree_init_cur_caches(void); +void xfs_btree_destroy_cur_caches(void); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c index ac9e80152b5c..dd75e208b543 100644 --- a/fs/xfs/libxfs/xfs_btree_staging.c +++ b/fs/xfs/libxfs/xfs_btree_staging.c @@ -657,12 +657,12 @@ xfs_btree_bload_compute_geometry( * checking levels 0 and 1 here, so set bc_nlevels such that the btree * code doesn't interpret either as the root level. */ - cur->bc_nlevels = XFS_BTREE_MAXLEVELS - 1; + cur->bc_nlevels = cur->bc_maxlevels - 1; xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0); xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1); bbl->nr_records = nr_this_level = nr_records; - for (cur->bc_nlevels = 1; cur->bc_nlevels < XFS_BTREE_MAXLEVELS;) { + for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) { uint64_t level_blocks; uint64_t dontcare64; unsigned int level = cur->bc_nlevels - 1; @@ -703,6 +703,7 @@ xfs_btree_bload_compute_geometry( * block-based btree level. */ cur->bc_nlevels++; + ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, &avg_per_block, &level_blocks, &dontcare64); @@ -718,13 +719,14 @@ xfs_btree_bload_compute_geometry( /* Otherwise, we need another level of btree. */ cur->bc_nlevels++; + ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); } nr_blocks += level_blocks; nr_this_level = level_blocks; } - if (cur->bc_nlevels == XFS_BTREE_MAXLEVELS) + if (cur->bc_nlevels > cur->bc_maxlevels) return -EOVERFLOW; bbl->btree_height = cur->bc_nlevels; diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index c062e2c85178..dd7a2dbce1d1 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -72,7 +72,7 @@ STATIC int xfs_da3_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *save_blk); -kmem_zone_t *xfs_da_state_zone; /* anchor for state struct zone */ +struct kmem_cache *xfs_da_state_cache; /* anchor for dir/attr state */ /* * Allocate a dir-state structure. @@ -84,7 +84,7 @@ xfs_da_state_alloc( { struct xfs_da_state *state; - state = kmem_cache_zalloc(xfs_da_state_zone, GFP_NOFS | __GFP_NOFAIL); + state = kmem_cache_zalloc(xfs_da_state_cache, GFP_NOFS | __GFP_NOFAIL); state->args = args; state->mp = args->dp->i_mount; return state; @@ -113,7 +113,7 @@ xfs_da_state_free(xfs_da_state_t *state) #ifdef DEBUG memset((char *)state, 0, sizeof(*state)); #endif /* DEBUG */ - kmem_cache_free(xfs_da_state_zone, state); + kmem_cache_free(xfs_da_state_cache, state); } static inline int xfs_dabuf_nfsb(struct xfs_mount *mp, int whichfork) diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h index ad5dd324631a..0faf7d9ac241 100644 --- a/fs/xfs/libxfs/xfs_da_btree.h +++ b/fs/xfs/libxfs/xfs_da_btree.h @@ -9,7 +9,6 @@ struct xfs_inode; struct xfs_trans; -struct zone; /* * Directory/attribute geometry information. There will be one of these for each @@ -227,6 +226,6 @@ void xfs_da3_node_hdr_from_disk(struct xfs_mount *mp, void xfs_da3_node_hdr_to_disk(struct xfs_mount *mp, struct xfs_da_intnode *to, struct xfs_da3_icnode_hdr *from); -extern struct kmem_zone *xfs_da_state_zone; +extern struct kmem_cache *xfs_da_state_cache; #endif /* __XFS_DA_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index eff4a127188e..0805ade2d300 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -18,6 +18,12 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_log.h" +#include "xfs_rmap.h" +#include "xfs_refcount.h" +#include "xfs_bmap.h" +#include "xfs_alloc.h" + +static struct kmem_cache *xfs_defer_pending_cache; /* * Deferred Operations in XFS @@ -232,23 +238,20 @@ xfs_defer_trans_abort( } } -/* Roll a transaction so we can do some deferred op processing. */ -STATIC int -xfs_defer_trans_roll( - struct xfs_trans **tpp) +/* + * Capture resources that the caller said not to release ("held") when the + * transaction commits. Caller is responsible for zero-initializing @dres. + */ +static int +xfs_defer_save_resources( + struct xfs_defer_resources *dres, + struct xfs_trans *tp) { - struct xfs_trans *tp = *tpp; struct xfs_buf_log_item *bli; struct xfs_inode_log_item *ili; struct xfs_log_item *lip; - struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS]; - struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES]; - unsigned int ordered = 0; /* bitmap */ - int bpcount = 0, ipcount = 0; - int i; - int error; - BUILD_BUG_ON(NBBY * sizeof(ordered) < XFS_DEFER_OPS_NR_BUFS); + BUILD_BUG_ON(NBBY * sizeof(dres->dr_ordered) < XFS_DEFER_OPS_NR_BUFS); list_for_each_entry(lip, &tp->t_items, li_trans) { switch (lip->li_type) { @@ -256,28 +259,29 @@ xfs_defer_trans_roll( bli = container_of(lip, struct xfs_buf_log_item, bli_item); if (bli->bli_flags & XFS_BLI_HOLD) { - if (bpcount >= XFS_DEFER_OPS_NR_BUFS) { + if (dres->dr_bufs >= XFS_DEFER_OPS_NR_BUFS) { ASSERT(0); return -EFSCORRUPTED; } if (bli->bli_flags & XFS_BLI_ORDERED) - ordered |= (1U << bpcount); + dres->dr_ordered |= + (1U << dres->dr_bufs); else xfs_trans_dirty_buf(tp, bli->bli_buf); - bplist[bpcount++] = bli->bli_buf; + dres->dr_bp[dres->dr_bufs++] = bli->bli_buf; } break; case XFS_LI_INODE: ili = container_of(lip, struct xfs_inode_log_item, ili_item); if (ili->ili_lock_flags == 0) { - if (ipcount >= XFS_DEFER_OPS_NR_INODES) { + if (dres->dr_inos >= XFS_DEFER_OPS_NR_INODES) { ASSERT(0); return -EFSCORRUPTED; } xfs_trans_log_inode(tp, ili->ili_inode, XFS_ILOG_CORE); - iplist[ipcount++] = ili->ili_inode; + dres->dr_ip[dres->dr_inos++] = ili->ili_inode; } break; default: @@ -285,7 +289,43 @@ xfs_defer_trans_roll( } } - trace_xfs_defer_trans_roll(tp, _RET_IP_); + return 0; +} + +/* Attach the held resources to the transaction. */ +static void +xfs_defer_restore_resources( + struct xfs_trans *tp, + struct xfs_defer_resources *dres) +{ + unsigned short i; + + /* Rejoin the joined inodes. */ + for (i = 0; i < dres->dr_inos; i++) + xfs_trans_ijoin(tp, dres->dr_ip[i], 0); + + /* Rejoin the buffers and dirty them so the log moves forward. */ + for (i = 0; i < dres->dr_bufs; i++) { + xfs_trans_bjoin(tp, dres->dr_bp[i]); + if (dres->dr_ordered & (1U << i)) + xfs_trans_ordered_buf(tp, dres->dr_bp[i]); + xfs_trans_bhold(tp, dres->dr_bp[i]); + } +} + +/* Roll a transaction so we can do some deferred op processing. */ +STATIC int +xfs_defer_trans_roll( + struct xfs_trans **tpp) +{ + struct xfs_defer_resources dres = { }; + int error; + + error = xfs_defer_save_resources(&dres, *tpp); + if (error) + return error; + + trace_xfs_defer_trans_roll(*tpp, _RET_IP_); /* * Roll the transaction. Rolling always given a new transaction (even @@ -295,22 +335,11 @@ xfs_defer_trans_roll( * happened. */ error = xfs_trans_roll(tpp); - tp = *tpp; - /* Rejoin the joined inodes. */ - for (i = 0; i < ipcount; i++) - xfs_trans_ijoin(tp, iplist[i], 0); - - /* Rejoin the buffers and dirty them so the log moves forward. */ - for (i = 0; i < bpcount; i++) { - xfs_trans_bjoin(tp, bplist[i]); - if (ordered & (1U << i)) - xfs_trans_ordered_buf(tp, bplist[i]); - xfs_trans_bhold(tp, bplist[i]); - } + xfs_defer_restore_resources(*tpp, &dres); if (error) - trace_xfs_defer_trans_roll_error(tp, error); + trace_xfs_defer_trans_roll_error(*tpp, error); return error; } @@ -342,7 +371,7 @@ xfs_defer_cancel_list( ops->cancel_item(pwi); } ASSERT(dfp->dfp_count == 0); - kmem_free(dfp); + kmem_cache_free(xfs_defer_pending_cache, dfp); } } @@ -439,7 +468,7 @@ xfs_defer_finish_one( /* Done with the dfp, free it. */ list_del(&dfp->dfp_list); - kmem_free(dfp); + kmem_cache_free(xfs_defer_pending_cache, dfp); out: if (ops->finish_cleanup) ops->finish_cleanup(tp, state, error); @@ -573,8 +602,8 @@ xfs_defer_add( dfp = NULL; } if (!dfp) { - dfp = kmem_alloc(sizeof(struct xfs_defer_pending), - KM_NOFS); + dfp = kmem_cache_zalloc(xfs_defer_pending_cache, + GFP_NOFS | __GFP_NOFAIL); dfp->dfp_type = type; dfp->dfp_intent = NULL; dfp->dfp_done = NULL; @@ -627,10 +656,11 @@ xfs_defer_move( */ static struct xfs_defer_capture * xfs_defer_ops_capture( - struct xfs_trans *tp, - struct xfs_inode *capture_ip) + struct xfs_trans *tp) { struct xfs_defer_capture *dfc; + unsigned short i; + int error; if (list_empty(&tp->t_dfops)) return NULL; @@ -654,27 +684,48 @@ xfs_defer_ops_capture( /* Preserve the log reservation size. */ dfc->dfc_logres = tp->t_log_res; + error = xfs_defer_save_resources(&dfc->dfc_held, tp); + if (error) { + /* + * Resource capture should never fail, but if it does, we + * still have to shut down the log and release things + * properly. + */ + xfs_force_shutdown(tp->t_mountp, SHUTDOWN_CORRUPT_INCORE); + } + /* - * Grab an extra reference to this inode and attach it to the capture - * structure. + * Grab extra references to the inodes and buffers because callers are + * expected to release their held references after we commit the + * transaction. */ - if (capture_ip) { - ihold(VFS_I(capture_ip)); - dfc->dfc_capture_ip = capture_ip; + for (i = 0; i < dfc->dfc_held.dr_inos; i++) { + ASSERT(xfs_isilocked(dfc->dfc_held.dr_ip[i], XFS_ILOCK_EXCL)); + ihold(VFS_I(dfc->dfc_held.dr_ip[i])); } + for (i = 0; i < dfc->dfc_held.dr_bufs; i++) + xfs_buf_hold(dfc->dfc_held.dr_bp[i]); + return dfc; } /* Release all resources that we used to capture deferred ops. */ void -xfs_defer_ops_release( +xfs_defer_ops_capture_free( struct xfs_mount *mp, struct xfs_defer_capture *dfc) { + unsigned short i; + xfs_defer_cancel_list(mp, &dfc->dfc_dfops); - if (dfc->dfc_capture_ip) - xfs_irele(dfc->dfc_capture_ip); + + for (i = 0; i < dfc->dfc_held.dr_bufs; i++) + xfs_buf_relse(dfc->dfc_held.dr_bp[i]); + + for (i = 0; i < dfc->dfc_held.dr_inos; i++) + xfs_irele(dfc->dfc_held.dr_ip[i]); + kmem_free(dfc); } @@ -689,24 +740,21 @@ xfs_defer_ops_release( int xfs_defer_ops_capture_and_commit( struct xfs_trans *tp, - struct xfs_inode *capture_ip, struct list_head *capture_list) { struct xfs_mount *mp = tp->t_mountp; struct xfs_defer_capture *dfc; int error; - ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL)); - /* If we don't capture anything, commit transaction and exit. */ - dfc = xfs_defer_ops_capture(tp, capture_ip); + dfc = xfs_defer_ops_capture(tp); if (!dfc) return xfs_trans_commit(tp); /* Commit the transaction and add the capture structure to the list. */ error = xfs_trans_commit(tp); if (error) { - xfs_defer_ops_release(mp, dfc); + xfs_defer_ops_capture_free(mp, dfc); return error; } @@ -724,17 +772,19 @@ void xfs_defer_ops_continue( struct xfs_defer_capture *dfc, struct xfs_trans *tp, - struct xfs_inode **captured_ipp) + struct xfs_defer_resources *dres) { ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY)); /* Lock and join the captured inode to the new transaction. */ - if (dfc->dfc_capture_ip) { - xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0); - } - *captured_ipp = dfc->dfc_capture_ip; + if (dfc->dfc_held.dr_inos == 2) + xfs_lock_two_inodes(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL, + dfc->dfc_held.dr_ip[1], XFS_ILOCK_EXCL); + else if (dfc->dfc_held.dr_inos == 1) + xfs_ilock(dfc->dfc_held.dr_ip[0], XFS_ILOCK_EXCL); + xfs_defer_restore_resources(tp, &dfc->dfc_held); + memcpy(dres, &dfc->dfc_held, sizeof(struct xfs_defer_resources)); /* Move captured dfops chain and state to the transaction. */ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops); @@ -742,3 +792,82 @@ xfs_defer_ops_continue( kmem_free(dfc); } + +/* Release the resources captured and continued during recovery. */ +void +xfs_defer_resources_rele( + struct xfs_defer_resources *dres) +{ + unsigned short i; + + for (i = 0; i < dres->dr_inos; i++) { + xfs_iunlock(dres->dr_ip[i], XFS_ILOCK_EXCL); + xfs_irele(dres->dr_ip[i]); + dres->dr_ip[i] = NULL; + } + + for (i = 0; i < dres->dr_bufs; i++) { + xfs_buf_relse(dres->dr_bp[i]); + dres->dr_bp[i] = NULL; + } + + dres->dr_inos = 0; + dres->dr_bufs = 0; + dres->dr_ordered = 0; +} + +static inline int __init +xfs_defer_init_cache(void) +{ + xfs_defer_pending_cache = kmem_cache_create("xfs_defer_pending", + sizeof(struct xfs_defer_pending), + 0, 0, NULL); + + return xfs_defer_pending_cache != NULL ? 0 : -ENOMEM; +} + +static inline void +xfs_defer_destroy_cache(void) +{ + kmem_cache_destroy(xfs_defer_pending_cache); + xfs_defer_pending_cache = NULL; +} + +/* Set up caches for deferred work items. */ +int __init +xfs_defer_init_item_caches(void) +{ + int error; + + error = xfs_defer_init_cache(); + if (error) + return error; + error = xfs_rmap_intent_init_cache(); + if (error) + goto err; + error = xfs_refcount_intent_init_cache(); + if (error) + goto err; + error = xfs_bmap_intent_init_cache(); + if (error) + goto err; + error = xfs_extfree_intent_init_cache(); + if (error) + goto err; + + return 0; +err: + xfs_defer_destroy_item_caches(); + return error; +} + +/* Destroy all the deferred work item caches, if they've been allocated. */ +void +xfs_defer_destroy_item_caches(void) +{ + xfs_extfree_intent_destroy_cache(); + xfs_bmap_intent_destroy_cache(); + xfs_refcount_intent_destroy_cache(); + xfs_rmap_intent_destroy_cache(); + xfs_defer_destroy_cache(); +} diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 05472f71fffe..7bb8a31ad65b 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -65,6 +65,30 @@ extern const struct xfs_defer_op_type xfs_extent_free_defer_type; extern const struct xfs_defer_op_type xfs_agfl_free_defer_type; /* + * Deferred operation item relogging limits. + */ +#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ +#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ + +/* Resources that must be held across a transaction roll. */ +struct xfs_defer_resources { + /* held buffers */ + struct xfs_buf *dr_bp[XFS_DEFER_OPS_NR_BUFS]; + + /* inodes with no unlock flags */ + struct xfs_inode *dr_ip[XFS_DEFER_OPS_NR_INODES]; + + /* number of held buffers */ + unsigned short dr_bufs; + + /* bitmap of ordered buffers */ + unsigned short dr_ordered; + + /* number of held inodes */ + unsigned short dr_inos; +}; + +/* * This structure enables a dfops user to detach the chain of deferred * operations from a transaction so that they can be continued later. */ @@ -83,11 +107,7 @@ struct xfs_defer_capture { /* Log reservation saved from the transaction. */ unsigned int dfc_logres; - /* - * An inode reference that must be maintained to complete the deferred - * work. - */ - struct xfs_inode *dfc_capture_ip; + struct xfs_defer_resources dfc_held; }; /* @@ -95,9 +115,14 @@ struct xfs_defer_capture { * This doesn't normally happen except log recovery. */ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, - struct xfs_inode *capture_ip, struct list_head *capture_list); + struct list_head *capture_list); void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, - struct xfs_inode **captured_ipp); -void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d); + struct xfs_defer_resources *dres); +void xfs_defer_ops_capture_free(struct xfs_mount *mp, + struct xfs_defer_capture *d); +void xfs_defer_resources_rele(struct xfs_defer_resources *dres); + +int __init xfs_defer_init_item_caches(void); +void xfs_defer_destroy_item_caches(void); #endif /* __XFS_DEFER_H__ */ diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index deeb74becabc..15a362e2f5ea 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -22,7 +22,7 @@ xfs_calc_dquots_per_chunk( unsigned int nbblks) /* basic block units */ { ASSERT(nbblks > 0); - return BBTOB(nbblks) / sizeof(xfs_dqblk_t); + return BBTOB(nbblks) / sizeof(struct xfs_dqblk); } /* @@ -127,7 +127,7 @@ xfs_dqblk_repair( * Typically, a repair is only requested by quotacheck. */ ASSERT(id != -1); - memset(dqb, 0, sizeof(xfs_dqblk_t)); + memset(dqb, 0, sizeof(struct xfs_dqblk)); dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 2d7057b7984b..d665c04e69dd 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -184,7 +184,7 @@ typedef struct xfs_sb { * Superblock - on disk version. Must match the in core version above. * Must be padded to 64 bit alignment. */ -typedef struct xfs_dsb { +struct xfs_dsb { __be32 sb_magicnum; /* magic number == XFS_SB_MAGIC */ __be32 sb_blocksize; /* logical block size, bytes */ __be64 sb_dblocks; /* number of data blocks */ @@ -263,7 +263,7 @@ typedef struct xfs_dsb { uuid_t sb_meta_uuid; /* metadata file system unique id */ /* must be padded to 64 bit alignment */ -} xfs_dsb_t; +}; /* * Misc. Flags - warning - these will be cleared by xfs_repair unless @@ -780,7 +780,7 @@ static inline time64_t xfs_bigtime_to_unix(uint64_t ondisk_seconds) * padding field for v3 inodes. */ #define XFS_DINODE_MAGIC 0x494e /* 'IN' */ -typedef struct xfs_dinode { +struct xfs_dinode { __be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */ __be16 di_mode; /* mode and type of file */ __u8 di_version; /* inode version */ @@ -825,7 +825,7 @@ typedef struct xfs_dinode { uuid_t di_uuid; /* UUID of the filesystem */ /* structure must be padded to 64 bit alignment */ -} xfs_dinode_t; +}; #define XFS_DINODE_CRC_OFF offsetof(struct xfs_dinode, di_crc) @@ -1215,7 +1215,7 @@ struct xfs_disk_dquot { * This is what goes on disk. This is separated from the xfs_disk_dquot because * carrying the unnecessary padding would be a waste of memory. */ -typedef struct xfs_dqblk { +struct xfs_dqblk { struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */ char dd_fill[4];/* filling for posterity */ @@ -1225,7 +1225,7 @@ typedef struct xfs_dqblk { __be32 dd_crc; /* checksum */ __be64 dd_lsn; /* last modification in log */ uuid_t dd_uuid; /* location information */ -} xfs_dqblk_t; +}; #define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index bde2b4c64dbe..c43877c8a279 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -268,6 +268,8 @@ typedef struct xfs_fsop_resblks { */ #define XFS_MIN_AG_BYTES (1ULL << 24) /* 16 MB */ #define XFS_MAX_AG_BYTES (1ULL << 40) /* 1 TB */ +#define XFS_MAX_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_BLOCKSIZE) +#define XFS_MAX_CRC_AG_BLOCKS (XFS_MAX_AG_BYTES / XFS_MIN_CRC_BLOCKSIZE) /* keep the maximum size under 2^31 by a small amount */ #define XFS_MAX_LOG_BYTES \ diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 994ad783d407..b418fe0c0679 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1827,7 +1827,7 @@ xfs_difree_inode_chunk( if (!xfs_inobt_issparse(rec->ir_holemask)) { /* not sparse, calculate extent info directly */ - xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), + xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, sagbno), M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES); return; @@ -1872,7 +1872,7 @@ xfs_difree_inode_chunk( ASSERT(agbno % mp->m_sb.sb_spino_align == 0); ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); - xfs_bmap_add_free(tp, XFS_AGB_TO_FSB(mp, agno, agbno), + xfs_free_extent_later(tp, XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, &XFS_RMAP_OINFO_INODES); /* reset range to current bit and carry on... */ @@ -2793,6 +2793,7 @@ xfs_ialloc_setup_geometry( inodes = (1LL << XFS_INO_AGINO_BITS(mp)) >> XFS_INODES_PER_CHUNK_LOG; igeo->inobt_maxlevels = xfs_btree_compute_maxlevels(igeo->inobt_mnr, inodes); + ASSERT(igeo->inobt_maxlevels <= xfs_iallocbt_maxlevels_ondisk()); /* * Set the maximum inode count for this filesystem, being careful not diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 27190840c5d8..b2ad2fdc40f5 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -22,6 +22,8 @@ #include "xfs_rmap.h" #include "xfs_ag.h" +static struct kmem_cache *xfs_inobt_cur_cache; + STATIC int xfs_inobt_get_minrecs( struct xfs_btree_cur *cur, @@ -432,10 +434,8 @@ xfs_inobt_init_common( { struct xfs_btree_cur *cur; - cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); - cur->bc_tp = tp; - cur->bc_mp = mp; - cur->bc_btnum = btnum; + cur = xfs_btree_alloc_cursor(mp, tp, btnum, + M_IGEO(mp)->inobt_maxlevels, xfs_inobt_cur_cache); if (btnum == XFS_BTNUM_INO) { cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2); cur->bc_ops = &xfs_inobt_ops; @@ -444,8 +444,6 @@ xfs_inobt_init_common( cur->bc_ops = &xfs_finobt_ops; } - cur->bc_blocklog = mp->m_sb.sb_blocklog; - if (xfs_has_crc(mp)) cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; @@ -530,6 +528,17 @@ xfs_inobt_commit_staged_btree( } } +/* Calculate number of records in an inode btree block. */ +static inline unsigned int +xfs_inobt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(xfs_inobt_rec_t); + return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); +} + /* * Calculate number of records in an inobt btree block. */ @@ -540,10 +549,54 @@ xfs_inobt_maxrecs( int leaf) { blocklen -= XFS_INOBT_BLOCK_LEN(mp); + return xfs_inobt_block_maxrecs(blocklen, leaf); +} - if (leaf) - return blocklen / sizeof(xfs_inobt_rec_t); - return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); +/* + * Maximum number of inode btree records per AG. Pretend that we can fill an + * entire AG completely full of inodes except for the AG headers. + */ +#define XFS_MAX_INODE_RECORDS \ + ((XFS_MAX_AG_BYTES - (4 * BBSIZE)) / XFS_DINODE_MIN_SIZE) / \ + XFS_INODES_PER_CHUNK + +/* Compute the max possible height for the inode btree. */ +static inline unsigned int +xfs_inobt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = min(XFS_MIN_BLOCKSIZE - XFS_BTREE_SBLOCK_LEN, + XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN); + + minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS); +} + +/* Compute the max possible height for the free inode btree. */ +static inline unsigned int +xfs_finobt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; + + minrecs[0] = xfs_inobt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_inobt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_INODE_RECORDS); +} + +/* Compute the max possible height for either inode btree. */ +unsigned int +xfs_iallocbt_maxlevels_ondisk(void) +{ + return max(xfs_inobt_maxlevels_ondisk(), + xfs_finobt_maxlevels_ondisk()); } /* @@ -761,3 +814,22 @@ xfs_iallocbt_calc_size( { return xfs_btree_calc_size(M_IGEO(mp)->inobt_mnr, len); } + +int __init +xfs_inobt_init_cur_cache(void) +{ + xfs_inobt_cur_cache = kmem_cache_create("xfs_inobt_cur", + xfs_btree_cur_sizeof(xfs_inobt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_inobt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_inobt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_inobt_cur_cache); + xfs_inobt_cur_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index 8a322d402e61..26451cb76b98 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -75,4 +75,9 @@ int xfs_inobt_cur(struct xfs_mount *mp, struct xfs_trans *tp, void xfs_inobt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); +unsigned int xfs_iallocbt_maxlevels_ondisk(void); + +int __init xfs_inobt_init_cur_cache(void); +void xfs_inobt_destroy_cur_cache(void); + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 3932b4ebf903..cae9708c8587 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -51,9 +51,9 @@ xfs_inode_buf_verify( agno = xfs_daddr_to_agno(mp, xfs_buf_daddr(bp)); ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock; for (i = 0; i < ni; i++) { - int di_ok; - xfs_dinode_t *dip; - xfs_agino_t unlinked_ino; + struct xfs_dinode *dip; + xfs_agino_t unlinked_ino; + int di_ok; dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); unlinked_ino = be32_to_cpu(dip->di_next_unlinked); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 1d174909f9bd..9149f4f796fc 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -26,7 +26,7 @@ #include "xfs_types.h" #include "xfs_errortag.h" -kmem_zone_t *xfs_ifork_zone; +struct kmem_cache *xfs_ifork_cache; void xfs_init_local_fork( @@ -67,10 +67,10 @@ xfs_init_local_fork( */ STATIC int xfs_iformat_local( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork, - int size) + struct xfs_inode *ip, + struct xfs_dinode *dip, + int whichfork, + int size) { /* * If the size is unreasonable, then something @@ -162,8 +162,8 @@ xfs_iformat_extents( */ STATIC int xfs_iformat_btree( - xfs_inode_t *ip, - xfs_dinode_t *dip, + struct xfs_inode *ip, + struct xfs_dinode *dip, int whichfork) { struct xfs_mount *mp = ip->i_mount; @@ -284,7 +284,7 @@ xfs_ifork_alloc( { struct xfs_ifork *ifp; - ifp = kmem_cache_zalloc(xfs_ifork_zone, GFP_NOFS | __GFP_NOFAIL); + ifp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL); ifp->if_format = format; ifp->if_nextents = nextents; return ifp; @@ -325,7 +325,7 @@ xfs_iformat_attr_fork( } if (error) { - kmem_cache_free(xfs_ifork_zone, ip->i_afp); + kmem_cache_free(xfs_ifork_cache, ip->i_afp); ip->i_afp = NULL; } return error; @@ -580,8 +580,8 @@ xfs_iextents_copy( */ void xfs_iflush_fork( - xfs_inode_t *ip, - xfs_dinode_t *dip, + struct xfs_inode *ip, + struct xfs_dinode *dip, struct xfs_inode_log_item *iip, int whichfork) { @@ -676,7 +676,7 @@ xfs_ifork_init_cow( if (ip->i_cowfp) return; - ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_zone, + ip->i_cowfp = kmem_cache_zalloc(xfs_ifork_cache, GFP_NOFS | __GFP_NOFAIL); ip->i_cowfp->if_format = XFS_DINODE_FMT_EXTENTS; } diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h index a6f7897b6887..3d64a3acb0ed 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.h +++ b/fs/xfs/libxfs/xfs_inode_fork.h @@ -221,7 +221,7 @@ static inline bool xfs_iext_peek_prev_extent(struct xfs_ifork *ifp, xfs_iext_get_extent((ifp), (ext), (got)); \ xfs_iext_next((ifp), (ext))) -extern struct kmem_zone *xfs_ifork_zone; +extern struct kmem_cache *xfs_ifork_cache; extern void xfs_ifork_init_cow(struct xfs_inode *ip); diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index e5d767a7fc5d..327ba25e9e17 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -24,6 +24,8 @@ #include "xfs_rmap.h" #include "xfs_ag.h" +struct kmem_cache *xfs_refcount_intent_cache; + /* Allowable refcount adjustment amounts. */ enum xfs_refc_adjust_op { XFS_REFCOUNT_ADJUST_INCREASE = 1, @@ -916,8 +918,7 @@ xfs_refcount_adjust_extents( struct xfs_btree_cur *cur, xfs_agblock_t *agbno, xfs_extlen_t *aglen, - enum xfs_refc_adjust_op adj, - struct xfs_owner_info *oinfo) + enum xfs_refc_adjust_op adj) { struct xfs_refcount_irec ext, tmp; int error; @@ -974,8 +975,8 @@ xfs_refcount_adjust_extents( fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, tmp.rc_startblock); - xfs_bmap_add_free(cur->bc_tp, fsbno, - tmp.rc_blockcount, oinfo); + xfs_free_extent_later(cur->bc_tp, fsbno, + tmp.rc_blockcount, NULL); } (*agbno) += tmp.rc_blockcount; @@ -1019,8 +1020,8 @@ xfs_refcount_adjust_extents( fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, ext.rc_startblock); - xfs_bmap_add_free(cur->bc_tp, fsbno, ext.rc_blockcount, - oinfo); + xfs_free_extent_later(cur->bc_tp, fsbno, + ext.rc_blockcount, NULL); } skip: @@ -1048,8 +1049,7 @@ xfs_refcount_adjust( xfs_extlen_t aglen, xfs_agblock_t *new_agbno, xfs_extlen_t *new_aglen, - enum xfs_refc_adjust_op adj, - struct xfs_owner_info *oinfo) + enum xfs_refc_adjust_op adj) { bool shape_changed; int shape_changes = 0; @@ -1092,8 +1092,7 @@ xfs_refcount_adjust( cur->bc_ag.refc.shape_changes++; /* Now that we've taken care of the ends, adjust the middle extents */ - error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, - adj, oinfo); + error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen, adj); if (error) goto out_error; @@ -1188,12 +1187,12 @@ xfs_refcount_finish_one( switch (type) { case XFS_REFCOUNT_INCREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, - new_len, XFS_REFCOUNT_ADJUST_INCREASE, NULL); + new_len, XFS_REFCOUNT_ADJUST_INCREASE); *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); break; case XFS_REFCOUNT_DECREASE: error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno, - new_len, XFS_REFCOUNT_ADJUST_DECREASE, NULL); + new_len, XFS_REFCOUNT_ADJUST_DECREASE); *new_fsb = XFS_AGB_TO_FSB(mp, pag->pag_agno, new_agbno); break; case XFS_REFCOUNT_ALLOC_COW: @@ -1235,8 +1234,8 @@ __xfs_refcount_add( type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), blockcount); - ri = kmem_alloc(sizeof(struct xfs_refcount_intent), - KM_NOFS); + ri = kmem_cache_alloc(xfs_refcount_intent_cache, + GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_startblock = startblock; @@ -1742,7 +1741,7 @@ xfs_refcount_recover_cow_leftovers( rr->rr_rrec.rc_blockcount); /* Free the block. */ - xfs_bmap_add_free(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); + xfs_free_extent_later(tp, fsb, rr->rr_rrec.rc_blockcount, NULL); error = xfs_trans_commit(tp); if (error) @@ -1782,3 +1781,20 @@ xfs_refcount_has_record( return xfs_btree_has_record(cur, &low, &high, exists); } + +int __init +xfs_refcount_intent_init_cache(void) +{ + xfs_refcount_intent_cache = kmem_cache_create("xfs_refc_intent", + sizeof(struct xfs_refcount_intent), + 0, 0, NULL); + + return xfs_refcount_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_refcount_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_refcount_intent_cache); + xfs_refcount_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 02cb3aa405be..9eb01edbd89d 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -32,8 +32,8 @@ enum xfs_refcount_intent_type { struct xfs_refcount_intent { struct list_head ri_list; enum xfs_refcount_intent_type ri_type; - xfs_fsblock_t ri_startblock; xfs_extlen_t ri_blockcount; + xfs_fsblock_t ri_startblock; }; void xfs_refcount_increase_extent(struct xfs_trans *tp, @@ -83,4 +83,9 @@ extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec, extern int xfs_refcount_insert(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); +extern struct kmem_cache *xfs_refcount_intent_cache; + +int __init xfs_refcount_intent_init_cache(void); +void xfs_refcount_intent_destroy_cache(void); + #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c index 1ef9b99962ab..d14c1720b0fb 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.c +++ b/fs/xfs/libxfs/xfs_refcount_btree.c @@ -21,6 +21,8 @@ #include "xfs_rmap.h" #include "xfs_ag.h" +static struct kmem_cache *xfs_refcountbt_cur_cache; + static struct xfs_btree_cur * xfs_refcountbt_dup_cursor( struct xfs_btree_cur *cur) @@ -322,11 +324,8 @@ xfs_refcountbt_init_common( ASSERT(pag->pag_agno < mp->m_sb.sb_agcount); - cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); - cur->bc_tp = tp; - cur->bc_mp = mp; - cur->bc_btnum = XFS_BTNUM_REFC; - cur->bc_blocklog = mp->m_sb.sb_blocklog; + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_REFC, + mp->m_refc_maxlevels, xfs_refcountbt_cur_cache); cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2); cur->bc_flags |= XFS_BTREE_CRC_BLOCKS; @@ -396,6 +395,18 @@ xfs_refcountbt_commit_staged_btree( xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_refcountbt_ops); } +/* Calculate number of records in a refcount btree block. */ +static inline unsigned int +xfs_refcountbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct xfs_refcount_rec); + return blocklen / (sizeof(struct xfs_refcount_key) + + sizeof(xfs_refcount_ptr_t)); +} + /* * Calculate the number of records in a refcount btree block. */ @@ -405,11 +416,22 @@ xfs_refcountbt_maxrecs( bool leaf) { blocklen -= XFS_REFCOUNT_BLOCK_LEN; + return xfs_refcountbt_block_maxrecs(blocklen, leaf); +} - if (leaf) - return blocklen / sizeof(struct xfs_refcount_rec); - return blocklen / (sizeof(struct xfs_refcount_key) + - sizeof(xfs_refcount_ptr_t)); +/* Compute the max possible height of the maximally sized refcount btree. */ +unsigned int +xfs_refcountbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; + + minrecs[0] = xfs_refcountbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_refcountbt_block_maxrecs(blocklen, false) / 2; + + return xfs_btree_compute_maxlevels(minrecs, XFS_MAX_CRC_AG_BLOCKS); } /* Compute the maximum height of a refcount btree. */ @@ -417,8 +439,14 @@ void xfs_refcountbt_compute_maxlevels( struct xfs_mount *mp) { + if (!xfs_has_reflink(mp)) { + mp->m_refc_maxlevels = 0; + return; + } + mp->m_refc_maxlevels = xfs_btree_compute_maxlevels( mp->m_refc_mnr, mp->m_sb.sb_agblocks); + ASSERT(mp->m_refc_maxlevels <= xfs_refcountbt_maxlevels_ondisk()); } /* Calculate the refcount btree size for some records. */ @@ -488,3 +516,22 @@ xfs_refcountbt_calc_reserves( return error; } + +int __init +xfs_refcountbt_init_cur_cache(void) +{ + xfs_refcountbt_cur_cache = kmem_cache_create("xfs_refcbt_cur", + xfs_btree_cur_sizeof(xfs_refcountbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_refcountbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_refcountbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_refcountbt_cur_cache); + xfs_refcountbt_cur_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h index bd9ed9e1e41f..d66b37259bed 100644 --- a/fs/xfs/libxfs/xfs_refcount_btree.h +++ b/fs/xfs/libxfs/xfs_refcount_btree.h @@ -65,4 +65,9 @@ extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp, void xfs_refcountbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, struct xfs_buf *agbp); +unsigned int xfs_refcountbt_maxlevels_ondisk(void); + +int __init xfs_refcountbt_init_cur_cache(void); +void xfs_refcountbt_destroy_cur_cache(void); + #endif /* __XFS_REFCOUNT_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index f45929b1b94a..cd322174dbff 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -24,6 +24,8 @@ #include "xfs_inode.h" #include "xfs_ag.h" +struct kmem_cache *xfs_rmap_intent_cache; + /* * Lookup the first record less than or equal to [bno, len, owner, offset] * in the btree given by cur. @@ -2485,7 +2487,7 @@ __xfs_rmap_add( bmap->br_blockcount, bmap->br_state); - ri = kmem_alloc(sizeof(struct xfs_rmap_intent), KM_NOFS); + ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL); INIT_LIST_HEAD(&ri->ri_list); ri->ri_type = type; ri->ri_owner = owner; @@ -2779,3 +2781,20 @@ const struct xfs_owner_info XFS_RMAP_OINFO_REFC = { const struct xfs_owner_info XFS_RMAP_OINFO_COW = { .oi_owner = XFS_RMAP_OWN_COW, }; + +int __init +xfs_rmap_intent_init_cache(void) +{ + xfs_rmap_intent_cache = kmem_cache_create("xfs_rmap_intent", + sizeof(struct xfs_rmap_intent), + 0, 0, NULL); + + return xfs_rmap_intent_cache != NULL ? 0 : -ENOMEM; +} + +void +xfs_rmap_intent_destroy_cache(void) +{ + kmem_cache_destroy(xfs_rmap_intent_cache); + xfs_rmap_intent_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index fd67904ed446..b718ebeda372 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -159,8 +159,8 @@ enum xfs_rmap_intent_type { struct xfs_rmap_intent { struct list_head ri_list; enum xfs_rmap_intent_type ri_type; - uint64_t ri_owner; int ri_whichfork; + uint64_t ri_owner; struct xfs_bmbt_irec ri_bmap; }; @@ -215,4 +215,9 @@ extern const struct xfs_owner_info XFS_RMAP_OINFO_INODES; extern const struct xfs_owner_info XFS_RMAP_OINFO_REFC; extern const struct xfs_owner_info XFS_RMAP_OINFO_COW; +extern struct kmem_cache *xfs_rmap_intent_cache; + +int __init xfs_rmap_intent_init_cache(void); +void xfs_rmap_intent_destroy_cache(void); + #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c index b7dbbfb3aeed..69e104d0277f 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.c +++ b/fs/xfs/libxfs/xfs_rmap_btree.c @@ -22,6 +22,8 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" +static struct kmem_cache *xfs_rmapbt_cur_cache; + /* * Reverse map btree. * @@ -451,13 +453,10 @@ xfs_rmapbt_init_common( { struct xfs_btree_cur *cur; - cur = kmem_cache_zalloc(xfs_btree_cur_zone, GFP_NOFS | __GFP_NOFAIL); - cur->bc_tp = tp; - cur->bc_mp = mp; /* Overlapping btree; 2 keys per pointer. */ - cur->bc_btnum = XFS_BTNUM_RMAP; + cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_RMAP, + mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache); cur->bc_flags = XFS_BTREE_CRC_BLOCKS | XFS_BTREE_OVERLAPPING; - cur->bc_blocklog = mp->m_sb.sb_blocklog; cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2); cur->bc_ops = &xfs_rmapbt_ops; @@ -522,6 +521,18 @@ xfs_rmapbt_commit_staged_btree( xfs_btree_commit_afakeroot(cur, tp, agbp, &xfs_rmapbt_ops); } +/* Calculate number of records in a reverse mapping btree block. */ +static inline unsigned int +xfs_rmapbt_block_maxrecs( + unsigned int blocklen, + bool leaf) +{ + if (leaf) + return blocklen / sizeof(struct xfs_rmap_rec); + return blocklen / + (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); +} + /* * Calculate number of records in an rmap btree block. */ @@ -531,11 +542,33 @@ xfs_rmapbt_maxrecs( int leaf) { blocklen -= XFS_RMAP_BLOCK_LEN; + return xfs_rmapbt_block_maxrecs(blocklen, leaf); +} - if (leaf) - return blocklen / sizeof(struct xfs_rmap_rec); - return blocklen / - (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t)); +/* Compute the max possible height for reverse mapping btrees. */ +unsigned int +xfs_rmapbt_maxlevels_ondisk(void) +{ + unsigned int minrecs[2]; + unsigned int blocklen; + + blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN; + + minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2; + minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2; + + /* + * Compute the asymptotic maxlevels for an rmapbt on any reflink fs. + * + * On a reflink filesystem, each AG block can have up to 2^32 (per the + * refcount record format) owners, which means that theoretically we + * could face up to 2^64 rmap records. However, we're likely to run + * out of blocks in the AG long before that happens, which means that + * we must compute the max height based on what the btree will look + * like if it consumes almost all the blocks in the AG due to maximal + * sharing factor. + */ + return xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS); } /* Compute the maximum height of an rmap btree. */ @@ -543,26 +576,36 @@ void xfs_rmapbt_compute_maxlevels( struct xfs_mount *mp) { - /* - * On a non-reflink filesystem, the maximum number of rmap - * records is the number of blocks in the AG, hence the max - * rmapbt height is log_$maxrecs($agblocks). However, with - * reflink each AG block can have up to 2^32 (per the refcount - * record format) owners, which means that theoretically we - * could face up to 2^64 rmap records. - * - * That effectively means that the max rmapbt height must be - * XFS_BTREE_MAXLEVELS. "Fortunately" we'll run out of AG - * blocks to feed the rmapbt long before the rmapbt reaches - * maximum height. The reflink code uses ag_resv_critical to - * disallow reflinking when less than 10% of the per-AG metadata - * block reservation since the fallback is a regular file copy. - */ - if (xfs_has_reflink(mp)) - mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS; - else + if (!xfs_has_rmapbt(mp)) { + mp->m_rmap_maxlevels = 0; + return; + } + + if (xfs_has_reflink(mp)) { + /* + * Compute the asymptotic maxlevels for an rmap btree on a + * filesystem that supports reflink. + * + * On a reflink filesystem, each AG block can have up to 2^32 + * (per the refcount record format) owners, which means that + * theoretically we could face up to 2^64 rmap records. + * However, we're likely to run out of blocks in the AG long + * before that happens, which means that we must compute the + * max height based on what the btree will look like if it + * consumes almost all the blocks in the AG due to maximal + * sharing factor. + */ + mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr, + mp->m_sb.sb_agblocks); + } else { + /* + * If there's no block sharing, compute the maximum rmapbt + * height assuming one rmap record per AG block. + */ mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels( mp->m_rmap_mnr, mp->m_sb.sb_agblocks); + } + ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk()); } /* Calculate the refcount btree size for some records. */ @@ -633,3 +676,22 @@ xfs_rmapbt_calc_reserves( return error; } + +int __init +xfs_rmapbt_init_cur_cache(void) +{ + xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur", + xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()), + 0, 0, NULL); + + if (!xfs_rmapbt_cur_cache) + return -ENOMEM; + return 0; +} + +void +xfs_rmapbt_destroy_cur_cache(void) +{ + kmem_cache_destroy(xfs_rmapbt_cur_cache); + xfs_rmapbt_cur_cache = NULL; +} diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h index f2eee6572af4..3244715dd111 100644 --- a/fs/xfs/libxfs/xfs_rmap_btree.h +++ b/fs/xfs/libxfs/xfs_rmap_btree.h @@ -59,4 +59,9 @@ extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp, extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_perag *pag, xfs_extlen_t *ask, xfs_extlen_t *used); +unsigned int xfs_rmapbt_maxlevels_ondisk(void); + +int __init xfs_rmapbt_init_cur_cache(void); +void xfs_rmapbt_destroy_cur_cache(void); + #endif /* __XFS_RMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index e58349be78bd..f4e84aa1d50a 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -495,7 +495,7 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp) static void __xfs_sb_from_disk( struct xfs_sb *to, - xfs_dsb_t *from, + struct xfs_dsb *from, bool convert_xquota) { to->sb_magicnum = be32_to_cpu(from->sb_magicnum); @@ -571,7 +571,7 @@ __xfs_sb_from_disk( void xfs_sb_from_disk( struct xfs_sb *to, - xfs_dsb_t *from) + struct xfs_dsb *from) { __xfs_sb_from_disk(to, from, true); } diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 5e300daa2559..6f83d9b306ee 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -70,7 +70,7 @@ xfs_allocfree_log_count( { uint blocks; - blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1); + blocks = num_ops * 2 * (2 * mp->m_alloc_maxlevels - 1); if (xfs_has_rmapbt(mp)) blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1); if (xfs_has_reflink(mp)) @@ -814,6 +814,19 @@ xfs_trans_resv_calc( struct xfs_mount *mp, struct xfs_trans_resv *resp) { + unsigned int rmap_maxlevels = mp->m_rmap_maxlevels; + + /* + * In the early days of rmap+reflink, we always set the rmap maxlevels + * to 9 even if the AG was small enough that it would never grow to + * that height. Transaction reservation sizes influence the minimum + * log size calculation, which influences the size of the log that mkfs + * creates. Use the old value here to ensure that newly formatted + * small filesystems will mount on older kernels. + */ + if (xfs_has_rmapbt(mp) && xfs_has_reflink(mp)) + mp->m_rmap_maxlevels = XFS_OLD_REFLINK_RMAP_MAXLEVELS; + /* * The following transactions are logged in physical format and * require a permanent reservation on space. @@ -916,4 +929,7 @@ xfs_trans_resv_calc( resp->tr_clearagi.tr_logres = xfs_calc_clear_agi_bucket_reservation(mp); resp->tr_growrtzero.tr_logres = xfs_calc_growrtzero_reservation(mp); resp->tr_growrtfree.tr_logres = xfs_calc_growrtfree_reservation(mp); + + /* Put everything back the way it was. This goes at the end. */ + mp->m_rmap_maxlevels = rmap_maxlevels; } diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 50332be34388..87b31c69a773 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -17,6 +17,13 @@ /* Adding one rmap could split every level up to the top of the tree. */ #define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels) +/* + * Note that we historically set m_rmap_maxlevels to 9 when reflink is enabled, + * so we must preserve this behavior to avoid changing the transaction space + * reservations and minimum log size calculations for existing filesystems. + */ +#define XFS_OLD_REFLINK_RMAP_MAXLEVELS 9 + /* Blocks we might need to add "b" rmaps to a tree. */ #define XFS_NRMAPADD_SPACE_RES(mp, b)\ (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ @@ -74,7 +81,7 @@ #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ - (2 * (mp)->m_ag_maxlevels) + (2 * (mp)->m_alloc_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) #define XFS_LINK_SPACE_RES(mp,nl) \ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index ae3c9f6e2c69..bed798792226 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -555,11 +555,11 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); - if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + if (level <= 0 || level > mp->m_alloc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); - if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + if (level <= 0 || level > mp->m_alloc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); if (xfs_has_rmapbt(mp)) { @@ -568,7 +568,7 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]); - if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + if (level <= 0 || level > mp->m_rmap_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); } @@ -578,7 +578,7 @@ xchk_agf( xchk_block_set_corrupt(sc, sc->sa.agf_bp); level = be32_to_cpu(agf->agf_refcount_level); - if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + if (level <= 0 || level > mp->m_refc_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agf_bp); } @@ -850,6 +850,7 @@ xchk_agi( struct xfs_mount *mp = sc->mp; struct xfs_agi *agi; struct xfs_perag *pag; + struct xfs_ino_geometry *igeo = M_IGEO(sc->mp); xfs_agnumber_t agno = sc->sm->sm_agno; xfs_agblock_t agbno; xfs_agblock_t eoag; @@ -880,7 +881,7 @@ xchk_agi( xchk_block_set_corrupt(sc, sc->sa.agi_bp); level = be32_to_cpu(agi->agi_level); - if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + if (level <= 0 || level > igeo->inobt_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agi_bp); if (xfs_has_finobt(mp)) { @@ -889,7 +890,7 @@ xchk_agi( xchk_block_set_corrupt(sc, sc->sa.agi_bp); level = be32_to_cpu(agi->agi_free_level); - if (level <= 0 || level > XFS_BTREE_MAXLEVELS) + if (level <= 0 || level > igeo->inobt_maxlevels) xchk_block_set_corrupt(sc, sc->sa.agi_bp); } diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c index 0f8deee66f15..d7bfed52f4cd 100644 --- a/fs/xfs/scrub/agheader_repair.c +++ b/fs/xfs/scrub/agheader_repair.c @@ -122,7 +122,7 @@ xrep_check_btree_root( xfs_agnumber_t agno = sc->sm->sm_agno; return xfs_verify_agbno(mp, agno, fab->root) && - fab->height <= XFS_BTREE_MAXLEVELS; + fab->height <= fab->maxlevels; } /* @@ -339,18 +339,22 @@ xrep_agf( [XREP_AGF_BNOBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_bnobt_buf_ops, + .maxlevels = sc->mp->m_alloc_maxlevels, }, [XREP_AGF_CNTBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_cntbt_buf_ops, + .maxlevels = sc->mp->m_alloc_maxlevels, }, [XREP_AGF_RMAPBT] = { .rmap_owner = XFS_RMAP_OWN_AG, .buf_ops = &xfs_rmapbt_buf_ops, + .maxlevels = sc->mp->m_rmap_maxlevels, }, [XREP_AGF_REFCOUNTBT] = { .rmap_owner = XFS_RMAP_OWN_REFC, .buf_ops = &xfs_refcountbt_buf_ops, + .maxlevels = sc->mp->m_refc_maxlevels, }, [XREP_AGF_END] = { .buf_ops = NULL, @@ -881,10 +885,12 @@ xrep_agi( [XREP_AGI_INOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, .buf_ops = &xfs_inobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, }, [XREP_AGI_FINOBT] = { .rmap_owner = XFS_RMAP_OWN_INOBT, .buf_ops = &xfs_finobt_buf_ops, + .maxlevels = M_IGEO(sc->mp)->inobt_maxlevels, }, [XREP_AGI_END] = { .buf_ops = NULL diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c index d6d24c866bc4..b89bf9de9b1c 100644 --- a/fs/xfs/scrub/bitmap.c +++ b/fs/xfs/scrub/bitmap.c @@ -222,21 +222,21 @@ out: * 1 2 3 * * Pretend for this example that each leaf block has 100 btree records. For - * the first btree record, we'll observe that bc_ptrs[0] == 1, so we record - * that we saw block 1. Then we observe that bc_ptrs[1] == 1, so we record - * block 4. The list is [1, 4]. + * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we + * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so + * we record block 4. The list is [1, 4]. * - * For the second btree record, we see that bc_ptrs[0] == 2, so we exit the - * loop. The list remains [1, 4]. + * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit + * the loop. The list remains [1, 4]. * * For the 101st btree record, we've moved onto leaf block 2. Now - * bc_ptrs[0] == 1 again, so we record that we saw block 2. We see that - * bc_ptrs[1] == 2, so we exit the loop. The list is now [1, 4, 2]. + * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that + * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2]. * - * For the 102nd record, bc_ptrs[0] == 2, so we continue. + * For the 102nd record, bc_levels[0].ptr == 2, so we continue. * - * For the 201st record, we've moved on to leaf block 3. bc_ptrs[0] == 1, so - * we add 3 to the list. Now it is [1, 4, 2, 3]. + * For the 201st record, we've moved on to leaf block 3. + * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3]. * * For the 300th record we just exit, with the list being [1, 4, 2, 3]. */ @@ -256,7 +256,7 @@ xbitmap_set_btcur_path( int i; int error; - for (i = 0; i < cur->bc_nlevels && cur->bc_ptrs[i] == 1; i++) { + for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) { xfs_btree_get_block(cur, i, &bp); if (!bp) continue; diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 017da9ceaee9..a4cbbc346f60 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -402,7 +402,7 @@ xchk_bmapbt_rec( * the root since the verifiers don't do that. */ if (xfs_has_crc(bs->cur->bc_mp) && - bs->cur->bc_ptrs[0] == 1) { + bs->cur->bc_levels[0].ptr == 1) { for (i = 0; i < bs->cur->bc_nlevels - 1; i++) { block = xfs_btree_get_block(bs->cur, i, &bp); owner = be64_to_cpu(block->bb_u.l.bb_owner); diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index eccb855dc904..39dd46f038fe 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -136,14 +136,14 @@ xchk_btree_rec( struct xfs_buf *bp; block = xfs_btree_get_block(cur, 0, &bp); - rec = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); + rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block); trace_xchk_btree_rec(bs->sc, cur, 0); /* If this isn't the first record, are they in order? */ - if (!bs->firstrec && !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec)) + if (cur->bc_levels[0].ptr > 1 && + !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec)) xchk_btree_set_corrupt(bs->sc, cur, 0); - bs->firstrec = false; memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len); if (cur->bc_nlevels == 1) @@ -152,7 +152,7 @@ xchk_btree_rec( /* Is this at least as large as the parent low key? */ cur->bc_ops->init_key_from_rec(&key, rec); keyblock = xfs_btree_get_block(cur, 1, &bp); - keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[1], keyblock); + keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock); if (cur->bc_ops->diff_two_keys(cur, &key, keyp) < 0) xchk_btree_set_corrupt(bs->sc, cur, 1); @@ -161,7 +161,7 @@ xchk_btree_rec( /* Is this no larger than the parent high key? */ cur->bc_ops->init_high_key_from_rec(&hkey, rec); - keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[1], keyblock); + keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock); if (cur->bc_ops->diff_two_keys(cur, keyp, &hkey) < 0) xchk_btree_set_corrupt(bs->sc, cur, 1); } @@ -183,23 +183,22 @@ xchk_btree_key( struct xfs_buf *bp; block = xfs_btree_get_block(cur, level, &bp); - key = xfs_btree_key_addr(cur, cur->bc_ptrs[level], block); + key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block); trace_xchk_btree_key(bs->sc, cur, level); /* If this isn't the first key, are they in order? */ - if (!bs->firstkey[level] && - !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level], key)) + if (cur->bc_levels[level].ptr > 1 && + !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1], key)) xchk_btree_set_corrupt(bs->sc, cur, level); - bs->firstkey[level] = false; - memcpy(&bs->lastkey[level], key, cur->bc_ops->key_len); + memcpy(&bs->lastkey[level - 1], key, cur->bc_ops->key_len); if (level + 1 >= cur->bc_nlevels) return; /* Is this at least as large as the parent low key? */ keyblock = xfs_btree_get_block(cur, level + 1, &bp); - keyp = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], keyblock); + keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock); if (cur->bc_ops->diff_two_keys(cur, key, keyp) < 0) xchk_btree_set_corrupt(bs->sc, cur, level); @@ -207,8 +206,9 @@ xchk_btree_key( return; /* Is this no larger than the parent high key? */ - key = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level], block); - keyp = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], keyblock); + key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block); + keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr, + keyblock); if (cur->bc_ops->diff_two_keys(cur, keyp, key) < 0) xchk_btree_set_corrupt(bs->sc, cur, level); } @@ -291,7 +291,7 @@ xchk_btree_block_check_sibling( /* Compare upper level pointer to sibling pointer. */ pblock = xfs_btree_get_block(ncur, level + 1, &pbp); - pp = xfs_btree_ptr_addr(ncur, ncur->bc_ptrs[level + 1], pblock); + pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock); if (!xchk_btree_ptr_ok(bs, level + 1, pp)) goto out; if (pbp) @@ -596,7 +596,7 @@ xchk_btree_block_keys( /* Obtain the parent's copy of the keys for this block. */ parent_block = xfs_btree_get_block(cur, level + 1, &bp); - parent_keys = xfs_btree_key_addr(cur, cur->bc_ptrs[level + 1], + parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, parent_block); if (cur->bc_ops->diff_two_keys(cur, &block_keys, parent_keys) != 0) @@ -607,7 +607,7 @@ xchk_btree_block_keys( /* Get high keys */ high_bk = xfs_btree_high_key_from_key(cur, &block_keys); - high_pk = xfs_btree_high_key_addr(cur, cur->bc_ptrs[level + 1], + high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr, parent_block); if (cur->bc_ops->diff_two_keys(cur, high_bk, high_pk) != 0) @@ -627,35 +627,39 @@ xchk_btree( const struct xfs_owner_info *oinfo, void *private) { - struct xchk_btree bs = { - .cur = cur, - .scrub_rec = scrub_fn, - .oinfo = oinfo, - .firstrec = true, - .private = private, - .sc = sc, - }; union xfs_btree_ptr ptr; + struct xchk_btree *bs; union xfs_btree_ptr *pp; union xfs_btree_rec *recp; struct xfs_btree_block *block; - int level; struct xfs_buf *bp; struct check_owner *co; struct check_owner *n; - int i; + size_t cur_sz; + int level; int error = 0; - /* Initialize scrub state */ - for (i = 0; i < XFS_BTREE_MAXLEVELS; i++) - bs.firstkey[i] = true; - INIT_LIST_HEAD(&bs.to_check); - - /* Don't try to check a tree with a height we can't handle. */ - if (cur->bc_nlevels > XFS_BTREE_MAXLEVELS) { + /* + * Allocate the btree scrub context from the heap, because this + * structure can get rather large. Don't let a caller feed us a + * totally absurd size. + */ + cur_sz = xchk_btree_sizeof(cur->bc_nlevels); + if (cur_sz > PAGE_SIZE) { xchk_btree_set_corrupt(sc, cur, 0); - goto out; + return 0; } + bs = kmem_zalloc(cur_sz, KM_NOFS | KM_MAYFAIL); + if (!bs) + return -ENOMEM; + bs->cur = cur; + bs->scrub_rec = scrub_fn; + bs->oinfo = oinfo; + bs->private = private; + bs->sc = sc; + + /* Initialize scrub state */ + INIT_LIST_HEAD(&bs->to_check); /* * Load the root of the btree. The helper function absorbs @@ -663,79 +667,82 @@ xchk_btree( */ level = cur->bc_nlevels - 1; cur->bc_ops->init_ptr_from_cur(cur, &ptr); - if (!xchk_btree_ptr_ok(&bs, cur->bc_nlevels, &ptr)) + if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr)) goto out; - error = xchk_btree_get_block(&bs, level, &ptr, &block, &bp); + error = xchk_btree_get_block(bs, level, &ptr, &block, &bp); if (error || !block) goto out; - cur->bc_ptrs[level] = 1; + cur->bc_levels[level].ptr = 1; while (level < cur->bc_nlevels) { block = xfs_btree_get_block(cur, level, &bp); if (level == 0) { /* End of leaf, pop back towards the root. */ - if (cur->bc_ptrs[level] > + if (cur->bc_levels[level].ptr > be16_to_cpu(block->bb_numrecs)) { - xchk_btree_block_keys(&bs, level, block); + xchk_btree_block_keys(bs, level, block); if (level < cur->bc_nlevels - 1) - cur->bc_ptrs[level + 1]++; + cur->bc_levels[level + 1].ptr++; level++; continue; } /* Records in order for scrub? */ - xchk_btree_rec(&bs); + xchk_btree_rec(bs); /* Call out to the record checker. */ - recp = xfs_btree_rec_addr(cur, cur->bc_ptrs[0], block); - error = bs.scrub_rec(&bs, recp); + recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, + block); + error = bs->scrub_rec(bs, recp); if (error) break; if (xchk_should_terminate(sc, &error) || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) break; - cur->bc_ptrs[level]++; + cur->bc_levels[level].ptr++; continue; } /* End of node, pop back towards the root. */ - if (cur->bc_ptrs[level] > be16_to_cpu(block->bb_numrecs)) { - xchk_btree_block_keys(&bs, level, block); + if (cur->bc_levels[level].ptr > + be16_to_cpu(block->bb_numrecs)) { + xchk_btree_block_keys(bs, level, block); if (level < cur->bc_nlevels - 1) - cur->bc_ptrs[level + 1]++; + cur->bc_levels[level + 1].ptr++; level++; continue; } /* Keys in order for scrub? */ - xchk_btree_key(&bs, level); + xchk_btree_key(bs, level); /* Drill another level deeper. */ - pp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[level], block); - if (!xchk_btree_ptr_ok(&bs, level, pp)) { - cur->bc_ptrs[level]++; + pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block); + if (!xchk_btree_ptr_ok(bs, level, pp)) { + cur->bc_levels[level].ptr++; continue; } level--; - error = xchk_btree_get_block(&bs, level, pp, &block, &bp); + error = xchk_btree_get_block(bs, level, pp, &block, &bp); if (error || !block) goto out; - cur->bc_ptrs[level] = 1; + cur->bc_levels[level].ptr = 1; } out: /* Process deferred owner checks on btree blocks. */ - list_for_each_entry_safe(co, n, &bs.to_check, list) { - if (!error && bs.cur) - error = xchk_btree_check_block_owner(&bs, - co->level, co->daddr); + list_for_each_entry_safe(co, n, &bs->to_check, list) { + if (!error && bs->cur) + error = xchk_btree_check_block_owner(bs, co->level, + co->daddr); list_del(&co->list); kmem_free(co); } + kmem_free(bs); return error; } diff --git a/fs/xfs/scrub/btree.h b/fs/xfs/scrub/btree.h index b7d2fc01fbf9..da61a53a0b61 100644 --- a/fs/xfs/scrub/btree.h +++ b/fs/xfs/scrub/btree.h @@ -39,11 +39,22 @@ struct xchk_btree { /* internal scrub state */ union xfs_btree_rec lastrec; - bool firstrec; - union xfs_btree_key lastkey[XFS_BTREE_MAXLEVELS]; - bool firstkey[XFS_BTREE_MAXLEVELS]; struct list_head to_check; + + /* this element must come last! */ + union xfs_btree_key lastkey[]; }; + +/* + * Calculate the size of a xchk_btree structure. There are nlevels-1 slots for + * keys because we track leaf records separately in lastrec. + */ +static inline size_t +xchk_btree_sizeof(unsigned int nlevels) +{ + return struct_size((struct xchk_btree *)NULL, lastkey, nlevels - 1); +} + int xchk_btree(struct xfs_scrub *sc, struct xfs_btree_cur *cur, xchk_btree_rec_fn scrub_fn, const struct xfs_owner_info *oinfo, void *private); diff --git a/fs/xfs/scrub/dabtree.c b/fs/xfs/scrub/dabtree.c index 8a52514bc1ff..b962cfbbd92b 100644 --- a/fs/xfs/scrub/dabtree.c +++ b/fs/xfs/scrub/dabtree.c @@ -473,7 +473,7 @@ xchk_da_btree( xchk_da_btree_rec_fn scrub_fn, void *private) { - struct xchk_da_btree ds = {}; + struct xchk_da_btree *ds; struct xfs_mount *mp = sc->mp; struct xfs_da_state_blk *blks; struct xfs_da_node_entry *key; @@ -486,32 +486,35 @@ xchk_da_btree( return 0; /* Set up initial da state. */ - ds.dargs.dp = sc->ip; - ds.dargs.whichfork = whichfork; - ds.dargs.trans = sc->tp; - ds.dargs.op_flags = XFS_DA_OP_OKNOENT; - ds.state = xfs_da_state_alloc(&ds.dargs); - ds.sc = sc; - ds.private = private; + ds = kmem_zalloc(sizeof(struct xchk_da_btree), KM_NOFS | KM_MAYFAIL); + if (!ds) + return -ENOMEM; + ds->dargs.dp = sc->ip; + ds->dargs.whichfork = whichfork; + ds->dargs.trans = sc->tp; + ds->dargs.op_flags = XFS_DA_OP_OKNOENT; + ds->state = xfs_da_state_alloc(&ds->dargs); + ds->sc = sc; + ds->private = private; if (whichfork == XFS_ATTR_FORK) { - ds.dargs.geo = mp->m_attr_geo; - ds.lowest = 0; - ds.highest = 0; + ds->dargs.geo = mp->m_attr_geo; + ds->lowest = 0; + ds->highest = 0; } else { - ds.dargs.geo = mp->m_dir_geo; - ds.lowest = ds.dargs.geo->leafblk; - ds.highest = ds.dargs.geo->freeblk; + ds->dargs.geo = mp->m_dir_geo; + ds->lowest = ds->dargs.geo->leafblk; + ds->highest = ds->dargs.geo->freeblk; } - blkno = ds.lowest; + blkno = ds->lowest; level = 0; /* Find the root of the da tree, if present. */ - blks = ds.state->path.blk; - error = xchk_da_btree_block(&ds, level, blkno); + blks = ds->state->path.blk; + error = xchk_da_btree_block(ds, level, blkno); if (error) goto out_state; /* - * We didn't find a block at ds.lowest, which means that there's + * We didn't find a block at ds->lowest, which means that there's * no LEAF1/LEAFN tree (at least not where it's supposed to be), * so jump out now. */ @@ -523,16 +526,16 @@ xchk_da_btree( /* Handle leaf block. */ if (blks[level].magic != XFS_DA_NODE_MAGIC) { /* End of leaf, pop back towards the root. */ - if (blks[level].index >= ds.maxrecs[level]) { + if (blks[level].index >= ds->maxrecs[level]) { if (level > 0) blks[level - 1].index++; - ds.tree_level++; + ds->tree_level++; level--; continue; } /* Dispatch record scrubbing. */ - error = scrub_fn(&ds, level); + error = scrub_fn(ds, level); if (error) break; if (xchk_should_terminate(sc, &error) || @@ -545,17 +548,17 @@ xchk_da_btree( /* End of node, pop back towards the root. */ - if (blks[level].index >= ds.maxrecs[level]) { + if (blks[level].index >= ds->maxrecs[level]) { if (level > 0) blks[level - 1].index++; - ds.tree_level++; + ds->tree_level++; level--; continue; } /* Hashes in order for scrub? */ - key = xchk_da_btree_node_entry(&ds, level); - error = xchk_da_btree_hash(&ds, level, &key->hashval); + key = xchk_da_btree_node_entry(ds, level); + error = xchk_da_btree_hash(ds, level, &key->hashval); if (error) goto out; @@ -564,11 +567,11 @@ xchk_da_btree( level++; if (level >= XFS_DA_NODE_MAXDEPTH) { /* Too deep! */ - xchk_da_set_corrupt(&ds, level - 1); + xchk_da_set_corrupt(ds, level - 1); break; } - ds.tree_level--; - error = xchk_da_btree_block(&ds, level, blkno); + ds->tree_level--; + error = xchk_da_btree_block(ds, level, blkno); if (error) goto out; if (blks[level].bp == NULL) @@ -587,6 +590,7 @@ out: } out_state: - xfs_da_state_free(ds.state); + xfs_da_state_free(ds->state); + kmem_free(ds); return error; } diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h index 3bb152d52a07..840f74ec431c 100644 --- a/fs/xfs/scrub/repair.h +++ b/fs/xfs/scrub/repair.h @@ -44,6 +44,9 @@ struct xrep_find_ag_btree { /* in: buffer ops */ const struct xfs_buf_ops *buf_ops; + /* in: maximum btree height */ + unsigned int maxlevels; + /* out: the highest btree block found and the tree height */ xfs_agblock_t root; unsigned int height; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 51e4c61916d2..8d528d35b725 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -461,15 +461,10 @@ xfs_scrub_metadata( struct file *file, struct xfs_scrub_metadata *sm) { - struct xfs_scrub sc = { - .file = file, - .sm = sm, - }; + struct xfs_scrub *sc; struct xfs_mount *mp = XFS_I(file_inode(file))->i_mount; int error = 0; - sc.mp = mp; - BUILD_BUG_ON(sizeof(meta_scrub_ops) != (sizeof(struct xchk_meta_ops) * XFS_SCRUB_TYPE_NR)); @@ -489,59 +484,68 @@ xfs_scrub_metadata( xchk_experimental_warning(mp); - sc.ops = &meta_scrub_ops[sm->sm_type]; - sc.sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); + sc = kmem_zalloc(sizeof(struct xfs_scrub), KM_NOFS | KM_MAYFAIL); + if (!sc) { + error = -ENOMEM; + goto out; + } + + sc->mp = mp; + sc->file = file; + sc->sm = sm; + sc->ops = &meta_scrub_ops[sm->sm_type]; + sc->sick_mask = xchk_health_mask_for_scrub_type(sm->sm_type); retry_op: /* * When repairs are allowed, prevent freezing or readonly remount while * scrub is running with a real transaction. */ if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { - error = mnt_want_write_file(sc.file); + error = mnt_want_write_file(sc->file); if (error) - goto out; + goto out_sc; } /* Set up for the operation. */ - error = sc.ops->setup(&sc); + error = sc->ops->setup(sc); if (error) goto out_teardown; /* Scrub for errors. */ - error = sc.ops->scrub(&sc); - if (!(sc.flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { + error = sc->ops->scrub(sc); + if (!(sc->flags & XCHK_TRY_HARDER) && error == -EDEADLOCK) { /* * Scrubbers return -EDEADLOCK to mean 'try harder'. * Tear down everything we hold, then set up again with * preparation for worst-case scenarios. */ - error = xchk_teardown(&sc, 0); + error = xchk_teardown(sc, 0); if (error) - goto out; - sc.flags |= XCHK_TRY_HARDER; + goto out_sc; + sc->flags |= XCHK_TRY_HARDER; goto retry_op; } else if (error || (sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)) goto out_teardown; - xchk_update_health(&sc); + xchk_update_health(sc); - if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && - !(sc.flags & XREP_ALREADY_FIXED)) { + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + !(sc->flags & XREP_ALREADY_FIXED)) { bool needs_fix; /* Let debug users force us into the repair routines. */ if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) - sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; - needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | - XFS_SCRUB_OFLAG_XCORRUPT | - XFS_SCRUB_OFLAG_PREEN)); + needs_fix = (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT | + XFS_SCRUB_OFLAG_PREEN)); /* * If userspace asked for a repair but it wasn't necessary, * report that back to userspace. */ if (!needs_fix) { - sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; + sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; goto out_nofix; } @@ -549,26 +553,28 @@ retry_op: * If it's broken, userspace wants us to fix it, and we haven't * already tried to fix it, then attempt a repair. */ - error = xrep_attempt(&sc); + error = xrep_attempt(sc); if (error == -EAGAIN) { /* * Either the repair function succeeded or it couldn't * get all the resources it needs; either way, we go * back to the beginning and call the scrub function. */ - error = xchk_teardown(&sc, 0); + error = xchk_teardown(sc, 0); if (error) { xrep_failure(mp); - goto out; + goto out_sc; } goto retry_op; } } out_nofix: - xchk_postmortem(&sc); + xchk_postmortem(sc); out_teardown: - error = xchk_teardown(&sc, error); + error = xchk_teardown(sc, error); +out_sc: + kmem_free(sc); out: trace_xchk_done(XFS_I(file_inode(file)), sm, error); if (error == -EFSCORRUPTED || error == -EFSBADCRC) { diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c index c0ef53fe6611..b5f94676c37c 100644 --- a/fs/xfs/scrub/trace.c +++ b/fs/xfs/scrub/trace.c @@ -21,13 +21,14 @@ xchk_btree_cur_fsbno( struct xfs_btree_cur *cur, int level) { - if (level < cur->bc_nlevels && cur->bc_bufs[level]) + if (level < cur->bc_nlevels && cur->bc_levels[level].bp) return XFS_DADDR_TO_FSB(cur->bc_mp, - xfs_buf_daddr(cur->bc_bufs[level])); - if (level == cur->bc_nlevels - 1 && cur->bc_flags & XFS_BTREE_LONG_PTRS) + xfs_buf_daddr(cur->bc_levels[level].bp)); + + if (level == cur->bc_nlevels - 1 && + (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)) return XFS_INO_TO_FSB(cur->bc_mp, cur->bc_ino.ip->i_ino); - if (!(cur->bc_flags & XFS_BTREE_LONG_PTRS)) - return XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_ag.pag->pag_agno, 0); + return NULLFSBLOCK; } diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index a7bbb84f91a7..93ece6df02e3 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -348,7 +348,7 @@ TRACE_EVENT(xchk_btree_op_error, __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); - __entry->ptr = cur->bc_ptrs[level]; + __entry->ptr = cur->bc_levels[level].ptr; __entry->error = error; __entry->ret_ip = ret_ip; ), @@ -389,7 +389,7 @@ TRACE_EVENT(xchk_ifork_btree_op_error, __entry->type = sc->sm->sm_type; __entry->btnum = cur->bc_btnum; __entry->level = level; - __entry->ptr = cur->bc_ptrs[level]; + __entry->ptr = cur->bc_levels[level].ptr; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->error = error; @@ -431,7 +431,7 @@ TRACE_EVENT(xchk_btree_error, __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); - __entry->ptr = cur->bc_ptrs[level]; + __entry->ptr = cur->bc_levels[level].ptr; __entry->ret_ip = ret_ip; ), TP_printk("dev %d:%d type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", @@ -471,7 +471,7 @@ TRACE_EVENT(xchk_ifork_btree_error, __entry->level = level; __entry->agno = XFS_FSB_TO_AGNO(cur->bc_mp, fsbno); __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); - __entry->ptr = cur->bc_ptrs[level]; + __entry->ptr = cur->bc_levels[level].ptr; __entry->ret_ip = ret_ip; ), TP_printk("dev %d:%d ino 0x%llx fork %s type %s btree %s level %d ptr %d agno 0x%x agbno 0x%x ret_ip %pS", @@ -511,7 +511,7 @@ DECLARE_EVENT_CLASS(xchk_sbtree_class, __entry->bno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno); __entry->level = level; __entry->nlevels = cur->bc_nlevels; - __entry->ptr = cur->bc_ptrs[level]; + __entry->ptr = cur->bc_levels[level].ptr; ), TP_printk("dev %d:%d type %s btree %s agno 0x%x agbno 0x%x level %d nlevels %d ptr %d", MAJOR(__entry->dev), MINOR(__entry->dev), diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 34fc6148032a..c8c15c3c3147 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -82,6 +82,7 @@ xfs_end_ioend( struct iomap_ioend *ioend) { struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; unsigned int nofs_flag; @@ -97,18 +98,26 @@ xfs_end_ioend( /* * Just clean up the in-memory structures if the fs has been shut down. */ - if (xfs_is_shutdown(ip->i_mount)) { + if (xfs_is_shutdown(mp)) { error = -EIO; goto done; } /* - * Clean up any COW blocks on an I/O error. + * Clean up all COW blocks and underlying data fork delalloc blocks on + * I/O error. The delalloc punch is required because this ioend was + * mapped to blocks in the COW fork and the associated pages are no + * longer dirty. If we don't remove delalloc blocks here, they become + * stale and can corrupt free space accounting on unmount. */ error = blk_status_to_errno(ioend->io_bio->bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) + if (ioend->io_flags & IOMAP_F_SHARED) { xfs_reflink_cancel_cow_range(ip, offset, size, true); + xfs_bmap_punch_delalloc_range(ip, + XFS_B_TO_FSBT(mp, offset), + XFS_B_TO_FSB(mp, size)); + } goto done; } diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 2b5da6218977..27265771f247 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -390,7 +390,7 @@ out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ if (dp->i_afp) { xfs_idestroy_fork(dp->i_afp); - kmem_cache_free(xfs_ifork_zone, dp->i_afp); + kmem_cache_free(xfs_ifork_cache, dp->i_afp); dp->i_afp = NULL; } if (lock_mode) diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 03159970133f..e1f4d7d5a011 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -25,8 +25,8 @@ #include "xfs_log_priv.h" #include "xfs_log_recover.h" -kmem_zone_t *xfs_bui_zone; -kmem_zone_t *xfs_bud_zone; +struct kmem_cache *xfs_bui_cache; +struct kmem_cache *xfs_bud_cache; static const struct xfs_item_ops xfs_bui_item_ops; @@ -39,7 +39,7 @@ STATIC void xfs_bui_item_free( struct xfs_bui_log_item *buip) { - kmem_cache_free(xfs_bui_zone, buip); + kmem_cache_free(xfs_bui_cache, buip); } /* @@ -138,7 +138,7 @@ xfs_bui_init( { struct xfs_bui_log_item *buip; - buip = kmem_cache_zalloc(xfs_bui_zone, GFP_KERNEL | __GFP_NOFAIL); + buip = kmem_cache_zalloc(xfs_bui_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops); buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS; @@ -198,7 +198,7 @@ xfs_bud_item_release( struct xfs_bud_log_item *budp = BUD_ITEM(lip); xfs_bui_release(budp->bud_buip); - kmem_cache_free(xfs_bud_zone, budp); + kmem_cache_free(xfs_bud_cache, budp); } static const struct xfs_item_ops xfs_bud_item_ops = { @@ -215,7 +215,7 @@ xfs_trans_get_bud( { struct xfs_bud_log_item *budp; - budp = kmem_cache_zalloc(xfs_bud_zone, GFP_KERNEL | __GFP_NOFAIL); + budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops); budp->bud_buip = buip; @@ -384,7 +384,7 @@ xfs_bmap_update_finish_item( bmap->bi_bmap.br_blockcount = count; return -EAGAIN; } - kmem_free(bmap); + kmem_cache_free(xfs_bmap_intent_cache, bmap); return error; } @@ -404,7 +404,7 @@ xfs_bmap_update_cancel_item( struct xfs_bmap_intent *bmap; bmap = container_of(item, struct xfs_bmap_intent, bi_list); - kmem_free(bmap); + kmem_cache_free(xfs_bmap_intent_cache, bmap); } const struct xfs_defer_op_type xfs_bmap_update_defer_type = { @@ -532,7 +532,7 @@ xfs_bui_item_recover( * Commit transaction, which frees the transaction and saves the inode * for later replay activities. */ - error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list); + error = xfs_defer_ops_capture_and_commit(tp, capture_list); if (error) goto err_unlock; diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index b9be62f8bd52..3fafd3881a0b 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -25,7 +25,7 @@ /* kernel only BUI/BUD definitions */ struct xfs_mount; -struct kmem_zone; +struct kmem_cache; /* * Max number of extents in fast allocation path. @@ -65,7 +65,7 @@ struct xfs_bud_log_item { struct xfs_bud_log_format bud_format; }; -extern struct kmem_zone *xfs_bui_zone; -extern struct kmem_zone *xfs_bud_zone; +extern struct kmem_cache *xfs_bui_cache; +extern struct kmem_cache *xfs_bud_cache; #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 5fa6cd947dd4..631c5a61d89b 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -20,7 +20,7 @@ #include "xfs_error.h" #include "xfs_ag.h" -static kmem_zone_t *xfs_buf_zone; +static struct kmem_cache *xfs_buf_cache; /* * Locking orders @@ -220,7 +220,7 @@ _xfs_buf_alloc( int i; *bpp = NULL; - bp = kmem_cache_zalloc(xfs_buf_zone, GFP_NOFS | __GFP_NOFAIL); + bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL); /* * We don't want certain flags to appear in b_flags unless they are @@ -247,7 +247,7 @@ _xfs_buf_alloc( */ error = xfs_buf_get_maps(bp, nmaps); if (error) { - kmem_cache_free(xfs_buf_zone, bp); + kmem_cache_free(xfs_buf_cache, bp); return error; } @@ -307,7 +307,7 @@ xfs_buf_free( kmem_free(bp->b_addr); xfs_buf_free_maps(bp); - kmem_cache_free(xfs_buf_zone, bp); + kmem_cache_free(xfs_buf_cache, bp); } static int @@ -2258,12 +2258,12 @@ xfs_buf_delwri_pushbuf( int __init xfs_buf_init(void) { - xfs_buf_zone = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, + xfs_buf_cache = kmem_cache_create("xfs_buf", sizeof(struct xfs_buf), 0, SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); - if (!xfs_buf_zone) + if (!xfs_buf_cache) goto out; return 0; @@ -2275,7 +2275,7 @@ xfs_buf_init(void) void xfs_buf_terminate(void) { - kmem_cache_destroy(xfs_buf_zone); + kmem_cache_destroy(xfs_buf_cache); } void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index b1ab100c09e1..a7a8e4528881 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -23,7 +23,7 @@ #include "xfs_log.h" -kmem_zone_t *xfs_buf_item_zone; +struct kmem_cache *xfs_buf_item_cache; static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip) { @@ -804,7 +804,7 @@ xfs_buf_item_init( return 0; } - bip = kmem_cache_zalloc(xfs_buf_item_zone, GFP_KERNEL | __GFP_NOFAIL); + bip = kmem_cache_zalloc(xfs_buf_item_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &bip->bli_item, XFS_LI_BUF, &xfs_buf_item_ops); bip->bli_buf = bp; @@ -825,7 +825,7 @@ xfs_buf_item_init( map_size = DIV_ROUND_UP(chunks, NBWORD); if (map_size > XFS_BLF_DATAMAP_SIZE) { - kmem_cache_free(xfs_buf_item_zone, bip); + kmem_cache_free(xfs_buf_item_cache, bip); xfs_err(mp, "buffer item dirty bitmap (%u uints) too small to reflect %u bytes!", map_size, @@ -1002,7 +1002,7 @@ xfs_buf_item_free( { xfs_buf_item_free_format(bip); kmem_free(bip->bli_item.li_lv_shadow); - kmem_cache_free(xfs_buf_item_zone, bip); + kmem_cache_free(xfs_buf_item_cache, bip); } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 50aa0f5ef959..e11e9ef2338f 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -71,6 +71,6 @@ static inline void xfs_buf_dquot_io_fail(struct xfs_buf *bp) void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); -extern kmem_zone_t *xfs_buf_item_zone; +extern struct kmem_cache *xfs_buf_item_cache; #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index a476c7ef5d53..70ca5751b13e 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -603,7 +603,7 @@ xlog_recover_do_inode_buffer( inodes_per_buf = BBTOB(bp->b_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + - offsetof(xfs_dinode_t, di_next_unlinked); + offsetof(struct xfs_dinode, di_next_unlinked); while (next_unlinked_offset >= (reg_buf_offset + reg_buf_bytes)) { diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index c15d61d47a06..e48ae227bb11 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -38,8 +38,8 @@ * otherwise by the lowest id first, see xfs_dqlock2. */ -struct kmem_zone *xfs_qm_dqtrxzone; -static struct kmem_zone *xfs_qm_dqzone; +struct kmem_cache *xfs_dqtrx_cache; +static struct kmem_cache *xfs_dquot_cache; static struct lock_class_key xfs_dquot_group_class; static struct lock_class_key xfs_dquot_project_class; @@ -57,7 +57,7 @@ xfs_qm_dqdestroy( mutex_destroy(&dqp->q_qlock); XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot); - kmem_cache_free(xfs_qm_dqzone, dqp); + kmem_cache_free(xfs_dquot_cache, dqp); } /* @@ -458,7 +458,7 @@ xfs_dquot_alloc( { struct xfs_dquot *dqp; - dqp = kmem_cache_zalloc(xfs_qm_dqzone, GFP_KERNEL | __GFP_NOFAIL); + dqp = kmem_cache_zalloc(xfs_dquot_cache, GFP_KERNEL | __GFP_NOFAIL); dqp->q_type = type; dqp->q_id = id; @@ -471,7 +471,7 @@ xfs_dquot_alloc( * Offset of dquot in the (fixed sized) dquot chunk. */ dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * - sizeof(xfs_dqblk_t); + sizeof(struct xfs_dqblk); /* * Because we want to use a counting completion, complete @@ -1363,22 +1363,22 @@ xfs_dqlock2( int __init xfs_qm_init(void) { - xfs_qm_dqzone = kmem_cache_create("xfs_dquot", + xfs_dquot_cache = kmem_cache_create("xfs_dquot", sizeof(struct xfs_dquot), 0, 0, NULL); - if (!xfs_qm_dqzone) + if (!xfs_dquot_cache) goto out; - xfs_qm_dqtrxzone = kmem_cache_create("xfs_dqtrx", + xfs_dqtrx_cache = kmem_cache_create("xfs_dqtrx", sizeof(struct xfs_dquot_acct), 0, 0, NULL); - if (!xfs_qm_dqtrxzone) - goto out_free_dqzone; + if (!xfs_dqtrx_cache) + goto out_free_dquot_cache; return 0; -out_free_dqzone: - kmem_cache_destroy(xfs_qm_dqzone); +out_free_dquot_cache: + kmem_cache_destroy(xfs_dquot_cache); out: return -ENOMEM; } @@ -1386,8 +1386,8 @@ out: void xfs_qm_exit(void) { - kmem_cache_destroy(xfs_qm_dqtrxzone); - kmem_cache_destroy(xfs_qm_dqzone); + kmem_cache_destroy(xfs_dqtrx_cache); + kmem_cache_destroy(xfs_dquot_cache); } /* diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 3f8a0713573a..47ef9c9c5c17 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -25,8 +25,8 @@ #include "xfs_log_priv.h" #include "xfs_log_recover.h" -kmem_zone_t *xfs_efi_zone; -kmem_zone_t *xfs_efd_zone; +struct kmem_cache *xfs_efi_cache; +struct kmem_cache *xfs_efd_cache; static const struct xfs_item_ops xfs_efi_item_ops; @@ -43,7 +43,7 @@ xfs_efi_item_free( if (efip->efi_format.efi_nextents > XFS_EFI_MAX_FAST_EXTENTS) kmem_free(efip); else - kmem_cache_free(xfs_efi_zone, efip); + kmem_cache_free(xfs_efi_cache, efip); } /* @@ -161,7 +161,7 @@ xfs_efi_init( ((nextents - 1) * sizeof(xfs_extent_t))); efip = kmem_zalloc(size, 0); } else { - efip = kmem_cache_zalloc(xfs_efi_zone, + efip = kmem_cache_zalloc(xfs_efi_cache, GFP_KERNEL | __GFP_NOFAIL); } @@ -241,7 +241,7 @@ xfs_efd_item_free(struct xfs_efd_log_item *efdp) if (efdp->efd_format.efd_nextents > XFS_EFD_MAX_FAST_EXTENTS) kmem_free(efdp); else - kmem_cache_free(xfs_efd_zone, efdp); + kmem_cache_free(xfs_efd_cache, efdp); } /* @@ -333,7 +333,7 @@ xfs_trans_get_efd( (nextents - 1) * sizeof(struct xfs_extent), 0); } else { - efdp = kmem_cache_zalloc(xfs_efd_zone, + efdp = kmem_cache_zalloc(xfs_efd_cache, GFP_KERNEL | __GFP_NOFAIL); } @@ -474,15 +474,21 @@ xfs_extent_free_finish_item( struct list_head *item, struct xfs_btree_cur **state) { + struct xfs_owner_info oinfo = { }; struct xfs_extent_free_item *free; int error; free = container_of(item, struct xfs_extent_free_item, xefi_list); + oinfo.oi_owner = free->xefi_owner; + if (free->xefi_flags & XFS_EFI_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (free->xefi_flags & XFS_EFI_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; error = xfs_trans_free_extent(tp, EFD_ITEM(done), free->xefi_startblock, free->xefi_blockcount, - &free->xefi_oinfo, free->xefi_skip_discard); - kmem_free(free); + &oinfo, free->xefi_flags & XFS_EFI_SKIP_DISCARD); + kmem_cache_free(xfs_extfree_item_cache, free); return error; } @@ -502,7 +508,7 @@ xfs_extent_free_cancel_item( struct xfs_extent_free_item *free; free = container_of(item, struct xfs_extent_free_item, xefi_list); - kmem_free(free); + kmem_cache_free(xfs_extfree_item_cache, free); } const struct xfs_defer_op_type xfs_extent_free_defer_type = { @@ -525,6 +531,7 @@ xfs_agfl_free_finish_item( struct list_head *item, struct xfs_btree_cur **state) { + struct xfs_owner_info oinfo = { }; struct xfs_mount *mp = tp->t_mountp; struct xfs_efd_log_item *efdp = EFD_ITEM(done); struct xfs_extent_free_item *free; @@ -539,13 +546,13 @@ xfs_agfl_free_finish_item( ASSERT(free->xefi_blockcount == 1); agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); + oinfo.oi_owner = free->xefi_owner; trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); if (!error) - error = xfs_free_agfl_block(tp, agno, agbno, agbp, - &free->xefi_oinfo); + error = xfs_free_agfl_block(tp, agno, agbno, agbp, &oinfo); /* * Mark the transaction dirty, even on error. This ensures the @@ -564,7 +571,7 @@ xfs_agfl_free_finish_item( extp->ext_len = free->xefi_blockcount; efdp->efd_next_extent++; - kmem_free(free); + kmem_cache_free(xfs_extfree_item_cache, free); return error; } @@ -637,7 +644,7 @@ xfs_efi_item_recover( } - return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); + return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index cd2860c875bf..186d0f2137f1 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -9,7 +9,7 @@ /* kernel only EFI/EFD definitions */ struct xfs_mount; -struct kmem_zone; +struct kmem_cache; /* * Max number of extents in fast allocation path. @@ -69,7 +69,7 @@ struct xfs_efd_log_item { */ #define XFS_EFD_MAX_FAST_EXTENTS 16 -extern struct kmem_zone *xfs_efi_zone; -extern struct kmem_zone *xfs_efd_zone; +extern struct kmem_cache *xfs_efi_cache; +extern struct kmem_cache *xfs_efd_cache; #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 7aa943edfc02..27594738b0d1 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -259,7 +259,7 @@ xfs_file_dio_read( ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED); if (ret) return ret; - ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0); + ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0); xfs_iunlock(ip, XFS_IOLOCK_SHARED); return ret; @@ -569,7 +569,7 @@ xfs_file_dio_write_aligned( } trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, 0); + &xfs_dio_write_ops, 0, 0); out_unlock: if (iolock) xfs_iunlock(ip, iolock); @@ -647,7 +647,7 @@ retry_exclusive: trace_xfs_file_direct_write(iocb, from); ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, flags); + &xfs_dio_write_ops, flags, 0); /* * Retry unaligned I/O with exclusive blocking semantics if the DIO @@ -1452,7 +1452,7 @@ const struct file_operations xfs_file_operations = { .write_iter = xfs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index f2210d927481..e1472004170e 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -77,10 +77,10 @@ xfs_inode_alloc( * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL * and return NULL here on ENOMEM. */ - ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL); + ip = kmem_cache_alloc(xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); if (inode_init_always(mp->m_super, VFS_I(ip))) { - kmem_cache_free(xfs_inode_zone, ip); + kmem_cache_free(xfs_inode_cache, ip); return NULL; } @@ -130,11 +130,11 @@ xfs_inode_free_callback( if (ip->i_afp) { xfs_idestroy_fork(ip->i_afp); - kmem_cache_free(xfs_ifork_zone, ip->i_afp); + kmem_cache_free(xfs_ifork_cache, ip->i_afp); } if (ip->i_cowfp) { xfs_idestroy_fork(ip->i_cowfp); - kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); + kmem_cache_free(xfs_ifork_cache, ip->i_cowfp); } if (ip->i_itemp) { ASSERT(!test_bit(XFS_LI_IN_AIL, @@ -143,7 +143,7 @@ xfs_inode_free_callback( ip->i_itemp = NULL; } - kmem_cache_free(xfs_inode_zone, ip); + kmem_cache_free(xfs_inode_cache, ip); } static void diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 017904a34c02..508e184e3b8f 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -20,7 +20,7 @@ #include "xfs_ialloc.h" #include "xfs_trace.h" -kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ +struct kmem_cache *xfs_icreate_cache; /* inode create item */ static inline struct xfs_icreate_item *ICR_ITEM(struct xfs_log_item *lip) { @@ -63,7 +63,7 @@ STATIC void xfs_icreate_item_release( struct xfs_log_item *lip) { - kmem_cache_free(xfs_icreate_zone, ICR_ITEM(lip)); + kmem_cache_free(xfs_icreate_cache, ICR_ITEM(lip)); } static const struct xfs_item_ops xfs_icreate_item_ops = { @@ -97,7 +97,7 @@ xfs_icreate_log( { struct xfs_icreate_item *icp; - icp = kmem_cache_zalloc(xfs_icreate_zone, GFP_KERNEL | __GFP_NOFAIL); + icp = kmem_cache_zalloc(xfs_icreate_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &icp->ic_item, XFS_LI_ICREATE, &xfs_icreate_item_ops); diff --git a/fs/xfs/xfs_icreate_item.h b/fs/xfs/xfs_icreate_item.h index a50d0b01e15a..64992823108a 100644 --- a/fs/xfs/xfs_icreate_item.h +++ b/fs/xfs/xfs_icreate_item.h @@ -12,7 +12,7 @@ struct xfs_icreate_item { struct xfs_icreate_log ic_format; }; -extern kmem_zone_t *xfs_icreate_zone; /* inode create item zone */ +extern struct kmem_cache *xfs_icreate_cache; /* inode create item */ void xfs_icreate_log(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_agblock_t agbno, unsigned int count, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index a4f6f034fb81..64b9bf334806 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -36,7 +36,7 @@ #include "xfs_reflink.h" #include "xfs_ag.h" -kmem_zone_t *xfs_inode_zone; +struct kmem_cache *xfs_inode_cache; /* * Used in xfs_itruncate_extents(). This is the maximum number of extents @@ -564,8 +564,6 @@ xfs_lock_two_inodes( struct xfs_inode *ip1, uint ip1_mode) { - struct xfs_inode *temp; - uint mode_temp; int attempts = 0; struct xfs_log_item *lp; @@ -578,12 +576,8 @@ xfs_lock_two_inodes( ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { - temp = ip0; - ip0 = ip1; - ip1 = temp; - mode_temp = ip0_mode; - ip0_mode = ip1_mode; - ip1_mode = mode_temp; + swap(ip0, ip1); + swap(ip0_mode, ip1_mode); } again: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index b21b177832d1..e635a3d64cba 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -504,7 +504,7 @@ static inline void xfs_setup_existing_inode(struct xfs_inode *ip) void xfs_irele(struct xfs_inode *ip); -extern struct kmem_zone *xfs_inode_zone; +extern struct kmem_cache *xfs_inode_cache; /* The default CoW extent size hint. */ #define XFS_DEFAULT_COWEXTSZ_HINT 32 diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 0659d19c211e..90d8e591baf8 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -21,7 +21,7 @@ #include <linux/iversion.h> -kmem_zone_t *xfs_ili_zone; /* inode log item zone */ +struct kmem_cache *xfs_ili_cache; /* inode log item */ static inline struct xfs_inode_log_item *INODE_ITEM(struct xfs_log_item *lip) { @@ -672,7 +672,7 @@ xfs_inode_item_init( struct xfs_inode_log_item *iip; ASSERT(ip->i_itemp == NULL); - iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_zone, + iip = ip->i_itemp = kmem_cache_zalloc(xfs_ili_cache, GFP_KERNEL | __GFP_NOFAIL); iip->ili_inode = ip; @@ -694,7 +694,7 @@ xfs_inode_item_destroy( ip->i_itemp = NULL; kmem_free(iip->ili_item.li_lv_shadow); - kmem_cache_free(xfs_ili_zone, iip); + kmem_cache_free(xfs_ili_cache, iip); } diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 403b45ab9aa2..1a302000d604 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -47,6 +47,6 @@ extern void xfs_iflush_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *); -extern struct kmem_zone *xfs_ili_zone; +extern struct kmem_cache *xfs_ili_cache; #endif /* __XFS_INODE_ITEM_H__ */ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 0c795dc093ef..174cd8950cb6 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1547,7 +1547,7 @@ xfs_ioc_getbmap( if (bmx.bmv_count > ULONG_MAX / recsize) return -ENOMEM; - buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL); + buf = kvcalloc(bmx.bmv_count, sizeof(*buf), GFP_KERNEL); if (!buf) return -ENOMEM; @@ -1601,11 +1601,11 @@ xfs_ioc_getfsmap( */ count = min_t(unsigned int, head.fmh_count, 131072 / sizeof(struct fsmap)); - recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); + recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL); if (!recs) { count = min_t(unsigned int, head.fmh_count, PAGE_SIZE / sizeof(struct fsmap)); - recs = kvzalloc(count * sizeof(struct fsmap), GFP_KERNEL); + recs = kvcalloc(count, sizeof(struct fsmap), GFP_KERNEL); if (!recs) return -ENOMEM; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f6cd2d4aa770..89fec9a18c34 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -21,7 +21,7 @@ #include "xfs_sb.h" #include "xfs_health.h" -kmem_zone_t *xfs_log_ticket_zone; +struct kmem_cache *xfs_log_ticket_cache; /* Local miscellaneous function prototypes */ STATIC struct xlog * @@ -3487,7 +3487,7 @@ xfs_log_ticket_put( { ASSERT(atomic_read(&ticket->t_ref) > 0); if (atomic_dec_and_test(&ticket->t_ref)) - kmem_cache_free(xfs_log_ticket_zone, ticket); + kmem_cache_free(xfs_log_ticket_cache, ticket); } xlog_ticket_t * @@ -3611,7 +3611,7 @@ xlog_ticket_alloc( struct xlog_ticket *tic; int unit_res; - tic = kmem_cache_zalloc(xfs_log_ticket_zone, GFP_NOFS | __GFP_NOFAIL); + tic = kmem_cache_zalloc(xfs_log_ticket_cache, GFP_NOFS | __GFP_NOFAIL); unit_res = xlog_calc_unit_res(log, unit_bytes); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 844fbeec3545..23103d68423c 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -497,7 +497,7 @@ xlog_recover_cancel(struct xlog *); extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead, char *dp, int size); -extern kmem_zone_t *xfs_log_ticket_zone; +extern struct kmem_cache *xfs_log_ticket_cache; struct xlog_ticket * xlog_ticket_alloc( struct xlog *log, diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 10562ecbd9ea..53366cc0bc9e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2466,11 +2466,11 @@ xlog_finish_defer_ops( { struct xfs_defer_capture *dfc, *next; struct xfs_trans *tp; - struct xfs_inode *ip; int error = 0; list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { struct xfs_trans_res resv; + struct xfs_defer_resources dres; /* * Create a new transaction reservation from the captured @@ -2494,13 +2494,9 @@ xlog_finish_defer_ops( * from recovering a single intent item. */ list_del_init(&dfc->dfc_list); - xfs_defer_ops_continue(dfc, tp, &ip); - + xfs_defer_ops_continue(dfc, tp, &dres); error = xfs_trans_commit(tp); - if (ip) { - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_irele(ip); - } + xfs_defer_resources_rele(&dres); if (error) return error; } @@ -2520,7 +2516,7 @@ xlog_abort_defer_ops( list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { list_del_init(&dfc->dfc_list); - xfs_defer_ops_release(mp, dfc); + xfs_defer_ops_capture_free(mp, dfc); } } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 06dac09eddbd..359109b6f0d3 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -567,6 +567,18 @@ xfs_mount_setup_inode_geom( xfs_ialloc_setup_geometry(mp); } +/* Compute maximum possible height for per-AG btree types for this fs. */ +static inline void +xfs_agbtree_compute_maxlevels( + struct xfs_mount *mp) +{ + unsigned int levels; + + levels = max(mp->m_alloc_maxlevels, M_IGEO(mp)->inobt_maxlevels); + levels = max(levels, mp->m_rmap_maxlevels); + mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); +} + /* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct @@ -638,6 +650,8 @@ xfs_mountfs( xfs_rmapbt_compute_maxlevels(mp); xfs_refcountbt_compute_maxlevels(mp); + xfs_agbtree_compute_maxlevels(mp); + /* * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks * is NOT aligned turn off m_dalign since allocator alignment is within diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index e091f3b3fa15..00720a02e761 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -128,10 +128,11 @@ typedef struct xfs_mount { uint m_rmap_mnr[2]; /* min rmap btree records */ uint m_refc_mxr[2]; /* max refc btree records */ uint m_refc_mnr[2]; /* min refc btree records */ - uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */ - uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */ + uint m_alloc_maxlevels; /* max alloc btree levels */ + uint m_bm_maxlevels[2]; /* max bmap btree levels */ uint m_rmap_maxlevels; /* max rmap btree levels */ uint m_refc_maxlevels; /* max refcount btree level */ + unsigned int m_agbtree_maxlevels; /* max level of all AG btrees */ xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */ uint m_alloc_set_aside; /* space we can't use */ uint m_ag_max_usable; /* max space per AG */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 34c3b16f834f..f85e3b07ab44 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -219,7 +219,7 @@ _xfs_mru_cache_list_insert( * When destroying or reaping, all the elements that were migrated to the reap * list need to be deleted. For each element this involves removing it from the * data store, removing it from the reap list, calling the client's free - * function and deleting the element from the element zone. + * function and deleting the element from the element cache. * * We get called holding the mru->lock, which we drop and then reacquire. * Sparse need special help with this to tell it we know what we are doing. diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5608066d6e53..32ac8d9c8940 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -850,7 +850,7 @@ xfs_qm_reset_dqcounts( */ #ifdef DEBUG j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) / - sizeof(xfs_dqblk_t); + sizeof(struct xfs_dqblk); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif dqb = bp->b_addr; diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 442a0f97a9d4..5bb12717ea28 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -11,7 +11,7 @@ struct xfs_inode; -extern struct kmem_zone *xfs_qm_dqtrxzone; +extern struct kmem_cache *xfs_dqtrx_cache; /* * Number of bmaps that we ask from bmapi when doing a quotacheck. diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 46904b793bd4..d3da67772d57 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -21,8 +21,8 @@ #include "xfs_log_priv.h" #include "xfs_log_recover.h" -kmem_zone_t *xfs_cui_zone; -kmem_zone_t *xfs_cud_zone; +struct kmem_cache *xfs_cui_cache; +struct kmem_cache *xfs_cud_cache; static const struct xfs_item_ops xfs_cui_item_ops; @@ -38,7 +38,7 @@ xfs_cui_item_free( if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS) kmem_free(cuip); else - kmem_cache_free(xfs_cui_zone, cuip); + kmem_cache_free(xfs_cui_cache, cuip); } /* @@ -143,7 +143,7 @@ xfs_cui_init( cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents), 0); else - cuip = kmem_cache_zalloc(xfs_cui_zone, + cuip = kmem_cache_zalloc(xfs_cui_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops); @@ -204,7 +204,7 @@ xfs_cud_item_release( struct xfs_cud_log_item *cudp = CUD_ITEM(lip); xfs_cui_release(cudp->cud_cuip); - kmem_cache_free(xfs_cud_zone, cudp); + kmem_cache_free(xfs_cud_cache, cudp); } static const struct xfs_item_ops xfs_cud_item_ops = { @@ -221,7 +221,7 @@ xfs_trans_get_cud( { struct xfs_cud_log_item *cudp; - cudp = kmem_cache_zalloc(xfs_cud_zone, GFP_KERNEL | __GFP_NOFAIL); + cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops); cudp->cud_cuip = cuip; @@ -384,7 +384,7 @@ xfs_refcount_update_finish_item( refc->ri_blockcount = new_aglen; return -EAGAIN; } - kmem_free(refc); + kmem_cache_free(xfs_refcount_intent_cache, refc); return error; } @@ -404,7 +404,7 @@ xfs_refcount_update_cancel_item( struct xfs_refcount_intent *refc; refc = container_of(item, struct xfs_refcount_intent, ri_list); - kmem_free(refc); + kmem_cache_free(xfs_refcount_intent_cache, refc); } const struct xfs_defer_op_type xfs_refcount_update_defer_type = { @@ -557,7 +557,7 @@ xfs_cui_item_recover( } xfs_refcount_finish_one_cleanup(tp, rcur, error); - return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); + return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: xfs_refcount_finish_one_cleanup(tp, rcur, error); diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index f4f2e836540b..eb0ab13682d0 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -25,7 +25,7 @@ /* kernel only CUI/CUD definitions */ struct xfs_mount; -struct kmem_zone; +struct kmem_cache; /* * Max number of extents in fast allocation path. @@ -68,7 +68,7 @@ struct xfs_cud_log_item { struct xfs_cud_log_format cud_format; }; -extern struct kmem_zone *xfs_cui_zone; -extern struct kmem_zone *xfs_cud_zone; +extern struct kmem_cache *xfs_cui_cache; +extern struct kmem_cache *xfs_cud_cache; #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 76355f293488..cb0edb1d68ef 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -484,7 +484,7 @@ xfs_reflink_cancel_cow_blocks( xfs_refcount_free_cow_extent(*tpp, del.br_startblock, del.br_blockcount); - xfs_bmap_add_free(*tpp, del.br_startblock, + xfs_free_extent_later(*tpp, del.br_startblock, del.br_blockcount, NULL); /* Roll the transaction */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 5f0695980467..c3966b4c58ef 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -21,8 +21,8 @@ #include "xfs_log_priv.h" #include "xfs_log_recover.h" -kmem_zone_t *xfs_rui_zone; -kmem_zone_t *xfs_rud_zone; +struct kmem_cache *xfs_rui_cache; +struct kmem_cache *xfs_rud_cache; static const struct xfs_item_ops xfs_rui_item_ops; @@ -38,7 +38,7 @@ xfs_rui_item_free( if (ruip->rui_format.rui_nextents > XFS_RUI_MAX_FAST_EXTENTS) kmem_free(ruip); else - kmem_cache_free(xfs_rui_zone, ruip); + kmem_cache_free(xfs_rui_cache, ruip); } /* @@ -141,7 +141,7 @@ xfs_rui_init( if (nextents > XFS_RUI_MAX_FAST_EXTENTS) ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0); else - ruip = kmem_cache_zalloc(xfs_rui_zone, + ruip = kmem_cache_zalloc(xfs_rui_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); @@ -227,7 +227,7 @@ xfs_rud_item_release( struct xfs_rud_log_item *rudp = RUD_ITEM(lip); xfs_rui_release(rudp->rud_ruip); - kmem_cache_free(xfs_rud_zone, rudp); + kmem_cache_free(xfs_rud_cache, rudp); } static const struct xfs_item_ops xfs_rud_item_ops = { @@ -244,7 +244,7 @@ xfs_trans_get_rud( { struct xfs_rud_log_item *rudp; - rudp = kmem_cache_zalloc(xfs_rud_zone, GFP_KERNEL | __GFP_NOFAIL); + rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL); xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD, &xfs_rud_item_ops); rudp->rud_ruip = ruip; @@ -427,7 +427,7 @@ xfs_rmap_update_finish_item( rmap->ri_bmap.br_startoff, rmap->ri_bmap.br_startblock, rmap->ri_bmap.br_blockcount, rmap->ri_bmap.br_state, state); - kmem_free(rmap); + kmem_cache_free(xfs_rmap_intent_cache, rmap); return error; } @@ -447,7 +447,7 @@ xfs_rmap_update_cancel_item( struct xfs_rmap_intent *rmap; rmap = container_of(item, struct xfs_rmap_intent, ri_list); - kmem_free(rmap); + kmem_cache_free(xfs_rmap_intent_cache, rmap); } const struct xfs_defer_op_type xfs_rmap_update_defer_type = { @@ -587,7 +587,7 @@ xfs_rui_item_recover( } xfs_rmap_finish_one_cleanup(tp, rcur, error); - return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list); + return xfs_defer_ops_capture_and_commit(tp, capture_list); abort_error: xfs_rmap_finish_one_cleanup(tp, rcur, error); diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 31e6cdfff71f..802e5119eaca 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -28,7 +28,7 @@ /* kernel only RUI/RUD definitions */ struct xfs_mount; -struct kmem_zone; +struct kmem_cache; /* * Max number of extents in fast allocation path. @@ -68,7 +68,7 @@ struct xfs_rud_log_item { struct xfs_rud_log_format rud_format; }; -extern struct kmem_zone *xfs_rui_zone; -extern struct kmem_zone *xfs_rud_zone; +extern struct kmem_cache *xfs_rui_cache; +extern struct kmem_cache *xfs_rud_cache; #endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index c4e0cd1c1c8c..e21459f9923a 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -37,6 +37,7 @@ #include "xfs_reflink.h" #include "xfs_pwork.h" #include "xfs_ag.h" +#include "xfs_defer.h" #include <linux/magic.h> #include <linux/fs_context.h> @@ -1951,196 +1952,194 @@ static struct file_system_type xfs_fs_type = { MODULE_ALIAS_FS("xfs"); STATIC int __init -xfs_init_zones(void) +xfs_init_caches(void) { - xfs_log_ticket_zone = kmem_cache_create("xfs_log_ticket", + int error; + + xfs_log_ticket_cache = kmem_cache_create("xfs_log_ticket", sizeof(struct xlog_ticket), 0, 0, NULL); - if (!xfs_log_ticket_zone) + if (!xfs_log_ticket_cache) goto out; - xfs_bmap_free_item_zone = kmem_cache_create("xfs_bmap_free_item", - sizeof(struct xfs_extent_free_item), - 0, 0, NULL); - if (!xfs_bmap_free_item_zone) - goto out_destroy_log_ticket_zone; + error = xfs_btree_init_cur_caches(); + if (error) + goto out_destroy_log_ticket_cache; - xfs_btree_cur_zone = kmem_cache_create("xfs_btree_cur", - sizeof(struct xfs_btree_cur), - 0, 0, NULL); - if (!xfs_btree_cur_zone) - goto out_destroy_bmap_free_item_zone; + error = xfs_defer_init_item_caches(); + if (error) + goto out_destroy_btree_cur_cache; - xfs_da_state_zone = kmem_cache_create("xfs_da_state", + xfs_da_state_cache = kmem_cache_create("xfs_da_state", sizeof(struct xfs_da_state), 0, 0, NULL); - if (!xfs_da_state_zone) - goto out_destroy_btree_cur_zone; + if (!xfs_da_state_cache) + goto out_destroy_defer_item_cache; - xfs_ifork_zone = kmem_cache_create("xfs_ifork", + xfs_ifork_cache = kmem_cache_create("xfs_ifork", sizeof(struct xfs_ifork), 0, 0, NULL); - if (!xfs_ifork_zone) - goto out_destroy_da_state_zone; + if (!xfs_ifork_cache) + goto out_destroy_da_state_cache; - xfs_trans_zone = kmem_cache_create("xfs_trans", + xfs_trans_cache = kmem_cache_create("xfs_trans", sizeof(struct xfs_trans), 0, 0, NULL); - if (!xfs_trans_zone) - goto out_destroy_ifork_zone; + if (!xfs_trans_cache) + goto out_destroy_ifork_cache; /* - * The size of the zone allocated buf log item is the maximum + * The size of the cache-allocated buf log item is the maximum * size possible under XFS. This wastes a little bit of memory, * but it is much faster. */ - xfs_buf_item_zone = kmem_cache_create("xfs_buf_item", + xfs_buf_item_cache = kmem_cache_create("xfs_buf_item", sizeof(struct xfs_buf_log_item), 0, 0, NULL); - if (!xfs_buf_item_zone) - goto out_destroy_trans_zone; + if (!xfs_buf_item_cache) + goto out_destroy_trans_cache; - xfs_efd_zone = kmem_cache_create("xfs_efd_item", + xfs_efd_cache = kmem_cache_create("xfs_efd_item", (sizeof(struct xfs_efd_log_item) + (XFS_EFD_MAX_FAST_EXTENTS - 1) * sizeof(struct xfs_extent)), 0, 0, NULL); - if (!xfs_efd_zone) - goto out_destroy_buf_item_zone; + if (!xfs_efd_cache) + goto out_destroy_buf_item_cache; - xfs_efi_zone = kmem_cache_create("xfs_efi_item", + xfs_efi_cache = kmem_cache_create("xfs_efi_item", (sizeof(struct xfs_efi_log_item) + (XFS_EFI_MAX_FAST_EXTENTS - 1) * sizeof(struct xfs_extent)), 0, 0, NULL); - if (!xfs_efi_zone) - goto out_destroy_efd_zone; + if (!xfs_efi_cache) + goto out_destroy_efd_cache; - xfs_inode_zone = kmem_cache_create("xfs_inode", + xfs_inode_cache = kmem_cache_create("xfs_inode", sizeof(struct xfs_inode), 0, (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT), xfs_fs_inode_init_once); - if (!xfs_inode_zone) - goto out_destroy_efi_zone; + if (!xfs_inode_cache) + goto out_destroy_efi_cache; - xfs_ili_zone = kmem_cache_create("xfs_ili", + xfs_ili_cache = kmem_cache_create("xfs_ili", sizeof(struct xfs_inode_log_item), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); - if (!xfs_ili_zone) - goto out_destroy_inode_zone; + if (!xfs_ili_cache) + goto out_destroy_inode_cache; - xfs_icreate_zone = kmem_cache_create("xfs_icr", + xfs_icreate_cache = kmem_cache_create("xfs_icr", sizeof(struct xfs_icreate_item), 0, 0, NULL); - if (!xfs_icreate_zone) - goto out_destroy_ili_zone; + if (!xfs_icreate_cache) + goto out_destroy_ili_cache; - xfs_rud_zone = kmem_cache_create("xfs_rud_item", + xfs_rud_cache = kmem_cache_create("xfs_rud_item", sizeof(struct xfs_rud_log_item), 0, 0, NULL); - if (!xfs_rud_zone) - goto out_destroy_icreate_zone; + if (!xfs_rud_cache) + goto out_destroy_icreate_cache; - xfs_rui_zone = kmem_cache_create("xfs_rui_item", + xfs_rui_cache = kmem_cache_create("xfs_rui_item", xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS), 0, 0, NULL); - if (!xfs_rui_zone) - goto out_destroy_rud_zone; + if (!xfs_rui_cache) + goto out_destroy_rud_cache; - xfs_cud_zone = kmem_cache_create("xfs_cud_item", + xfs_cud_cache = kmem_cache_create("xfs_cud_item", sizeof(struct xfs_cud_log_item), 0, 0, NULL); - if (!xfs_cud_zone) - goto out_destroy_rui_zone; + if (!xfs_cud_cache) + goto out_destroy_rui_cache; - xfs_cui_zone = kmem_cache_create("xfs_cui_item", + xfs_cui_cache = kmem_cache_create("xfs_cui_item", xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS), 0, 0, NULL); - if (!xfs_cui_zone) - goto out_destroy_cud_zone; + if (!xfs_cui_cache) + goto out_destroy_cud_cache; - xfs_bud_zone = kmem_cache_create("xfs_bud_item", + xfs_bud_cache = kmem_cache_create("xfs_bud_item", sizeof(struct xfs_bud_log_item), 0, 0, NULL); - if (!xfs_bud_zone) - goto out_destroy_cui_zone; + if (!xfs_bud_cache) + goto out_destroy_cui_cache; - xfs_bui_zone = kmem_cache_create("xfs_bui_item", + xfs_bui_cache = kmem_cache_create("xfs_bui_item", xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS), 0, 0, NULL); - if (!xfs_bui_zone) - goto out_destroy_bud_zone; + if (!xfs_bui_cache) + goto out_destroy_bud_cache; return 0; - out_destroy_bud_zone: - kmem_cache_destroy(xfs_bud_zone); - out_destroy_cui_zone: - kmem_cache_destroy(xfs_cui_zone); - out_destroy_cud_zone: - kmem_cache_destroy(xfs_cud_zone); - out_destroy_rui_zone: - kmem_cache_destroy(xfs_rui_zone); - out_destroy_rud_zone: - kmem_cache_destroy(xfs_rud_zone); - out_destroy_icreate_zone: - kmem_cache_destroy(xfs_icreate_zone); - out_destroy_ili_zone: - kmem_cache_destroy(xfs_ili_zone); - out_destroy_inode_zone: - kmem_cache_destroy(xfs_inode_zone); - out_destroy_efi_zone: - kmem_cache_destroy(xfs_efi_zone); - out_destroy_efd_zone: - kmem_cache_destroy(xfs_efd_zone); - out_destroy_buf_item_zone: - kmem_cache_destroy(xfs_buf_item_zone); - out_destroy_trans_zone: - kmem_cache_destroy(xfs_trans_zone); - out_destroy_ifork_zone: - kmem_cache_destroy(xfs_ifork_zone); - out_destroy_da_state_zone: - kmem_cache_destroy(xfs_da_state_zone); - out_destroy_btree_cur_zone: - kmem_cache_destroy(xfs_btree_cur_zone); - out_destroy_bmap_free_item_zone: - kmem_cache_destroy(xfs_bmap_free_item_zone); - out_destroy_log_ticket_zone: - kmem_cache_destroy(xfs_log_ticket_zone); + out_destroy_bud_cache: + kmem_cache_destroy(xfs_bud_cache); + out_destroy_cui_cache: + kmem_cache_destroy(xfs_cui_cache); + out_destroy_cud_cache: + kmem_cache_destroy(xfs_cud_cache); + out_destroy_rui_cache: + kmem_cache_destroy(xfs_rui_cache); + out_destroy_rud_cache: + kmem_cache_destroy(xfs_rud_cache); + out_destroy_icreate_cache: + kmem_cache_destroy(xfs_icreate_cache); + out_destroy_ili_cache: + kmem_cache_destroy(xfs_ili_cache); + out_destroy_inode_cache: + kmem_cache_destroy(xfs_inode_cache); + out_destroy_efi_cache: + kmem_cache_destroy(xfs_efi_cache); + out_destroy_efd_cache: + kmem_cache_destroy(xfs_efd_cache); + out_destroy_buf_item_cache: + kmem_cache_destroy(xfs_buf_item_cache); + out_destroy_trans_cache: + kmem_cache_destroy(xfs_trans_cache); + out_destroy_ifork_cache: + kmem_cache_destroy(xfs_ifork_cache); + out_destroy_da_state_cache: + kmem_cache_destroy(xfs_da_state_cache); + out_destroy_defer_item_cache: + xfs_defer_destroy_item_caches(); + out_destroy_btree_cur_cache: + xfs_btree_destroy_cur_caches(); + out_destroy_log_ticket_cache: + kmem_cache_destroy(xfs_log_ticket_cache); out: return -ENOMEM; } STATIC void -xfs_destroy_zones(void) +xfs_destroy_caches(void) { /* * Make sure all delayed rcu free are flushed before we * destroy caches. */ rcu_barrier(); - kmem_cache_destroy(xfs_bui_zone); - kmem_cache_destroy(xfs_bud_zone); - kmem_cache_destroy(xfs_cui_zone); - kmem_cache_destroy(xfs_cud_zone); - kmem_cache_destroy(xfs_rui_zone); - kmem_cache_destroy(xfs_rud_zone); - kmem_cache_destroy(xfs_icreate_zone); - kmem_cache_destroy(xfs_ili_zone); - kmem_cache_destroy(xfs_inode_zone); - kmem_cache_destroy(xfs_efi_zone); - kmem_cache_destroy(xfs_efd_zone); - kmem_cache_destroy(xfs_buf_item_zone); - kmem_cache_destroy(xfs_trans_zone); - kmem_cache_destroy(xfs_ifork_zone); - kmem_cache_destroy(xfs_da_state_zone); - kmem_cache_destroy(xfs_btree_cur_zone); - kmem_cache_destroy(xfs_bmap_free_item_zone); - kmem_cache_destroy(xfs_log_ticket_zone); + kmem_cache_destroy(xfs_bui_cache); + kmem_cache_destroy(xfs_bud_cache); + kmem_cache_destroy(xfs_cui_cache); + kmem_cache_destroy(xfs_cud_cache); + kmem_cache_destroy(xfs_rui_cache); + kmem_cache_destroy(xfs_rud_cache); + kmem_cache_destroy(xfs_icreate_cache); + kmem_cache_destroy(xfs_ili_cache); + kmem_cache_destroy(xfs_inode_cache); + kmem_cache_destroy(xfs_efi_cache); + kmem_cache_destroy(xfs_efd_cache); + kmem_cache_destroy(xfs_buf_item_cache); + kmem_cache_destroy(xfs_trans_cache); + kmem_cache_destroy(xfs_ifork_cache); + kmem_cache_destroy(xfs_da_state_cache); + xfs_defer_destroy_item_caches(); + xfs_btree_destroy_cur_caches(); + kmem_cache_destroy(xfs_log_ticket_cache); } STATIC int __init @@ -2233,13 +2232,13 @@ init_xfs_fs(void) if (error) goto out; - error = xfs_init_zones(); + error = xfs_init_caches(); if (error) goto out_destroy_hp; error = xfs_init_workqueues(); if (error) - goto out_destroy_zones; + goto out_destroy_caches; error = xfs_mru_cache_init(); if (error) @@ -2314,8 +2313,8 @@ init_xfs_fs(void) xfs_mru_cache_uninit(); out_destroy_wq: xfs_destroy_workqueues(); - out_destroy_zones: - xfs_destroy_zones(); + out_destroy_caches: + xfs_destroy_caches(); out_destroy_hp: xfs_cpu_hotplug_destroy(); out: @@ -2338,7 +2337,7 @@ exit_xfs_fs(void) xfs_buf_terminate(); xfs_mru_cache_uninit(); xfs_destroy_workqueues(); - xfs_destroy_zones(); + xfs_destroy_caches(); xfs_uuid_table_free(); xfs_cpu_hotplug_destroy(); } diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 18dc5eca6c04..8608f804388f 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -105,7 +105,7 @@ bug_on_assert_show( struct kobject *kobject, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bug_on_assert ? 1 : 0); + return sysfs_emit(buf, "%d\n", xfs_globals.bug_on_assert); } XFS_SYSFS_ATTR_RW(bug_on_assert); @@ -135,7 +135,7 @@ log_recovery_delay_show( struct kobject *kobject, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay); + return sysfs_emit(buf, "%d\n", xfs_globals.log_recovery_delay); } XFS_SYSFS_ATTR_RW(log_recovery_delay); @@ -165,7 +165,7 @@ mount_delay_show( struct kobject *kobject, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay); + return sysfs_emit(buf, "%d\n", xfs_globals.mount_delay); } XFS_SYSFS_ATTR_RW(mount_delay); @@ -188,7 +188,7 @@ always_cow_show( struct kobject *kobject, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow); + return sysfs_emit(buf, "%d\n", xfs_globals.always_cow); } XFS_SYSFS_ATTR_RW(always_cow); @@ -224,7 +224,7 @@ pwork_threads_show( struct kobject *kobject, char *buf) { - return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.pwork_threads); + return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads); } XFS_SYSFS_ATTR_RW(pwork_threads); #endif /* DEBUG */ @@ -327,7 +327,7 @@ log_head_lsn_show( block = log->l_curr_block; spin_unlock(&log->l_icloglock); - return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); + return sysfs_emit(buf, "%d:%d\n", cycle, block); } XFS_SYSFS_ATTR_RO(log_head_lsn); @@ -341,7 +341,7 @@ log_tail_lsn_show( struct xlog *log = to_xlog(kobject); xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block); - return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, block); + return sysfs_emit(buf, "%d:%d\n", cycle, block); } XFS_SYSFS_ATTR_RO(log_tail_lsn); @@ -356,7 +356,7 @@ reserve_grant_head_show( struct xlog *log = to_xlog(kobject); xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes); - return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); + return sysfs_emit(buf, "%d:%d\n", cycle, bytes); } XFS_SYSFS_ATTR_RO(reserve_grant_head); @@ -370,7 +370,7 @@ write_grant_head_show( struct xlog *log = to_xlog(kobject); xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes); - return snprintf(buf, PAGE_SIZE, "%d:%d\n", cycle, bytes); + return sysfs_emit(buf, "%d:%d\n", cycle, bytes); } XFS_SYSFS_ATTR_RO(write_grant_head); @@ -425,7 +425,7 @@ max_retries_show( else retries = cfg->max_retries; - return snprintf(buf, PAGE_SIZE, "%d\n", retries); + return sysfs_emit(buf, "%d\n", retries); } static ssize_t @@ -466,7 +466,7 @@ retry_timeout_seconds_show( else timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC; - return snprintf(buf, PAGE_SIZE, "%d\n", timeout); + return sysfs_emit(buf, "%d\n", timeout); } static ssize_t @@ -504,7 +504,7 @@ fail_at_unmount_show( { struct xfs_mount *mp = err_to_mp(kobject); - return snprintf(buf, PAGE_SIZE, "%d\n", mp->m_fail_unmount); + return sysfs_emit(buf, "%d\n", mp->m_fail_unmount); } static ssize_t diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 1033a95fbf8e..4a8076ef8cb4 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -2476,7 +2476,7 @@ DECLARE_EVENT_CLASS(xfs_btree_cur_class, __entry->btnum = cur->bc_btnum; __entry->level = level; __entry->nlevels = cur->bc_nlevels; - __entry->ptr = cur->bc_ptrs[level]; + __entry->ptr = cur->bc_levels[level].ptr; __entry->daddr = bp ? xfs_buf_daddr(bp) : -1; ), TP_printk("dev %d:%d btree %s level %d/%d ptr %d daddr 0x%llx", diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 67dec11e34c7..234a9d9c2f43 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -25,7 +25,7 @@ #include "xfs_dquot.h" #include "xfs_icache.h" -kmem_zone_t *xfs_trans_zone; +struct kmem_cache *xfs_trans_cache; #if defined(CONFIG_TRACEPOINTS) static void @@ -76,7 +76,7 @@ xfs_trans_free( if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); - kmem_cache_free(xfs_trans_zone, tp); + kmem_cache_free(xfs_trans_cache, tp); } /* @@ -95,7 +95,7 @@ xfs_trans_dup( trace_xfs_trans_dup(tp, _RET_IP_); - ntp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); + ntp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL); /* * Initialize the new transaction structure. @@ -263,7 +263,7 @@ xfs_trans_alloc( * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ retry: - tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); + tp = kmem_cache_zalloc(xfs_trans_cache, GFP_KERNEL | __GFP_NOFAIL); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); xfs_trans_set_context(tp); @@ -477,7 +477,7 @@ STATIC void xfs_trans_apply_sb_deltas( xfs_trans_t *tp) { - xfs_dsb_t *sbp; + struct xfs_dsb *sbp; struct xfs_buf *bp; int whole = 0; @@ -541,14 +541,14 @@ xfs_trans_apply_sb_deltas( /* * Log the whole thing, the fields are noncontiguous. */ - xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_dsb_t) - 1); + xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); else /* * Since all the modifiable fields are contiguous, we * can get away with this. */ - xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount), - offsetof(xfs_dsb_t, sb_frextents) + + xfs_trans_log_buf(tp, bp, offsetof(struct xfs_dsb, sb_icount), + offsetof(struct xfs_dsb, sb_frextents) + sizeof(sbp->sb_frextents) - 1); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 50da47f23a07..a487b264a9eb 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -113,12 +113,6 @@ void xfs_log_item_init(struct xfs_mount *mp, struct xfs_log_item *item, #define XFS_ITEM_FLUSHING 3 /* - * Deferred operation item relogging limits. - */ -#define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ -#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ - -/* * This is the structure maintained for every active transaction. */ typedef struct xfs_trans { @@ -243,7 +237,7 @@ void xfs_trans_buf_set_type(struct xfs_trans *, struct xfs_buf *, void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); -extern kmem_zone_t *xfs_trans_zone; +extern struct kmem_cache *xfs_trans_cache; static inline struct xfs_log_item * xfs_trans_item_relog( diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 3872ce671411..9ba7e6b9bed3 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -846,7 +846,7 @@ STATIC void xfs_trans_alloc_dqinfo( xfs_trans_t *tp) { - tp->t_dqinfo = kmem_cache_zalloc(xfs_qm_dqtrxzone, + tp->t_dqinfo = kmem_cache_zalloc(xfs_dqtrx_cache, GFP_KERNEL | __GFP_NOFAIL); } @@ -856,6 +856,6 @@ xfs_trans_free_dqinfo( { if (!tp->t_dqinfo) return; - kmem_cache_free(xfs_qm_dqtrxzone, tp->t_dqinfo); + kmem_cache_free(xfs_dqtrx_cache, tp->t_dqinfo); tp->t_dqinfo = NULL; } diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index ddc346a9df9b..259ee2bda492 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -852,7 +852,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from) ret = zonefs_file_dio_append(iocb, from); else ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops, - &zonefs_write_dio_ops, 0); + &zonefs_write_dio_ops, 0, 0); if (zi->i_ztype == ZONEFS_ZTYPE_SEQ && (ret > 0 || ret == -EIOCBQUEUED)) { if (ret > 0) @@ -987,7 +987,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } file_accessed(iocb->ki_filp); ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops, - &zonefs_read_dio_ops, 0); + &zonefs_read_dio_ops, 0, 0); } else { ret = generic_file_read_iter(iocb, to); if (ret == -EIO) @@ -1128,7 +1128,7 @@ static const struct file_operations zonefs_file_operations = { .write_iter = zonefs_file_write_iter, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .iopoll = iomap_dio_iopoll, + .iopoll = iocb_bio_iopoll, }; static struct kmem_cache *zonefs_inode_cachep; |