diff options
Diffstat (limited to 'fs')
761 files changed, 32756 insertions, 18310 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index 8482f2d11606..31c010372660 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -247,7 +247,7 @@ static int v9fs_xattr_get_acl(struct dentry *dentry, const char *name, if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) return v9fs_remote_get_acl(dentry, name, buffer, size, type); - acl = v9fs_get_cached_acl(dentry->d_inode, type); + acl = v9fs_get_cached_acl(d_inode(dentry), type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -285,7 +285,7 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name, int retval; struct posix_acl *acl; struct v9fs_session_info *v9ses; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (strcmp(name, "") != 0) return -EINVAL; diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 620d93489539..8aa56bb6e861 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -320,31 +320,21 @@ fail_option_alloc: struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, const char *dev_name, char *data) { - int retval = -EINVAL; struct p9_fid *fid; - int rc; + int rc = -ENOMEM; v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL); if (!v9ses->uname) - return ERR_PTR(-ENOMEM); + goto err_names; v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL); - if (!v9ses->aname) { - kfree(v9ses->uname); - return ERR_PTR(-ENOMEM); - } + if (!v9ses->aname) + goto err_names; init_rwsem(&v9ses->rename_sem); rc = bdi_setup_and_register(&v9ses->bdi, "9p"); - if (rc) { - kfree(v9ses->aname); - kfree(v9ses->uname); - return ERR_PTR(rc); - } - - spin_lock(&v9fs_sessionlist_lock); - list_add(&v9ses->slist, &v9fs_sessionlist); - spin_unlock(&v9fs_sessionlist_lock); + if (rc) + goto err_names; v9ses->uid = INVALID_UID; v9ses->dfltuid = V9FS_DEFUID; @@ -352,10 +342,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { - retval = PTR_ERR(v9ses->clnt); - v9ses->clnt = NULL; + rc = PTR_ERR(v9ses->clnt); p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); - goto error; + goto err_bdi; } v9ses->flags = V9FS_ACCESS_USER; @@ -368,10 +357,8 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, } rc = v9fs_parse_options(v9ses, data); - if (rc < 0) { - retval = rc; - goto error; - } + if (rc < 0) + goto err_clnt; v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; @@ -405,10 +392,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID, v9ses->aname); if (IS_ERR(fid)) { - retval = PTR_ERR(fid); - fid = NULL; + rc = PTR_ERR(fid); p9_debug(P9_DEBUG_ERROR, "cannot attach\n"); - goto error; + goto err_clnt; } if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE) @@ -420,12 +406,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, /* register the session for caching */ v9fs_cache_session_get_cookie(v9ses); #endif + spin_lock(&v9fs_sessionlist_lock); + list_add(&v9ses->slist, &v9fs_sessionlist); + spin_unlock(&v9fs_sessionlist_lock); return fid; -error: +err_clnt: + p9_client_destroy(v9ses->clnt); +err_bdi: bdi_destroy(&v9ses->bdi); - return ERR_PTR(retval); +err_names: + kfree(v9ses->uname); + kfree(v9ses->aname); + return ERR_PTR(rc); } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 099c7712631c..0923f2cf3c80 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -78,7 +78,6 @@ enum p9_cache_modes { * @cache: cache mode of type &p9_cache_modes * @cachetag: the tag of the cache associated with this session * @fscache: session cookie associated with FS-Cache - * @options: copy of options string given by user * @uname: string user name to mount hierarchy as * @aname: mount specifier for remote hierarchy * @maxdata: maximum data to be sent/recvd per protocol message @@ -150,8 +149,6 @@ extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); -extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *p); extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h index b83ebfbf3fdc..5a0db6dec8d1 100644 --- a/fs/9p/v9fs_vfs.h +++ b/fs/9p/v9fs_vfs.h @@ -68,14 +68,10 @@ int v9fs_file_open(struct inode *inode, struct file *file); void v9fs_inode2stat(struct inode *inode, struct p9_wstat *stat); int v9fs_uflags2omode(int uflags, int extended); -ssize_t v9fs_file_readn(struct file *, char *, char __user *, u32, u64); -ssize_t v9fs_fid_readn(struct p9_fid *, char *, char __user *, u32, u64); void v9fs_blank_wstat(struct p9_wstat *wstat); int v9fs_vfs_setattr_dotl(struct dentry *, struct iattr *); int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, int datasync); -ssize_t v9fs_file_write_internal(struct inode *, struct p9_fid *, - const char __user *, size_t, loff_t *, int); int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode); int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode); static inline void v9fs_invalidate_inode_attr(struct inode *inode) diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index eb14e055ea83..e9e04376c52c 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,7 +33,7 @@ #include <linux/pagemap.h> #include <linux/idr.h> #include <linux/sched.h> -#include <linux/aio.h> +#include <linux/uio.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -51,12 +51,11 @@ */ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) { - int retval; - loff_t offset; - char *buffer; - struct inode *inode; + struct inode *inode = page->mapping->host; + struct bio_vec bvec = {.bv_page = page, .bv_len = PAGE_SIZE}; + struct iov_iter to; + int retval, err; - inode = page->mapping->host; p9_debug(P9_DEBUG_VFS, "\n"); BUG_ON(!PageLocked(page)); @@ -65,16 +64,16 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) if (retval == 0) return retval; - buffer = kmap(page); - offset = page_offset(page); + iov_iter_bvec(&to, ITER_BVEC | READ, &bvec, 1, PAGE_SIZE); - retval = v9fs_fid_readn(fid, buffer, NULL, PAGE_CACHE_SIZE, offset); - if (retval < 0) { + retval = p9_client_read(fid, page_offset(page), &to, &err); + if (err) { v9fs_uncache_page(inode, page); + retval = err; goto done; } - memset(buffer + retval, 0, PAGE_CACHE_SIZE - retval); + zero_user(page, retval, PAGE_SIZE - retval); flush_dcache_page(page); SetPageUptodate(page); @@ -82,7 +81,6 @@ static int v9fs_fid_readpage(struct p9_fid *fid, struct page *page) retval = 0; done: - kunmap(page); unlock_page(page); return retval; } @@ -161,41 +159,32 @@ static void v9fs_invalidate_page(struct page *page, unsigned int offset, static int v9fs_vfs_writepage_locked(struct page *page) { - char *buffer; - int retval, len; - loff_t offset, size; - mm_segment_t old_fs; - struct v9fs_inode *v9inode; struct inode *inode = page->mapping->host; + struct v9fs_inode *v9inode = V9FS_I(inode); + loff_t size = i_size_read(inode); + struct iov_iter from; + struct bio_vec bvec; + int err, len; - v9inode = V9FS_I(inode); - size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; else len = PAGE_CACHE_SIZE; - set_page_writeback(page); - - buffer = kmap(page); - offset = page_offset(page); + bvec.bv_page = page; + bvec.bv_offset = 0; + bvec.bv_len = len; + iov_iter_bvec(&from, ITER_BVEC | WRITE, &bvec, 1, len); - old_fs = get_fs(); - set_fs(get_ds()); /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); - retval = v9fs_file_write_internal(inode, - v9inode->writeback_fid, - (__force const char __user *)buffer, - len, &offset, 0); - if (retval > 0) - retval = 0; + set_page_writeback(page); + + p9_client_write(v9inode->writeback_fid, page_offset(page), &from, &err); - set_fs(old_fs); - kunmap(page); end_page_writeback(page); - return retval; + return err; } static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) @@ -241,11 +230,8 @@ static int v9fs_launder_page(struct page *page) /** * v9fs_direct_IO - 9P address space operation for direct I/O - * @rw: direction (read or write) * @iocb: target I/O control block - * @iov: array of vectors that define I/O buffer * @pos: offset in file to begin the operation - * @nr_segs: size of iovec array * * The presence of v9fs_direct_IO() in the address space ops vector * allowes open() O_DIRECT flags which would have failed otherwise. @@ -259,18 +245,23 @@ static int v9fs_launder_page(struct page *page) * */ static ssize_t -v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { - /* - * FIXME - * Now that we do caching with cache mode enabled, We need - * to support direct IO - */ - p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%pD) off/no(%lld/%lu) EINVAL\n", - iocb->ki_filp, - (long long)pos, iter->nr_segs); - - return -EINVAL; + struct file *file = iocb->ki_filp; + ssize_t n; + int err = 0; + if (iov_iter_rw(iter) == WRITE) { + n = p9_client_write(file->private_data, pos, iter, &err); + if (n) { + struct inode *inode = file_inode(file); + loff_t i_size = i_size_read(inode); + if (pos + n > i_size) + inode_add_bytes(inode, pos + n - i_size); + } + } else { + n = p9_client_read(file->private_data, pos, iter, &err); + } + return n ? n : err; } static int v9fs_write_begin(struct file *filp, struct address_space *mapping, diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index a345b2d659cc..bd456c668d39 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -53,7 +53,7 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry) dentry, dentry); /* Don't cache negative dentries */ - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return 1; return 0; } @@ -83,7 +83,7 @@ static int v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) goto out_valid; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 4f1151088ebe..5cc00e56206e 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -33,6 +33,7 @@ #include <linux/inet.h> #include <linux/idr.h> #include <linux/slab.h> +#include <linux/uio.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -115,6 +116,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) int buflen; int reclen = 0; struct p9_rdir *rdir; + struct kvec kvec; p9_debug(P9_DEBUG_VFS, "name %pD\n", file); fid = file->private_data; @@ -124,16 +126,23 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) rdir = v9fs_alloc_rdir_buf(file, buflen); if (!rdir) return -ENOMEM; + kvec.iov_base = rdir->buf; + kvec.iov_len = buflen; while (1) { if (rdir->tail == rdir->head) { - err = v9fs_file_readn(file, rdir->buf, NULL, - buflen, ctx->pos); - if (err <= 0) + struct iov_iter to; + int n; + iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buflen); + n = p9_client_read(file->private_data, ctx->pos, &to, + &err); + if (err) return err; + if (n == 0) + return 0; rdir->head = 0; - rdir->tail = err; + rdir->tail = n; } while (rdir->head < rdir->tail) { p9stat_init(&st); diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index b40133796b87..1ef16bd8280b 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -36,6 +36,8 @@ #include <linux/utsname.h> #include <asm/uaccess.h> #include <linux/idr.h> +#include <linux/uio.h> +#include <linux/slab.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -149,7 +151,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) { struct p9_flock flock; struct p9_fid *fid; - uint8_t status; + uint8_t status = P9_LOCK_ERROR; int res = 0; unsigned char fl_type; @@ -194,7 +196,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) for (;;) { res = p9_client_lock_dotl(fid, &flock, &status); if (res < 0) - break; + goto out_unlock; if (status != P9_LOCK_BLOCKED) break; @@ -212,14 +214,16 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) case P9_LOCK_BLOCKED: res = -EAGAIN; break; + default: + WARN_ONCE(1, "unknown lock status code: %d\n", status); + /* fallthough */ case P9_LOCK_ERROR: case P9_LOCK_GRACE: res = -ENOLCK; break; - default: - BUG(); } +out_unlock: /* * incase server returned error for lock request, revert * it locally @@ -285,6 +289,7 @@ static int v9fs_file_getlock(struct file *filp, struct file_lock *fl) fl->fl_end = glock.start + glock.length - 1; fl->fl_pid = glock.proc_id; } + kfree(glock.client_id); return res; } @@ -364,63 +369,6 @@ out_err: } /** - * v9fs_fid_readn - read from a fid - * @fid: fid to read - * @data: data buffer to read data into - * @udata: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data - * - */ -ssize_t -v9fs_fid_readn(struct p9_fid *fid, char *data, char __user *udata, u32 count, - u64 offset) -{ - int n, total, size; - - p9_debug(P9_DEBUG_VFS, "fid %d offset %llu count %d\n", - fid->fid, (long long unsigned)offset, count); - n = 0; - total = 0; - size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; - do { - n = p9_client_read(fid, data, udata, offset, count); - if (n <= 0) - break; - - if (data) - data += n; - if (udata) - udata += n; - - offset += n; - count -= n; - total += n; - } while (count > 0 && n == size); - - if (n < 0) - total = n; - - return total; -} - -/** - * v9fs_file_readn - read from a file - * @filp: file pointer to read - * @data: data buffer to read data into - * @udata: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data - * - */ -ssize_t -v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, - u64 offset) -{ - return v9fs_fid_readn(filp->private_data, data, udata, count, offset); -} - -/** * v9fs_file_read - read from a file * @filp: file pointer to read * @udata: user data buffer to read data into @@ -430,69 +378,22 @@ v9fs_file_readn(struct file *filp, char *data, char __user *udata, u32 count, */ static ssize_t -v9fs_file_read(struct file *filp, char __user *udata, size_t count, - loff_t * offset) +v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - int ret; - struct p9_fid *fid; - size_t size; - - p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", count, *offset); - fid = filp->private_data; + struct p9_fid *fid = iocb->ki_filp->private_data; + int ret, err; - size = fid->iounit ? fid->iounit : fid->clnt->msize - P9_IOHDRSZ; - if (count > size) - ret = v9fs_file_readn(filp, NULL, udata, count, *offset); - else - ret = p9_client_read(fid, NULL, udata, *offset, count); + p9_debug(P9_DEBUG_VFS, "count %zu offset %lld\n", + iov_iter_count(to), iocb->ki_pos); - if (ret > 0) - *offset += ret; + ret = p9_client_read(fid, iocb->ki_pos, to, &err); + if (!ret) + return err; + iocb->ki_pos += ret; return ret; } -ssize_t -v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, - const char __user *data, size_t count, - loff_t *offset, int invalidate) -{ - int n; - loff_t i_size; - size_t total = 0; - loff_t origin = *offset; - unsigned long pg_start, pg_end; - - p9_debug(P9_DEBUG_VFS, "data %p count %d offset %x\n", - data, (int)count, (int)*offset); - - do { - n = p9_client_write(fid, NULL, data+total, origin+total, count); - if (n <= 0) - break; - count -= n; - total += n; - } while (count > 0); - - if (invalidate && (total > 0)) { - pg_start = origin >> PAGE_CACHE_SHIFT; - pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT; - if (inode->i_mapping && inode->i_mapping->nrpages) - invalidate_inode_pages2_range(inode->i_mapping, - pg_start, pg_end); - *offset += total; - i_size = i_size_read(inode); - if (*offset > i_size) { - inode_add_bytes(inode, *offset - i_size); - i_size_write(inode, *offset); - } - } - if (n < 0) - return n; - - return total; -} - /** * v9fs_file_write - write to a file * @filp: file pointer to write @@ -502,35 +403,39 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid, * */ static ssize_t -v9fs_file_write(struct file *filp, const char __user * data, - size_t count, loff_t *offset) +v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - ssize_t retval = 0; - loff_t origin = *offset; - - - retval = generic_write_checks(filp, &origin, &count, 0); - if (retval) - goto out; + struct file *file = iocb->ki_filp; + ssize_t retval; + loff_t origin; + int err = 0; - retval = -EINVAL; - if ((ssize_t) count < 0) - goto out; - retval = 0; - if (!count) - goto out; + retval = generic_write_checks(iocb, from); + if (retval <= 0) + return retval; - retval = v9fs_file_write_internal(file_inode(filp), - filp->private_data, - data, count, &origin, 1); - /* update offset on successful write */ - if (retval > 0) - *offset = origin; -out: - return retval; + origin = iocb->ki_pos; + retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err); + if (retval > 0) { + struct inode *inode = file_inode(file); + loff_t i_size; + unsigned long pg_start, pg_end; + pg_start = origin >> PAGE_CACHE_SHIFT; + pg_end = (origin + retval - 1) >> PAGE_CACHE_SHIFT; + if (inode->i_mapping && inode->i_mapping->nrpages) + invalidate_inode_pages2_range(inode->i_mapping, + pg_start, pg_end); + iocb->ki_pos += retval; + i_size = i_size_read(inode); + if (iocb->ki_pos > i_size) { + inode_add_bytes(inode, iocb->ki_pos - i_size); + i_size_write(inode, iocb->ki_pos); + } + return retval; + } + return err; } - static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { @@ -657,44 +562,6 @@ out_unlock: return VM_FAULT_NOPAGE; } -static ssize_t -v9fs_direct_read(struct file *filp, char __user *udata, size_t count, - loff_t *offsetp) -{ - loff_t size, offset; - struct inode *inode; - struct address_space *mapping; - - offset = *offsetp; - mapping = filp->f_mapping; - inode = mapping->host; - if (!count) - return 0; - size = i_size_read(inode); - if (offset < size) - filemap_write_and_wait_range(mapping, offset, - offset + count - 1); - - return v9fs_file_read(filp, udata, count, offsetp); -} - -/** - * v9fs_cached_file_read - read from a file - * @filp: file pointer to read - * @data: user data buffer to read data into - * @count: size of buffer - * @offset: offset at which to read data - * - */ -static ssize_t -v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, - loff_t *offset) -{ - if (filp->f_flags & O_DIRECT) - return v9fs_direct_read(filp, data, count, offset); - return new_sync_read(filp, data, count, offset); -} - /** * v9fs_mmap_file_read - read from a file * @filp: file pointer to read @@ -704,84 +571,12 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, * */ static ssize_t -v9fs_mmap_file_read(struct file *filp, char __user *data, size_t count, - loff_t *offset) +v9fs_mmap_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { /* TODO: Check if there are dirty pages */ - return v9fs_file_read(filp, data, count, offset); -} - -static ssize_t -v9fs_direct_write(struct file *filp, const char __user * data, - size_t count, loff_t *offsetp) -{ - loff_t offset; - ssize_t retval; - struct inode *inode; - struct address_space *mapping; - - offset = *offsetp; - mapping = filp->f_mapping; - inode = mapping->host; - if (!count) - return 0; - - mutex_lock(&inode->i_mutex); - retval = filemap_write_and_wait_range(mapping, offset, - offset + count - 1); - if (retval) - goto err_out; - /* - * After a write we want buffered reads to be sure to go to disk to get - * the new data. We invalidate clean cached page from the region we're - * about to write. We do this *before* the write so that if we fail - * here we fall back to buffered write - */ - if (mapping->nrpages) { - pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT; - pgoff_t pg_end = (offset + count - 1) >> PAGE_CACHE_SHIFT; - - retval = invalidate_inode_pages2_range(mapping, - pg_start, pg_end); - /* - * If a page can not be invalidated, fall back - * to buffered write. - */ - if (retval) { - if (retval == -EBUSY) - goto buff_write; - goto err_out; - } - } - retval = v9fs_file_write(filp, data, count, offsetp); -err_out: - mutex_unlock(&inode->i_mutex); - return retval; - -buff_write: - mutex_unlock(&inode->i_mutex); - return new_sync_write(filp, data, count, offsetp); -} - -/** - * v9fs_cached_file_write - write to a file - * @filp: file pointer to write - * @data: data buffer to write data from - * @count: size of buffer - * @offset: offset at which to write data - * - */ -static ssize_t -v9fs_cached_file_write(struct file *filp, const char __user * data, - size_t count, loff_t *offset) -{ - - if (filp->f_flags & O_DIRECT) - return v9fs_direct_write(filp, data, count, offset); - return new_sync_write(filp, data, count, offset); + return v9fs_file_read_iter(iocb, to); } - /** * v9fs_mmap_file_write - write to a file * @filp: file pointer to write @@ -791,14 +586,13 @@ v9fs_cached_file_write(struct file *filp, const char __user * data, * */ static ssize_t -v9fs_mmap_file_write(struct file *filp, const char __user *data, - size_t count, loff_t *offset) +v9fs_mmap_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { /* * TODO: invalidate mmaps on filp's inode between * offset and offset+count */ - return v9fs_file_write(filp, data, count, offset); + return v9fs_file_write_iter(iocb, from); } static void v9fs_mmap_vm_close(struct vm_area_struct *vma) @@ -843,8 +637,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = { const struct file_operations v9fs_cached_file_operations = { .llseek = generic_file_llseek, - .read = v9fs_cached_file_read, - .write = v9fs_cached_file_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .open = v9fs_file_open, @@ -856,8 +648,6 @@ const struct file_operations v9fs_cached_file_operations = { const struct file_operations v9fs_cached_file_operations_dotl = { .llseek = generic_file_llseek, - .read = v9fs_cached_file_read, - .write = v9fs_cached_file_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .open = v9fs_file_open, @@ -870,8 +660,8 @@ const struct file_operations v9fs_cached_file_operations_dotl = { const struct file_operations v9fs_file_operations = { .llseek = generic_file_llseek, - .read = v9fs_file_read, - .write = v9fs_file_write, + .read_iter = v9fs_file_read_iter, + .write_iter = v9fs_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock, @@ -881,8 +671,8 @@ const struct file_operations v9fs_file_operations = { const struct file_operations v9fs_file_operations_dotl = { .llseek = generic_file_llseek, - .read = v9fs_file_read, - .write = v9fs_file_write, + .read_iter = v9fs_file_read_iter, + .write_iter = v9fs_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, @@ -893,8 +683,8 @@ const struct file_operations v9fs_file_operations_dotl = { const struct file_operations v9fs_mmap_file_operations = { .llseek = generic_file_llseek, - .read = v9fs_mmap_file_read, - .write = v9fs_mmap_file_write, + .read_iter = v9fs_mmap_file_read_iter, + .write_iter = v9fs_mmap_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock, @@ -904,8 +694,8 @@ const struct file_operations v9fs_mmap_file_operations = { const struct file_operations v9fs_mmap_file_operations_dotl = { .llseek = generic_file_llseek, - .read = v9fs_mmap_file_read, - .write = v9fs_mmap_file_write, + .read_iter = v9fs_mmap_file_read_iter, + .write_iter = v9fs_mmap_file_write_iter, .open = v9fs_file_open, .release = v9fs_dir_release, .lock = v9fs_file_lock_dotl, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 3662f1d1d9cf..b1dc51888048 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -540,8 +540,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb, unlock_new_inode(inode); return inode; error: - unlock_new_inode(inode); - iput(inode); + iget_failed(inode); return ERR_PTR(retval); } @@ -595,7 +594,7 @@ static int v9fs_remove(struct inode *dir, struct dentry *dentry, int flags) dir, dentry, flags); v9ses = v9fs_inode2v9ses(dir); - inode = dentry->d_inode; + inode = d_inode(dentry); dfid = v9fs_fid_lookup(dentry->d_parent); if (IS_ERR(dfid)) { retval = PTR_ERR(dfid); @@ -864,7 +863,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT) || dentry->d_inode) + if (!(flags & O_CREAT) || d_really_is_positive(dentry)) return finish_no_open(file, res); err = 0; @@ -881,7 +880,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, } v9fs_invalidate_inode_attr(dir); - v9inode = V9FS_I(dentry->d_inode); + v9inode = V9FS_I(d_inode(dentry)); mutex_lock(&v9inode->v_mutex); if ((v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) && !v9inode->writeback_fid && @@ -908,7 +907,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, file->private_data = fid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(dentry->d_inode, file); + v9fs_cache_inode_set_cookie(d_inode(dentry), file); *opened |= FILE_CREATED; out: @@ -969,8 +968,8 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, p9_debug(P9_DEBUG_VFS, "\n"); retval = 0; - old_inode = old_dentry->d_inode; - new_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); v9ses = v9fs_inode2v9ses(old_inode); oldfid = v9fs_fid_lookup(old_dentry); if (IS_ERR(oldfid)) @@ -1061,7 +1060,7 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -1072,8 +1071,8 @@ v9fs_vfs_getattr(struct vfsmount *mnt, struct dentry *dentry, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode(st, dentry->d_inode, dentry->d_inode->i_sb); - generic_fillattr(dentry->d_inode, stat); + v9fs_stat2inode(st, d_inode(dentry), d_inode(dentry)->i_sb); + generic_fillattr(d_inode(dentry), stat); p9stat_free(st); kfree(st); @@ -1095,7 +1094,7 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) struct p9_wstat wstat; p9_debug(P9_DEBUG_VFS, "\n"); - retval = inode_change_ok(dentry->d_inode, iattr); + retval = inode_change_ok(d_inode(dentry), iattr); if (retval) return retval; @@ -1128,20 +1127,20 @@ static int v9fs_vfs_setattr(struct dentry *dentry, struct iattr *iattr) /* Write all dirty data */ if (d_is_reg(dentry)) - filemap_write_and_wait(dentry->d_inode->i_mapping); + filemap_write_and_wait(d_inode(dentry)->i_mapping); retval = p9_client_wstat(fid, &wstat); if (retval < 0) return retval; if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(dentry->d_inode)) - truncate_setsize(dentry->d_inode, iattr->ia_size); + iattr->ia_size != i_size_read(d_inode(dentry))) + truncate_setsize(d_inode(dentry), iattr->ia_size); - v9fs_invalidate_inode_attr(dentry->d_inode); + v9fs_invalidate_inode_attr(d_inode(dentry)); - setattr_copy(dentry->d_inode, iattr); - mark_inode_dirty(dentry->d_inode); + setattr_copy(d_inode(dentry), iattr); + mark_inode_dirty(d_inode(dentry)); return 0; } @@ -1224,100 +1223,43 @@ ino_t v9fs_qid2ino(struct p9_qid *qid) } /** - * v9fs_readlink - read a symlink's location (internal version) + * v9fs_vfs_follow_link - follow a symlink path * @dentry: dentry for symlink - * @buffer: buffer to load symlink location into - * @buflen: length of buffer - * + * @cookie: place to pass the data to put_link() */ -static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) +static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie) { - int retval; - - struct v9fs_session_info *v9ses; - struct p9_fid *fid; + struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry); + struct p9_fid *fid = v9fs_fid_lookup(dentry); struct p9_wstat *st; + char *res; + + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); - p9_debug(P9_DEBUG_VFS, " %pd\n", dentry); - retval = -EPERM; - v9ses = v9fs_dentry2v9ses(dentry); - fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) - return PTR_ERR(fid); + return ERR_CAST(fid); if (!v9fs_proto_dotu(v9ses)) - return -EBADF; + return ERR_PTR(-EBADF); st = p9_client_stat(fid); if (IS_ERR(st)) - return PTR_ERR(st); + return ERR_CAST(st); if (!(st->mode & P9_DMSYMLINK)) { - retval = -EINVAL; - goto done; + p9stat_free(st); + kfree(st); + return ERR_PTR(-EINVAL); } + res = st->extension; + st->extension = NULL; + if (strlen(res) >= PATH_MAX) + res[PATH_MAX - 1] = '\0'; - /* copy extension buffer into buffer */ - retval = min(strlen(st->extension)+1, (size_t)buflen); - memcpy(buffer, st->extension, retval); - - p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n", - dentry, st->extension, buflen, buffer); - -done: p9stat_free(st); kfree(st); - return retval; -} - -/** - * v9fs_vfs_follow_link - follow a symlink path - * @dentry: dentry for symlink - * @nd: nameidata - * - */ - -static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - int len = 0; - char *link = __getname(); - - p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); - - if (!link) - link = ERR_PTR(-ENOMEM); - else { - len = v9fs_readlink(dentry, link, PATH_MAX); - - if (len < 0) { - __putname(link); - link = ERR_PTR(len); - } else - link[min(len, PATH_MAX-1)] = 0; - } - nd_set_link(nd, link); - - return NULL; -} - -/** - * v9fs_vfs_put_link - release a symlink path - * @dentry: dentry for symlink - * @nd: nameidata - * @p: unused - * - */ - -void -v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) -{ - char *s = nd_get_link(nd); - - p9_debug(P9_DEBUG_VFS, " %pd %s\n", - dentry, IS_ERR(s) ? "<error>" : s); - if (!IS_ERR(s)) - __putname(s); + return *cookie = res; } /** @@ -1370,6 +1312,8 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); } +#define U32_MAX_DIGITS 10 + /** * v9fs_vfs_link - create a hardlink * @old_dentry: dentry for file to link to @@ -1383,7 +1327,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int retval; - char *name; + char name[1 + U32_MAX_DIGITS + 2]; /* sign + number + \n + \0 */ struct p9_fid *oldfid; p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n", @@ -1393,20 +1337,12 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, if (IS_ERR(oldfid)) return PTR_ERR(oldfid); - name = __getname(); - if (unlikely(!name)) { - retval = -ENOMEM; - goto clunk_fid; - } - sprintf(name, "%d\n", oldfid->fid); retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); - __putname(name); if (!retval) { - v9fs_refresh_inode(oldfid, old_dentry->d_inode); + v9fs_refresh_inode(oldfid, d_inode(old_dentry)); v9fs_invalidate_inode_attr(dir); } -clunk_fid: p9_client_clunk(oldfid); return retval; } @@ -1425,7 +1361,7 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); int retval; - char *name; + char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1]; u32 perm; p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", @@ -1435,26 +1371,16 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde if (!new_valid_dev(rdev)) return -EINVAL; - name = __getname(); - if (!name) - return -ENOMEM; /* build extension */ if (S_ISBLK(mode)) sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev)); else if (S_ISCHR(mode)) sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); - else if (S_ISFIFO(mode)) - *name = 0; - else if (S_ISSOCK(mode)) + else *name = 0; - else { - __putname(name); - return -EINVAL; - } perm = unixmode2p9mode(v9ses, mode); retval = v9fs_vfs_mkspecial(dir, dentry, perm, name); - __putname(name); return retval; } @@ -1530,7 +1456,7 @@ static const struct inode_operations v9fs_file_inode_operations = { static const struct inode_operations v9fs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, - .put_link = v9fs_vfs_put_link, + .put_link = kfree_put_link, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 6054c16b8fae..e8aa57dc8d6d 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -149,8 +149,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb, unlock_new_inode(inode); return inode; error: - unlock_new_inode(inode); - iput(inode); + iget_failed(inode); return ERR_PTR(retval); } @@ -265,7 +264,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, } /* Only creates */ - if (!(flags & O_CREAT) || dentry->d_inode) + if (!(flags & O_CREAT) || d_really_is_positive(dentry)) return finish_no_open(file, res); v9ses = v9fs_inode2v9ses(dir); @@ -481,7 +480,7 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, p9_debug(P9_DEBUG_VFS, "dentry: %p\n", dentry); v9ses = v9fs_dentry2v9ses(dentry); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return 0; } fid = v9fs_fid_lookup(dentry); @@ -496,8 +495,8 @@ v9fs_vfs_getattr_dotl(struct vfsmount *mnt, struct dentry *dentry, if (IS_ERR(st)) return PTR_ERR(st); - v9fs_stat2inode_dotl(st, dentry->d_inode); - generic_fillattr(dentry->d_inode, stat); + v9fs_stat2inode_dotl(st, d_inode(dentry)); + generic_fillattr(d_inode(dentry), stat); /* Change block size to what the server returned */ stat->blksize = st->st_blksize; @@ -557,7 +556,7 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) int retval; struct p9_fid *fid; struct p9_iattr_dotl p9attr; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); p9_debug(P9_DEBUG_VFS, "\n"); @@ -795,10 +794,10 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, if (IS_ERR(fid)) return PTR_ERR(fid); - v9fs_refresh_inode_dotl(fid, old_dentry->d_inode); + v9fs_refresh_inode_dotl(fid, d_inode(old_dentry)); } - ihold(old_dentry->d_inode); - d_instantiate(dentry, old_dentry->d_inode); + ihold(d_inode(old_dentry)); + d_instantiate(dentry, d_inode(old_dentry)); return err; } @@ -905,41 +904,24 @@ error: /** * v9fs_vfs_follow_link_dotl - follow a symlink path * @dentry: dentry for symlink - * @nd: nameidata - * + * @cookie: place to pass the data to put_link() */ -static void * -v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) +static const char * +v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie) { - int retval; - struct p9_fid *fid; - char *link = __getname(); + struct p9_fid *fid = v9fs_fid_lookup(dentry); char *target; + int retval; p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); - if (!link) { - link = ERR_PTR(-ENOMEM); - goto ndset; - } - fid = v9fs_fid_lookup(dentry); - if (IS_ERR(fid)) { - __putname(link); - link = ERR_CAST(fid); - goto ndset; - } + if (IS_ERR(fid)) + return ERR_CAST(fid); retval = p9_client_readlink(fid, &target); - if (!retval) { - strcpy(link, target); - kfree(target); - goto ndset; - } - __putname(link); - link = ERR_PTR(retval); -ndset: - nd_set_link(nd, link); - return NULL; + if (retval) + return ERR_PTR(retval); + return *cookie = target; } int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) @@ -1006,7 +988,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = { const struct inode_operations v9fs_symlink_inode_operations_dotl = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link_dotl, - .put_link = v9fs_vfs_put_link, + .put_link = kfree_put_link, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .setxattr = generic_setxattr, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 0afd0382822b..bf495cedec26 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -130,11 +130,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, fid = v9fs_session_init(v9ses, dev_name, data); if (IS_ERR(fid)) { retval = PTR_ERR(fid); - /* - * we need to call session_close to tear down some - * of the data structure setup by session_init - */ - goto close_session; + goto free_session; } sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses); @@ -168,8 +164,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(st); goto release_sb; } - root->d_inode->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode_dotl(st, root->d_inode); + d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode_dotl(st, d_inode(root)); kfree(st); } else { struct p9_wstat *st = NULL; @@ -179,8 +175,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, goto release_sb; } - root->d_inode->i_ino = v9fs_qid2ino(&st->qid); - v9fs_stat2inode(st, root->d_inode, sb); + d_inode(root)->i_ino = v9fs_qid2ino(&st->qid); + v9fs_stat2inode(st, d_inode(root), sb); p9stat_free(st); kfree(st); @@ -195,8 +191,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, clunk_fid: p9_client_clunk(fid); -close_session: v9fs_session_close(v9ses); +free_session: kfree(v9ses); return ERR_PTR(retval); diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index f95e01e058e4..0cf44b6cccd6 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/sched.h> +#include <linux/uio.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -25,50 +26,34 @@ ssize_t v9fs_fid_xattr_get(struct p9_fid *fid, const char *name, void *buffer, size_t buffer_size) { ssize_t retval; - int msize, read_count; - u64 offset = 0, attr_size; + u64 attr_size; struct p9_fid *attr_fid; + struct kvec kvec = {.iov_base = buffer, .iov_len = buffer_size}; + struct iov_iter to; + int err; + + iov_iter_kvec(&to, READ | ITER_KVEC, &kvec, 1, buffer_size); attr_fid = p9_client_xattrwalk(fid, name, &attr_size); if (IS_ERR(attr_fid)) { retval = PTR_ERR(attr_fid); p9_debug(P9_DEBUG_VFS, "p9_client_attrwalk failed %zd\n", retval); - attr_fid = NULL; - goto error; - } - if (!buffer_size) { - /* request to get the attr_size */ - retval = attr_size; - goto error; + return retval; } if (attr_size > buffer_size) { - retval = -ERANGE; - goto error; - } - msize = attr_fid->clnt->msize; - while (attr_size) { - if (attr_size > (msize - P9_IOHDRSZ)) - read_count = msize - P9_IOHDRSZ; + if (!buffer_size) /* request to get the attr_size */ + retval = attr_size; else - read_count = attr_size; - read_count = p9_client_read(attr_fid, ((char *)buffer)+offset, - NULL, offset, read_count); - if (read_count < 0) { - /* error in xattr read */ - retval = read_count; - goto error; - } - offset += read_count; - attr_size -= read_count; + retval = -ERANGE; + } else { + iov_iter_truncate(&to, attr_size); + retval = p9_client_read(attr_fid, 0, &to, &err); + if (err) + retval = err; } - /* Total read xattr bytes */ - retval = offset; -error: - if (attr_fid) - p9_client_clunk(attr_fid); + p9_client_clunk(attr_fid); return retval; - } @@ -120,8 +105,11 @@ int v9fs_xattr_set(struct dentry *dentry, const char *name, int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, const void *value, size_t value_len, int flags) { - u64 offset = 0; - int retval, msize, write_count; + struct kvec kvec = {.iov_base = (void *)value, .iov_len = value_len}; + struct iov_iter from; + int retval; + + iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len); p9_debug(P9_DEBUG_VFS, "name = %s value_len = %zu flags = %d\n", name, value_len, flags); @@ -135,29 +123,11 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, * On success fid points to xattr */ retval = p9_client_xattrcreate(fid, name, value_len, flags); - if (retval < 0) { + if (retval < 0) p9_debug(P9_DEBUG_VFS, "p9_client_xattrcreate failed %d\n", retval); - goto err; - } - msize = fid->clnt->msize; - while (value_len) { - if (value_len > (msize - P9_IOHDRSZ)) - write_count = msize - P9_IOHDRSZ; - else - write_count = value_len; - write_count = p9_client_write(fid, ((char *)value)+offset, - NULL, offset, write_count); - if (write_count < 0) { - /* error in xattr write */ - retval = write_count; - goto err; - } - offset += write_count; - value_len -= write_count; - } - retval = 0; -err: + else + p9_client_write(fid, 0, &from, &retval); p9_client_clunk(fid); return retval; } diff --git a/fs/Kconfig b/fs/Kconfig index ec35851e5b71..011f43365d7b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -32,6 +32,7 @@ source "fs/gfs2/Kconfig" source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" +source "fs/f2fs/Kconfig" config FS_DAX bool "Direct Access (DAX) support" @@ -217,7 +218,6 @@ source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" -source "fs/f2fs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 270c48148f79..2d0cbbd14cfc 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -27,9 +27,6 @@ config COMPAT_BINFMT_ELF bool depends on COMPAT && BINFMT_ELF -config ARCH_BINFMT_ELF_RANDOMIZE_PIE - bool - config ARCH_BINFMT_ELF_STATE bool diff --git a/fs/Makefile b/fs/Makefile index a88ac4838c9e..cb20e4bf2303 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -115,9 +115,9 @@ obj-$(CONFIG_AFS_FS) += afs/ obj-$(CONFIG_NILFS2_FS) += nilfs2/ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ -obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_CACHEFILES) += cachefiles/ obj-$(CONFIG_DEBUG_FS) += debugfs/ +obj-$(CONFIG_TRACING) += tracefs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index f2ba88ab4aed..82d14cdf70f9 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -61,6 +61,7 @@ adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct kcalloc(size, sizeof(struct buffer_head *), GFP_KERNEL); if (!bh_fplus) { + ret = -ENOMEM; adfs_error(sb, "not enough memory for" " dir object %X (%d blocks)", id, size); goto out; diff --git a/fs/adfs/file.c b/fs/adfs/file.c index 07c9edce5aa7..46c0d5671cd5 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -23,11 +23,9 @@ const struct file_operations adfs_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, - .write = new_sync_write, .write_iter = generic_file_write_iter, .splice_read = generic_file_splice_read, }; diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index b9acadafa4a1..335055d828e4 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -298,7 +298,7 @@ out: int adfs_notify_change(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; unsigned int ia_valid = attr->ia_valid; int error; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 9852bdf34d76..4d4a0df8344f 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -242,7 +242,7 @@ static struct kmem_cache *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; - ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; @@ -316,7 +316,7 @@ static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_di dm = kmalloc(nzones * sizeof(*dm), GFP_KERNEL); if (dm == NULL) { adfs_error(sb, "not enough memory"); - return NULL; + return ERR_PTR(-ENOMEM); } for (zone = 0; zone < nzones; zone++, map_addr++) { @@ -349,7 +349,7 @@ error_free: brelse(dm[zone].dm_bh); kfree(dm); - return NULL; + return ERR_PTR(-EIO); } static inline unsigned long adfs_discsize(struct adfs_discrecord *dr, int block_bits) @@ -370,6 +370,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) unsigned char *b_data; struct adfs_sb_info *asb; struct inode *root; + int ret = -EINVAL; sb->s_flags |= MS_NODIRATIME; @@ -391,6 +392,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) sb_set_blocksize(sb, BLOCK_SIZE); if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) { adfs_error(sb, "unable to read superblock"); + ret = -EIO; goto error; } @@ -400,6 +402,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) if (!silent) printk("VFS: Can't find an adfs filesystem on dev " "%s.\n", sb->s_id); + ret = -EINVAL; goto error_free_bh; } @@ -412,6 +415,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) if (!silent) printk("VPS: Can't find an adfs filesystem on dev " "%s.\n", sb->s_id); + ret = -EINVAL; goto error_free_bh; } @@ -421,11 +425,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) if (!bh) { adfs_error(sb, "couldn't read superblock on " "2nd try."); + ret = -EIO; goto error; } b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); if (adfs_checkbblk(b_data)) { adfs_error(sb, "disc record mismatch, very weird!"); + ret = -EINVAL; goto error_free_bh; } dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); @@ -433,6 +439,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) if (!silent) printk(KERN_ERR "VFS: Unsupported blocksize on dev " "%s.\n", sb->s_id); + ret = -EINVAL; goto error; } @@ -447,10 +454,12 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) asb->s_size = adfs_discsize(dr, sb->s_blocksize_bits); asb->s_version = dr->format_version; asb->s_log2sharesize = dr->log2sharesize; - + asb->s_map = adfs_read_map(sb, dr); - if (!asb->s_map) + if (IS_ERR(asb->s_map)) { + ret = PTR_ERR(asb->s_map); goto error_free_bh; + } brelse(bh); @@ -499,6 +508,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) brelse(asb->s_map[i].dm_bh); kfree(asb->s_map); adfs_error(sb, "get root inode failed\n"); + ret = -EIO; goto error; } return 0; @@ -508,7 +518,7 @@ error_free_bh: error: sb->s_fs_info = NULL; kfree(asb); - return -EINVAL; + return ret; } static struct dentry *adfs_mount(struct file_system_type *fs_type, diff --git a/fs/affs/affs.h b/fs/affs/affs.h index c8764bd7497d..c69a87eaf57d 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -64,7 +64,7 @@ struct affs_inode_info { /* short cut to get to the affs specific inode data */ static inline struct affs_inode_info *AFFS_I(struct inode *inode) { - return list_entry(inode, struct affs_inode_info, vfs_inode); + return container_of(inode, struct affs_inode_info, vfs_inode); } /* @@ -106,18 +106,22 @@ struct affs_sb_info { spinlock_t work_lock; /* protects sb_work and work_queued */ }; -#define SF_INTL 0x0001 /* International filesystem. */ -#define SF_BM_VALID 0x0002 /* Bitmap is valid. */ -#define SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */ -#define SF_QUIET 0x0008 /* chmod errors will be not reported */ -#define SF_SETUID 0x0010 /* Ignore Amiga uid */ -#define SF_SETGID 0x0020 /* Ignore Amiga gid */ -#define SF_SETMODE 0x0040 /* Ignore Amiga protection bits */ -#define SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */ -#define SF_OFS 0x0200 /* Old filesystem */ -#define SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ -#define SF_VERBOSE 0x0800 /* Talk about fs when mounting */ -#define SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ +#define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */ +#define AFFS_MOUNT_SF_BM_VALID 0x0002 /* Bitmap is valid. */ +#define AFFS_MOUNT_SF_IMMUTABLE 0x0004 /* Protection bits cannot be changed */ +#define AFFS_MOUNT_SF_QUIET 0x0008 /* chmod errors will be not reported */ +#define AFFS_MOUNT_SF_SETUID 0x0010 /* Ignore Amiga uid */ +#define AFFS_MOUNT_SF_SETGID 0x0020 /* Ignore Amiga gid */ +#define AFFS_MOUNT_SF_SETMODE 0x0040 /* Ignore Amiga protection bits */ +#define AFFS_MOUNT_SF_MUFS 0x0100 /* Use MUFS uid/gid mapping */ +#define AFFS_MOUNT_SF_OFS 0x0200 /* Old filesystem */ +#define AFFS_MOUNT_SF_PREFIX 0x0400 /* Buffer for prefix is allocated */ +#define AFFS_MOUNT_SF_VERBOSE 0x0800 /* Talk about fs when mounting */ +#define AFFS_MOUNT_SF_NO_TRUNCATE 0x1000 /* Don't truncate filenames */ + +#define affs_clear_opt(o, opt) (o &= ~AFFS_MOUNT_##opt) +#define affs_set_opt(o, opt) (o |= AFFS_MOUNT_##opt) +#define affs_test_opt(o, opt) ((o) & AFFS_MOUNT_##opt) /* short cut to get to the affs specific sb data */ static inline struct affs_sb_info *AFFS_SB(struct super_block *sb) diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 388da1ea815d..5fa92bc790ef 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -138,9 +138,9 @@ affs_fix_dcache(struct inode *inode, u32 entry_ino) static int affs_remove_link(struct dentry *dentry) { - struct inode *dir, *inode = dentry->d_inode; + struct inode *dir, *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; - struct buffer_head *bh = NULL, *link_bh = NULL; + struct buffer_head *bh, *link_bh = NULL; u32 link_ino, ino; int retval; @@ -268,11 +268,11 @@ affs_remove_header(struct dentry *dentry) struct buffer_head *bh = NULL; int retval; - dir = dentry->d_parent->d_inode; + dir = d_inode(dentry->d_parent); sb = dir->i_sb; retval = -ENOENT; - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) goto done; @@ -471,9 +471,9 @@ affs_warning(struct super_block *sb, const char *function, const char *fmt, ...) bool affs_nofilenametruncate(const struct dentry *dentry) { - struct inode *inode = dentry->d_inode; - return AFFS_SB(inode->i_sb)->s_flags & SF_NO_TRUNCATE; + struct inode *inode = d_inode(dentry); + return affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_NO_TRUNCATE); } /* Check if the name is valid for a affs object. */ diff --git a/fs/affs/file.c b/fs/affs/file.c index a91795e01a7f..659c579c4588 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -12,7 +12,7 @@ * affs regular file handling primitives */ -#include <linux/aio.h> +#include <linux/uio.h> #include "affs.h" static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); @@ -389,8 +389,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to) } static ssize_t -affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -398,15 +397,15 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, size_t count = iov_iter_count(iter); ssize_t ret; - if (rw == WRITE) { + if (iov_iter_rw(iter) == WRITE) { loff_t size = offset + count; if (AFFS_I(inode)->mmu_private < size) return 0; } - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block); - if (ret < 0 && (rw & WRITE)) + ret = blockdev_direct_IO(iocb, inode, iter, offset, affs_get_block); + if (ret < 0 && iov_iter_rw(iter) == WRITE) affs_write_failed(mapping, offset + count); return ret; } @@ -915,7 +914,7 @@ affs_truncate(struct inode *inode) if (inode->i_size) { AFFS_I(inode)->i_blkcnt = last_blk + 1; AFFS_I(inode)->i_extcnt = ext + 1; - if (AFFS_SB(sb)->s_flags & SF_OFS) { + if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_OFS)) { struct buffer_head *bh = affs_bread_ino(inode, last_blk, 0); u32 tmp; if (IS_ERR(bh)) { @@ -969,9 +968,7 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) } const struct file_operations affs_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .open = affs_file_open, diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 6f34510449e8..17349500592d 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -66,23 +66,23 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) AFFS_I(inode)->i_lastalloc = 0; AFFS_I(inode)->i_pa_cnt = 0; - if (sbi->s_flags & SF_SETMODE) + if (affs_test_opt(sbi->s_flags, SF_SETMODE)) inode->i_mode = sbi->s_mode; else inode->i_mode = prot_to_mode(prot); id = be16_to_cpu(tail->uid); - if (id == 0 || sbi->s_flags & SF_SETUID) + if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETUID)) inode->i_uid = sbi->s_uid; - else if (id == 0xFFFF && sbi->s_flags & SF_MUFS) + else if (id == 0xFFFF && affs_test_opt(sbi->s_flags, SF_MUFS)) i_uid_write(inode, 0); else i_uid_write(inode, id); id = be16_to_cpu(tail->gid); - if (id == 0 || sbi->s_flags & SF_SETGID) + if (id == 0 || affs_test_opt(sbi->s_flags, SF_SETGID)) inode->i_gid = sbi->s_gid; - else if (id == 0xFFFF && sbi->s_flags & SF_MUFS) + else if (id == 0xFFFF && affs_test_opt(sbi->s_flags, SF_MUFS)) i_gid_write(inode, 0); else i_gid_write(inode, id); @@ -94,7 +94,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) /* fall through */ case ST_USERDIR: if (be32_to_cpu(tail->stype) == ST_USERDIR || - sbi->s_flags & SF_SETMODE) { + affs_test_opt(sbi->s_flags, SF_SETMODE)) { if (inode->i_mode & S_IRUSR) inode->i_mode |= S_IXUSR; if (inode->i_mode & S_IRGRP) @@ -133,7 +133,8 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) } if (tail->link_chain) set_nlink(inode, 2); - inode->i_mapping->a_ops = (sbi->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; + inode->i_mapping->a_ops = affs_test_opt(sbi->s_flags, SF_OFS) ? + &affs_aops_ofs : &affs_aops; inode->i_op = &affs_file_inode_operations; inode->i_fop = &affs_file_operations; break; @@ -190,15 +191,15 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) if (!(inode->i_ino == AFFS_SB(sb)->s_root_block)) { uid = i_uid_read(inode); gid = i_gid_read(inode); - if (AFFS_SB(sb)->s_flags & SF_MUFS) { + if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_MUFS)) { if (uid == 0 || uid == 0xFFFF) uid = uid ^ ~0; if (gid == 0 || gid == 0xFFFF) gid = gid ^ ~0; } - if (!(AFFS_SB(sb)->s_flags & SF_SETUID)) + if (!affs_test_opt(AFFS_SB(sb)->s_flags, SF_SETUID)) tail->uid = cpu_to_be16(uid); - if (!(AFFS_SB(sb)->s_flags & SF_SETGID)) + if (!affs_test_opt(AFFS_SB(sb)->s_flags, SF_SETGID)) tail->gid = cpu_to_be16(gid); } } @@ -212,7 +213,7 @@ affs_write_inode(struct inode *inode, struct writeback_control *wbc) int affs_notify_change(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; pr_debug("notify_change(%lu,0x%x)\n", inode->i_ino, attr->ia_valid); @@ -221,11 +222,14 @@ affs_notify_change(struct dentry *dentry, struct iattr *attr) if (error) goto out; - if (((attr->ia_valid & ATTR_UID) && (AFFS_SB(inode->i_sb)->s_flags & SF_SETUID)) || - ((attr->ia_valid & ATTR_GID) && (AFFS_SB(inode->i_sb)->s_flags & SF_SETGID)) || + if (((attr->ia_valid & ATTR_UID) && + affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_SETUID)) || + ((attr->ia_valid & ATTR_GID) && + affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_SETGID)) || ((attr->ia_valid & ATTR_MODE) && - (AFFS_SB(inode->i_sb)->s_flags & (SF_SETMODE | SF_IMMUTABLE)))) { - if (!(AFFS_SB(inode->i_sb)->s_flags & SF_QUIET)) + (AFFS_SB(inode->i_sb)->s_flags & + (AFFS_MOUNT_SF_SETMODE | AFFS_MOUNT_SF_IMMUTABLE)))) { + if (!affs_test_opt(AFFS_SB(inode->i_sb)->s_flags, SF_QUIET)) error = -EPERM; goto out; } @@ -342,7 +346,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 { struct super_block *sb = dir->i_sb; struct buffer_head *inode_bh = NULL; - struct buffer_head *bh = NULL; + struct buffer_head *bh; u32 block = 0; int retval; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index ffb7bd82c2a5..181e05b46e72 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -53,7 +53,8 @@ affs_intl_toupper(int ch) static inline toupper_t affs_get_toupper(struct super_block *sb) { - return AFFS_SB(sb)->s_flags & SF_INTL ? affs_intl_toupper : affs_toupper; + return affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL) ? + affs_intl_toupper : affs_toupper; } /* @@ -250,7 +251,7 @@ int affs_unlink(struct inode *dir, struct dentry *dentry) { pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, - dentry->d_inode->i_ino, dentry); + d_inode(dentry)->i_ino, dentry); return affs_remove_header(dentry); } @@ -275,7 +276,8 @@ affs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) inode->i_op = &affs_file_inode_operations; inode->i_fop = &affs_file_operations; - inode->i_mapping->a_ops = (AFFS_SB(sb)->s_flags & SF_OFS) ? &affs_aops_ofs : &affs_aops; + inode->i_mapping->a_ops = affs_test_opt(AFFS_SB(sb)->s_flags, SF_OFS) ? + &affs_aops_ofs : &affs_aops; error = affs_add_entry(dir, inode, dentry, ST_FILE); if (error) { clear_nlink(inode); @@ -318,7 +320,7 @@ int affs_rmdir(struct inode *dir, struct dentry *dentry) { pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino, - dentry->d_inode->i_ino, dentry); + d_inode(dentry)->i_ino, dentry); return affs_remove_header(dentry); } @@ -401,7 +403,7 @@ err: int affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino, dentry); @@ -428,13 +430,13 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, return retval; /* Unlink destination if it already exists */ - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { retval = affs_remove_header(new_dentry); if (retval) return retval; } - bh = affs_bread(sb, old_dentry->d_inode->i_ino); + bh = affs_bread(sb, d_inode(old_dentry)->i_ino); if (!bh) return -EIO; diff --git a/fs/affs/super.c b/fs/affs/super.c index 4cf0e9113fb6..3f89c9e05b40 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -227,22 +227,22 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, if (match_octal(&args[0], &option)) return 0; *mode = option & 0777; - *mount_opts |= SF_SETMODE; + affs_set_opt(*mount_opts, SF_SETMODE); break; case Opt_mufs: - *mount_opts |= SF_MUFS; + affs_set_opt(*mount_opts, SF_MUFS); break; case Opt_notruncate: - *mount_opts |= SF_NO_TRUNCATE; + affs_set_opt(*mount_opts, SF_NO_TRUNCATE); break; case Opt_prefix: *prefix = match_strdup(&args[0]); if (!*prefix) return 0; - *mount_opts |= SF_PREFIX; + affs_set_opt(*mount_opts, SF_PREFIX); break; case Opt_protect: - *mount_opts |= SF_IMMUTABLE; + affs_set_opt(*mount_opts, SF_IMMUTABLE); break; case Opt_reserved: if (match_int(&args[0], reserved)) @@ -258,7 +258,7 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, *gid = make_kgid(current_user_ns(), option); if (!gid_valid(*gid)) return 0; - *mount_opts |= SF_SETGID; + affs_set_opt(*mount_opts, SF_SETGID); break; case Opt_setuid: if (match_int(&args[0], &option)) @@ -266,10 +266,10 @@ parse_options(char *options, kuid_t *uid, kgid_t *gid, int *mode, int *reserved, *uid = make_kuid(current_user_ns(), option); if (!uid_valid(*uid)) return 0; - *mount_opts |= SF_SETUID; + affs_set_opt(*mount_opts, SF_SETUID); break; case Opt_verbose: - *mount_opts |= SF_VERBOSE; + affs_set_opt(*mount_opts, SF_VERBOSE); break; case Opt_volume: { char *vol = match_strdup(&args[0]); @@ -435,30 +435,31 @@ got_root: case MUFS_FS: case MUFS_INTLFFS: case MUFS_DCFFS: - sbi->s_flags |= SF_MUFS; + affs_set_opt(sbi->s_flags, SF_MUFS); /* fall thru */ case FS_INTLFFS: case FS_DCFFS: - sbi->s_flags |= SF_INTL; + affs_set_opt(sbi->s_flags, SF_INTL); break; case MUFS_FFS: - sbi->s_flags |= SF_MUFS; + affs_set_opt(sbi->s_flags, SF_MUFS); break; case FS_FFS: break; case MUFS_OFS: - sbi->s_flags |= SF_MUFS; + affs_set_opt(sbi->s_flags, SF_MUFS); /* fall thru */ case FS_OFS: - sbi->s_flags |= SF_OFS; + affs_set_opt(sbi->s_flags, SF_OFS); sb->s_flags |= MS_NOEXEC; break; case MUFS_DCOFS: case MUFS_INTLOFS: - sbi->s_flags |= SF_MUFS; + affs_set_opt(sbi->s_flags, SF_MUFS); case FS_DCOFS: case FS_INTLOFS: - sbi->s_flags |= SF_INTL | SF_OFS; + affs_set_opt(sbi->s_flags, SF_INTL); + affs_set_opt(sbi->s_flags, SF_OFS); sb->s_flags |= MS_NOEXEC; break; default: @@ -467,7 +468,7 @@ got_root: return -EINVAL; } - if (mount_flags & SF_VERBOSE) { + if (affs_test_opt(mount_flags, SF_VERBOSE)) { u8 len = AFFS_ROOT_TAIL(sb, root_bh)->disk_name[0]; pr_notice("Mounting volume \"%.*s\": Type=%.3s\\%c, Blocksize=%d\n", len > 31 ? 31 : len, @@ -478,7 +479,7 @@ got_root: sb->s_flags |= MS_NODEV | MS_NOSUID; sbi->s_data_blksize = sb->s_blocksize; - if (sbi->s_flags & SF_OFS) + if (affs_test_opt(sbi->s_flags, SF_OFS)) sbi->s_data_blksize -= 24; tmp_flags = sb->s_flags; @@ -493,7 +494,7 @@ got_root: if (IS_ERR(root_inode)) return PTR_ERR(root_inode); - if (AFFS_SB(sb)->s_flags & SF_INTL) + if (affs_test_opt(AFFS_SB(sb)->s_flags, SF_INTL)) sb->s_d_op = &affs_intl_dentry_operations; else sb->s_d_op = &affs_dentry_operations; @@ -520,10 +521,14 @@ affs_remount(struct super_block *sb, int *flags, char *data) int root_block; unsigned long mount_flags; int res = 0; - char *new_opts = kstrdup(data, GFP_KERNEL); + char *new_opts; char volume[32]; char *prefix = NULL; + new_opts = kstrdup(data, GFP_KERNEL); + if (!new_opts) + return -ENOMEM; + pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); sync_filesystem(sb); diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index f39b71c3981e..ea5b69a18ba9 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -16,14 +16,12 @@ static int affs_symlink_readpage(struct file *file, struct page *page) struct inode *inode = page->mapping->host; char *link = kmap(page); struct slink_front *lf; - int err; int i, j; char c; char lc; pr_debug("follow_link(ino=%lu)\n", inode->i_ino); - err = -EIO; bh = affs_bread(inode->i_sb, inode->i_ino); if (!bh) goto fail; @@ -66,7 +64,7 @@ fail: SetPageError(page); kunmap(page); unlock_page(page); - return err; + return -EIO; } const struct address_space_operations affs_symlink_aops = { diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 4ec35e9130e1..e10e17788f06 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -505,7 +505,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, _enter("{%x:%u},%p{%pd},", vnode->fid.vid, vnode->fid.vnode, dentry, dentry); - ASSERTCMP(dentry->d_inode, ==, NULL); + ASSERTCMP(d_inode(dentry), ==, NULL); if (dentry->d_name.len >= AFSNAMEMAX) { _leave(" = -ENAMETOOLONG"); @@ -563,8 +563,8 @@ success: _leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%u }", fid.vnode, fid.unique, - dentry->d_inode->i_ino, - dentry->d_inode->i_generation); + d_inode(dentry)->i_ino, + d_inode(dentry)->i_generation); return NULL; } @@ -586,9 +586,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - vnode = AFS_FS_I(dentry->d_inode); + vnode = AFS_FS_I(d_inode(dentry)); - if (dentry->d_inode) + if (d_really_is_positive(dentry)) _enter("{v={%x:%u} n=%pd fl=%lx},", vnode->fid.vid, vnode->fid.vnode, dentry, vnode->flags); @@ -601,7 +601,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) /* lock down the parent dentry so we can peer at it */ parent = dget_parent(dentry); - dir = AFS_FS_I(parent->d_inode); + dir = AFS_FS_I(d_inode(parent)); /* validate the parent directory */ if (test_bit(AFS_VNODE_MODIFIED, &dir->flags)) @@ -623,9 +623,9 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) switch (ret) { case 0: /* the filename maps to something */ - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out_bad; - if (is_bad_inode(dentry->d_inode)) { + if (is_bad_inode(d_inode(dentry))) { printk("kAFS: afs_d_revalidate: %pd2 has bad inode\n", dentry); goto out_bad; @@ -647,7 +647,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) _debug("%pd: file deleted (uq %u -> %u I:%u)", dentry, fid.unique, vnode->fid.unique, - dentry->d_inode->i_generation); + d_inode(dentry)->i_generation); spin_lock(&vnode->lock); set_bit(AFS_VNODE_DELETED, &vnode->flags); spin_unlock(&vnode->lock); @@ -658,7 +658,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) case -ENOENT: /* the filename is unknown */ _debug("%pd: dirent not found", dentry); - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto not_found; goto out_valid; @@ -703,9 +703,9 @@ static int afs_d_delete(const struct dentry *dentry) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto zap; - if (dentry->d_inode && - (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags) || - test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(dentry->d_inode)->flags))) + if (d_really_is_positive(dentry) && + (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(d_inode(dentry))->flags) || + test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(d_inode(dentry))->flags))) goto zap; _leave(" = 0 [keep]"); @@ -814,8 +814,8 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) if (ret < 0) goto rmdir_error; - if (dentry->d_inode) { - vnode = AFS_FS_I(dentry->d_inode); + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); clear_nlink(&vnode->vfs_inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); afs_discard_callback_on_delete(vnode); @@ -856,8 +856,8 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) goto error; } - if (dentry->d_inode) { - vnode = AFS_FS_I(dentry->d_inode); + if (d_really_is_positive(dentry)) { + vnode = AFS_FS_I(d_inode(dentry)); /* make sure we have a callback promise on the victim */ ret = afs_validate(vnode, key); @@ -869,7 +869,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) if (ret < 0) goto remove_error; - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { /* if the file wasn't deleted due to excess hard links, the * fileserver will break the callback promise on the file - if * it had one - before it returns to us, and if it was deleted, @@ -879,7 +879,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) * or it was outstanding on a different server, then it won't * break it either... */ - vnode = AFS_FS_I(dentry->d_inode); + vnode = AFS_FS_I(d_inode(dentry)); if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) _debug("AFS_VNODE_DELETED"); if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) @@ -977,7 +977,7 @@ static int afs_link(struct dentry *from, struct inode *dir, struct key *key; int ret; - vnode = AFS_FS_I(from->d_inode); + vnode = AFS_FS_I(d_inode(from)); dvnode = AFS_FS_I(dir); _enter("{%x:%u},{%x:%u},{%pd}", @@ -1089,7 +1089,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, struct key *key; int ret; - vnode = AFS_FS_I(old_dentry->d_inode); + vnode = AFS_FS_I(d_inode(old_dentry)); orig_dvnode = AFS_FS_I(old_dir); new_dvnode = AFS_FS_I(new_dir); diff --git a/fs/afs/file.c b/fs/afs/file.c index 932ce07948b3..999bc3caec92 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -31,8 +31,6 @@ const struct file_operations afs_file_operations = { .open = afs_open, .release = afs_release, .llseek = generic_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .write_iter = afs_file_write, .mmap = generic_file_readonly_mmap, diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 8a1d38ef0fc2..e06f5a23352a 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -379,7 +379,7 @@ int afs_getattr(struct vfsmount *mnt, struct dentry *dentry, { struct inode *inode; - inode = dentry->d_inode; + inode = d_inode(dentry); _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); @@ -458,7 +458,7 @@ void afs_evict_inode(struct inode *inode) */ int afs_setattr(struct dentry *dentry, struct iattr *attr) { - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); struct key *key; int ret; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 0dd4dafee10b..91ea1aa0d8b3 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -22,9 +22,12 @@ int afs_abort_to_error(u32 abort_code) { switch (abort_code) { + /* low errno codes inserted into abort namespace */ case 13: return -EACCES; case 27: return -EFBIG; case 30: return -EROFS; + + /* VICE "special error" codes; 101 - 111 */ case VSALVAGE: return -EIO; case VNOVNODE: return -ENOENT; case VNOVOL: return -ENOMEDIUM; @@ -36,11 +39,18 @@ int afs_abort_to_error(u32 abort_code) case VOVERQUOTA: return -EDQUOT; case VBUSY: return -EBUSY; case VMOVED: return -ENXIO; - case 0x2f6df0a: return -EWOULDBLOCK; + + /* Unified AFS error table; ET "uae" == 0x2f6df00 */ + case 0x2f6df00: return -EPERM; + case 0x2f6df01: return -ENOENT; + case 0x2f6df04: return -EIO; + case 0x2f6df0a: return -EAGAIN; + case 0x2f6df0b: return -ENOMEM; case 0x2f6df0c: return -EACCES; case 0x2f6df0f: return -EBUSY; case 0x2f6df10: return -EEXIST; case 0x2f6df11: return -EXDEV; + case 0x2f6df12: return -ENODEV; case 0x2f6df13: return -ENOTDIR; case 0x2f6df14: return -EISDIR; case 0x2f6df15: return -EINVAL; @@ -54,8 +64,12 @@ int afs_abort_to_error(u32 abort_code) case 0x2f6df23: return -ENAMETOOLONG; case 0x2f6df24: return -ENOLCK; case 0x2f6df26: return -ENOTEMPTY; + case 0x2f6df28: return -EWOULDBLOCK; + case 0x2f6df69: return -ENOTCONN; + case 0x2f6df6c: return -ETIMEDOUT; case 0x2f6df78: return -EDQUOT; + /* RXKAD abort codes; from include/rxrpc/packet.h. ET "RXK" == 0x1260B00 */ case RXKADINCONSISTENCY: return -EPROTO; case RXKADPACKETSHORT: return -EPROTO; case RXKADLEVELFAIL: return -EKEYREJECTED; diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 938c5ab06d5a..ccd0b212e82a 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -134,7 +134,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) _enter("{%pd}", mntpt); - BUG_ON(!mntpt->d_inode); + BUG_ON(!d_inode(mntpt)); ret = -ENOMEM; devname = (char *) get_zeroed_page(GFP_KERNEL); @@ -145,7 +145,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) if (!options) goto error_no_options; - vnode = AFS_FS_I(mntpt->d_inode); + vnode = AFS_FS_I(d_inode(mntpt)); if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { /* if the directory is a pseudo directory, use the d_name */ static const char afs_root_cell[] = ":root.cell."; @@ -169,14 +169,14 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) } } else { /* read the contents of the AFS special symlink */ - loff_t size = i_size_read(mntpt->d_inode); + loff_t size = i_size_read(d_inode(mntpt)); char *buf; ret = -EINVAL; if (size > PAGE_SIZE - 1) goto error_no_page; - page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); + page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL); if (IS_ERR(page)) { ret = PTR_ERR(page); goto error_no_page; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index dbc732e9a5c0..b50642870a43 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -85,7 +85,7 @@ int afs_open_socket(void) return -ENOMEM; } - ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); + ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); if (ret < 0) { destroy_workqueue(afs_async_calls); _leave(" = %d [socket]", ret); @@ -770,15 +770,12 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb, void afs_send_empty_reply(struct afs_call *call) { struct msghdr msg; - struct kvec iov[1]; _enter(""); - iov[0].iov_base = NULL; - iov[0].iov_len = 0; msg.msg_name = NULL; msg.msg_namelen = 0; - iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0); /* WTF? */ + iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, NULL, 0, 0); msg.msg_control = NULL; msg.msg_controllen = 0; msg.msg_flags = 0; diff --git a/fs/afs/super.c b/fs/afs/super.c index c4861557e385..1fb4a5129f7d 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -529,7 +529,7 @@ static void afs_destroy_inode(struct inode *inode) static int afs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct afs_volume_status vs; - struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode); + struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); struct key *key; int ret; diff --git a/fs/afs/write.c b/fs/afs/write.c index c13cb08964ed..0714abcd7f32 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -14,7 +14,6 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> -#include <linux/aio.h> #include "internal.h" static int afs_write_back_from_locked_page(struct afs_writeback *wb, @@ -77,6 +77,11 @@ struct kioctx_cpu { unsigned reqs_available; }; +struct ctx_rq_wait { + struct completion comp; + atomic_t count; +}; + struct kioctx { struct percpu_ref users; atomic_t dead; @@ -115,7 +120,7 @@ struct kioctx { /* * signals when all in-flight requests are done */ - struct completion *requests_done; + struct ctx_rq_wait *rq_wait; struct { /* @@ -151,6 +156,38 @@ struct kioctx { unsigned id; }; +/* + * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either + * cancelled or completed (this makes a certain amount of sense because + * successful cancellation - io_cancel() - does deliver the completion to + * userspace). + * + * And since most things don't implement kiocb cancellation and we'd really like + * kiocb completion to be lockless when possible, we use ki_cancel to + * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED + * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel(). + */ +#define KIOCB_CANCELLED ((void *) (~0ULL)) + +struct aio_kiocb { + struct kiocb common; + + struct kioctx *ki_ctx; + kiocb_cancel_fn *ki_cancel; + + struct iocb __user *ki_user_iocb; /* user's aiocb */ + __u64 ki_user_data; /* user's data for completion */ + + struct list_head ki_list; /* the aio core uses this + * for cancellation */ + + /* + * If the aio_resfd field of the userspace iocb is not zero, + * this is the underlying eventfd context to deliver events to. + */ + struct eventfd_ctx *ki_eventfd; +}; + /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); unsigned long aio_nr; /* current system wide number of aio requests */ @@ -220,7 +257,7 @@ static int __init aio_setup(void) if (IS_ERR(aio_mnt)) panic("Failed to create aio fs mount."); - kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); + kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); pr_debug("sizeof(struct page) = %zu\n", sizeof(struct page)); @@ -278,11 +315,11 @@ static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static void aio_ring_remap(struct file *file, struct vm_area_struct *vma) +static int aio_ring_remap(struct file *file, struct vm_area_struct *vma) { struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; - int i; + int i, res = -EINVAL; spin_lock(&mm->ioctx_lock); rcu_read_lock(); @@ -292,13 +329,17 @@ static void aio_ring_remap(struct file *file, struct vm_area_struct *vma) ctx = table->table[i]; if (ctx && ctx->aio_ring_file == file) { - ctx->user_id = ctx->mmap_base = vma->vm_start; + if (!atomic_read(&ctx->dead)) { + ctx->user_id = ctx->mmap_base = vma->vm_start; + res = 0; + } break; } } rcu_read_unlock(); spin_unlock(&mm->ioctx_lock); + return res; } static const struct file_operations aio_ring_fops = { @@ -480,8 +521,9 @@ static int aio_setup_ring(struct kioctx *ctx) #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event)) #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE) -void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) +void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel) { + struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, common); struct kioctx *ctx = req->ki_ctx; unsigned long flags; @@ -496,7 +538,7 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) } EXPORT_SYMBOL(kiocb_set_cancel_fn); -static int kiocb_cancel(struct kiocb *kiocb) +static int kiocb_cancel(struct aio_kiocb *kiocb) { kiocb_cancel_fn *old, *cancel; @@ -514,7 +556,7 @@ static int kiocb_cancel(struct kiocb *kiocb) cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED); } while (cancel != old); - return cancel(kiocb); + return cancel(&kiocb->common); } static void free_ioctx(struct work_struct *work) @@ -535,8 +577,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref) struct kioctx *ctx = container_of(ref, struct kioctx, reqs); /* At this point we know that there are no any in-flight requests */ - if (ctx->requests_done) - complete(ctx->requests_done); + if (ctx->rq_wait && atomic_dec_and_test(&ctx->rq_wait->count)) + complete(&ctx->rq_wait->comp); INIT_WORK(&ctx->free_work, free_ioctx); schedule_work(&ctx->free_work); @@ -550,13 +592,13 @@ static void free_ioctx_reqs(struct percpu_ref *ref) static void free_ioctx_users(struct percpu_ref *ref) { struct kioctx *ctx = container_of(ref, struct kioctx, users); - struct kiocb *req; + struct aio_kiocb *req; spin_lock_irq(&ctx->ctx_lock); while (!list_empty(&ctx->active_reqs)) { req = list_first_entry(&ctx->active_reqs, - struct kiocb, ki_list); + struct aio_kiocb, ki_list); list_del_init(&req->ki_list); kiocb_cancel(req); @@ -655,8 +697,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) nr_events *= 2; /* Prevent overflows */ - if ((nr_events > (0x10000000U / sizeof(struct io_event))) || - (nr_events > (0x10000000U / sizeof(struct kiocb)))) { + if (nr_events > (0x10000000U / sizeof(struct io_event))) { pr_debug("ENOMEM: nr_events too high\n"); return ERR_PTR(-EINVAL); } @@ -727,6 +768,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) err_cleanup: aio_nr_sub(ctx->max_reqs); err_ctx: + atomic_set(&ctx->dead, 1); + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); aio_free_ring(ctx); err: mutex_unlock(&ctx->ring_lock); @@ -744,15 +788,16 @@ err: * the rapid destruction of the kioctx. */ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, - struct completion *requests_done) + struct ctx_rq_wait *wait) { struct kioctx_table *table; - if (atomic_xchg(&ctx->dead, 1)) + spin_lock(&mm->ioctx_lock); + if (atomic_xchg(&ctx->dead, 1)) { + spin_unlock(&mm->ioctx_lock); return -EINVAL; + } - - spin_lock(&mm->ioctx_lock); table = rcu_dereference_raw(mm->ioctx_table); WARN_ON(ctx != table->table[ctx->id]); table->table[ctx->id] = NULL; @@ -773,27 +818,11 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx, if (ctx->mmap_size) vm_munmap(ctx->mmap_base, ctx->mmap_size); - ctx->requests_done = requests_done; + ctx->rq_wait = wait; percpu_ref_kill(&ctx->users); return 0; } -/* wait_on_sync_kiocb: - * Waits on the given sync kiocb to complete. - */ -ssize_t wait_on_sync_kiocb(struct kiocb *req) -{ - while (!req->ki_ctx) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (req->ki_ctx) - break; - io_schedule(); - } - __set_current_state(TASK_RUNNING); - return req->ki_user_data; -} -EXPORT_SYMBOL(wait_on_sync_kiocb); - /* * exit_aio: called when the last user of mm goes away. At this point, there is * no way for any new requests to be submited or any of the io_* syscalls to be @@ -805,18 +834,24 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); void exit_aio(struct mm_struct *mm) { struct kioctx_table *table = rcu_dereference_raw(mm->ioctx_table); - int i; + struct ctx_rq_wait wait; + int i, skipped; if (!table) return; + atomic_set(&wait.count, table->nr); + init_completion(&wait.comp); + + skipped = 0; for (i = 0; i < table->nr; ++i) { struct kioctx *ctx = table->table[i]; - struct completion requests_done = - COMPLETION_INITIALIZER_ONSTACK(requests_done); - if (!ctx) + if (!ctx) { + skipped++; continue; + } + /* * We don't need to bother with munmap() here - exit_mmap(mm) * is coming and it'll unmap everything. And we simply can't, @@ -825,10 +860,12 @@ void exit_aio(struct mm_struct *mm) * that it needs to unmap the area, just set it to 0. */ ctx->mmap_size = 0; - kill_ioctx(mm, ctx, &requests_done); + kill_ioctx(mm, ctx, &wait); + } + if (!atomic_sub_and_test(skipped, &wait.count)) { /* Wait until all IO for the context are done. */ - wait_for_completion(&requests_done); + wait_for_completion(&wait.comp); } RCU_INIT_POINTER(mm->ioctx_table, NULL); @@ -948,9 +985,9 @@ static void user_refill_reqs_available(struct kioctx *ctx) * Allocate a slot for an aio request. * Returns NULL if no requests are free. */ -static inline struct kiocb *aio_get_req(struct kioctx *ctx) +static inline struct aio_kiocb *aio_get_req(struct kioctx *ctx) { - struct kiocb *req; + struct aio_kiocb *req; if (!get_reqs_available(ctx)) { user_refill_reqs_available(ctx); @@ -971,10 +1008,10 @@ out_put: return NULL; } -static void kiocb_free(struct kiocb *req) +static void kiocb_free(struct aio_kiocb *req) { - if (req->ki_filp) - fput(req->ki_filp); + if (req->common.ki_filp) + fput(req->common.ki_filp); if (req->ki_eventfd != NULL) eventfd_ctx_put(req->ki_eventfd); kmem_cache_free(kiocb_cachep, req); @@ -1010,8 +1047,9 @@ out: /* aio_complete * Called when the io request on the given iocb is complete. */ -void aio_complete(struct kiocb *iocb, long res, long res2) +static void aio_complete(struct kiocb *kiocb, long res, long res2) { + struct aio_kiocb *iocb = container_of(kiocb, struct aio_kiocb, common); struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; @@ -1025,13 +1063,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) * ref, no other paths have a way to get another ref * - the sync task helpfully left a reference to itself in the iocb */ - if (is_sync_kiocb(iocb)) { - iocb->ki_user_data = res; - smp_wmb(); - iocb->ki_ctx = ERR_PTR(-EXDEV); - wake_up_process(iocb->ki_obj.tsk); - return; - } + BUG_ON(is_sync_kiocb(kiocb)); if (iocb->ki_list.next) { unsigned long flags; @@ -1057,7 +1089,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) ev_page = kmap_atomic(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); event = ev_page + pos % AIO_EVENTS_PER_PAGE; - event->obj = (u64)(unsigned long)iocb->ki_obj.user; + event->obj = (u64)(unsigned long)iocb->ki_user_iocb; event->data = iocb->ki_user_data; event->res = res; event->res2 = res2; @@ -1066,7 +1098,7 @@ void aio_complete(struct kiocb *iocb, long res, long res2) flush_dcache_page(ctx->ring_pages[pos / AIO_EVENTS_PER_PAGE]); pr_debug("%p[%u]: %p: %p %Lx %lx %lx\n", - ctx, tail, iocb, iocb->ki_obj.user, iocb->ki_user_data, + ctx, tail, iocb, iocb->ki_user_iocb, iocb->ki_user_data, res, res2); /* after flagging the request as done, we @@ -1113,7 +1145,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2) percpu_ref_put(&ctx->reqs); } -EXPORT_SYMBOL(aio_complete); /* aio_read_events_ring * Pull an event off of the ioctx's event ring. Returns the number of @@ -1313,15 +1344,17 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - struct completion requests_done = - COMPLETION_INITIALIZER_ONSTACK(requests_done); + struct ctx_rq_wait wait; int ret; + init_completion(&wait.comp); + atomic_set(&wait.count, 1); + /* Pass requests_done to kill_ioctx() where it can be set * in a thread-safe way. If we try to set it here then we have * a race condition if two io_destroy() called simultaneously. */ - ret = kill_ioctx(current->mm, ioctx, &requests_done); + ret = kill_ioctx(current->mm, ioctx, &wait); percpu_ref_put(&ioctx->users); /* Wait until all IO for the context are done. Otherwise kernel @@ -1329,7 +1362,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) * is destroyed. */ if (!ret) - wait_for_completion(&requests_done); + wait_for_completion(&wait.comp); return ret; } @@ -1337,50 +1370,21 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) return -EINVAL; } -typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *, - unsigned long, loff_t); typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *); -static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, - int rw, char __user *buf, - unsigned long *nr_segs, - struct iovec **iovec, - bool compat) +static int aio_setup_vectored_rw(int rw, char __user *buf, size_t len, + struct iovec **iovec, + bool compat, + struct iov_iter *iter) { - ssize_t ret; - - *nr_segs = kiocb->ki_nbytes; - #ifdef CONFIG_COMPAT if (compat) - ret = compat_rw_copy_check_uvector(rw, + return compat_import_iovec(rw, (struct compat_iovec __user *)buf, - *nr_segs, UIO_FASTIOV, *iovec, iovec); - else + len, UIO_FASTIOV, iovec, iter); #endif - ret = rw_copy_check_uvector(rw, - (struct iovec __user *)buf, - *nr_segs, UIO_FASTIOV, *iovec, iovec); - if (ret < 0) - return ret; - - /* ki_nbytes now reflect bytes instead of segs */ - kiocb->ki_nbytes = ret; - return 0; -} - -static ssize_t aio_setup_single_vector(struct kiocb *kiocb, - int rw, char __user *buf, - unsigned long *nr_segs, - struct iovec *iovec) -{ - if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) - return -EFAULT; - - iovec->iov_base = buf; - iovec->iov_len = kiocb->ki_nbytes; - *nr_segs = 1; - return 0; + return import_iovec(rw, (struct iovec __user *)buf, + len, UIO_FASTIOV, iovec, iter); } /* @@ -1388,14 +1392,12 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb, * Performs the initial checks and io submission. */ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, - char __user *buf, bool compat) + char __user *buf, size_t len, bool compat) { struct file *file = req->ki_filp; ssize_t ret; - unsigned long nr_segs; int rw; fmode_t mode; - aio_rw_op *rw_op; rw_iter_op *iter_op; struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; struct iov_iter iter; @@ -1405,7 +1407,6 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, case IOCB_CMD_PREADV: mode = FMODE_READ; rw = READ; - rw_op = file->f_op->aio_read; iter_op = file->f_op->read_iter; goto rw_common; @@ -1413,51 +1414,40 @@ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, case IOCB_CMD_PWRITEV: mode = FMODE_WRITE; rw = WRITE; - rw_op = file->f_op->aio_write; iter_op = file->f_op->write_iter; goto rw_common; rw_common: if (unlikely(!(file->f_mode & mode))) return -EBADF; - if (!rw_op && !iter_op) + if (!iter_op) return -EINVAL; - ret = (opcode == IOCB_CMD_PREADV || - opcode == IOCB_CMD_PWRITEV) - ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, - &iovec, compat) - : aio_setup_single_vector(req, rw, buf, &nr_segs, - iovec); + if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV) + ret = aio_setup_vectored_rw(rw, buf, len, + &iovec, compat, &iter); + else { + ret = import_single_range(rw, buf, len, iovec, &iter); + iovec = NULL; + } if (!ret) - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + ret = rw_verify_area(rw, file, &req->ki_pos, + iov_iter_count(&iter)); if (ret < 0) { - if (iovec != inline_vecs) - kfree(iovec); + kfree(iovec); return ret; } - req->ki_nbytes = ret; - - /* XXX: move/kill - rw_verify_area()? */ - /* This matches the pread()/pwrite() logic */ - if (req->ki_pos < 0) { - ret = -EINVAL; - break; - } + len = ret; if (rw == WRITE) file_start_write(file); - if (iter_op) { - iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes); - ret = iter_op(req, &iter); - } else { - ret = rw_op(req, iovec, nr_segs, req->ki_pos); - } + ret = iter_op(req, &iter); if (rw == WRITE) file_end_write(file); + kfree(iovec); break; case IOCB_CMD_FDSYNC: @@ -1479,9 +1469,6 @@ rw_common: return -EINVAL; } - if (iovec != inline_vecs) - kfree(iovec); - if (ret != -EIOCBQUEUED) { /* * There's no easy way to restart the syscall since other AIO's @@ -1500,7 +1487,7 @@ rw_common: static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, struct iocb *iocb, bool compat) { - struct kiocb *req; + struct aio_kiocb *req; ssize_t ret; /* enforce forwards compatibility on users */ @@ -1523,11 +1510,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, if (unlikely(!req)) return -EAGAIN; - req->ki_filp = fget(iocb->aio_fildes); - if (unlikely(!req->ki_filp)) { + req->common.ki_filp = fget(iocb->aio_fildes); + if (unlikely(!req->common.ki_filp)) { ret = -EBADF; goto out_put_req; } + req->common.ki_pos = iocb->aio_offset; + req->common.ki_complete = aio_complete; + req->common.ki_flags = iocb_flags(req->common.ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) { /* @@ -1542,6 +1532,8 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_eventfd = NULL; goto out_put_req; } + + req->common.ki_flags |= IOCB_EVENTFD; } ret = put_user(KIOCB_KEY, &user_iocb->aio_key); @@ -1550,13 +1542,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, goto out_put_req; } - req->ki_obj.user = user_iocb; + req->ki_user_iocb = user_iocb; req->ki_user_data = iocb->aio_data; - req->ki_pos = iocb->aio_offset; - req->ki_nbytes = iocb->aio_nbytes; - ret = aio_run_iocb(req, iocb->aio_lio_opcode, + ret = aio_run_iocb(&req->common, iocb->aio_lio_opcode, (char __user *)(unsigned long)iocb->aio_buf, + iocb->aio_nbytes, compat); if (ret) goto out_put_req; @@ -1643,10 +1634,10 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, /* lookup_kiocb * Finds a given iocb for cancellation. */ -static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, - u32 key) +static struct aio_kiocb * +lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, u32 key) { - struct list_head *pos; + struct aio_kiocb *kiocb; assert_spin_locked(&ctx->ctx_lock); @@ -1654,9 +1645,8 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb, return NULL; /* TODO: use a hash or array, this sucks. */ - list_for_each(pos, &ctx->active_reqs) { - struct kiocb *kiocb = list_kiocb(pos); - if (kiocb->ki_obj.user == iocb) + list_for_each_entry(kiocb, &ctx->active_reqs, ki_list) { + if (kiocb->ki_user_iocb == iocb) return kiocb; } return NULL; @@ -1676,7 +1666,7 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, struct io_event __user *, result) { struct kioctx *ctx; - struct kiocb *kiocb; + struct aio_kiocb *kiocb; u32 key; int ret; diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 8e98cf954bab..c37149b929be 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -213,7 +213,7 @@ void autofs4_clean_ino(struct autofs_info *); static inline int autofs_prepare_pipe(struct file *pipe) { - if (!pipe->f_op->write) + if (!(pipe->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!S_ISFIFO(file_inode(pipe)->i_mode)) return -EINVAL; @@ -235,12 +235,7 @@ static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) { - return sbi->sb->s_root->d_inode->i_ino; -} - -static inline int simple_positive(struct dentry *dentry) -{ - return dentry->d_inode && !d_unhashed(dentry); + return d_inode(sbi->sb->s_root)->i_ino; } static inline void __autofs4_add_expiring(struct dentry *dentry) diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 11dd118f75e2..1cebc3c52fa5 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -374,7 +374,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (dentry->d_inode && d_is_symlink(dentry)) { + if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { DPRINTK("checking symlink %p %pd", dentry, dentry); /* * A symlink can't be "busy" in the usual sense so diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 1c55388ae633..a3ae0b2aeb5a 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -71,7 +71,7 @@ void autofs4_kill_sb(struct super_block *sb) static int autofs4_show_options(struct seq_file *m, struct dentry *root) { struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); - struct inode *root_inode = root->d_sb->s_root->d_inode; + struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) return 0; @@ -352,8 +352,8 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) inode->i_mode = mode; if (sb->s_root) { - inode->i_uid = sb->s_root->d_inode->i_uid; - inode->i_gid = sb->s_root->d_inode->i_gid; + inode->i_uid = d_inode(sb->s_root)->i_uid; + inode->i_gid = d_inode(sb->s_root)->i_gid; } inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_ino = get_next_ino(); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 7e44fdd03e2d..c6d7d3dbd52a 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -240,7 +240,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, spin_lock(&expiring->d_lock); /* We've already been dentry_iput or unlinked */ - if (!expiring->d_inode) + if (d_really_is_negative(expiring)) goto next; qstr = &expiring->d_name; @@ -371,7 +371,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * having d_mountpoint() true, so there's no need to call back * to the daemon. */ - if (dentry->d_inode && d_is_symlink(dentry)) { + if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { spin_unlock(&sbi->fs_lock); goto done; } @@ -459,7 +459,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) return 0; if (d_mountpoint(dentry)) return 0; - inode = ACCESS_ONCE(dentry->d_inode); + inode = d_inode_rcu(dentry); if (inode && S_ISLNK(inode->i_mode)) return -EISDIR; if (list_empty(&dentry->d_subdirs)) @@ -485,7 +485,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) * an incorrect ELOOP error return. */ if ((!d_mountpoint(dentry) && !simple_empty(dentry)) || - (dentry->d_inode && d_is_symlink(dentry))) + (d_really_is_positive(dentry) && d_is_symlink(dentry))) status = -EISDIR; } spin_unlock(&sbi->fs_lock); @@ -625,8 +625,8 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) } dput(ino->dentry); - dentry->d_inode->i_size = 0; - clear_nlink(dentry->d_inode); + d_inode(dentry)->i_size = 0; + clear_nlink(d_inode(dentry)); dir->i_mtime = CURRENT_TIME; @@ -719,8 +719,8 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) atomic_dec(&p_ino->count); } dput(ino->dentry); - dentry->d_inode->i_size = 0; - clear_nlink(dentry->d_inode); + d_inode(dentry)->i_size = 0; + clear_nlink(d_inode(dentry)); if (dir->i_nlink) drop_nlink(dir); @@ -839,7 +839,7 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) */ int is_autofs4_dentry(struct dentry *dentry) { - return dentry && dentry->d_inode && + return dentry && d_really_is_positive(dentry) && dentry->d_op == &autofs4_dentry_operations && dentry->d_fsdata != NULL; } diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index 1e8ea192be2b..da0c33481bc0 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -12,14 +12,13 @@ #include "autofs_i.h" -static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *autofs4_follow_link(struct dentry *dentry, void **cookie) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); if (ino && !autofs4_oz_mode(sbi)) ino->last_used = jiffies; - nd_set_link(nd, dentry->d_inode->i_private); - return NULL; + return d_inode(dentry)->i_private; } const struct inode_operations autofs4_symlink_inode_operations = { diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 116fd38ee472..35b755e79c2d 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -70,7 +70,7 @@ static int autofs4_write(struct autofs_sb_info *sbi, mutex_lock(&sbi->pipe_mutex); while (bytes && - (wr = file->f_op->write(file,data,bytes,&file->f_pos)) > 0) { + (wr = __vfs_write(file,data,bytes,&file->f_pos)) > 0) { data += wr; bytes -= wr; } @@ -322,7 +322,7 @@ static int validate_request(struct autofs_wait_queue **wait, * continue on and create a new request. */ if (!IS_ROOT(dentry)) { - if (dentry->d_inode && d_unhashed(dentry)) { + if (d_really_is_positive(dentry) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; new = d_lookup(parent, &dentry->d_name); if (new) @@ -364,7 +364,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, if (pid == 0 || tgid == 0) return -ENOENT; - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { /* * A wait for a negative dentry is invalid for certain * cases. A direct or offset mount "always" has its mount diff --git a/fs/befs/befs.h b/fs/befs/befs.h index 3a7813ab8c95..35d19e8731e3 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -19,16 +19,16 @@ typedef u64 befs_blocknr_t; * BeFS in memory structures */ -typedef struct befs_mount_options { +struct befs_mount_options { kgid_t gid; kuid_t uid; int use_gid; int use_uid; int debug; char *iocharset; -} befs_mount_options; +}; -typedef struct befs_sb_info { +struct befs_sb_info { u32 magic1; u32 block_size; u32 block_shift; @@ -52,12 +52,11 @@ typedef struct befs_sb_info { befs_inode_addr indices; u32 magic3; - befs_mount_options mount_opts; + struct befs_mount_options mount_opts; struct nls_table *nls; +}; -} befs_sb_info; - -typedef struct befs_inode_info { +struct befs_inode_info { u32 i_flags; u32 i_type; @@ -71,8 +70,7 @@ typedef struct befs_inode_info { } i_data; struct inode vfs_inode; - -} befs_inode_info; +}; enum befs_err { BEFS_OK, @@ -105,16 +103,16 @@ void befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *); /* Gets a pointer to the private portion of the super_block * structure from the public part */ -static inline befs_sb_info * +static inline struct befs_sb_info * BEFS_SB(const struct super_block *super) { - return (befs_sb_info *) super->s_fs_info; + return (struct befs_sb_info *) super->s_fs_info; } -static inline befs_inode_info * +static inline struct befs_inode_info * BEFS_I(const struct inode *inode) { - return list_entry(inode, struct befs_inode_info, vfs_inode); + return container_of(inode, struct befs_inode_info, vfs_inode); } static inline befs_blocknr_t diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 0826e91dacda..22c166280883 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -137,8 +137,8 @@ static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, befs_btree_super * sup) { - struct buffer_head *bh = NULL; - befs_disk_btree_super *od_sup = NULL; + struct buffer_head *bh; + befs_disk_btree_super *od_sup; befs_debug(sb, "---> %s", __func__); @@ -250,7 +250,7 @@ int befs_btree_find(struct super_block *sb, befs_data_stream * ds, const char *key, befs_off_t * value) { - struct befs_btree_node *this_node = NULL; + struct befs_btree_node *this_node; befs_btree_super bt_super; befs_off_t node_off; int res; diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c index 1e8e0b8d8836..ebd50718659f 100644 --- a/fs/befs/datastream.c +++ b/fs/befs/datastream.c @@ -168,7 +168,7 @@ befs_count_blocks(struct super_block * sb, befs_data_stream * ds) befs_blocknr_t blocks; befs_blocknr_t datablocks; /* File data blocks */ befs_blocknr_t metablocks; /* FS metadata blocks */ - befs_sb_info *befs_sb = BEFS_SB(sb); + struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_debug(sb, "---> %s", __func__); @@ -428,7 +428,7 @@ befs_find_brun_dblindirect(struct super_block *sb, struct buffer_head *indir_block; befs_block_run indir_run; befs_disk_inode_addr *iaddr_array = NULL; - befs_sb_info *befs_sb = BEFS_SB(sb); + struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_blocknr_t indir_start_blk = data->max_indirect_range >> befs_sb->block_shift; diff --git a/fs/befs/io.c b/fs/befs/io.c index 0408a3d601d0..7a5b4ec21c56 100644 --- a/fs/befs/io.c +++ b/fs/befs/io.c @@ -28,7 +28,7 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr) { struct buffer_head *bh = NULL; befs_blocknr_t block = 0; - befs_sb_info *befs_sb = BEFS_SB(sb); + struct befs_sb_info *befs_sb = BEFS_SB(sb); befs_debug(sb, "---> Enter %s " "[%u, %hu, %hu]", __func__, iaddr.allocation_group, diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index e089f1985fca..46aedacfa6a8 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -42,8 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); static void befs_destroy_inodecache(void); -static void *befs_follow_link(struct dentry *, struct nameidata *); -static void *befs_fast_follow_link(struct dentry *, struct nameidata *); +static const char *befs_follow_link(struct dentry *, void **); static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, @@ -51,7 +50,7 @@ static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, static void befs_put_super(struct super_block *); static int befs_remount(struct super_block *, int *, char *); static int befs_statfs(struct dentry *, struct kstatfs *); -static int parse_options(char *, befs_mount_options *); +static int parse_options(char *, struct befs_mount_options *); static const struct super_operations befs_sops = { .alloc_inode = befs_alloc_inode, /* allocate a new inode */ @@ -80,11 +79,6 @@ static const struct address_space_operations befs_aops = { .bmap = befs_bmap, }; -static const struct inode_operations befs_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = befs_fast_follow_link, -}; - static const struct inode_operations befs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = befs_follow_link, @@ -304,9 +298,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) { struct buffer_head *bh = NULL; befs_inode *raw_inode = NULL; - - befs_sb_info *befs_sb = BEFS_SB(sb); - befs_inode_info *befs_ino = NULL; + struct befs_sb_info *befs_sb = BEFS_SB(sb); + struct befs_inode_info *befs_ino = NULL; struct inode *inode; long ret = -EIO; @@ -404,10 +397,12 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &befs_dir_inode_operations; inode->i_fop = &befs_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (befs_ino->i_flags & BEFS_LONG_SYMLINK) + if (befs_ino->i_flags & BEFS_LONG_SYMLINK) { inode->i_op = &befs_symlink_inode_operations; - else - inode->i_op = &befs_fast_symlink_inode_operations; + } else { + inode->i_link = befs_ino->i_data.symlink; + inode->i_op = &simple_symlink_inode_operations; + } } else { befs_error(sb, "Inode %lu is not a regular file, " "directory or symlink. THAT IS WRONG! BeFS has no " @@ -468,43 +463,31 @@ befs_destroy_inodecache(void) * The data stream become link name. Unless the LONG_SYMLINK * flag is set. */ -static void * -befs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char * +befs_follow_link(struct dentry *dentry, void **cookie) { struct super_block *sb = dentry->d_sb; - befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); + struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; char *link; if (len == 0) { befs_error(sb, "Long symlink with illegal length"); - link = ERR_PTR(-EIO); - } else { - befs_debug(sb, "Follow long symlink"); - - link = kmalloc(len, GFP_NOFS); - if (!link) { - link = ERR_PTR(-ENOMEM); - } else if (befs_read_lsymlink(sb, data, link, len) != len) { - kfree(link); - befs_error(sb, "Failed to read entire long symlink"); - link = ERR_PTR(-EIO); - } else { - link[len - 1] = '\0'; - } + return ERR_PTR(-EIO); } - nd_set_link(nd, link); - return NULL; -} - + befs_debug(sb, "Follow long symlink"); -static void * -befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); - nd_set_link(nd, befs_ino->i_data.symlink); - return NULL; + link = kmalloc(len, GFP_NOFS); + if (!link) + return ERR_PTR(-ENOMEM); + if (befs_read_lsymlink(sb, data, link, len) != len) { + kfree(link); + befs_error(sb, "Failed to read entire long symlink"); + return ERR_PTR(-EIO); + } + link[len - 1] = '\0'; + return *cookie = link; } /* @@ -669,7 +652,7 @@ static const match_table_t befs_tokens = { }; static int -parse_options(char *options, befs_mount_options * opts) +parse_options(char *options, struct befs_mount_options *opts) { char *p; substring_t args[MAX_OPT_ARGS]; @@ -769,7 +752,7 @@ static int befs_fill_super(struct super_block *sb, void *data, int silent) { struct buffer_head *bh; - befs_sb_info *befs_sb; + struct befs_sb_info *befs_sb; befs_super_block *disk_sb; struct inode *root; long ret = -EINVAL; diff --git a/fs/befs/super.c b/fs/befs/super.c index ca40f828f64d..aeafc4d84278 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -24,7 +24,7 @@ int befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) { - befs_sb_info *befs_sb = BEFS_SB(sb); + struct befs_sb_info *befs_sb = BEFS_SB(sb); /* Check the byte order of the filesystem */ if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE) @@ -59,7 +59,7 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) int befs_check_sb(struct super_block *sb) { - befs_sb_info *befs_sb = BEFS_SB(sb); + struct befs_sb_info *befs_sb = BEFS_SB(sb); /* Check magic headers of super block */ if ((befs_sb->magic1 != BEFS_SUPER_MAGIC1) diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 08063ae0a17c..3ec6113146c0 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -86,7 +86,7 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode = new_inode(s); if (!inode) - return -ENOSPC; + return -ENOMEM; mutex_lock(&info->bfs_lock); ino = find_first_zero_bit(info->si_imap, info->si_lasti + 1); if (ino > info->si_lasti) { @@ -153,7 +153,7 @@ static struct dentry *bfs_lookup(struct inode *dir, struct dentry *dentry, static int bfs_link(struct dentry *old, struct inode *dir, struct dentry *new) { - struct inode *inode = old->d_inode; + struct inode *inode = d_inode(old); struct bfs_sb_info *info = BFS_SB(inode->i_sb); int err; @@ -176,7 +176,7 @@ static int bfs_link(struct dentry *old, struct inode *dir, static int bfs_unlink(struct inode *dir, struct dentry *dentry) { int error = -ENOENT; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh; struct bfs_dirent *de; struct bfs_sb_info *info = BFS_SB(inode->i_sb); @@ -216,7 +216,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, int error = -ENOENT; old_bh = new_bh = NULL; - old_inode = old_dentry->d_inode; + old_inode = d_inode(old_dentry); if (S_ISDIR(old_inode->i_mode)) return -EINVAL; @@ -231,7 +231,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto end_rename; error = -EPERM; - new_inode = new_dentry->d_inode; + new_inode = d_inode(new_dentry); new_bh = bfs_find_entry(new_dir, new_dentry->d_name.name, new_dentry->d_name.len, &new_de); @@ -293,7 +293,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name, for (block = sblock; block <= eblock; block++) { bh = sb_bread(dir->i_sb, block); if (!bh) - return -ENOSPC; + return -EIO; for (off = 0; off < BFS_BSIZE; off += BFS_DIRENT_SIZE) { de = (struct bfs_dirent *)(bh->b_data + off); if (!de->ino) { diff --git a/fs/bfs/file.c b/fs/bfs/file.c index e7f88ace1a25..97f1b5160155 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -23,9 +23,7 @@ const struct file_operations bfs_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 90bc079d9982..fdcb4d69f430 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -15,6 +15,7 @@ #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/writeback.h> +#include <linux/uio.h> #include <asm/uaccess.h> #include "bfs.h" diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 995986b8e36b..6b659967898e 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -31,6 +31,7 @@ #include <linux/security.h> #include <linux/random.h> #include <linux/elf.h> +#include <linux/elf-randomize.h> #include <linux/utsname.h> #include <linux/coredump.h> #include <linux/sched.h> @@ -862,6 +863,7 @@ static int load_elf_binary(struct linux_binprm *bprm) i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { int elf_prot = 0, elf_flags; unsigned long k, vaddr; + unsigned long total_size = 0; if (elf_ppnt->p_type != PT_LOAD) continue; @@ -909,25 +911,20 @@ static int load_elf_binary(struct linux_binprm *bprm) * default mmap base, as well as whatever program they * might try to exec. This is because the brk will * follow the loader, and is not movable. */ -#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE - /* Memory randomization might have been switched off - * in runtime via sysctl or explicit setting of - * personality flags. - * If that is the case, retain the original non-zero - * load_bias value in order to establish proper - * non-randomized mappings. - */ + load_bias = ELF_ET_DYN_BASE - vaddr; if (current->flags & PF_RANDOMIZE) - load_bias = 0; - else - load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); -#else - load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr); -#endif + load_bias += arch_mmap_rnd(); + load_bias = ELF_PAGESTART(load_bias); + total_size = total_mapping_size(elf_phdata, + loc->elf_ex.e_phnum); + if (!total_size) { + retval = -EINVAL; + goto out_free_dentry; + } } error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, - elf_prot, elf_flags, 0); + elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { retval = IS_ERR((void *)error) ? PTR_ERR((void*)error) : -EINVAL; @@ -1053,15 +1050,13 @@ static int load_elf_binary(struct linux_binprm *bprm) current->mm->end_data = end_data; current->mm->start_stack = bprm->p; -#ifdef arch_randomize_brk if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { current->mm->brk = current->mm->start_brk = arch_randomize_brk(current->mm); -#ifdef CONFIG_COMPAT_BRK +#ifdef compat_brk_randomized current->brk_randomized = 1; #endif } -#endif if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, @@ -1535,7 +1530,7 @@ static int fill_files_note(struct memelfnote *note) file = vma->vm_file; if (!file) continue; - filename = d_path(&file->f_path, name_curpos, remaining); + filename = file_path(file, name_curpos, remaining); if (IS_ERR(filename)) { if (PTR_ERR(filename) == -ENAMETOOLONG) { vfree(data); @@ -1545,7 +1540,7 @@ static int fill_files_note(struct memelfnote *note) continue; } - /* d_path() fills at the end, move name down */ + /* file_path() fills at the end, move name down */ /* n = strlen(filename) + 1: */ n = (name_curpos + remaining) - filename; remaining = filename - name_curpos; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 97aff2879cda..78f005f37847 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -9,6 +9,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/sched.h> @@ -521,9 +522,8 @@ static int parse_command(const char __user *buffer, size_t count) static void entry_status(Node *e, char *page) { - char *dp; - char *status = "disabled"; - const char *flags = "flags: "; + char *dp = page; + const char *status = "disabled"; if (test_bit(Enabled, &e->flags)) status = "enabled"; @@ -533,12 +533,10 @@ static void entry_status(Node *e, char *page) return; } - sprintf(page, "%s\ninterpreter %s\n", status, e->interpreter); - dp = page + strlen(page); + dp += sprintf(dp, "%s\ninterpreter %s\n", status, e->interpreter); /* print the special flags */ - sprintf(dp, "%s", flags); - dp += strlen(flags); + dp += sprintf(dp, "flags: "); if (e->flags & MISC_FMT_PRESERVE_ARGV0) *dp++ = 'P'; if (e->flags & MISC_FMT_OPEN_BINARY) @@ -550,21 +548,11 @@ static void entry_status(Node *e, char *page) if (!test_bit(Magic, &e->flags)) { sprintf(dp, "extension .%s\n", e->magic); } else { - int i; - - sprintf(dp, "offset %i\nmagic ", e->offset); - dp = page + strlen(page); - for (i = 0; i < e->size; i++) { - sprintf(dp, "%02x", 0xff & (int) (e->magic[i])); - dp += 2; - } + dp += sprintf(dp, "offset %i\nmagic ", e->offset); + dp = bin2hex(dp, e->magic, e->size); if (e->mask) { - sprintf(dp, "\nmask "); - dp += 6; - for (i = 0; i < e->size; i++) { - sprintf(dp, "%02x", 0xff & (int) (e->mask[i])); - dp += 2; - } + dp += sprintf(dp, "\nmask "); + dp = bin2hex(dp, e->mask, e->size); } *dp++ = '\n'; *dp = '\0'; @@ -603,7 +591,7 @@ static void kill_node(Node *e) write_unlock(&entries_lock); if (dentry) { - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); d_drop(dentry); dput(dentry); simple_release_fs(&bm_mnt, &entry_count); @@ -650,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, case 3: /* Delete this handler. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); kill_node(e); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); break; default: @@ -687,14 +675,14 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); root = dget(sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out; err = -EEXIST; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto out2; inode = bm_get_inode(sb, S_IFREG | 0644); @@ -723,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, out2: dput(dentry); out: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); if (err) { @@ -766,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, case 3: /* Delete all handlers. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(root); break; default: diff --git a/fs/block_dev.c b/fs/block_dev.c index 975266be67d3..198243717da5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -14,6 +14,7 @@ #include <linux/device_cgroup.h> #include <linux/highmem.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> @@ -27,7 +28,6 @@ #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> -#include <linux/aio.h> #include <asm/uaccess.h> #include "internal.h" @@ -43,7 +43,7 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode) return container_of(inode, struct bdev_inode, vfs_inode); } -inline struct block_device *I_BDEV(struct inode *inode) +struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } @@ -147,15 +147,17 @@ blkdev_get_block(struct inode *inode, sector_t iblock, } static ssize_t -blkdev_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; - return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iter, - offset, blkdev_get_block, - NULL, NULL, 0); + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, + NULL, DIO_SKIP_DIO_COUNT); + return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, + blkdev_get_block, NULL, NULL, + DIO_SKIP_DIO_COUNT); } int __sync_blockdev(struct block_device *bdev, int wait) @@ -378,7 +380,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, struct page *page) { const struct block_device_operations *ops = bdev->bd_disk->fops; - if (!ops->rw_page) + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); } @@ -409,7 +411,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, int result; int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; - if (!ops->rw_page) + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); @@ -444,6 +446,12 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, long avail; const struct block_device_operations *ops = bdev->bd_disk->fops; + /* + * The device driver is allowed to sleep, in order to make the + * memory directly accessible. + */ + might_sleep(); + if (size < 0) return size; if (!ops->direct_access) @@ -548,7 +556,8 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -static struct super_block *blockdev_superblock __read_mostly; +struct super_block *blockdev_superblock __read_mostly; +EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { @@ -689,11 +698,6 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } -int sb_is_blkdev_sb(struct super_block *sb) -{ - return sb == blockdev_superblock; -} - /* Call when you free inode */ void bd_forget(struct inode *inode) @@ -1175,6 +1179,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; + bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0; if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); @@ -1598,9 +1603,22 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; + struct inode *bd_inode = file->f_mapping->host; + loff_t size = i_size_read(bd_inode); struct blk_plug plug; ssize_t ret; + if (bdev_read_only(I_BDEV(bd_inode))) + return -EPERM; + + if (!iov_iter_count(from)) + return 0; + + if (iocb->ki_pos >= size) + return -ENOSPC; + + iov_iter_truncate(from, size - iocb->ki_pos); + blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); if (ret > 0) { @@ -1660,8 +1678,6 @@ const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, .mmap = generic_file_mmap, @@ -1708,7 +1724,7 @@ struct block_device *lookup_bdev(const char *pathname) if (error) return ERR_PTR(error); - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto fail; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 4dabeb893b7c..1ce06c849a86 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -85,9 +85,10 @@ BTRFS_WORK_HELPER(extent_refs_helper); BTRFS_WORK_HELPER(scrub_helper); BTRFS_WORK_HELPER(scrubwrc_helper); BTRFS_WORK_HELPER(scrubnc_helper); +BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(const char *name, int flags, int max_active, +__btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, int thresh) { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); @@ -132,7 +133,7 @@ static inline void __btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, - int flags, + unsigned int flags, int max_active, int thresh) { diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index e386c29ef1f6..b0b093b6afec 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,9 +64,11 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper); BTRFS_WORK_HELPER_PROTO(scrub_helper); BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); BTRFS_WORK_HELPER_PROTO(scrubnc_helper); +BTRFS_WORK_HELPER_PROTO(scrubparity_helper); + struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, - int flags, + unsigned int flags, int max_active, int thresh); void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f55721ff9385..802fabb30e15 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -250,8 +250,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * the first item to check. But sometimes, we may enter it with * slot==nritems. In that case, go to the next leaf before we continue. */ - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) - ret = btrfs_next_old_leaf(root, path, time_seq); + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + if (time_seq == (u64)-1) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, time_seq); + } while (!ret && count < total_refs) { eb = path->nodes[0]; @@ -291,7 +295,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - ret = btrfs_next_old_item(root, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_next_item(root, path); + else + ret = btrfs_next_old_item(root, path, time_seq); } if (ret > 0) @@ -334,6 +341,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); + else if (time_seq == (u64)-1) + root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -343,7 +352,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, + 0, 0); + else + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, + time_seq); /* root node has been locked, we can release @subvol_srcu safely here */ srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -491,7 +505,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -507,7 +523,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, } /* - * merge two lists of backrefs and adjust counts accordingly + * merge backrefs and adjust counts accordingly * * mode = 1: merge identical keys, if key is set * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. @@ -535,9 +551,9 @@ static void __merge_refs(struct list_head *head, int mode) ref2 = list_entry(pos2, struct __prelim_ref, list); + if (!ref_for_same_block(ref1, ref2)) + continue; if (mode == 1) { - if (!ref_for_same_block(ref1, ref2)) - continue; if (!ref1->parent && ref2->parent) { xchg = ref1; ref1 = ref2; @@ -572,8 +588,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct list_head *prefs, u64 *total_refs, u64 inum) { + struct btrfs_delayed_ref_node *node; struct btrfs_delayed_extent_op *extent_op = head->extent_op; - struct rb_node *n = &head->node.rb_node; struct btrfs_key key; struct btrfs_key op_key = {0}; int sgn; @@ -583,12 +599,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, btrfs_disk_key_to_cpu(&op_key, &extent_op->key); spin_lock(&head->lock); - n = rb_first(&head->ref_root); - while (n) { - struct btrfs_delayed_ref_node *node; - node = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - n = rb_next(n); + list_for_each_entry(node, &head->ref_list, list) { if (node->seq > seq) continue; @@ -880,6 +891,13 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * + * NOTE: This can return values > 0 + * + * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * much like trans == NULL case, the difference only lies in it will not + * commit root. + * The special case is for qgroup to search roots in commit_transaction(). + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, @@ -918,6 +936,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } + if (time_seq == (u64)-1) + path->skip_locking = 1; + /* * grab both a lock on the path and a lock on the delayed ref head. * We need both to get a consistent picture of how the refs look @@ -932,9 +953,10 @@ again: BUG_ON(ret == 0); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (trans && likely(trans->type != __TRANS_DUMMY)) { + if (trans && likely(trans->type != __TRANS_DUMMY) && + time_seq != (u64)-1) { #else - if (trans) { + if (trans && time_seq != (u64)-1) { #endif /* * look if there are updates for this ref queued and lock the @@ -1032,7 +1054,10 @@ again: eb = read_tree_block(fs_info->extent_root, ref->parent, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; @@ -1198,6 +1223,19 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return ret; } +/** + * btrfs_check_shared - tell us whether an extent is shared + * + * @trans: optional trans handle + * + * btrfs_check_shared uses the backref walking code but will short + * circuit as soon as it finds a root or inode that doesn't match the + * one passed in. This provides a significant performance benefit for + * callers (such as fiemap) which want to know whether the extent is + * shared but do not need a ref count. + * + * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. + */ int btrfs_check_shared(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 root_objectid, u64 inum, u64 bytenr) @@ -1206,7 +1244,7 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, struct ulist *roots = NULL; struct ulist_iterator uiter; struct ulist_node *node; - struct seq_list elem = {}; + struct seq_list elem = SEQ_LIST_INIT(elem); int ret = 0; tmp = ulist_alloc(GFP_NOFS); @@ -1226,11 +1264,13 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans, ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, roots, NULL, root_objectid, inum); if (ret == BACKREF_FOUND_SHARED) { + /* this is the only condition under which we return 1 */ ret = 1; break; } if (ret < 0 && ret != -ENOENT) break; + ret = 0; node = ulist_next(tmp, &uiter); if (!node) break; @@ -1610,7 +1650,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index de5e4f2adfea..81220b2203c6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,6 +44,8 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 +/* DIO is ready to submit */ +#define BTRFS_INODE_DIO_READY 12 /* * The following 3 bits are meant only for the btree inode. * When any of them is set, it means an error happened while writing an @@ -66,7 +68,11 @@ struct btrfs_inode { */ struct btrfs_key location; - /* Lock for counters */ + /* + * Lock for counters and all fields used to determine if the inode is in + * the log or not (last_trans, last_sub_trans, last_log_commit, + * logged_trans). + */ spinlock_t lock; /* the extent_tree has caches of all the extent mappings to disk */ @@ -250,6 +256,9 @@ static inline bool btrfs_is_free_space_inode(struct inode *inode) static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) { + int ret = 0; + + spin_lock(&BTRFS_I(inode)->lock); if (BTRFS_I(inode)->logged_trans == generation && BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit && @@ -263,9 +272,10 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) */ smp_mb(); if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) - return 1; + ret = 1; } - return 0; + spin_unlock(&BTRFS_I(inode)->lock); + return ret; } #define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index d897ef803b3b..ce7dec88f4b8 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2990,8 +2990,8 @@ static void __btrfsic_submit_bio(int rw, struct bio *bio) (unsigned long long)bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt, - GFP_NOFS); + mapped_datav = kmalloc_array(bio->bi_vcnt, + sizeof(*mapped_datav), GFP_NOFS); if (!mapped_datav) goto leave; cur_bytenr = dev_bytenr; @@ -3241,8 +3241,5 @@ void btrfsic_unmount(struct btrfs_root *root, mutex_unlock(&btrfsic_mutex); - if (is_vmalloc_addr(state)) - vfree(state); - else - kfree(state); + kvfree(state); } diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e9df8862012c..ce62324c78e7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -622,7 +622,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->orig_bio = bio; nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE); - cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, + cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!cb->compressed_pages) goto fail1; @@ -750,7 +750,7 @@ static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; -static struct btrfs_compress_op *btrfs_compress_op[] = { +static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zlib_compress, &btrfs_lzo_compress, }; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index d181f70caae0..13a4dc0436c9 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -77,7 +77,7 @@ struct btrfs_compress_op { size_t srclen, size_t destlen); }; -extern struct btrfs_compress_op btrfs_zlib_compress; -extern struct btrfs_compress_op btrfs_lzo_compress; +extern const struct btrfs_compress_op btrfs_zlib_compress; +extern const struct btrfs_compress_op btrfs_lzo_compress; #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6d67f32e648d..54114b4887dd 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -578,7 +578,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (!tree_mod_need_log(fs_info, eb)) return 0; - tm_list = kzalloc(nr_items * sizeof(struct tree_mod_elem *), flags); + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); if (!tm_list) return -ENOMEM; @@ -677,7 +677,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal && btrfs_header_level(old_root) > 0) { nritems = btrfs_header_nritems(old_root); - tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), flags); if (!tm_list) { ret = -ENOMEM; @@ -814,7 +814,7 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) return 0; - tm_list = kzalloc(nr_items * 2 * sizeof(struct tree_mod_elem *), + tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; @@ -905,8 +905,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) return 0; nritems = btrfs_header_nritems(eb); - tm_list = kzalloc(nritems * sizeof(struct tree_mod_elem *), - GFP_NOFS); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; @@ -1073,7 +1072,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, buf, 1); BUG_ON(ret); /* -ENOMEM */ } - clean_tree_block(trans, root, buf); + clean_tree_block(trans, root->fs_info, buf); *last_ref = 1; } return 0; @@ -1440,8 +1439,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); old = read_tree_block(root, logical, 0); - if (WARN_ON(!old || !extent_buffer_uptodate(old))) { - free_extent_buffer(old); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); btrfs_warn(root->fs_info, "failed to read tree block %llu from get_old_root", logical); } else { @@ -1678,7 +1678,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = btrfs_find_tree_block(root, blocknr); + cur = btrfs_find_tree_block(root->fs_info, blocknr); if (cur) uptodate = btrfs_buffer_uptodate(cur, gen, 0); else @@ -1686,7 +1686,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur || !uptodate) { if (!cur) { cur = read_tree_block(root, blocknr, gen); - if (!cur || !extent_buffer_uptodate(cur)) { + if (IS_ERR(cur)) { + return PTR_ERR(cur); + } else if (!extent_buffer_uptodate(cur)) { free_extent_buffer(cur); return -EIO; } @@ -1865,8 +1867,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (eb && !extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); + if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { + if (!IS_ERR(eb)) + free_extent_buffer(eb); eb = NULL; } @@ -1943,7 +1946,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, path->locks[level] = 0; path->nodes[level] = NULL; - clean_tree_block(trans, root, mid); + clean_tree_block(trans, root->fs_info, mid); btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); @@ -1997,7 +2000,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, if (wret < 0 && wret != -ENOSPC) ret = wret; if (btrfs_header_nritems(right) == 0) { - clean_tree_block(trans, root, right); + clean_tree_block(trans, root->fs_info, right); btrfs_tree_unlock(right); del_ptr(root, path, level + 1, pslot + 1); root_sub_used(root, right->len); @@ -2041,7 +2044,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BUG_ON(wret == 1); } if (btrfs_header_nritems(mid) == 0) { - clean_tree_block(trans, root, mid); + clean_tree_block(trans, root->fs_info, mid); btrfs_tree_unlock(mid); del_ptr(root, path, level + 1, pslot); root_sub_used(root, mid->len); @@ -2259,7 +2262,7 @@ static void reada_for_search(struct btrfs_root *root, search = btrfs_node_blockptr(node, slot); blocksize = root->nodesize; - eb = btrfs_find_tree_block(root, search); + eb = btrfs_find_tree_block(root->fs_info, search); if (eb) { free_extent_buffer(eb); return; @@ -2319,7 +2322,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root, block1); + eb = btrfs_find_tree_block(root->fs_info, block1); /* * if we get -eagain from btrfs_buffer_uptodate, we * don't want to return eagain here. That will loop @@ -2332,7 +2335,7 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root, block2); + eb = btrfs_find_tree_block(root->fs_info, block2); if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); @@ -2450,7 +2453,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - tmp = btrfs_find_tree_block(root, blocknr); + tmp = btrfs_find_tree_block(root->fs_info, blocknr); if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { @@ -2495,7 +2498,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, ret = -EAGAIN; tmp = read_tree_block(root, blocknr, 0); - if (tmp) { + if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now @@ -3126,7 +3129,8 @@ again: * higher levels * */ -static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, +static void fixup_low_keys(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_disk_key *key, int level) { int i; @@ -3137,7 +3141,7 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, if (!path->nodes[i]) break; t = path->nodes[i]; - tree_mod_log_set_node_key(root->fs_info, t, tslot, 1); + tree_mod_log_set_node_key(fs_info, t, tslot, 1); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); if (tslot != 0) @@ -3151,7 +3155,8 @@ static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path, * This function isn't completely safe. It's the caller's responsibility * that the new key won't break the order */ -void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *new_key) { struct btrfs_disk_key disk_key; @@ -3173,7 +3178,7 @@ void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_item_key(eb, &disk_key, slot); btrfs_mark_buffer_dirty(eb); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } /* @@ -3692,7 +3697,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (left_nritems) btrfs_mark_buffer_dirty(left); else - clean_tree_block(trans, root, left); + clean_tree_block(trans, root->fs_info, left); btrfs_mark_buffer_dirty(right); @@ -3704,7 +3709,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, if (path->slots[0] >= left_nritems) { path->slots[0] -= left_nritems; if (btrfs_header_nritems(path->nodes[0]) == 0) - clean_tree_block(trans, root, path->nodes[0]); + clean_tree_block(trans, root->fs_info, path->nodes[0]); btrfs_tree_unlock(path->nodes[0]); free_extent_buffer(path->nodes[0]); path->nodes[0] = right; @@ -3928,10 +3933,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, if (right_nritems) btrfs_mark_buffer_dirty(right); else - clean_tree_block(trans, root, right); + clean_tree_block(trans, root->fs_info, right); btrfs_item_key(right, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); /* then fixup the leaf pointer in the path */ if (path->slots[0] < push_items) { @@ -4168,6 +4173,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, int mid; int slot; struct extent_buffer *right; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int wret; int split; @@ -4271,10 +4277,10 @@ again: btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV); btrfs_set_header_owner(right, root->root_key.objectid); btrfs_set_header_level(right, 0); - write_extent_buffer(right, root->fs_info->fsid, + write_extent_buffer(right, fs_info->fsid, btrfs_header_fsid(), BTRFS_FSID_SIZE); - write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + write_extent_buffer(right, fs_info->chunk_tree_uuid, btrfs_header_chunk_tree_uuid(right), BTRFS_UUID_SIZE); @@ -4297,7 +4303,7 @@ again: path->nodes[0] = right; path->slots[0] = 0; if (path->slots[1] == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(fs_info, path, &disk_key, 1); } btrfs_mark_buffer_dirty(right); return ret; @@ -4615,7 +4621,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, btrfs_set_disk_key_offset(&disk_key, offset + size_diff); btrfs_set_item_key(leaf, &disk_key, slot); if (slot == 0) - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } item = btrfs_item_nr(slot); @@ -4716,7 +4722,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } btrfs_unlock_up_safe(path, 1); @@ -4888,7 +4894,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_disk_key disk_key; btrfs_node_key(parent, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, level + 1); + fixup_low_keys(root->fs_info, path, &disk_key, level + 1); } btrfs_mark_buffer_dirty(parent); } @@ -4981,7 +4987,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_level(leaf, 0); } else { btrfs_set_path_blocking(path); - clean_tree_block(trans, root, leaf); + clean_tree_block(trans, root->fs_info, leaf); btrfs_del_leaf(trans, root, path, leaf); } } else { @@ -4990,7 +4996,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_disk_key disk_key; btrfs_item_key(leaf, &disk_key, 0); - fixup_low_keys(root, path, &disk_key, 1); + fixup_low_keys(root->fs_info, path, &disk_key, 1); } /* delete the leaf if it is mostly empty */ diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9c89cae39ee..aac314e14188 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -174,7 +174,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4, 0 }; +static int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 @@ -1061,6 +1061,12 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +#define BTRFS_QGROUP_LEVEL_SHIFT 48 +static inline u64 btrfs_qgroup_level(u64 qgroupid) +{ + return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT; +} + /* * is subvolume quota turned on? */ @@ -1256,6 +1262,20 @@ struct btrfs_caching_control { atomic_t count; }; +struct btrfs_io_ctl { + void *cur, *orig; + struct page *page; + struct page **pages; + struct btrfs_root *root; + struct inode *inode; + unsigned long size; + int index; + int num_pages; + int entries; + int bitmaps; + unsigned check_crcs:1; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -1321,6 +1341,9 @@ struct btrfs_block_group_cache { /* For dirty block groups */ struct list_head dirty_list; + struct list_head io_list; + + struct btrfs_io_ctl io_ctl; }; /* delayed seq elem */ @@ -1329,6 +1352,8 @@ struct seq_list { u64 seq; }; +#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } + enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -1472,6 +1497,12 @@ struct btrfs_fs_info { struct mutex chunk_mutex; struct mutex volume_mutex; + /* + * this is taken to make sure we don't set block groups ro after + * the free space cache has been allocated on them + */ + struct mutex ro_block_group_mutex; + /* this is used during read/modify/write to make sure * no two ios are trying to mod the same stripe at the same * time @@ -1513,6 +1544,7 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; + struct rw_semaphore delayed_iput_sem; /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; @@ -1587,10 +1619,7 @@ struct btrfs_fs_info { struct task_struct *cleaner_kthread; int thread_pool_size; - struct kobject super_kobj; struct kobject *space_info_kobj; - struct kobject *device_dir_kobj; - struct completion kobj_unregister; int do_barriers; int closing; int log_root_recovering; @@ -1666,6 +1695,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_nocow_workers; + struct btrfs_workqueue *scrub_parity_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; @@ -1703,7 +1733,7 @@ struct btrfs_fs_info { /* list of dirty qgroups to be written at next commit */ struct list_head dirty_qgroups; - /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + /* used by qgroup for an efficient tree traversal */ u64 qgroup_seq; /* qgroup rescan items */ @@ -1748,6 +1778,7 @@ struct btrfs_fs_info { spinlock_t unused_bgs_lock; struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; + struct mutex delete_unused_bgs_mutex; /* For btrfs to record security options */ struct security_mnt_opts security_opts; @@ -3295,6 +3326,9 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) } /* extent-tree.c */ + +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes); + static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) { @@ -3385,6 +3419,8 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, u64 owner, u64 offset, int no_quota); +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, @@ -3417,10 +3453,11 @@ enum btrfs_reserve_flush_enum { BTRFS_RESERVE_FLUSH_ALL, }; -int btrfs_check_data_free_space(struct inode *inode, u64 bytes); +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes); void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); @@ -3440,6 +3477,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, unsigned short type); void btrfs_free_block_rsv(struct btrfs_root *root, struct btrfs_block_rsv *rsv); +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); @@ -3477,6 +3515,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const u64 type); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3486,7 +3527,8 @@ int btrfs_previous_item(struct btrfs_root *root, int type); int btrfs_previous_extent_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid); -void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path, +void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); @@ -4011,6 +4053,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #ifdef CONFIG_BTRFS_ASSERT +__cold static inline void assfail(char *expr, char *file, int line) { pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", @@ -4026,10 +4069,12 @@ static inline void assfail(char *expr, char *file, int line) #define btrfs_assert() __printf(5, 6) +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); @@ -4072,11 +4117,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ - #define btrfs_abort_transaction(trans, root, errno) \ do { \ - __btrfs_abort_transaction(trans, root, __func__, \ - __LINE__, errno); \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((root)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), (root), __func__, \ + __LINE__, (errno)); \ } while (0) #define btrfs_std_error(fs_info, errno) \ @@ -4093,6 +4144,7 @@ do { \ } while (0) __printf(5, 6) +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); @@ -4180,7 +4232,8 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb, static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || - (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID) + ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID && + !btrfs_qgroup_level(rootid))) return 1; return 0; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 82f0c7c95474..a2ae42720a6a 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1383,7 +1383,7 @@ out: static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, - struct btrfs_root *root, int nr) + struct btrfs_fs_info *fs_info, int nr) { struct btrfs_async_delayed_work *async_work; @@ -1399,7 +1399,7 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root, btrfs_async_run_delayed_root, NULL, NULL); async_work->nr = nr; - btrfs_queue_work(root->fs_info->delayed_workers, &async_work->work); + btrfs_queue_work(fs_info->delayed_workers, &async_work->work); return 0; } @@ -1426,6 +1426,7 @@ static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) void btrfs_balance_delayed_items(struct btrfs_root *root) { struct btrfs_delayed_root *delayed_root; + struct btrfs_fs_info *fs_info = root->fs_info; delayed_root = btrfs_get_delayed_root(root); @@ -1438,7 +1439,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) seq = atomic_read(&delayed_root->items_seq); - ret = btrfs_wq_run_delayed_node(delayed_root, root, 0); + ret = btrfs_wq_run_delayed_node(delayed_root, fs_info, 0); if (ret) return; @@ -1447,7 +1448,7 @@ void btrfs_balance_delayed_items(struct btrfs_root *root) return; } - btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH); + btrfs_wq_run_delayed_node(delayed_root, fs_info, BTRFS_DELAYED_BATCH); } /* Will return 0 or -ENOMEM */ @@ -1801,6 +1802,8 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); + BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); + inode->i_version = btrfs_stack_inode_sequence(inode_item); inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6d16bea94e1c..ac3e81da6d4e 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -22,6 +22,7 @@ #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" +#include "qgroup.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, return 0; } -/* - * entries in the rb tree are ordered by the byte number of the extent, - * type of the delayed backrefs and content of delayed backrefs. - */ -static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1, - bool compare_seq) -{ - if (ref1->bytenr < ref2->bytenr) - return -1; - if (ref1->bytenr > ref2->bytenr) - return 1; - if (ref1->is_head && ref2->is_head) - return 0; - if (ref2->is_head) - return -1; - if (ref1->is_head) - return 1; - if (ref1->type < ref2->type) - return -1; - if (ref1->type > ref2->type) - return 1; - if (ref1->no_quota > ref2->no_quota) - return 1; - if (ref1->no_quota < ref2->no_quota) - return -1; - /* merging of sequenced refs is not allowed */ - if (compare_seq) { - if (ref1->seq < ref2->seq) - return -1; - if (ref1->seq > ref2->seq) - return 1; - } - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { - return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1), - ref1->type); - } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || - ref1->type == BTRFS_SHARED_DATA_REF_KEY) { - return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), - btrfs_delayed_node_to_data_ref(ref1)); - } - BUG(); - return 0; -} - -/* - * insert a new ref into the rbtree. This returns any existing refs - * for the same (bytenr,parent) tuple, or NULL if the new node was properly - * inserted. - */ -static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - struct btrfs_delayed_ref_node *ins; - int cmp; - - ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - rb_node); - - cmp = comp_entry(entry, ins, 1); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(node, parent_node, p); - rb_insert_color(node, root); - return NULL; -} - /* insert a new ref to head ref rbtree */ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, struct rb_node *node) @@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, rb_erase(&head->href_node, &delayed_refs->href_root); } else { assert_spin_locked(&head->lock); - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); } ref->in_tree = 0; btrfs_put_delayed_ref(ref); @@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates--; } -static int merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *ref, u64 seq) -{ - struct rb_node *node; - int mod = 0; - int done = 0; - - node = rb_next(&ref->rb_node); - while (!done && node) { - struct btrfs_delayed_ref_node *next; - - next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - if (seq && next->seq >= seq) - break; - if (comp_entry(ref, next, 0)) - continue; - - if (ref->action == next->action) { - mod = next->ref_mod; - } else { - if (ref->ref_mod < next->ref_mod) { - struct btrfs_delayed_ref_node *tmp; - - tmp = ref; - ref = next; - next = tmp; - done = 1; - } - mod = -next->ref_mod; - } - - drop_delayed_ref(trans, delayed_refs, head, next); - ref->ref_mod += mod; - if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); - done = 1; - } else { - /* - * You can't have multiples of the same ref on a tree - * block. - */ - WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || - ref->type == BTRFS_SHARED_BLOCK_REF_KEY); - } - } - return done; -} - -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - struct rb_node *node; - u64 seq = 0; - - assert_spin_locked(&head->lock); - /* - * We don't have too much refs to merge in the case of delayed data - * refs. - */ - if (head->is_data) - return; - - spin_lock(&fs_info->tree_mod_seq_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *elem; - - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - seq = elem->seq; - } - spin_unlock(&fs_info->tree_mod_seq_lock); - - node = rb_first(&head->ref_root); - while (node) { - struct btrfs_delayed_ref_node *ref; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - /* We can't merge refs that are outside of our seq count */ - if (seq && ref->seq >= seq) - break; - if (merge_ref(trans, delayed_refs, head, ref, seq)) - node = rb_first(&head->ref_root); - else - node = rb_next(&ref->rb_node); - } -} - int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -443,45 +270,71 @@ again: } /* - * helper function to update an extent delayed ref in the - * rbtree. existing and update must both have the same - * bytenr and parent + * Helper to insert the ref_node to the tail or merge with tail. * - * This may free existing if the update cancels out whatever - * operation it was doing. + * Return 0 for insert. + * Return >0 for merge. */ -static noinline void -update_existing_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) +static int +add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { - if (update->action != existing->action) { - /* - * this is effectively undoing either an add or a - * drop. We decrement the ref_mod, and if it goes - * down to zero we just delete the entry without - * every changing the extent allocation tree. - */ - existing->ref_mod--; - if (existing->ref_mod == 0) - drop_delayed_ref(trans, delayed_refs, head, existing); - else - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + struct btrfs_delayed_ref_node *exist; + int mod; + int ret = 0; + + spin_lock(&href->lock); + /* Check whether we can merge the tail node with ref */ + if (list_empty(&href->ref_list)) + goto add_tail; + exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, + list); + /* No need to compare bytenr nor is_head */ + if (exist->type != ref->type || exist->no_quota != ref->no_quota || + exist->seq != ref->seq) + goto add_tail; + + if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || + exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && + comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), + btrfs_delayed_node_to_tree_ref(ref), + ref->type)) + goto add_tail; + if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || + exist->type == BTRFS_SHARED_DATA_REF_KEY) && + comp_data_refs(btrfs_delayed_node_to_data_ref(exist), + btrfs_delayed_node_to_data_ref(ref))) + goto add_tail; + + /* Now we are sure we can merge */ + ret = 1; + if (exist->action == ref->action) { + mod = ref->ref_mod; } else { - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - /* - * the action on the existing ref matches - * the action on the ref we're trying to add. - * Bump the ref_mod by one so the backref that - * is eventually added/removed has the correct - * reference count - */ - existing->ref_mod += update->ref_mod; + /* Need to change action */ + if (exist->ref_mod < ref->ref_mod) { + exist->action = ref->action; + mod = -exist->ref_mod; + exist->ref_mod = ref->ref_mod; + } else + mod = -ref->ref_mod; } + exist->ref_mod += mod; + + /* remove existing tail if its ref_mod is zero */ + if (exist->ref_mod == 0) + drop_delayed_ref(trans, root, href, exist); + spin_unlock(&href->lock); + return ret; + +add_tail: + list_add_tail(&ref->list, &href->ref_list); + atomic_inc(&root->num_entries); + trans->delayed_ref_updates++; + spin_unlock(&href->lock); + return ret; } /* @@ -489,11 +342,13 @@ update_existing_ref(struct btrfs_trans_handle *trans, * existing and update must have the same bytenr */ static noinline void -update_existing_head_ref(struct btrfs_delayed_ref_node *existing, +update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_node *existing, struct btrfs_delayed_ref_node *update) { struct btrfs_delayed_ref_head *existing_ref; struct btrfs_delayed_ref_head *ref; + int old_ref_mod; existing_ref = btrfs_delayed_node_to_head(existing); ref = btrfs_delayed_node_to_head(update); @@ -541,7 +396,20 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, * only need the lock for this case cause we could be processing it * currently, for refs we just added we know we're a-ok. */ + old_ref_mod = existing_ref->total_ref_mod; existing->ref_mod += update->ref_mod; + existing_ref->total_ref_mod += update->ref_mod; + + /* + * If we are going to from a positive ref mod to a negative or vice + * versa we need to make sure to adjust pending_csums accordingly. + */ + if (existing_ref->is_data) { + if (existing_ref->total_ref_mod >= 0 && old_ref_mod < 0) + delayed_refs->pending_csums -= existing->num_bytes; + if (existing_ref->total_ref_mod < 0 && old_ref_mod >= 0) + delayed_refs->pending_csums += existing->num_bytes; + } spin_unlock(&existing_ref->lock); } @@ -553,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, u64 bytenr, - u64 num_bytes, int action, int is_data) + struct btrfs_delayed_ref_node *ref, + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr, u64 num_bytes, int action, int is_data) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -603,8 +473,21 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; - head_ref->ref_root = RB_ROOT; + INIT_LIST_HEAD(&head_ref->ref_list); head_ref->processing = 0; + head_ref->total_ref_mod = count_mod; + + /* Record qgroup extent info if provided */ + if (qrecord) { + qrecord->bytenr = bytenr; + qrecord->num_bytes = num_bytes; + qrecord->old_roots = NULL; + + qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, + qrecord); + if (qexisting) + kfree(qrecord); + } spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); @@ -614,7 +497,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, existing = htree_insert(&delayed_refs->href_root, &head_ref->href_node); if (existing) { - update_existing_head_ref(&existing->node, ref); + update_existing_head_ref(delayed_refs, &existing->node, ref); /* * we've updated the existing ref, free the newly * allocated ref @@ -622,6 +505,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); head_ref = existing; } else { + if (is_data && count_mod < 0) + delayed_refs->pending_csums += num_bytes; delayed_refs->num_heads++; delayed_refs->num_heads_ready++; atomic_inc(&delayed_refs->num_entries); @@ -641,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -675,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + /* + * XXX: memory should be freed at the same level allocated. + * But bad practice is anywhere... Follow it now. Need cleanup. + */ + if (ret > 0) kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -703,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -740,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + if (ret > 0) kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -772,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -782,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - return -ENOMEM; + if (!head_ref) + goto free_ref; + + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) + goto free_head_ref; } head_ref->extent_op = extent_op; @@ -796,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 0); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -805,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, spin_unlock(&delayed_refs->lock); return 0; + +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_ref: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + + return -ENOMEM; } /* @@ -821,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -836,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; } + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) { + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, + head_ref); + return -ENOMEM; + } + } + head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -845,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 1); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -873,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data); + add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a764e2340d48..13fb5e6090fe 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -24,9 +24,25 @@ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ +/* + * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the + * same ref_node structure. + * Ref_head is in a higher logic level than tree/data ref, and duplicated + * bytenr/num_bytes in ref_node is really a waste or memory, they should be + * referred from ref_head. + * This gets more disgusting after we use list to store tree/data ref in + * ref_head. Must clean this mess up later. + */ struct btrfs_delayed_ref_node { + /* + * ref_head use rb tree, stored in ref_root->href. + * indexed by bytenr + */ struct rb_node rb_node; + /*data/tree ref use list, stored in ref_head->ref_list. */ + struct list_head list; + /* the starting bytenr of the extent */ u64 bytenr; @@ -83,11 +99,19 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct rb_root ref_root; + struct list_head ref_list; struct rb_node href_node; struct btrfs_delayed_extent_op *extent_op; + + /* + * This is used to track the final ref_mod from all the refs associated + * with this head ref, this is not adjusted as delayed refs are run, + * this is meant to track if we need to do the csum accounting or not. + */ + int total_ref_mod; + /* * when a new extent is allocated, it is just reserved in memory * The actual extent isn't inserted into the extent allocation tree @@ -124,6 +148,9 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root href_root; + /* dirty extent records */ + struct rb_root dirty_extent_root; + /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -138,6 +165,8 @@ struct btrfs_delayed_ref_root { /* total number of head nodes ready for processing */ unsigned long num_heads_ready; + u64 pending_csums; + /* * set when the tree is flushing before a transaction commit, * used by the throttling code to decide if new updates need @@ -146,6 +175,14 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; + + /* + * To make qgroup to skip given root. + * This is for snapshot, as btrfs_qgroup_inherit() will manully + * modify counters for snapshot and its source, so we should skip + * the snapshot in new_root/old_roots or it will get calculated twice + */ + u64 qgroup_to_skip; }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 5ec03d999c37..564a7de17d99 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; + ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + printk_in_rcu(KERN_INFO "BTRFS: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "<missing disk>" : @@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info, src_device); - btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_kobj_rm_device(fs_info->fs_devices, src_device); btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ @@ -670,8 +673,8 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: srcdev = dev_replace->srcdev; - args->status.progress_1000 = div64_u64(dev_replace->cursor_left, - div64_u64(btrfs_device_get_total_bytes(srcdev), 1000)); + args->status.progress_1000 = div_u64(dev_replace->cursor_left, + div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } btrfs_dev_replace_unlock(dev_replace); @@ -806,7 +809,7 @@ static int btrfs_dev_replace_kthread(void *data) btrfs_dev_replace_status(fs_info, status_args); progress = status_args->status.progress_1000; kfree(status_args); - do_div(progress, 10); + progress = div_u64(progress, 10); printk_in_rcu(KERN_INFO "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", dev_replace->srcdev->missing ? "<missing disk>" : diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 639f2663ed3f..f556c3732c2c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -54,7 +54,7 @@ #include <asm/cpufeature.h> #endif -static struct extent_io_ops btree_extent_io_ops; +static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, @@ -274,10 +274,11 @@ void btrfs_csum_final(u32 crc, char *result) * compute the csum for a btree block, and either verify it or write it * into the csum field of the block. */ -static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, +static int csum_tree_block(struct btrfs_fs_info *fs_info, + struct extent_buffer *buf, int verify) { - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); + u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); char *result = NULL; unsigned long len; unsigned long cur_len; @@ -302,7 +303,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, offset += cur_len; } if (csum_size > sizeof(inline_result)) { - result = kzalloc(csum_size * sizeof(char), GFP_NOFS); + result = kzalloc(csum_size, GFP_NOFS); if (!result) return 1; } else { @@ -321,7 +322,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, printk_ratelimited(KERN_WARNING "BTRFS: %s checksum verify failed on %llu wanted %X found %X " "level %d\n", - root->fs_info->sb->s_id, buf->start, + fs_info->sb->s_id, buf->start, val, found, btrfs_header_level(buf)); if (result != (char *)&inline_result) kfree(result); @@ -418,12 +419,6 @@ static int btrfs_check_super_csum(char *raw_disk_sb) if (memcmp(raw_disk_sb, result, csum_size)) ret = 1; - - if (ret && btrfs_super_generation(disk_sb) < 10) { - printk(KERN_WARNING - "BTRFS: super block crcs don't match, older mkfs detected\n"); - ret = 0; - } } if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) { @@ -501,7 +496,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, * we only fill in the checksum field in the first page of a multi-page block */ -static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) +static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) { u64 start = page_offset(page); u64 found_start; @@ -513,14 +508,14 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) found_start = btrfs_header_bytenr(eb); if (WARN_ON(found_start != start || !PageUptodate(page))) return 0; - csum_tree_block(root, eb, 0); + csum_tree_block(fs_info, eb, 0); return 0; } -static int check_tree_block_fsid(struct btrfs_root *root, +static int check_tree_block_fsid(struct btrfs_fs_info *fs_info, struct extent_buffer *eb) { - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u8 fsid[BTRFS_UUID_SIZE]; int ret = 1; @@ -640,7 +635,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, ret = -EIO; goto err; } - if (check_tree_block_fsid(root, eb)) { + if (check_tree_block_fsid(root->fs_info, eb)) { printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", eb->fs_info->sb->s_id, eb->start); ret = -EIO; @@ -657,7 +652,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, found_level); - ret = csum_tree_block(root, eb, 1); + ret = csum_tree_block(root->fs_info, eb, 1); if (ret) { ret = -EIO; goto err; @@ -882,7 +877,7 @@ static int btree_csum_one_bio(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { root = BTRFS_I(bvec->bv_page->mapping->host)->root; - ret = csum_dirty_buffer(root, bvec->bv_page); + ret = csum_dirty_buffer(root->fs_info, bvec->bv_page); if (ret) break; } @@ -1119,10 +1114,10 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, return 0; } -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, +struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr) { - return find_extent_buffer(root->fs_info, bytenr); + return find_extent_buffer(fs_info, bytenr); } struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, @@ -1154,22 +1149,21 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) - return NULL; + return ERR_PTR(-ENOMEM); ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret) { free_extent_buffer(buf); - return NULL; + return ERR_PTR(ret); } return buf; } -void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, +void clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, struct extent_buffer *buf) { - struct btrfs_fs_info *fs_info = root->fs_info; - if (btrfs_header_generation(buf) == fs_info->running_transaction->transid) { btrfs_assert_tree_locked(buf); @@ -1515,20 +1509,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), generation); - if (!root->node) { - ret = -ENOMEM; + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - goto read_fail; + free_extent_buffer(root->node); + goto find_fail; } root->commit_root = btrfs_root_node(root); out: btrfs_free_path(path); return root; -read_fail: - free_extent_buffer(root->node); find_fail: kfree(root); alloc_fail: @@ -1751,13 +1744,14 @@ static void end_workqueue_fn(struct btrfs_work *work) bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); - bio_endio_nodec(bio, error); + bio_endio(bio, error); } static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; int again; + struct btrfs_trans_handle *trans; do { again = 0; @@ -1779,7 +1773,6 @@ static int cleaner_kthread(void *arg) } btrfs_run_delayed_iputs(root); - btrfs_delete_unused_bgs(root->fs_info); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -1788,6 +1781,16 @@ static int cleaner_kthread(void *arg) * needn't do anything special here. */ btrfs_run_defrag_inodes(root->fs_info); + + /* + * Acquires fs_info->delete_unused_bgs_mutex to avoid racing + * with relocation (btrfs_relocate_chunk) and relocation + * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) + * after acquiring fs_info->delete_unused_bgs_mutex. So we + * can't hold, nor need to, fs_info->cleaner_mutex when deleting + * unused block groups. + */ + btrfs_delete_unused_bgs(root->fs_info); sleep: if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); @@ -1796,6 +1799,34 @@ sleep: __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); + + /* + * Transaction kthread is stopped before us and wakes us up. + * However we might have started a new transaction and COWed some + * tree blocks when deleting unused block groups for example. So + * make sure we commit the transaction we started to have a clean + * shutdown when evicting the btree inode - if it has dirty pages + * when we do the final iput() on it, eviction will trigger a + * writeback for it which will fail with null pointer dereferences + * since work queues and other resources were already released and + * destroyed by the time the iput/eviction/writeback is made. + */ + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + btrfs_err(root->fs_info, + "cleaner transaction attach returned %ld", + PTR_ERR(trans)); + } else { + int ret; + + ret = btrfs_commit_transaction(trans, root); + if (ret) + btrfs_err(root->fs_info, + "cleaner open transaction commit returned %d", + ret); + } + return 0; } @@ -2146,6 +2177,271 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) } } +static void btrfs_init_scrub(struct btrfs_fs_info *fs_info) +{ + mutex_init(&fs_info->scrub_lock); + atomic_set(&fs_info->scrubs_running, 0); + atomic_set(&fs_info->scrub_pause_req, 0); + atomic_set(&fs_info->scrubs_paused, 0); + atomic_set(&fs_info->scrub_cancel_req, 0); + init_waitqueue_head(&fs_info->scrub_pause_wait); + fs_info->scrub_workers_refcnt = 0; +} + +static void btrfs_init_balance(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->balance_lock); + mutex_init(&fs_info->balance_mutex); + atomic_set(&fs_info->balance_running, 0); + atomic_set(&fs_info->balance_pause_req, 0); + atomic_set(&fs_info->balance_cancel_req, 0); + fs_info->balance_ctl = NULL; + init_waitqueue_head(&fs_info->balance_wait_q); +} + +static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info, + struct btrfs_root *tree_root) +{ + fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; + set_nlink(fs_info->btree_inode, 1); + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; + fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + + RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, + fs_info->btree_inode->i_mapping); + BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + set_bit(BTRFS_INODE_DUMMY, + &BTRFS_I(fs_info->btree_inode)->runtime_flags); + btrfs_insert_inode_hash(fs_info->btree_inode); +} + +static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info) +{ + fs_info->dev_replace.lock_owner = 0; + atomic_set(&fs_info->dev_replace.nesting_level, 0); + mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); + mutex_init(&fs_info->dev_replace.lock_management_lock); + mutex_init(&fs_info->dev_replace.lock); + init_waitqueue_head(&fs_info->replace_wait); +} + +static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info) +{ + spin_lock_init(&fs_info->qgroup_lock); + mutex_init(&fs_info->qgroup_ioctl_lock); + fs_info->qgroup_tree = RB_ROOT; + fs_info->qgroup_op_tree = RB_ROOT; + INIT_LIST_HEAD(&fs_info->dirty_qgroups); + fs_info->qgroup_seq = 1; + fs_info->quota_enabled = 0; + fs_info->pending_quota_state = 0; + fs_info->qgroup_ulist = NULL; + mutex_init(&fs_info->qgroup_rescan_lock); +} + +static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int max_active = fs_info->thread_pool_size; + unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; + + fs_info->workers = + btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, + max_active, 16); + + fs_info->delalloc_workers = + btrfs_alloc_workqueue("delalloc", flags, max_active, 2); + + fs_info->flush_workers = + btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); + + fs_info->caching_workers = + btrfs_alloc_workqueue("cache", flags, max_active, 0); + + /* + * a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers = + btrfs_alloc_workqueue("submit", flags, + min_t(u64, fs_devices->num_devices, + max_active), 64); + + fs_info->fixup_workers = + btrfs_alloc_workqueue("fixup", flags, 1, 0); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers = + btrfs_alloc_workqueue("endio", flags, max_active, 4); + fs_info->endio_meta_workers = + btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); + fs_info->endio_meta_write_workers = + btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); + fs_info->endio_raid56_workers = + btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->endio_repair_workers = + btrfs_alloc_workqueue("endio-repair", flags, 1, 0); + fs_info->rmw_workers = + btrfs_alloc_workqueue("rmw", flags, max_active, 2); + fs_info->endio_write_workers = + btrfs_alloc_workqueue("endio-write", flags, max_active, 2); + fs_info->endio_freespace_worker = + btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); + fs_info->delayed_workers = + btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); + fs_info->readahead_workers = + btrfs_alloc_workqueue("readahead", flags, max_active, 2); + fs_info->qgroup_rescan_workers = + btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); + fs_info->extent_workers = + btrfs_alloc_workqueue("extent-refs", flags, + min_t(u64, fs_devices->num_devices, + max_active), 8); + + if (!(fs_info->workers && fs_info->delalloc_workers && + fs_info->submit_workers && fs_info->flush_workers && + fs_info->endio_workers && fs_info->endio_meta_workers && + fs_info->endio_meta_write_workers && + fs_info->endio_repair_workers && + fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_freespace_worker && fs_info->rmw_workers && + fs_info->caching_workers && fs_info->readahead_workers && + fs_info->fixup_workers && fs_info->delayed_workers && + fs_info->extent_workers && + fs_info->qgroup_rescan_workers)) { + return -ENOMEM; + } + + return 0; +} + +static int btrfs_replay_log(struct btrfs_fs_info *fs_info, + struct btrfs_fs_devices *fs_devices) +{ + int ret; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *log_tree_root; + struct btrfs_super_block *disk_super = fs_info->super_copy; + u64 bytenr = btrfs_super_log_root(disk_super); + + if (fs_devices->rw_devices == 0) { + printk(KERN_WARNING "BTRFS: log replay required " + "on RO media\n"); + return -EIO; + } + + log_tree_root = btrfs_alloc_root(fs_info); + if (!log_tree_root) + return -ENOMEM; + + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, log_tree_root, fs_info, + BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + fs_info->generation + 1); + if (IS_ERR(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + ret = PTR_ERR(log_tree_root->node); + kfree(log_tree_root); + return ret; + } else if (!extent_buffer_uptodate(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + return -EIO; + } + /* returns with log_tree_root freed on success */ + ret = btrfs_recover_log_trees(log_tree_root); + if (ret) { + btrfs_error(tree_root->fs_info, ret, + "Failed to recover log tree"); + free_extent_buffer(log_tree_root->node); + kfree(log_tree_root); + return ret; + } + + if (fs_info->sb->s_flags & MS_RDONLY) { + ret = btrfs_commit_super(tree_root); + if (ret) + return ret; + } + + return 0; +} + +static int btrfs_read_roots(struct btrfs_fs_info *fs_info, + struct btrfs_root *tree_root) +{ + struct btrfs_root *root; + struct btrfs_key location; + int ret; + + location.objectid = BTRFS_EXTENT_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = 0; + + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->extent_root = root; + + location.objectid = BTRFS_DEV_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->dev_root = root; + btrfs_init_devices_late(fs_info); + + location.objectid = BTRFS_CSUM_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->csum_root = root; + + location.objectid = BTRFS_QUOTA_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (!IS_ERR(root)) { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->quota_enabled = 1; + fs_info->pending_quota_state = 1; + fs_info->quota_root = root; + } + + location.objectid = BTRFS_UUID_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + if (ret != -ENOENT) + return ret; + } else { + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->uuid_root = root; + } + + return 0; +} + int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options) @@ -2160,21 +2456,12 @@ int open_ctree(struct super_block *sb, struct btrfs_super_block *disk_super; struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *tree_root; - struct btrfs_root *extent_root; - struct btrfs_root *csum_root; struct btrfs_root *chunk_root; - struct btrfs_root *dev_root; - struct btrfs_root *quota_root; - struct btrfs_root *uuid_root; - struct btrfs_root *log_tree_root; int ret; int err = -EINVAL; int num_backups_tried = 0; int backup_index = 0; int max_active; - int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; - bool create_uuid_tree; - bool check_uuid_tree; tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info); chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info); @@ -2241,13 +2528,14 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); - mutex_init(&fs_info->unused_bg_unpin_mutex); rwlock_init(&fs_info->tree_mod_log_lock); + mutex_init(&fs_info->unused_bg_unpin_mutex); + mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); + init_rwsem(&fs_info->delayed_iput_sem); - init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); @@ -2276,7 +2564,7 @@ int open_ctree(struct super_block *sb, fs_info->free_chunk_space = 0; fs_info->tree_mod_log = RB_ROOT; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; - fs_info->avg_delayed_ref_runtime = div64_u64(NSEC_PER_SEC, 64); + fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ /* readahead state */ INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT); spin_lock_init(&fs_info->reada_lock); @@ -2294,55 +2582,18 @@ int open_ctree(struct super_block *sb, } btrfs_init_delayed_root(fs_info->delayed_root); - mutex_init(&fs_info->scrub_lock); - atomic_set(&fs_info->scrubs_running, 0); - atomic_set(&fs_info->scrub_pause_req, 0); - atomic_set(&fs_info->scrubs_paused, 0); - atomic_set(&fs_info->scrub_cancel_req, 0); - init_waitqueue_head(&fs_info->replace_wait); - init_waitqueue_head(&fs_info->scrub_pause_wait); - fs_info->scrub_workers_refcnt = 0; + btrfs_init_scrub(fs_info); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY fs_info->check_integrity_print_mask = 0; #endif - - spin_lock_init(&fs_info->balance_lock); - mutex_init(&fs_info->balance_mutex); - atomic_set(&fs_info->balance_running, 0); - atomic_set(&fs_info->balance_pause_req, 0); - atomic_set(&fs_info->balance_cancel_req, 0); - fs_info->balance_ctl = NULL; - init_waitqueue_head(&fs_info->balance_wait_q); + btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); sb->s_bdi = &fs_info->bdi; - fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; - set_nlink(fs_info->btree_inode, 1); - /* - * we set the i_size on the btree inode to the max possible int. - * the real end of the address space is determined by all of - * the devices in the system - */ - fs_info->btree_inode->i_size = OFFSET_MAX; - fs_info->btree_inode->i_mapping->a_ops = &btree_aops; - - RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); - extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, - fs_info->btree_inode->i_mapping); - BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0; - extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree); - - BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; - - BTRFS_I(fs_info->btree_inode)->root = tree_root; - memset(&BTRFS_I(fs_info->btree_inode)->location, 0, - sizeof(struct btrfs_key)); - set_bit(BTRFS_INODE_DUMMY, - &BTRFS_I(fs_info->btree_inode)->runtime_flags); - btrfs_insert_inode_hash(fs_info->btree_inode); + btrfs_init_btree_inode(fs_info, tree_root); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -2363,26 +2614,14 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->transaction_kthread_mutex); mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); + mutex_init(&fs_info->ro_block_group_mutex); init_rwsem(&fs_info->commit_root_sem); init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); sema_init(&fs_info->uuid_tree_rescan_sem, 1); - fs_info->dev_replace.lock_owner = 0; - atomic_set(&fs_info->dev_replace.nesting_level, 0); - mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount); - mutex_init(&fs_info->dev_replace.lock_management_lock); - mutex_init(&fs_info->dev_replace.lock); - spin_lock_init(&fs_info->qgroup_lock); - mutex_init(&fs_info->qgroup_ioctl_lock); - fs_info->qgroup_tree = RB_ROOT; - fs_info->qgroup_op_tree = RB_ROOT; - INIT_LIST_HEAD(&fs_info->dirty_qgroups); - fs_info->qgroup_seq = 1; - fs_info->quota_enabled = 0; - fs_info->pending_quota_state = 0; - fs_info->qgroup_ulist = NULL; - mutex_init(&fs_info->qgroup_rescan_lock); + btrfs_init_dev_replace_locks(fs_info); + btrfs_init_qgroup(fs_info); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -2554,75 +2793,9 @@ int open_ctree(struct super_block *sb, max_active = fs_info->thread_pool_size; - fs_info->workers = - btrfs_alloc_workqueue("worker", flags | WQ_HIGHPRI, - max_active, 16); - - fs_info->delalloc_workers = - btrfs_alloc_workqueue("delalloc", flags, max_active, 2); - - fs_info->flush_workers = - btrfs_alloc_workqueue("flush_delalloc", flags, max_active, 0); - - fs_info->caching_workers = - btrfs_alloc_workqueue("cache", flags, max_active, 0); - - /* - * a higher idle thresh on the submit workers makes it much more - * likely that bios will be send down in a sane order to the - * devices - */ - fs_info->submit_workers = - btrfs_alloc_workqueue("submit", flags, - min_t(u64, fs_devices->num_devices, - max_active), 64); - - fs_info->fixup_workers = - btrfs_alloc_workqueue("fixup", flags, 1, 0); - - /* - * endios are largely parallel and should have a very - * low idle thresh - */ - fs_info->endio_workers = - btrfs_alloc_workqueue("endio", flags, max_active, 4); - fs_info->endio_meta_workers = - btrfs_alloc_workqueue("endio-meta", flags, max_active, 4); - fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); - fs_info->endio_raid56_workers = - btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); - fs_info->endio_repair_workers = - btrfs_alloc_workqueue("endio-repair", flags, 1, 0); - fs_info->rmw_workers = - btrfs_alloc_workqueue("rmw", flags, max_active, 2); - fs_info->endio_write_workers = - btrfs_alloc_workqueue("endio-write", flags, max_active, 2); - fs_info->endio_freespace_worker = - btrfs_alloc_workqueue("freespace-write", flags, max_active, 0); - fs_info->delayed_workers = - btrfs_alloc_workqueue("delayed-meta", flags, max_active, 0); - fs_info->readahead_workers = - btrfs_alloc_workqueue("readahead", flags, max_active, 2); - fs_info->qgroup_rescan_workers = - btrfs_alloc_workqueue("qgroup-rescan", flags, 1, 0); - fs_info->extent_workers = - btrfs_alloc_workqueue("extent-refs", flags, - min_t(u64, fs_devices->num_devices, - max_active), 8); - - if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->submit_workers && fs_info->flush_workers && - fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->endio_meta_write_workers && - fs_info->endio_repair_workers && - fs_info->endio_write_workers && fs_info->endio_raid56_workers && - fs_info->endio_freespace_worker && fs_info->rmw_workers && - fs_info->caching_workers && fs_info->readahead_workers && - fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->extent_workers && - fs_info->qgroup_rescan_workers)) { - err = -ENOMEM; + ret = btrfs_init_workqueues(fs_info, fs_devices); + if (ret) { + err = ret; goto fail_sb_buffer; } @@ -2665,10 +2838,11 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), generation); - if (!chunk_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + if (IS_ERR(chunk_root->node) || + !extent_buffer_uptodate(chunk_root->node)) { printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); + chunk_root->node = NULL; goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); @@ -2688,7 +2862,7 @@ int open_ctree(struct super_block *sb, * keep the device that is marked to be the target device for the * dev_replace procedure */ - btrfs_close_extra_devices(fs_info, fs_devices, 0); + btrfs_close_extra_devices(fs_devices, 0); if (!fs_devices->latest_bdev) { printk(KERN_ERR "BTRFS: failed to read devices on %s\n", @@ -2702,11 +2876,11 @@ retry_root_backup: tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), generation); - if (!tree_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (IS_ERR(tree_root->node) || + !extent_buffer_uptodate(tree_root->node)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); - + tree_root->node = NULL; goto recovery_tree_root; } @@ -2714,61 +2888,9 @@ retry_root_backup: tree_root->commit_root = btrfs_root_node(tree_root); btrfs_set_root_refs(&tree_root->root_item, 1); - location.objectid = BTRFS_EXTENT_TREE_OBJECTID; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = 0; - - extent_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(extent_root)) { - ret = PTR_ERR(extent_root); - goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &extent_root->state); - fs_info->extent_root = extent_root; - - location.objectid = BTRFS_DEV_TREE_OBJECTID; - dev_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(dev_root)) { - ret = PTR_ERR(dev_root); - goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &dev_root->state); - fs_info->dev_root = dev_root; - btrfs_init_devices_late(fs_info); - - location.objectid = BTRFS_CSUM_TREE_OBJECTID; - csum_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(csum_root)) { - ret = PTR_ERR(csum_root); + ret = btrfs_read_roots(fs_info, tree_root); + if (ret) goto recovery_tree_root; - } - set_bit(BTRFS_ROOT_TRACK_DIRTY, &csum_root->state); - fs_info->csum_root = csum_root; - - location.objectid = BTRFS_QUOTA_TREE_OBJECTID; - quota_root = btrfs_read_tree_root(tree_root, &location); - if (!IS_ERR(quota_root)) { - set_bit(BTRFS_ROOT_TRACK_DIRTY, "a_root->state); - fs_info->quota_enabled = 1; - fs_info->pending_quota_state = 1; - fs_info->quota_root = quota_root; - } - - location.objectid = BTRFS_UUID_TREE_OBJECTID; - uuid_root = btrfs_read_tree_root(tree_root, &location); - if (IS_ERR(uuid_root)) { - ret = PTR_ERR(uuid_root); - if (ret != -ENOENT) - goto recovery_tree_root; - create_uuid_tree = true; - check_uuid_tree = false; - } else { - set_bit(BTRFS_ROOT_TRACK_DIRTY, &uuid_root->state); - fs_info->uuid_root = uuid_root; - create_uuid_tree = false; - check_uuid_tree = - generation != btrfs_super_uuid_tree_generation(disk_super); - } fs_info->generation = generation; fs_info->last_trans_committed = generation; @@ -2792,12 +2914,24 @@ retry_root_backup: goto fail_block_groups; } - btrfs_close_extra_devices(fs_info, fs_devices, 1); + btrfs_close_extra_devices(fs_devices, 1); + + ret = btrfs_sysfs_add_fsid(fs_devices, NULL); + if (ret) { + pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + goto fail_block_groups; + } + + ret = btrfs_sysfs_add_device(fs_devices); + if (ret) { + pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + goto fail_fsdev_sysfs; + } ret = btrfs_sysfs_add_one(fs_info); if (ret) { pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); - goto fail_block_groups; + goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); @@ -2806,7 +2940,7 @@ retry_root_backup: goto fail_sysfs; } - ret = btrfs_read_block_groups(extent_root); + ret = btrfs_read_block_groups(fs_info->extent_root); if (ret) { printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); goto fail_sysfs; @@ -2864,48 +2998,11 @@ retry_root_backup: /* do not make disk changes in broken FS */ if (btrfs_super_log_root(disk_super) != 0) { - u64 bytenr = btrfs_super_log_root(disk_super); - - if (fs_devices->rw_devices == 0) { - printk(KERN_WARNING "BTRFS: log replay required " - "on RO media\n"); - err = -EIO; - goto fail_qgroup; - } - - log_tree_root = btrfs_alloc_root(fs_info); - if (!log_tree_root) { - err = -ENOMEM; - goto fail_qgroup; - } - - __setup_root(nodesize, sectorsize, stripesize, - log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); - - log_tree_root->node = read_tree_block(tree_root, bytenr, - generation + 1); - if (!log_tree_root->node || - !extent_buffer_uptodate(log_tree_root->node)) { - printk(KERN_ERR "BTRFS: failed to read log tree\n"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); - goto fail_qgroup; - } - /* returns with log_tree_root freed on success */ - ret = btrfs_recover_log_trees(log_tree_root); + ret = btrfs_replay_log(fs_info, fs_devices); if (ret) { - btrfs_error(tree_root->fs_info, ret, - "Failed to recover log tree"); - free_extent_buffer(log_tree_root->node); - kfree(log_tree_root); + err = ret; goto fail_qgroup; } - - if (sb->s_flags & MS_RDONLY) { - ret = btrfs_commit_super(tree_root); - if (ret) - goto fail_qgroup; - } } ret = btrfs_find_orphan_roots(tree_root); @@ -2966,7 +3063,7 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); - if (create_uuid_tree) { + if (!fs_info->uuid_root) { pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); if (ret) { @@ -2975,8 +3072,9 @@ retry_root_backup: close_ctree(tree_root); return ret; } - } else if (check_uuid_tree || - btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) { + } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || + fs_info->generation != + btrfs_super_uuid_tree_generation(disk_super)) { pr_info("BTRFS: checking UUID tree\n"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { @@ -3011,6 +3109,9 @@ fail_cleaner: fail_sysfs: btrfs_sysfs_remove_one(fs_info); +fail_fsdev_sysfs: + btrfs_sysfs_remove_fsid(fs_info->fs_devices); + fail_block_groups: btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); @@ -3225,11 +3326,8 @@ static int write_dev_supers(struct btrfs_device *device, */ static void btrfs_end_empty_barrier(struct bio *bio, int err) { - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); - } if (bio->bi_private) complete(bio->bi_private); bio_put(bio); @@ -3257,11 +3355,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk_in_rcu("BTRFS: disabling barriers on dev %s\n", - rcu_str_deref(device->name)); - device->nobarriers = 1; - } else if (!bio_flagged(bio, BIO_UPTODATE)) { + if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); @@ -3668,7 +3762,7 @@ void close_ctree(struct btrfs_root *root) if (!(fs_info->sb->s_flags & MS_RDONLY)) { ret = btrfs_commit_super(root); if (ret) - btrfs_err(root->fs_info, "commit super ret %d", ret); + btrfs_err(fs_info, "commit super ret %d", ret); } if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) @@ -3680,14 +3774,15 @@ void close_ctree(struct btrfs_root *root) fs_info->closing = 2; smp_mb(); - btrfs_free_qgroup_config(root->fs_info); + btrfs_free_qgroup_config(fs_info); if (percpu_counter_sum(&fs_info->delalloc_bytes)) { - btrfs_info(root->fs_info, "at unmount delalloc count %lld", + btrfs_info(fs_info, "at unmount delalloc count %lld", percpu_counter_sum(&fs_info->delalloc_bytes)); } btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); @@ -3723,7 +3818,7 @@ void close_ctree(struct btrfs_root *root) btrfs_free_stripe_hash_table(fs_info); - btrfs_free_block_rsv(root, root->orphan_block_rsv); + __btrfs_free_block_rsv(root->orphan_block_rsv); root->orphan_block_rsv = NULL; lock_chunks(root); @@ -4016,6 +4111,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *tmp; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, @@ -4031,11 +4127,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } spin_lock(&head->lock); - while ((node = rb_first(&head->ref_root)) != NULL) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); + list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, + list) { ref->in_tree = 0; - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); } @@ -4134,7 +4229,7 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { - eb = btrfs_find_tree_block(root, start); + eb = btrfs_find_tree_block(root->fs_info, start); start += root->nodesize; if (!eb) continue; @@ -4285,7 +4380,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root) return 0; } -static struct extent_io_ops btree_extent_io_ops = { +static const struct extent_io_ops btree_extent_io_ops = { .readpage_end_io_hook = btree_readpage_end_io_hook, .readpage_io_failed_hook = btree_io_failed_hook, .submit_bio_hook = btree_submit_bio_hook, diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 27d44c0fd236..d4cbfeeeedd4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -52,7 +52,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr); void clean_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf); + struct btrfs_fs_info *fs_info, struct extent_buffer *buf); int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); @@ -61,7 +61,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); -struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, +struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 37d164540c3a..8d052209f473 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -152,7 +152,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, static struct dentry *btrfs_get_parent(struct dentry *child) { - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -220,8 +220,8 @@ fail: static int btrfs_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *inode = child->d_inode; - struct inode *dir = parent->d_inode; + struct inode *inode = d_inode(child); + struct inode *dir = d_inode(parent); struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_inode_ref *iref; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8b353ad02f03..07204bf601ed 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op, - int no_quota); + struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -1967,10 +1966,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, - int no_quota, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1978,9 +1976,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; u64 refs; int ret; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; + int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) @@ -1996,26 +1996,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) + if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; - /* - * Ok we were able to insert an inline extent and it appears to be a new - * reference, deal with the qgroup accounting. - */ - if (!ret && !no_quota) { - ASSERT(root->fs_info->quota_enabled); - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) - type = BTRFS_QGROUP_OPER_ADD_SHARED; - btrfs_release_path(path); - - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - goto out; - } /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -2026,8 +2008,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); - if (refs) - type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -2035,13 +2015,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - if (ret) - goto out; - } - path->reada = 1; path->leave_spinning = 1; /* now insert the actual backref */ @@ -2087,17 +2060,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ref->objectid, ref->offset, &ins, node->ref_mod); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - node->no_quota, extent_op); + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_free_extent(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op, node->no_quota); + extent_op); } else { BUG(); } @@ -2255,15 +2226,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, ref->level, &ins, node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, node->no_quota, + ret = __btrfs_inc_extent_ref(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op, - node->no_quota); + ret = __btrfs_free_extent(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else { BUG(); } @@ -2323,28 +2293,27 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline struct btrfs_delayed_ref_node * +static inline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { - struct rb_node *node; - struct btrfs_delayed_ref_node *ref, *last = NULL;; + struct btrfs_delayed_ref_node *ref; + + if (list_empty(&head->ref_list)) + return NULL; /* - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. - * this prevents ref count from going down to zero when - * there still are pending delayed ref. + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. */ - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); + list_for_each_entry(ref, &head->ref_list, list) { if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - else if (last == NULL) - last = ref; - node = rb_next(node); } - return last; + + return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, + list); } /* @@ -2396,16 +2365,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } - /* - * We need to try and merge add/drops of the same ref since we - * can run into issues with relocate dropping the implicit ref - * and then it being added back again before the drop can - * finish. If we merged anything we need to re-loop so we can - * get a good ref. - */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, - locked_ref); /* * locked_ref is the head node, so we have to go one @@ -2482,7 +2442,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&locked_ref->lock); spin_lock(&delayed_refs->lock); spin_lock(&locked_ref->lock); - if (rb_first(&locked_ref->ref_root) || + if (!list_empty(&locked_ref->ref_list) || locked_ref->extent_op) { spin_unlock(&locked_ref->lock); spin_unlock(&delayed_refs->lock); @@ -2496,7 +2456,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } else { actual_count++; ref->in_tree = 0; - rb_erase(&ref->rb_node, &locked_ref->ref_root); + list_del(&ref->list); } atomic_dec(&delayed_refs->num_entries); @@ -2538,6 +2498,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, * list before we release it. */ if (btrfs_delayed_ref_is_head(ref)) { + if (locked_ref->is_data && + locked_ref->total_ref_mod < 0) { + spin_lock(&delayed_refs->lock); + delayed_refs->pending_csums -= ref->num_bytes; + spin_unlock(&delayed_refs->lock); + } btrfs_delayed_ref_unlock(locked_ref); locked_ref = NULL; } @@ -2561,8 +2527,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, */ spin_lock(&delayed_refs->lock); avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; - avg = div64_u64(avg, 4); - fs_info->avg_delayed_ref_runtime = avg; + fs_info->avg_delayed_ref_runtime = avg >> 2; /* div by 4 */ spin_unlock(&delayed_refs->lock); } return 0; @@ -2624,7 +2589,26 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) * We don't ever fill up leaves all the way so multiply by 2 just to be * closer to what we're really going to want to ouse. */ - return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); + return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); +} + +/* + * Takes the number of bytes to be csumm'ed and figures out how many leaves it + * would require to store the csums for that many bytes. + */ +u64 btrfs_csum_bytes_to_leaves(struct btrfs_root *root, u64 csum_bytes) +{ + u64 csum_size; + u64 num_csums_per_leaf; + u64 num_csums; + + csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); + num_csums_per_leaf = div64_u64(csum_size, + (u64)btrfs_super_csum_size(root->fs_info->super_copy)); + num_csums = div64_u64(csum_bytes, root->sectorsize); + num_csums += num_csums_per_leaf - 1; + num_csums = div64_u64(num_csums, num_csums_per_leaf); + return num_csums; } int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, @@ -2632,7 +2616,9 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, { struct btrfs_block_rsv *global_rsv; u64 num_heads = trans->transaction->delayed_refs.num_heads_ready; - u64 num_bytes; + u64 csum_bytes = trans->transaction->delayed_refs.pending_csums; + u64 num_dirty_bgs = trans->transaction->num_dirty_bgs; + u64 num_bytes, num_dirty_bgs_bytes; int ret = 0; num_bytes = btrfs_calc_trans_metadata_size(root, 1); @@ -2640,17 +2626,22 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, if (num_heads > 1) num_bytes += (num_heads - 1) * root->nodesize; num_bytes <<= 1; + num_bytes += btrfs_csum_bytes_to_leaves(root, csum_bytes) * root->nodesize; + num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(root, + num_dirty_bgs); global_rsv = &root->fs_info->global_block_rsv; /* * If we can't allocate any more chunks lets make sure we have _lots_ of * wiggle room since running delayed refs can create more delayed refs. */ - if (global_rsv->space_info->full) + if (global_rsv->space_info->full) { + num_dirty_bgs_bytes <<= 1; num_bytes <<= 1; + } spin_lock(&global_rsv->lock); - if (global_rsv->reserved <= num_bytes) + if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes) ret = 1; spin_unlock(&global_rsv->lock); return ret; @@ -2833,9 +2824,6 @@ again: goto again; } out: - ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); - if (ret) - return ret; assert_qgroups_uptodate(trans); return 0; } @@ -2874,7 +2862,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; @@ -2903,11 +2890,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - + list_for_each_entry(ref, &head->ref_list, list) { /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; @@ -3147,10 +3130,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans, bi = btrfs_item_ptr_offset(leaf, path->slots[0]); write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); fail: - if (ret) - btrfs_abort_transaction(trans, root, ret); + btrfs_release_path(path); return ret; } @@ -3193,7 +3174,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, struct inode *inode = NULL; u64 alloc_hint = 0; int dcs = BTRFS_DC_ERROR; - int num_pages = 0; + u64 num_pages = 0; int retries = 0; int ret = 0; @@ -3267,15 +3248,14 @@ again: if (ret) goto out_put; - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) goto out_put; } spin_lock(&block_group->lock); if (block_group->cached != BTRFS_CACHE_FINISHED || - !btrfs_test_opt(root, SPACE_CACHE) || - block_group->delalloc_bytes) { + !btrfs_test_opt(root, SPACE_CACHE)) { /* * don't bother trying to write stuff out _if_ * a) we're not cached, @@ -3293,14 +3273,14 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024); + num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); if (!num_pages) num_pages = 1; num_pages *= 16; num_pages *= PAGE_CACHE_SIZE; - ret = btrfs_check_data_free_space(inode, num_pages); + ret = btrfs_check_data_free_space(inode, num_pages, num_pages); if (ret) goto out_put; @@ -3351,16 +3331,188 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans, return 0; } +/* + * transaction commit does final block group cache writeback during a + * critical section where nothing is allowed to change the FS. This is + * required in order for the cache to actually match the block group, + * but can introduce a lot of latency into the commit. + * + * So, btrfs_start_dirty_block_groups is here to kick off block group + * cache IO. There's a chance we'll have to redo some of it if the + * block group changes again during the commit, but it greatly reduces + * the commit latency by getting rid of the easy block groups while + * we're still allowing others to join the commit. + */ +int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_transaction *cur_trans = trans->transaction; + int ret = 0; + int should_put; + struct btrfs_path *path = NULL; + LIST_HEAD(dirty); + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; + int loops = 0; + + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cur_trans->dirty_bgs)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + return 0; + } + list_splice_init(&cur_trans->dirty_bgs, &dirty); + spin_unlock(&cur_trans->dirty_bgs_lock); + +again: + /* + * make sure all the block groups on our dirty list actually + * exist + */ + btrfs_create_pending_block_groups(trans, root); + + if (!path) { + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + } + + /* + * cache_write_mutex is here only to save us from balance or automatic + * removal of empty block groups deleting this block group while we are + * writing out the cache + */ + mutex_lock(&trans->transaction->cache_write_mutex); + while (!list_empty(&dirty)) { + cache = list_first_entry(&dirty, + struct btrfs_block_group_cache, + dirty_list); + /* + * this can happen if something re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + + /* + * btrfs_wait_cache_io uses the cache->dirty_list to decide + * if it should update the cache_state. Don't delete + * until after we wait. + * + * Since we're not running in the commit critical section + * we need the dirty_bgs_lock to protect from update_block_group + */ + spin_lock(&cur_trans->dirty_bgs_lock); + list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); + + should_put = 1; + + cache_save_setup(cache, trans, path); + + if (cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + + /* + * the cache_write_mutex is protecting + * the io_list + */ + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { + ret = write_one_cache_group(trans, root, path, cache); + /* + * Our block group might still be attached to the list + * of new block groups in the transaction handle of some + * other task (struct btrfs_trans_handle->new_bgs). This + * means its block group item isn't yet in the extent + * tree. If this happens ignore the error, as we will + * try again later in the critical section of the + * transaction commit. + */ + if (ret == -ENOENT) { + ret = 0; + spin_lock(&cur_trans->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &cur_trans->dirty_bgs); + btrfs_get_block_group(cache); + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } else if (ret) { + btrfs_abort_transaction(trans, root, ret); + } + } + + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + + if (ret) + break; + + /* + * Avoid blocking other tasks for too long. It might even save + * us from writing caches for block groups that are going to be + * removed. + */ + mutex_unlock(&trans->transaction->cache_write_mutex); + mutex_lock(&trans->transaction->cache_write_mutex); + } + mutex_unlock(&trans->transaction->cache_write_mutex); + + /* + * go through delayed refs for all the stuff we've just kicked off + * and then loop back (just once) + */ + ret = btrfs_run_delayed_refs(trans, root, 0); + if (!ret && loops == 0) { + loops++; + spin_lock(&cur_trans->dirty_bgs_lock); + list_splice_init(&cur_trans->dirty_bgs, &dirty); + /* + * dirty_bgs_lock protects us from concurrent block group + * deletes too (not just cache_write_mutex). + */ + if (!list_empty(&dirty)) { + spin_unlock(&cur_trans->dirty_bgs_lock); + goto again; + } + spin_unlock(&cur_trans->dirty_bgs_lock); + } + + btrfs_free_path(path); + return ret; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_group_cache *cache; struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; + int should_put; struct btrfs_path *path; - - if (list_empty(&cur_trans->dirty_bgs)) - return 0; + struct list_head *io = &cur_trans->io_bgs; + int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3376,16 +3528,64 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, dirty_list); + + /* + * this can happen if cache_save_setup re-dirties a block + * group that is already under IO. Just wait for it to + * finish and then do it all again + */ + if (!list_empty(&cache->io_list)) { + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, + cache->key.objectid); + btrfs_put_block_group(cache); + } + + /* + * don't remove from the dirty list until after we've waited + * on any pending IO + */ list_del_init(&cache->dirty_list); - if (cache->disk_cache_state == BTRFS_DC_CLEAR) - cache_save_setup(cache, trans, path); - if (!ret) - ret = btrfs_run_delayed_refs(trans, root, - (unsigned long) -1); - if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) - btrfs_write_out_cache(root, trans, cache, path); + should_put = 1; + + cache_save_setup(cache, trans, path); + if (!ret) + ret = btrfs_run_delayed_refs(trans, root, (unsigned long) -1); + + if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) { + cache->io_ctl.inode = NULL; + ret = btrfs_write_out_cache(root, trans, cache, path); + if (ret == 0 && cache->io_ctl.inode) { + num_started++; + should_put = 0; + list_add_tail(&cache->io_list, io); + } else { + /* + * if we failed to write the cache, the + * generation will be bad and life goes on + */ + ret = 0; + } + } + if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + if (ret) + btrfs_abort_transaction(trans, root, ret); + } + + /* if its not on the io list, we need to put the block group */ + if (should_put) + btrfs_put_block_group(cache); + } + + while (!list_empty(io)) { + cache = list_first_entry(io, struct btrfs_block_group_cache, + io_list); + list_del_init(&cache->io_list); + btrfs_wait_cache_io(root, trans, cache, + &cache->io_ctl, path, cache->key.objectid); btrfs_put_block_group(cache); } @@ -3445,7 +3645,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; - found->full = 0; + if (total_bytes > 0) + found->full = 0; spin_unlock(&found->lock); *space_info = found; return 0; @@ -3473,7 +3674,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_reserved = 0; found->bytes_readonly = 0; found->bytes_may_use = 0; - found->full = 0; + if (total_bytes > 0) + found->full = 0; + else + found->full = 1; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3635,19 +3839,21 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) * This will check the space that the inode allocates from to make sure we have * enough space for bytes. */ -int btrfs_check_data_free_space(struct inode *inode, u64 bytes) +int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) { struct btrfs_space_info *data_sinfo; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 used; - int ret = 0, committed = 0, alloc_chunk = 1; + int ret = 0; + int need_commit = 2; + int have_pinned_space; /* make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, root->sectorsize); if (btrfs_is_free_space_inode(inode)) { - committed = 1; + need_commit = 0; ASSERT(current->journal_info); } @@ -3669,7 +3875,7 @@ again: * if we don't have enough free bytes in this space then we need * to alloc a new chunk. */ - if (!data_sinfo->full && alloc_chunk) { + if (!data_sinfo->full) { u64 alloc_target; data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; @@ -3697,8 +3903,10 @@ alloc: if (ret < 0) { if (ret != -ENOSPC) return ret; - else + else { + have_pinned_space = 1; goto commit_trans; + } } if (!data_sinfo) @@ -3709,26 +3917,42 @@ alloc: /* * If we don't have enough pinned space to deal with this - * allocation don't bother committing the transaction. + * allocation, and no removed chunk in current transaction, + * don't bother committing the transaction. */ - if (percpu_counter_compare(&data_sinfo->total_bytes_pinned, - bytes) < 0) - committed = 1; + have_pinned_space = percpu_counter_compare( + &data_sinfo->total_bytes_pinned, + used + bytes - data_sinfo->total_bytes); spin_unlock(&data_sinfo->lock); /* commit the current transaction and try again */ commit_trans: - if (!committed && + if (need_commit && !atomic_read(&root->fs_info->open_ioctl_trans)) { - committed = 1; + need_commit--; + + if (need_commit > 0) + btrfs_wait_ordered_roots(fs_info, -1); trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_commit_transaction(trans, root); - if (ret) - return ret; - goto again; + if (have_pinned_space >= 0 || + trans->transaction->have_free_bgs || + need_commit > 0) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + /* + * make sure that all running delayed iput are + * done + */ + down_write(&root->fs_info->delayed_iput_sem); + up_write(&root->fs_info->delayed_iput_sem); + goto again; + } else { + btrfs_end_transaction(trans, root); + } } trace_btrfs_space_reservation(root->fs_info, @@ -3736,12 +3960,16 @@ commit_trans: data_sinfo->flags, bytes, 1); return -ENOSPC; } + ret = btrfs_qgroup_reserve(root, write_bytes); + if (ret) + goto out; data_sinfo->bytes_may_use += bytes; trace_btrfs_space_reservation(root->fs_info, "space_info", data_sinfo->flags, bytes, 1); +out: spin_unlock(&data_sinfo->lock); - return 0; + return ret; } /* @@ -3819,7 +4047,7 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } -static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) { u64 num_dev; @@ -3833,24 +4061,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) else num_dev = 1; /* DUP or single */ - /* metadata for updaing devices and chunk tree */ - return btrfs_calc_trans_metadata_size(root, num_dev + 1); + return num_dev; } -static void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type) +/* + * If @is_allocation is true, reserve space in the system space info necessary + * for allocating a chunk, otherwise if it's false, reserve space necessary for + * removing a chunk. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 type) { struct btrfs_space_info *info; u64 left; u64 thresh; + int ret = 0; + u64 num_devs; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly; + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use; spin_unlock(&info->lock); - thresh = get_system_chunk_thresh(root, type); + num_devs = get_profile_num_devs(root, type); + + /* num_devs device items to update and 1 chunk item to add or remove */ + thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + + btrfs_calc_trans_metadata_size(root, 1); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); @@ -3861,7 +4108,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, u64 flags; flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); - btrfs_alloc_chunk(trans, root, flags); + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + ret = btrfs_alloc_chunk(trans, root, flags); + } + + if (!ret) { + ret = btrfs_block_rsv_add(root->fs_info->chunk_root, + &root->fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += thresh; } } @@ -3966,6 +4227,24 @@ out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); + /* + * When we allocate a new chunk we reserve space in the chunk block + * reserve to make sure we can COW nodes/leafs in the chunk tree or + * add new nodes/leafs to it if we end up needing to do it when + * inserting the chunk item and updating device items as part of the + * second phase of chunk allocation, performed by + * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a + * large number of new block groups to create in our transaction + * handle's new_bgs list to avoid exhausting the chunk block reserve + * in extreme cases - like having a single transaction create many new + * block groups when starting to write out the free space caches of all + * the block groups that were made dirty during the lifetime of the + * transaction. + */ + if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + btrfs_create_pending_block_groups(trans, trans->root); + btrfs_trans_release_chunk_metadata(trans); + } return ret; } @@ -4298,8 +4577,13 @@ out: static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, struct btrfs_fs_info *fs_info, u64 used) { - return (used >= div_factor_fine(space_info->total_bytes, 98) && - !btrfs_fs_closing(fs_info) && + u64 thresh = div_factor_fine(space_info->total_bytes, 98); + + /* If we're just plain full then async reclaim just slows us down. */ + if (space_info->bytes_used >= thresh) + return 0; + + return (used >= thresh && !btrfs_fs_closing(fs_info) && !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)); } @@ -4354,10 +4638,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) if (!btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) return; - } while (flush_state <= COMMIT_TRANS); - - if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) - queue_work(system_unbound_wq, work); + } while (flush_state < COMMIT_TRANS); } void btrfs_init_async_reclaim_work(struct work_struct *work) @@ -4700,6 +4981,11 @@ void btrfs_free_block_rsv(struct btrfs_root *root, kfree(rsv); } +void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv) +{ + kfree(rsv); +} + int btrfs_block_rsv_add(struct btrfs_root *root, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush) @@ -4812,10 +5098,10 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) * csum_size * 2; - num_bytes += div64_u64(data_used + meta_used, 50); + num_bytes += div_u64(data_used + meta_used, 50); if (num_bytes * 3 > meta_used) - num_bytes = div64_u64(meta_used, 3); + num_bytes = div_u64(meta_used, 3); return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); } @@ -4912,6 +5198,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, trans->bytes_reserved = 0; } +/* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->root->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + /* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) @@ -4998,8 +5302,6 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, u64 qgroup_reserved) { btrfs_block_rsv_release(root, rsv, (u64)-1); - if (qgroup_reserved) - btrfs_qgroup_free(root, qgroup_reserved); } /** @@ -5066,30 +5368,18 @@ static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes, int reserve) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 csum_size; - int num_csums_per_leaf; - int num_csums; - int old_csums; + u64 old_csums, num_csums; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM && BTRFS_I(inode)->csum_bytes == 0) return 0; - old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); + old_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); if (reserve) BTRFS_I(inode)->csum_bytes += num_bytes; else BTRFS_I(inode)->csum_bytes -= num_bytes; - csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item); - num_csums_per_leaf = (int)div64_u64(csum_size, - sizeof(struct btrfs_csum_item) + - sizeof(struct btrfs_disk_key)); - num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize); - num_csums = num_csums + num_csums_per_leaf - 1; - num_csums = num_csums / num_csums_per_leaf; - - old_csums = old_csums + num_csums_per_leaf - 1; - old_csums = old_csums / num_csums_per_leaf; + num_csums = btrfs_csum_bytes_to_leaves(root, BTRFS_I(inode)->csum_bytes); /* No change, no need to reserve more */ if (old_csums == num_csums) @@ -5163,8 +5453,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) spin_unlock(&BTRFS_I(inode)->lock); if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, num_bytes + - nr_extents * root->nodesize); + ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); if (ret) goto out_fail; } @@ -5172,8 +5461,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); if (unlikely(ret)) { if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, num_bytes + - nr_extents * root->nodesize); + btrfs_qgroup_free(root, nr_extents * root->nodesize); goto out_fail; } @@ -5290,10 +5578,6 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) trace_btrfs_space_reservation(root->fs_info, "delalloc", btrfs_ino(inode), to_free, 0); - if (root->fs_info->quota_enabled) { - btrfs_qgroup_free(root, num_bytes + - dropped * root->nodesize); - } btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, to_free); @@ -5318,7 +5602,7 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) { int ret; - ret = btrfs_check_data_free_space(inode, num_bytes); + ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); if (ret) return ret; @@ -5390,14 +5674,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, if (!alloc && cache->cached == BTRFS_CACHE_NO) cache_block_group(cache, 1); - spin_lock(&trans->transaction->dirty_bgs_lock); - if (list_empty(&cache->dirty_list)) { - list_add_tail(&cache->dirty_list, - &trans->transaction->dirty_bgs); - btrfs_get_block_group(cache); - } - spin_unlock(&trans->transaction->dirty_bgs_lock); - byte_in_group = bytenr - cache->key.objectid; WARN_ON(byte_in_group > cache->key.offset); @@ -5446,6 +5722,16 @@ static int update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->unused_bgs_lock); } } + + spin_lock(&trans->transaction->dirty_bgs_lock); + if (list_empty(&cache->dirty_list)) { + list_add_tail(&cache->dirty_list, + &trans->transaction->dirty_bgs); + trans->transaction->num_dirty_bgs++; + btrfs_get_block_group(cache); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_put_block_group(cache); total -= num_bytes; bytenr += num_bytes; @@ -5834,11 +6120,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_key key; struct btrfs_path *path; @@ -5852,10 +6137,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int no_quota = node->no_quota; u32 item_size; u64 refs; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; int last_ref = 0; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6036,7 +6323,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs -= refs_to_drop; if (refs > 0) { - type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -6098,18 +6384,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* Deal with the quota accounting */ - if (!ret && last_ref && !no_quota) { - int mod_seq = 0; - - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && - type == BTRFS_QGROUP_OPER_SUB_SHARED) - mod_seq = 1; - - ret = btrfs_qgroup_record_ref(trans, info, root_objectid, - bytenr, num_bytes, type, - mod_seq); - } out: btrfs_free_path(path); return ret; @@ -6135,7 +6409,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (rb_first(&head->ref_root)) + if (!list_empty(&head->ref_list)) goto out; if (head->extent_op) { @@ -6956,15 +7230,15 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root, return -ENOSPC; } - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_discard_extent(root, start, len, NULL); - if (pin) pin_down_extent(root, cache, start, len, 1); else { + if (btrfs_test_opt(root, DISCARD)) + ret = btrfs_discard_extent(root, start, len, NULL); btrfs_add_free_space(cache, start, len); btrfs_update_reserved_bytes(cache, len, RESERVE_FREE, delalloc); } + btrfs_put_block_group(cache); trace_btrfs_reserved_extent_free(root, start, len); @@ -7045,13 +7319,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - /* Always set parent to 0 here since its exclusive anyway. */ - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, ins->offset, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7095,9 +7362,9 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, ins, size); if (ret) { + btrfs_free_path(path); btrfs_free_and_pin_reserved_extent(root, ins->objectid, root->nodesize); - btrfs_free_path(path); return ret; } @@ -7133,14 +7400,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, num_bytes, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - } - ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7217,7 +7476,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_header_generation(buf, trans->transid); btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); btrfs_tree_lock(buf); - clean_tree_block(trans, root, buf); + clean_tree_block(trans, root->fs_info, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); @@ -7311,7 +7570,7 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, * returns the key for the extent through ins, and a tree buffer for * the first block of the extent through buf. * - * returns the tree buffer or NULL. + * returns the tree buffer or an ERR_PTR on error. */ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7322,6 +7581,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_key ins; struct btrfs_block_rsv *block_rsv; struct extent_buffer *buf; + struct btrfs_delayed_extent_op *extent_op; u64 flags = 0; int ret; u32 blocksize = root->nodesize; @@ -7342,13 +7602,14 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, ret = btrfs_reserve_extent(root, blocksize, blocksize, empty_size, hint, &ins, 0, 0); - if (ret) { - unuse_block_rsv(root->fs_info, block_rsv, blocksize); - return ERR_PTR(ret); - } + if (ret) + goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level); - BUG_ON(IS_ERR(buf)); /* -ENOMEM */ + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_free_reserved; + } if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -7358,9 +7619,11 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, BUG_ON(parent > 0); if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { - struct btrfs_delayed_extent_op *extent_op; extent_op = btrfs_alloc_delayed_extent_op(); - BUG_ON(!extent_op); /* -ENOMEM */ + if (!extent_op) { + ret = -ENOMEM; + goto out_free_buf; + } if (key) memcpy(&extent_op->key, key, sizeof(extent_op->key)); else @@ -7375,13 +7638,24 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, - ins.objectid, - ins.offset, parent, root_objectid, - level, BTRFS_ADD_DELAYED_EXTENT, - extent_op, 0); - BUG_ON(ret); /* -ENOMEM */ + ins.objectid, ins.offset, + parent, root_objectid, level, + BTRFS_ADD_DELAYED_EXTENT, + extent_op, 0); + if (ret) + goto out_free_delayed; } return buf; + +out_free_delayed: + btrfs_free_delayed_extent_op(extent_op); +out_free_buf: + free_extent_buffer(buf); +out_free_reserved: + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 0); +out_unuse: + unuse_block_rsv(root->fs_info, block_rsv, blocksize); + return ERR_PTR(ret); } struct walk_control { @@ -7482,12 +7756,18 @@ reada: wc->reada_slot = slot; } +/* + * TODO: Modify related function to add related node/leaf to dirty_extent_root, + * for later qgroup accounting. + * + * Current, this function does nothing. + */ static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int nr = btrfs_header_nritems(eb); - int i, extent_type, ret; + int i, extent_type; struct btrfs_key key; struct btrfs_file_extent_item *fi; u64 bytenr, num_bytes; @@ -7510,13 +7790,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, continue; num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - bytenr, num_bytes, - BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); - if (ret) - return ret; } return 0; } @@ -7585,6 +7858,8 @@ static int adjust_slots_upwards(struct btrfs_root *root, /* * root_eb is the subtree root and is locked before this function is called. + * TODO: Modify this function to mark all (including complete shared node) + * to dirty_extent_root to allow it get accounted in qgroup. */ static int account_shared_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7647,7 +7922,11 @@ walk_down: child_gen = btrfs_node_ptr_generation(eb, parent_slot); eb = read_tree_block(root, child_bytenr, child_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); ret = -EIO; goto out; } @@ -7658,16 +7937,6 @@ walk_down: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - child_bytenr, - root->nodesize, - BTRFS_QGROUP_OPER_SUB_SUBTREE, - 0); - if (ret) - goto out; - } if (level == 0) { @@ -7815,7 +8084,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); blocksize = root->nodesize; - next = btrfs_find_tree_block(root, bytenr); + next = btrfs_find_tree_block(root->fs_info, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr); if (!next) @@ -7878,7 +8147,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, generation); - if (!next || !extent_buffer_uptodate(next)) { + if (IS_ERR(next)) { + return PTR_ERR(next); + } else if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; } @@ -8016,7 +8287,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking(eb); path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING; } - clean_tree_block(trans, root, eb); + clean_tree_block(trans, root->fs_info, eb); } if (eb == root->node) { @@ -8260,24 +8531,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } - /* - * Qgroup update accounting is run from - * delayed ref handling. This usually works - * out because delayed refs are normally the - * only way qgroup updates are added. However, - * we may have added updates during our tree - * walk so run qgroups here to make sure we - * don't lose any updates. - */ - ret = btrfs_delayed_qgroup_accounting(trans, - root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("BTRFS: drop snapshot early exit\n"); @@ -8331,14 +8584,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: - ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); @@ -8533,10 +8778,48 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, BUG_ON(cache->ro); +again: trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); + /* + * we're not allowed to set block groups readonly after the dirty + * block groups cache has started writing. If it already started, + * back off and let this transaction commit + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (trans->transaction->dirty_bg_run) { + u64 transid = trans->transid; + + mutex_unlock(&root->fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans, root); + + ret = btrfs_wait_for_commit(root, transid); + if (ret) + return ret; + goto again; + } + + /* + * if we are changing raid levels, try to allocate a corresponding + * block group with the new raid level. + */ + alloc_flags = update_block_group_flags(root, cache->flags); + if (alloc_flags != cache->flags) { + ret = do_chunk_alloc(trans, root, alloc_flags, + CHUNK_ALLOC_FORCE); + /* + * ENOSPC is allowed here, we may have enough space + * already allocated at the new raid level to + * carry on + */ + if (ret == -ENOSPC) + ret = 0; + if (ret < 0) + goto out; + } + ret = set_block_group_ro(cache, 0); if (!ret) goto out; @@ -8549,8 +8832,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root, out: if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) { alloc_flags = update_block_group_flags(root, cache->flags); + lock_chunks(root->fs_info->chunk_root); check_system_chunk(trans, root, alloc_flags); + unlock_chunks(root->fs_info->chunk_root); } + mutex_unlock(&root->fs_info->ro_block_group_mutex); btrfs_end_transaction(trans, root); return ret; @@ -8720,7 +9006,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) min_free <<= 1; } else if (index == BTRFS_RAID_RAID0) { dev_min = fs_devices->rw_devices; - do_div(min_free, dev_min); + min_free = div64_u64(min_free, dev_min); } /* We need to do this so that we can look at pending chunks */ @@ -8992,6 +9278,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); INIT_LIST_HEAD(&cache->dirty_list); + INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); @@ -9247,6 +9534,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); + /* + * Call to ensure the corresponding space_info object is created and + * assigned to our block group, but don't update its counters just yet. + * We want our bg to be added to the rbtree with its ->space_info set. + */ + ret = update_space_info(root->fs_info, cache->flags, 0, 0, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = btrfs_add_block_group_cache(root->fs_info, cache); if (ret) { btrfs_remove_free_space_cache(cache); @@ -9254,6 +9554,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return ret; } + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); if (ret) { @@ -9355,7 +9659,38 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, goto out; } + /* + * get the inode first so any iput calls done for the io_list + * aren't the final iput (no unlinks allowed now) + */ inode = lookup_free_space_inode(tree_root, block_group, path); + + mutex_lock(&trans->transaction->cache_write_mutex); + /* + * make sure our free spache cache IO is done before remove the + * free space inode + */ + spin_lock(&trans->transaction->dirty_bgs_lock); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode); + + spin_unlock(&trans->transaction->dirty_bgs_lock); + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + spin_lock(&trans->transaction->dirty_bgs_lock); + } + + if (!list_empty(&block_group->dirty_list)) { + list_del_init(&block_group->dirty_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&trans->transaction->dirty_bgs_lock); + mutex_unlock(&trans->transaction->cache_write_mutex); + if (!IS_ERR(inode)) { ret = btrfs_orphan_add(trans, inode); if (ret) { @@ -9448,18 +9783,29 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, spin_lock(&trans->transaction->dirty_bgs_lock); if (!list_empty(&block_group->dirty_list)) { - list_del_init(&block_group->dirty_list); - btrfs_put_block_group(block_group); + WARN_ON(1); + } + if (!list_empty(&block_group->io_list)) { + WARN_ON(1); } spin_unlock(&trans->transaction->dirty_bgs_lock); - btrfs_remove_free_space_cache(block_group); spin_lock(&block_group->space_info->lock); list_del_init(&block_group->ro_list); + + if (btrfs_test_opt(root, ENOSPC_DEBUG)) { + WARN_ON(block_group->space_info->total_bytes + < block_group->key.offset); + WARN_ON(block_group->space_info->bytes_readonly + < block_group->key.offset); + WARN_ON(block_group->space_info->disk_total + < block_group->key.offset * factor); + } block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; block_group->space_info->disk_total -= block_group->key.offset * factor; + spin_unlock(&block_group->space_info->lock); memcpy(&key, &block_group->key, sizeof(key)); @@ -9574,6 +9920,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->unused_bgs_lock); + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); + /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); spin_lock(&block_group->lock); @@ -9647,8 +9995,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_info->unused_bg_unpin_mutex); /* Reset pinned so btrfs_put_block_group doesn't complain */ + spin_lock(&space_info->lock); + spin_lock(&block_group->lock); + + space_info->bytes_pinned -= block_group->pinned; + space_info->bytes_readonly += block_group->pinned; + percpu_counter_add(&space_info->total_bytes_pinned, + -block_group->pinned); block_group->pinned = 0; + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + /* * Btrfs_remove_chunk will abort the transaction if things go * horribly wrong. @@ -9658,6 +10016,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) end_trans: btrfs_end_transaction(trans, root); next: + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 000000000000..e69de29bb2d1 --- /dev/null +++ b/fs/btrfs/extent-tree.h diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d688cfe5d496..02d05817cbdf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask) { - return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); } int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, @@ -2767,8 +2772,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, else btrfsic_submit_bio(rw, bio); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; bio_put(bio); return ret; } @@ -4492,6 +4495,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + flags |= FIEMAP_EXTENT_UNWRITTEN; free_extent_map(em); em = NULL; @@ -4514,8 +4519,11 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } ret = fiemap_fill_next_extent(fieinfo, em_start, disko, em_len, flags); - if (ret) + if (ret) { + if (ret == 1) + ret = 0; goto out_free; + } } out_free: free_extent_map(em); @@ -4557,36 +4565,37 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) do { index--; page = eb->pages[index]; - if (page && mapped) { + if (!page) + continue; + if (mapped) spin_lock(&page->mapping->private_lock); + /* + * We do this since we'll remove the pages after we've + * removed the eb from the radix tree, so we could race + * and have this page now attached to the new eb. So + * only clear page_private if it's still connected to + * this eb. + */ + if (PagePrivate(page) && + page->private == (unsigned long)eb) { + BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(PageDirty(page)); + BUG_ON(PageWriteback(page)); /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to - * this eb. + * We need to make sure we haven't be attached + * to a new eb. */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { - BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - ClearPagePrivate(page); - set_page_private(page, 0); - /* One for the page private */ - page_cache_release(page); - } - spin_unlock(&page->mapping->private_lock); - - } - if (page) { - /* One for when we alloced the page */ + ClearPagePrivate(page); + set_page_private(page, 0); + /* One for the page private */ page_cache_release(page); } + + if (mapped) + spin_unlock(&page->mapping->private_lock); + + /* One for when we alloced the page */ + page_cache_release(page); } while (index != 0); } @@ -4768,6 +4777,25 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); + /* + * Lock our eb's refs_lock to avoid races with + * free_extent_buffer. When we get our eb it might be flagged + * with EXTENT_BUFFER_STALE and another task running + * free_extent_buffer might have seen that flag set, + * eb->refs == 2, that the buffer isn't under IO (dirty and + * writeback flags not set) and it's still in the tree (flag + * EXTENT_BUFFER_TREE_REF set), therefore being in the process + * of decrementing the extent buffer's reference count twice. + * So here we could race and increment the eb's reference count, + * clear its stale flag, mark it as dirty and drop our reference + * before the other task finishes executing free_extent_buffer, + * which would later result in an attempt to free an extent + * buffer that is dirty. + */ + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { + spin_lock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); + } mark_extent_buffer_accessed(eb, NULL); return eb; } @@ -4867,6 +4895,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, mark_extent_buffer_accessed(exists, p); goto free_eb; } + exists = NULL; /* * Do this so attach doesn't complain and we need to @@ -4930,12 +4959,12 @@ again: return eb; free_eb: + WARN_ON(!atomic_dec_and_test(&eb->refs)); for (i = 0; i < num_pages; i++) { if (eb->pages[i]) unlock_page(eb->pages[i]); } - WARN_ON(!atomic_dec_and_test(&eb->refs)); btrfs_release_extent_buffer(eb); return exists; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 695b0ccfb755..c668f36898d3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -97,7 +97,7 @@ struct extent_io_tree { u64 dirty_bytes; int track_uptodate; spinlock_t lock; - struct extent_io_ops *ops; + const struct extent_io_ops *ops; }; struct extent_state { diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 84a2d1868271..58ece6558430 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -185,8 +185,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, nblocks = bio->bi_iter.bi_size >> inode->i_sb->s_blocksize_bits; if (!dst) { if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { - btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size, - GFP_NOFS); + btrfs_bio->csum_allocated = kmalloc_array(nblocks, + csum_size, GFP_NOFS); if (!btrfs_bio->csum_allocated) { btrfs_free_path(path); return -ENOMEM; @@ -553,7 +553,7 @@ static noinline void truncate_one_csum(struct btrfs_root *root, btrfs_truncate_item(root, path, new_size, 0); key->offset = end_byte; - btrfs_set_item_key_safe(root, path, key); + btrfs_set_item_key_safe(root->fs_info, path, key); } else { BUG(); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 30982bbd31c3..b823fac91c92 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,7 +24,6 @@ #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/mpage.h> -#include <linux/aio.h> #include <linux/falloc.h> #include <linux/swap.h> #include <linux/writeback.h> @@ -32,6 +31,7 @@ #include <linux/compat.h> #include <linux/slab.h> #include <linux/btrfs.h> +#include <linux/uio.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -273,11 +273,7 @@ void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) defrag = rb_entry(node, struct inode_defrag, rb_node); kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - if (need_resched()) { - spin_unlock(&fs_info->defrag_inodes_lock); - cond_resched(); - spin_lock(&fs_info->defrag_inodes_lock); - } + cond_resched_lock(&fs_info->defrag_inodes_lock); node = rb_first(&fs_info->defrag_inodes); } @@ -868,7 +864,7 @@ next_slot: memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = end; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); extent_offset += end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); @@ -1126,7 +1122,7 @@ again: ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, @@ -1160,7 +1156,7 @@ again: trans->transid); path->slots[0]++; new_key.offset = start; - btrfs_set_item_key_safe(root, path, &new_key); + btrfs_set_item_key_safe(root->fs_info, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -1485,7 +1481,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, PAGE_CACHE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); - pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL); if (!pages) return -ENOMEM; @@ -1514,7 +1510,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, } reserve_bytes = num_pages << PAGE_CACHE_SHIFT; - ret = btrfs_check_data_free_space(inode, reserve_bytes); + ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); if (ret == -ENOSPC && (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) { @@ -1635,8 +1631,8 @@ again: btrfs_end_write_no_snapshoting(root); if (only_release_metadata && copied > 0) { - u64 lockstart = round_down(pos, root->sectorsize); - u64 lockend = lockstart + + lockstart = round_down(pos, root->sectorsize); + lockend = lockstart + (dirty_pages << PAGE_CACHE_SHIFT) - 1; set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, @@ -1739,28 +1735,20 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, u64 start_pos; u64 end_pos; ssize_t num_written = 0; - ssize_t err = 0; - size_t count = iov_iter_count(from); bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host); - loff_t pos = iocb->ki_pos; + ssize_t err; + loff_t pos; + size_t count; mutex_lock(&inode->i_mutex); - - current->backing_dev_info = inode_to_bdi(inode); - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) { + err = generic_write_checks(iocb, from); + if (err <= 0) { mutex_unlock(&inode->i_mutex); - goto out; - } - - if (count == 0) { - mutex_unlock(&inode->i_mutex); - goto out; + return err; } - iov_iter_truncate(from, count); - - err = file_remove_suid(file); + current->backing_dev_info = inode_to_bdi(inode); + err = file_remove_privs(file); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1786,6 +1774,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, */ update_time_for_write(inode); + pos = iocb->ki_pos; + count = iov_iter_count(from); start_pos = round_down(pos, root->sectorsize); if (start_pos > i_size_read(inode)) { /* Expand hole size to cover write data, preventing empty gap */ @@ -1800,7 +1790,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); - if (file->f_flags & O_DIRECT) { + if (iocb->ki_flags & IOCB_DIRECT) { num_written = __btrfs_direct_write(iocb, from, pos); } else { num_written = __btrfs_buffered_write(file, from, pos); @@ -1815,7 +1805,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * otherwise subsequent syncs to a file that's been synced in this * transaction will appear to have already occured. */ + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_sub_trans = root->log_transid; + spin_unlock(&BTRFS_I(inode)->lock); if (num_written > 0) { err = generic_write_sync(file, pos, num_written); if (err < 0) @@ -1870,12 +1862,13 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0; bool full_sync = 0; + const u64 len = end - start + 1; trace_btrfs_sync_file(file, datasync); @@ -1904,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * all extents are persisted and the respective file extent * items are in the fs/subvol btree. */ - ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + ret = btrfs_wait_ordered_range(inode, start, len); } else { /* * Start any new ordered operations before starting to log the @@ -1976,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - (full_sync && BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed)) { + (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed && + (full_sync || + !btrfs_have_ordered_extents_in_range(inode, start, len)))) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2168,7 +2163,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode, u64 num_bytes; key.offset = offset; - btrfs_set_item_key_safe(root, path, &key); + btrfs_set_item_key_safe(root->fs_info, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - @@ -2551,7 +2546,6 @@ static long btrfs_fallocate(struct file *file, int mode, { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; - struct btrfs_root *root = BTRFS_I(inode)->root; u64 cur_offset; u64 last_byte; u64 alloc_start; @@ -2576,14 +2570,9 @@ static long btrfs_fallocate(struct file *file, int mode, * Make sure we have enough space before we do the * allocation. */ - ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start); + ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); if (ret) return ret; - if (root->fs_info->quota_enabled) { - ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start); - if (ret) - goto out_reserve_fail; - } mutex_lock(&inode->i_mutex); ret = inode_newsize_ok(inode, alloc_end); @@ -2673,23 +2662,35 @@ static long btrfs_fallocate(struct file *file, int mode, 1 << inode->i_blkbits, offset + len, &alloc_hint); - - if (ret < 0) { - free_extent_map(em); - break; - } } else if (actual_end > inode->i_size && !(mode & FALLOC_FL_KEEP_SIZE)) { + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + /* * We didn't need to allocate any more space, but we * still extended the size of the file so we need to - * update i_size. + * update i_size and the inode item. */ - inode->i_ctime = CURRENT_TIME; - i_size_write(inode, actual_end); - btrfs_ordered_update_i_size(inode, actual_end, NULL); + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + } else { + inode->i_ctime = CURRENT_TIME; + i_size_write(inode, actual_end); + btrfs_ordered_update_i_size(inode, actual_end, + NULL); + ret = btrfs_update_inode(trans, root, inode); + if (ret) + btrfs_end_transaction(trans, root); + else + ret = btrfs_end_transaction(trans, + root); + } } free_extent_map(em); + if (ret < 0) + break; cur_offset = last_byte; if (cur_offset >= alloc_end) { @@ -2701,9 +2702,6 @@ static long btrfs_fallocate(struct file *file, int mode, &cached_state, GFP_NOFS); out: mutex_unlock(&inode->i_mutex); - if (root->fs_info->quota_enabled) - btrfs_qgroup_free(root, alloc_end - alloc_start); -out_reserve_fail: /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); return ret; @@ -2806,8 +2804,6 @@ out: const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index a71978578fa7..fb5a6b1c62a6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -85,7 +85,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, } mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + mapping_gfp_mask(inode->i_mapping) & + ~(__GFP_FS | __GFP_HIGHMEM)); return inode; } @@ -170,13 +171,13 @@ static int __create_free_space_inode(struct btrfs_root *root, key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.offset = offset; key.type = 0; - ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { btrfs_release_path(path); return ret; } + leaf = path->nodes[0]; header = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_free_space_header); @@ -225,9 +226,39 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, struct inode *inode) { int ret = 0; + struct btrfs_path *path = btrfs_alloc_path(); + bool locked = false; + + if (!path) { + ret = -ENOMEM; + goto fail; + } + + if (block_group) { + locked = true; + mutex_lock(&trans->transaction->cache_write_mutex); + if (!list_empty(&block_group->io_list)) { + list_del_init(&block_group->io_list); + + btrfs_wait_cache_io(root, trans, block_group, + &block_group->io_ctl, path, + block_group->key.objectid); + btrfs_put_block_group(block_group); + } + + /* + * now that we've truncated the cache away, its no longer + * setup or written + */ + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_CLEAR; + spin_unlock(&block_group->lock); + } + btrfs_free_path(path); btrfs_i_size_write(inode, 0); truncate_pagecache(inode, 0); @@ -235,15 +266,19 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, /* * We don't need an orphan item because truncating the free space cache * will never be split across transactions. + * We don't need to check for -EAGAIN because we're a free space + * cache inode */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - return ret; - } + if (ret) + goto fail; ret = btrfs_update_inode(trans, root, inode); + +fail: + if (locked) + mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) btrfs_abort_transaction(trans, root, ret); @@ -269,18 +304,7 @@ static int readahead_cache(struct inode *inode) return 0; } -struct io_ctl { - void *cur, *orig; - struct page *page; - struct page **pages; - struct btrfs_root *root; - unsigned long size; - int index; - int num_pages; - unsigned check_crcs:1; -}; - -static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, +static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, struct btrfs_root *root, int write) { int num_pages; @@ -296,45 +320,46 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) return -ENOSPC; - memset(io_ctl, 0, sizeof(struct io_ctl)); + memset(io_ctl, 0, sizeof(struct btrfs_io_ctl)); - io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); + io_ctl->pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); if (!io_ctl->pages) return -ENOMEM; io_ctl->num_pages = num_pages; io_ctl->root = root; io_ctl->check_crcs = check_crcs; + io_ctl->inode = inode; return 0; } -static void io_ctl_free(struct io_ctl *io_ctl) +static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { kfree(io_ctl->pages); + io_ctl->pages = NULL; } -static void io_ctl_unmap_page(struct io_ctl *io_ctl) +static void io_ctl_unmap_page(struct btrfs_io_ctl *io_ctl) { if (io_ctl->cur) { - kunmap(io_ctl->page); io_ctl->cur = NULL; io_ctl->orig = NULL; } } -static void io_ctl_map_page(struct io_ctl *io_ctl, int clear) +static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) { ASSERT(io_ctl->index < io_ctl->num_pages); io_ctl->page = io_ctl->pages[io_ctl->index++]; - io_ctl->cur = kmap(io_ctl->page); + io_ctl->cur = page_address(io_ctl->page); io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_CACHE_SIZE; if (clear) memset(io_ctl->cur, 0, PAGE_CACHE_SIZE); } -static void io_ctl_drop_pages(struct io_ctl *io_ctl) +static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) { int i; @@ -349,7 +374,7 @@ static void io_ctl_drop_pages(struct io_ctl *io_ctl) } } -static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, +static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode, int uptodate) { struct page *page; @@ -383,7 +408,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode, return 0; } -static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) +static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { __le64 *val; @@ -406,7 +431,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation) io_ctl->cur += sizeof(u64); } -static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) +static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { __le64 *gen; @@ -435,7 +460,7 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation) return 0; } -static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) +static void io_ctl_set_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp; u32 crc = ~(u32)0; @@ -453,13 +478,12 @@ static void io_ctl_set_crc(struct io_ctl *io_ctl, int index) PAGE_CACHE_SIZE - offset); btrfs_csum_final(crc, (char *)&crc); io_ctl_unmap_page(io_ctl); - tmp = kmap(io_ctl->pages[0]); + tmp = page_address(io_ctl->pages[0]); tmp += index; *tmp = crc; - kunmap(io_ctl->pages[0]); } -static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) +static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index) { u32 *tmp, val; u32 crc = ~(u32)0; @@ -473,10 +497,9 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) if (index == 0) offset = sizeof(u32) * io_ctl->num_pages; - tmp = kmap(io_ctl->pages[0]); + tmp = page_address(io_ctl->pages[0]); tmp += index; val = *tmp; - kunmap(io_ctl->pages[0]); io_ctl_map_page(io_ctl, 0); crc = btrfs_csum_data(io_ctl->orig + offset, crc, @@ -492,7 +515,7 @@ static int io_ctl_check_crc(struct io_ctl *io_ctl, int index) return 0; } -static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, +static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, void *bitmap) { struct btrfs_free_space_entry *entry; @@ -522,7 +545,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes, return 0; } -static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) +static int io_ctl_add_bitmap(struct btrfs_io_ctl *io_ctl, void *bitmap) { if (!io_ctl->cur) return -ENOSPC; @@ -545,7 +568,7 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap) return 0; } -static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) +static void io_ctl_zero_remaining_pages(struct btrfs_io_ctl *io_ctl) { /* * If we're not on the boundary we know we've modified the page and we @@ -562,7 +585,7 @@ static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl) } } -static int io_ctl_read_entry(struct io_ctl *io_ctl, +static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry, u8 *type) { struct btrfs_free_space_entry *e; @@ -589,7 +612,7 @@ static int io_ctl_read_entry(struct io_ctl *io_ctl, return 0; } -static int io_ctl_read_bitmap(struct io_ctl *io_ctl, +static int io_ctl_read_bitmap(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space *entry) { int ret; @@ -648,7 +671,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, { struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct io_ctl io_ctl; + struct btrfs_io_ctl io_ctl; struct btrfs_key key; struct btrfs_free_space *e, *n; LIST_HEAD(bitmaps); @@ -877,7 +900,7 @@ out: } static noinline_for_stack -int write_cache_extent_entries(struct io_ctl *io_ctl, +int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, int *entries, int *bitmaps, @@ -885,6 +908,7 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, { int ret; struct btrfs_free_cluster *cluster = NULL; + struct btrfs_free_cluster *cluster_locked = NULL; struct rb_node *node = rb_first(&ctl->free_space_offset); struct btrfs_trim_range *trim_entry; @@ -896,6 +920,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, } if (!node && cluster) { + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); node = rb_first(&cluster->root); cluster = NULL; } @@ -919,9 +945,15 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, node = rb_next(node); if (!node && cluster) { node = rb_first(&cluster->root); + cluster_locked = cluster; + spin_lock(&cluster_locked->lock); cluster = NULL; } } + if (cluster_locked) { + spin_unlock(&cluster_locked->lock); + cluster_locked = NULL; + } /* * Make sure we don't miss any range that was removed from our rbtree @@ -939,6 +971,8 @@ int write_cache_extent_entries(struct io_ctl *io_ctl, return 0; fail: + if (cluster_locked) + spin_unlock(&cluster_locked->lock); return -ENOSPC; } @@ -1000,7 +1034,7 @@ fail: static noinline_for_stack int write_pinned_extent_entries(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, - struct io_ctl *io_ctl, + struct btrfs_io_ctl *io_ctl, int *entries) { u64 start, extent_start, extent_end, len; @@ -1050,7 +1084,7 @@ write_pinned_extent_entries(struct btrfs_root *root, } static noinline_for_stack int -write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list) +write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) { struct list_head *pos, *n; int ret; @@ -1083,10 +1117,7 @@ static int flush_dirty_cache(struct inode *inode) } static void noinline_for_stack -cleanup_write_cache_enospc(struct inode *inode, - struct io_ctl *io_ctl, - struct extent_state **cached_state, - struct list_head *bitmap_list) +cleanup_bitmap_list(struct list_head *bitmap_list) { struct list_head *pos, *n; @@ -1095,12 +1126,85 @@ cleanup_write_cache_enospc(struct inode *inode, list_entry(pos, struct btrfs_free_space, list); list_del_init(&entry->list); } +} + +static void noinline_for_stack +cleanup_write_cache_enospc(struct inode *inode, + struct btrfs_io_ctl *io_ctl, + struct extent_state **cached_state, + struct list_head *bitmap_list) +{ io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, cached_state, GFP_NOFS); } +int btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset) +{ + int ret; + struct inode *inode = io_ctl->inode; + + if (!inode) + return 0; + + if (block_group) + root = root->fs_info->tree_root; + + /* Flush the dirty pages in the cache file. */ + ret = flush_dirty_cache(inode); + if (ret) + goto out; + + /* Update the cache item to tell everyone this cache file is valid. */ + ret = update_cache_item(trans, root, inode, path, offset, + io_ctl->entries, io_ctl->bitmaps); +out: + io_ctl_free(io_ctl); + if (ret) { + invalidate_inode_pages2(inode->i_mapping); + BTRFS_I(inode)->generation = 0; + if (block_group) { +#ifdef DEBUG + btrfs_err(root->fs_info, + "failed to write free space cache for block group %llu", + block_group->key.objectid); +#endif + } + } + btrfs_update_inode(trans, root, inode); + + if (block_group) { + /* the dirty list is protected by the dirty_bgs_lock */ + spin_lock(&trans->transaction->dirty_bgs_lock); + + /* the disk_cache_state is protected by the block group lock */ + spin_lock(&block_group->lock); + + /* + * only mark this as written if we didn't get put back on + * the dirty list while waiting for IO. Otherwise our + * cache state won't be right, and we won't get written again + */ + if (!ret && list_empty(&block_group->dirty_list)) + block_group->disk_cache_state = BTRFS_DC_WRITTEN; + else if (ret) + block_group->disk_cache_state = BTRFS_DC_ERROR; + + spin_unlock(&block_group->lock); + spin_unlock(&trans->transaction->dirty_bgs_lock); + io_ctl->inode = NULL; + iput(inode); + } + + return ret; + +} + /** * __btrfs_write_out_cache - write out cached info to an inode * @root - the root the inode belongs to @@ -1112,27 +1216,29 @@ cleanup_write_cache_enospc(struct inode *inode, * * This function writes out a free space cache struct to disk for quick recovery * on mount. This will return 0 if it was successfull in writing the cache out, - * and -1 if it was not. + * or an errno if it was not. */ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, struct btrfs_free_space_ctl *ctl, struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, struct btrfs_trans_handle *trans, struct btrfs_path *path, u64 offset) { struct extent_state *cached_state = NULL; - struct io_ctl io_ctl; LIST_HEAD(bitmap_list); int entries = 0; int bitmaps = 0; int ret; + int must_iput = 0; if (!i_size_read(inode)) - return -1; + return -EIO; - ret = io_ctl_init(&io_ctl, inode, root, 1); + WARN_ON(io_ctl->pages); + ret = io_ctl_init(io_ctl, inode, root, 1); if (ret) - return -1; + return ret; if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) { down_write(&block_group->data_rwsem); @@ -1143,55 +1249,59 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, up_write(&block_group->data_rwsem); BTRFS_I(inode)->generation = 0; ret = 0; + must_iput = 1; goto out; } spin_unlock(&block_group->lock); } /* Lock all pages first so we can lock the extent safely. */ - io_ctl_prepare_pages(&io_ctl, inode, 0); + ret = io_ctl_prepare_pages(io_ctl, inode, 0); + if (ret) + goto out; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 0, &cached_state); - io_ctl_set_generation(&io_ctl, trans->transid); + io_ctl_set_generation(io_ctl, trans->transid); mutex_lock(&ctl->cache_writeout_mutex); /* Write out the extent entries in the free space cache */ - ret = write_cache_extent_entries(&io_ctl, ctl, + spin_lock(&ctl->tree_lock); + ret = write_cache_extent_entries(io_ctl, ctl, block_group, &entries, &bitmaps, &bitmap_list); - if (ret) { - mutex_unlock(&ctl->cache_writeout_mutex); - goto out_nospc; - } + if (ret) + goto out_nospc_locked; /* * Some spaces that are freed in the current transaction are pinned, * they will be added into free space cache after the transaction is * committed, we shouldn't lose them. + * + * If this changes while we are working we'll get added back to + * the dirty list and redo it. No locking needed */ - ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries); - if (ret) { - mutex_unlock(&ctl->cache_writeout_mutex); - goto out_nospc; - } + ret = write_pinned_extent_entries(root, block_group, io_ctl, &entries); + if (ret) + goto out_nospc_locked; /* * At last, we write out all the bitmaps and keep cache_writeout_mutex * locked while doing it because a concurrent trim can be manipulating * or freeing the bitmap. */ - ret = write_bitmap_entries(&io_ctl, &bitmap_list); + ret = write_bitmap_entries(io_ctl, &bitmap_list); + spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); if (ret) goto out_nospc; /* Zero out the rest of the pages just to make sure */ - io_ctl_zero_remaining_pages(&io_ctl); + io_ctl_zero_remaining_pages(io_ctl); /* Everything is written out, now we dirty the pages in the file. */ - ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages, + ret = btrfs_dirty_pages(root, inode, io_ctl->pages, io_ctl->num_pages, 0, i_size_read(inode), &cached_state); if (ret) goto out_nospc; @@ -1202,30 +1312,44 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, * Release the pages and unlock the extent, we will flush * them out later */ - io_ctl_drop_pages(&io_ctl); + io_ctl_drop_pages(io_ctl); unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state, GFP_NOFS); - /* Flush the dirty pages in the cache file. */ - ret = flush_dirty_cache(inode); + /* + * at this point the pages are under IO and we're happy, + * The caller is responsible for waiting on them and updating the + * the cache and the inode + */ + io_ctl->entries = entries; + io_ctl->bitmaps = bitmaps; + + ret = btrfs_fdatawrite_range(inode, 0, (u64)-1); if (ret) goto out; - /* Update the cache item to tell everyone this cache file is valid. */ - ret = update_cache_item(trans, root, inode, path, offset, - entries, bitmaps); + return 0; + out: - io_ctl_free(&io_ctl); + io_ctl->inode = NULL; + io_ctl_free(io_ctl); if (ret) { invalidate_inode_pages2(inode->i_mapping); BTRFS_I(inode)->generation = 0; } btrfs_update_inode(trans, root, inode); + if (must_iput) + iput(inode); return ret; +out_nospc_locked: + cleanup_bitmap_list(&bitmap_list); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + out_nospc: - cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list); + cleanup_write_cache_enospc(inode, io_ctl, &cached_state, &bitmap_list); if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); @@ -1241,7 +1365,6 @@ int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct inode *inode; int ret = 0; - enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN; root = root->fs_info->tree_root; @@ -1250,34 +1373,34 @@ int btrfs_write_out_cache(struct btrfs_root *root, spin_unlock(&block_group->lock); return 0; } - - if (block_group->delalloc_bytes) { - block_group->disk_cache_state = BTRFS_DC_WRITTEN; - spin_unlock(&block_group->lock); - return 0; - } spin_unlock(&block_group->lock); inode = lookup_free_space_inode(root, block_group, path); if (IS_ERR(inode)) return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, + ret = __btrfs_write_out_cache(root, inode, ctl, block_group, + &block_group->io_ctl, trans, path, block_group->key.objectid); if (ret) { - dcs = BTRFS_DC_ERROR; - ret = 0; #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free space cache for block group %llu", block_group->key.objectid); #endif + spin_lock(&block_group->lock); + block_group->disk_cache_state = BTRFS_DC_ERROR; + spin_unlock(&block_group->lock); + + block_group->io_ctl.inode = NULL; + iput(inode); } - spin_lock(&block_group->lock); - block_group->disk_cache_state = dcs; - spin_unlock(&block_group->lock); - iput(inode); + /* + * if ret == 0 the caller is expected to call btrfs_wait_cache_io + * to wait for IO and put the inode + */ + return ret; } @@ -1298,11 +1421,11 @@ static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) { u64 bitmap_start; - u64 bytes_per_bitmap; + u32 bytes_per_bitmap; bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit; bitmap_start = offset - ctl->start; - bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start = div_u64(bitmap_start, bytes_per_bitmap); bitmap_start *= bytes_per_bitmap; bitmap_start += ctl->start; @@ -1521,10 +1644,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) u64 bitmap_bytes; u64 extent_bytes; u64 size = block_group->key.offset; - u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; - int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); + u32 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; + u32 max_bitmaps = div_u64(size + bytes_per_bg - 1, bytes_per_bg); - max_bitmaps = max(max_bitmaps, 1); + max_bitmaps = max_t(u32, max_bitmaps, 1); ASSERT(ctl->total_bitmaps <= max_bitmaps); @@ -1537,7 +1660,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) max_bytes = MAX_CACHE_BYTES_PER_GIG; else max_bytes = MAX_CACHE_BYTES_PER_GIG * - div64_u64(size, 1024 * 1024 * 1024); + div_u64(size, 1024 * 1024 * 1024); /* * we want to account for 1 more bitmap than what we have so we can make @@ -1552,14 +1675,14 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) } /* - * we want the extent entry threshold to always be at most 1/2 the maxw + * we want the extent entry threshold to always be at most 1/2 the max * bytes we can have, or whatever is less than that. */ extent_bytes = max_bytes - bitmap_bytes; - extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); + extent_bytes = min_t(u64, extent_bytes, max_bytes >> 1); ctl->extents_thresh = - div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); + div_u64(extent_bytes, sizeof(struct btrfs_free_space)); } static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, @@ -1673,7 +1796,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, */ if (*bytes >= align) { tmp = entry->offset - ctl->start + align - 1; - do_div(tmp, align); + tmp = div64_u64(tmp, align); tmp = tmp * align + ctl->start; align_off = tmp - entry->offset; } else { @@ -2402,11 +2525,8 @@ static void __btrfs_remove_free_space_cache_locked( } else { free_bitmap(ctl, info); } - if (need_resched()) { - spin_unlock(&ctl->tree_lock); - cond_resched(); - spin_lock(&ctl->tree_lock); - } + + cond_resched_lock(&ctl->tree_lock); } } @@ -2431,11 +2551,8 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); - if (need_resched()) { - spin_unlock(&ctl->tree_lock); - cond_resched(); - spin_lock(&ctl->tree_lock); - } + + cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache_locked(ctl); spin_unlock(&ctl->tree_lock); @@ -3346,13 +3463,29 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; int ret; + struct btrfs_io_ctl io_ctl; + bool release_metadata = true; if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return 0; - ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); + memset(&io_ctl, 0, sizeof(io_ctl)); + ret = __btrfs_write_out_cache(root, inode, ctl, NULL, &io_ctl, + trans, path, 0); + if (!ret) { + /* + * At this point writepages() didn't error out, so our metadata + * reservation is released when the writeback finishes, at + * inode.c:btrfs_finish_ordered_io(), regardless of it finishing + * with or without an error. + */ + release_metadata = false; + ret = btrfs_wait_cache_io(root, trans, NULL, &io_ctl, path, 0); + } + if (ret) { - btrfs_delalloc_release_metadata(inode, inode->i_size); + if (release_metadata) + btrfs_delalloc_release_metadata(inode, inode->i_size); #ifdef DEBUG btrfs_err(root->fs_info, "failed to write free ino cache for root %llu", diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 88b2238a0aed..a16a029ad3b1 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -48,6 +48,8 @@ struct btrfs_free_space_op { struct btrfs_free_space *info); }; +struct btrfs_io_ctl; + struct inode *lookup_free_space_inode(struct btrfs_root *root, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); @@ -60,14 +62,19 @@ int btrfs_check_trunc_cache_free_space(struct btrfs_root *root, struct btrfs_block_rsv *rsv); int btrfs_truncate_free_space_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, struct inode *inode); int load_free_space_cache(struct btrfs_fs_info *fs_info, struct btrfs_block_group_cache *block_group); +int btrfs_wait_cache_io(struct btrfs_root *root, + struct btrfs_trans_handle *trans, + struct btrfs_block_group_cache *block_group, + struct btrfs_io_ctl *io_ctl, + struct btrfs_path *path, u64 offset); int btrfs_write_out_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group, struct btrfs_path *path); - struct inode *lookup_free_ino_inode(struct btrfs_root *root, struct btrfs_path *path); int create_free_ino_inode(struct btrfs_root *root, diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 74faea3a516e..d4a582ac3f73 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -246,6 +246,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) { struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; + spinlock_t *rbroot_lock = &root->free_ino_pinned->tree_lock; struct btrfs_free_space *info; struct rb_node *n; u64 count; @@ -254,24 +255,30 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) return; while (1) { + bool add_to_ctl = true; + + spin_lock(rbroot_lock); n = rb_first(rbroot); - if (!n) + if (!n) { + spin_unlock(rbroot_lock); break; + } info = rb_entry(n, struct btrfs_free_space, offset_index); BUG_ON(info->bitmap); /* Logic error */ if (info->offset > root->ino_cache_progress) - goto free; + add_to_ctl = false; else if (info->offset + info->bytes > root->ino_cache_progress) count = root->ino_cache_progress - info->offset + 1; else count = info->bytes; - __btrfs_add_free_space(ctl, info->offset, count); -free: rb_erase(&info->offset_index, rbroot); - kfree(info); + spin_unlock(rbroot_lock); + if (add_to_ctl) + __btrfs_add_free_space(ctl, info->offset, count); + kmem_cache_free(btrfs_free_space_cachep, info); } } @@ -456,7 +463,7 @@ again: } if (i_size_read(inode) > 0) { - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, NULL, inode); if (ret) { if (ret != -ENOSPC) btrfs_abort_transaction(trans, root, ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d2e732d7af52..e33dff356460 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,7 +32,6 @@ #include <linux/writeback.h> #include <linux/statfs.h> #include <linux/compat.h> -#include <linux/aio.h> #include <linux/bit_spinlock.h> #include <linux/xattr.h> #include <linux/posix_acl.h> @@ -43,6 +42,7 @@ #include <linux/btrfs.h> #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> +#include <linux/uio.h> #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -59,6 +59,7 @@ #include "backref.h" #include "hash.h" #include "props.h" +#include "qgroup.h" struct btrfs_iget_args { struct btrfs_key *location; @@ -470,7 +471,7 @@ again: */ if (inode_need_compress(inode)) { WARN_ON(pages); - pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { /* just bail out to the uncompressed code */ goto cont; @@ -752,7 +753,6 @@ retry: } goto out_free; } - /* * here we're doing allocation and writeback of the * compressed pages @@ -3110,6 +3110,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) if (empty) return; + down_read(&fs_info->delayed_iput_sem); + spin_lock(&fs_info->delayed_iput_lock); list_splice_init(&fs_info->delayed_iputs, &list); spin_unlock(&fs_info->delayed_iput_lock); @@ -3120,6 +3122,8 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) iput(delayed->inode); kfree(delayed); } + + up_read(&root->fs_info->delayed_iput_sem); } /* @@ -3628,25 +3632,28 @@ static void btrfs_read_locked_inode(struct inode *inode) BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode->i_version = btrfs_inode_sequence(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + +cache_index: /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any * idea about which extents were modified before we were evicted from * cache. + * + * This is required for both inode re-read from disk and delayed inode + * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == root->fs_info->generation) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - inode->i_version = btrfs_inode_sequence(leaf, inode_item); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; - rdev = btrfs_inode_rdev(leaf, inode_item); - - BTRFS_I(inode)->index_cnt = (u64)-1; - BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); - -cache_index: path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) @@ -4016,16 +4023,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int ret; trans = __unlink_start_trans(dir); if (IS_ERR(trans)) return PTR_ERR(trans); - btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); + btrfs_record_unlink_dir(trans, dir, d_inode(dentry), 0); - ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + ret = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); if (ret) goto out; @@ -4124,7 +4131,7 @@ out: static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = 0; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_trans_handle *trans; @@ -4151,7 +4158,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) goto out; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + err = btrfs_unlink_inode(trans, root, dir, d_inode(dentry), dentry->d_name.name, dentry->d_name.len); if (!err) btrfs_i_size_write(inode, 0); @@ -4162,6 +4169,21 @@ out: return err; } +static int truncate_space_check(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytes_deleted) +{ + int ret; + + bytes_deleted = btrfs_csum_bytes_to_leaves(root, bytes_deleted); + ret = btrfs_block_rsv_add(root, &root->fs_info->trans_block_rsv, + bytes_deleted, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->bytes_reserved += bytes_deleted; + return ret; + +} + /* * this can truncate away extent items, csum items and directory items. * It starts at a high offset and removes keys until it can't find @@ -4187,7 +4209,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 last_size = (u64)-1; + u64 last_size = new_size; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4197,9 +4219,21 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, int ret; int err = 0; u64 ino = btrfs_ino(inode); + u64 bytes_deleted = 0; + bool be_nice = 0; + bool should_throttle = 0; + bool should_end = 0; BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); + /* + * for non-free space inodes and ref cows, we want to back off from + * time to time + */ + if (!btrfs_is_free_space_inode(inode) && + test_bit(BTRFS_ROOT_REF_COWS, &root->state)) + be_nice = 1; + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -4229,6 +4263,19 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.type = (u8)-1; search_again: + /* + * with a 16K leaf size and 128MB extents, you can actually queue + * up a huge file in a single leaf. Most of the time that + * bytes_deleted is > 0, it will be huge by the time we get here + */ + if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (btrfs_should_end_transaction(trans, root)) { + err = -EAGAIN; + goto error; + } + } + + path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) { @@ -4371,22 +4418,39 @@ delete: } else { break; } + should_throttle = 0; + if (found_extent && (test_bit(BTRFS_ROOT_REF_COWS, &root->state) || root == root->fs_info->tree_root)) { btrfs_set_path_blocking(path); + bytes_deleted += extent_num_bytes; ret = btrfs_free_extent(trans, root, extent_start, extent_num_bytes, 0, btrfs_header_owner(leaf), ino, extent_offset, 0); BUG_ON(ret); + if (btrfs_should_throttle_delayed_refs(trans, root)) + btrfs_async_run_delayed_refs(root, + trans->delayed_ref_updates * 2, 0); + if (be_nice) { + if (truncate_space_check(trans, root, + extent_num_bytes)) { + should_end = 1; + } + if (btrfs_should_throttle_delayed_refs(trans, + root)) { + should_throttle = 1; + } + } } if (found_type == BTRFS_INODE_ITEM_KEY) break; if (path->slots[0] == 0 || - path->slots[0] != pending_del_slot) { + path->slots[0] != pending_del_slot || + should_throttle || should_end) { if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, @@ -4399,6 +4463,23 @@ delete: pending_del_nr = 0; } btrfs_release_path(path); + if (should_throttle) { + unsigned long updates = trans->delayed_ref_updates; + if (updates) { + trans->delayed_ref_updates = 0; + ret = btrfs_run_delayed_refs(trans, root, updates * 2); + if (ret && !err) + err = ret; + } + } + /* + * if we failed to refill our space rsv, bail out + * and let the transaction restart + */ + if (should_end) { + err = -EAGAIN; + goto error; + } goto search_again; } else { path->slots[0]--; @@ -4412,10 +4493,20 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); + btrfs_free_path(path); + + if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + unsigned long updates = trans->delayed_ref_updates; + if (updates) { + trans->delayed_ref_updates = 0; + ret = btrfs_run_delayed_refs(trans, root, updates * 2); + if (ret && !err) + err = ret; + } + } return err; } @@ -4826,7 +4917,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; int err; @@ -4894,24 +4985,41 @@ static void evict_inode_truncate_pages(struct inode *inode) } write_unlock(&map_tree->lock); + /* + * Keep looping until we have no more ranges in the io tree. + * We can have ongoing bios started by readpages (called from readahead) + * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * still in progress (unlocked the pages in the bio but did not yet + * unlocked the ranges in the io tree). Therefore this means some + * ranges can still be locked and eviction started because before + * submitting those bios, which are executed by a separate task (work + * queue kthread), inode references (inode->i_count) were not taken + * (which would be dropped in the end io callback of each bio). + * Therefore here we effectively end up waiting for those bios and + * anyone else holding locked ranges without having bumped the inode's + * reference count - if we don't do it, when they access the inode's + * io_tree to unlock a range it may be too late, leading to an + * use-after-free issue. + */ spin_lock(&io_tree->lock); while (!RB_EMPTY_ROOT(&io_tree->state)) { struct extent_state *state; struct extent_state *cached_state = NULL; + u64 start; + u64 end; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); - atomic_inc(&state->refs); + start = state->start; + end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, state->start, state->end, - 0, &cached_state); - clear_extent_bit(io_tree, state->start, state->end, + lock_extent_bits(io_tree, start, end, 0, &cached_state); + clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); - free_extent_state(state); cond_resched(); spin_lock(&io_tree->lock); @@ -4924,6 +5032,7 @@ void btrfs_evict_inode(struct inode *inode) struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv, *global_rsv; + int steal_from_global = 0; u64 min_size = btrfs_calc_trunc_metadata_size(root, 1); int ret; @@ -4991,9 +5100,20 @@ void btrfs_evict_inode(struct inode *inode) * hard as possible to get this to work. */ if (ret) - ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size); + steal_from_global++; + else + steal_from_global = 0; + ret = 0; - if (ret) { + /* + * steal_from_global == 0: we reserved stuff, hooray! + * steal_from_global == 1: we didn't reserve stuff, boo! + * steal_from_global == 2: we've committed, still not a lot of + * room but maybe we'll have room in the global reserve this + * time. + * steal_from_global == 3: abandon all hope! + */ + if (steal_from_global > 2) { btrfs_warn(root->fs_info, "Could not get space for a delete, will truncate on mount %d", ret); @@ -5009,10 +5129,40 @@ void btrfs_evict_inode(struct inode *inode) goto no_delete; } + /* + * We can't just steal from the global reserve, we need tomake + * sure there is room to do it, if not we need to commit and try + * again. + */ + if (steal_from_global) { + if (!btrfs_check_space_for_delayed_refs(trans, root)) + ret = btrfs_block_rsv_migrate(global_rsv, rsv, + min_size); + else + ret = -ENOSPC; + } + + /* + * Couldn't steal from the global reserve, we have too much + * pending stuff built up, commit the transaction and try it + * again. + */ + if (ret) { + ret = btrfs_commit_transaction(trans, root); + if (ret) { + btrfs_orphan_del(NULL, inode); + btrfs_free_block_rsv(root, rsv); + goto no_delete; + } + continue; + } else { + steal_from_global = 0; + } + trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - if (ret != -ENOSPC) + if (ret != -ENOSPC && ret != -EAGAIN) break; trans->block_rsv = &root->fs_info->trans_block_rsv; @@ -5416,10 +5566,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (!inode && !IS_ROOT(dentry)) - inode = dentry->d_parent->d_inode; + inode = d_inode(dentry->d_parent); if (inode) { root = BTRFS_I(inode)->root; @@ -6226,7 +6376,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); u64 index; int err; int drop_inode = 0; @@ -7396,6 +7546,7 @@ unlock: current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); + set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); } /* @@ -7721,8 +7872,6 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct bio *dio_bio; int ret; - if (err) - goto out_done; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, @@ -7745,7 +7894,6 @@ out_test: ordered = NULL; goto again; } -out_done: dio_bio = dip->dio_bio; kfree(dip); @@ -8013,9 +8161,8 @@ out_err: static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_dio_private *dip; - struct bio *io_bio; + struct btrfs_dio_private *dip = NULL; + struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; int write = rw & REQ_WRITE; @@ -8032,7 +8179,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_io_bio; + goto free_ordered; } dip->private = dio_bio->bi_private; @@ -8060,28 +8207,58 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (btrfs_bio->end_io) btrfs_bio->end_io(btrfs_bio, ret); -free_io_bio: - bio_put(io_bio); free_ordered: /* - * If this is a write, we need to clean up the reserved space and kill - * the ordered extent. + * If we arrived here it means either we failed to submit the dip + * or we either failed to clone the dio_bio or failed to allocate the + * dip. If we cloned the dio_bio and allocated the dip, we can just + * call bio_endio against our io_bio so that we get proper resource + * cleanup if we fail to submit the dip, otherwise, we must do the + * same as btrfs_endio_direct_[write|read] because we can't call these + * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (write) { - struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len, 1); - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); + if (io_bio && dip) { + bio_endio(io_bio, ret); + /* + * The end io callbacks free our dip, do the final put on io_bio + * and all the cleanup and final put for dio_bio (through + * dio_end_io()). + */ + dip = NULL; + io_bio = NULL; + } else { + if (write) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_extent(inode, + file_offset); + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + /* + * Decrements our ref on the ordered extent and removes + * the ordered extent from the inode's ordered tree, + * doing all the proper resource cleanup such as for the + * reserved space and waking up any waiters for this + * ordered extent (through btrfs_remove_ordered_extent). + */ + btrfs_finish_ordered_io(ordered); + } else { + unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, + file_offset + dio_bio->bi_iter.bi_size - 1); + } + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + /* + * Releases and cleans up our dio_bio, no need to bio_put() + * nor bio_endio()/bio_io_error() against dio_bio. + */ + dio_end_io(dio_bio, ret); } - bio_endio(dio_bio, ret); + if (io_bio) + bio_put(io_bio); + kfree(dip); } -static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb, +static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, const struct iov_iter *iter, loff_t offset) { int seg; @@ -8096,7 +8273,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io goto out; /* If this is a write we don't need to check anymore */ - if (rw & WRITE) + if (iov_iter_rw(iter) == WRITE) return 0; /* * Check to make sure we don't have duplicate iov_base's in this @@ -8114,8 +8291,8 @@ out: return retval; } -static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -8126,10 +8303,10 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, bool relock = false; ssize_t ret; - if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iter, offset)) + if (check_direct_IO(BTRFS_I(inode)->root, iocb, iter, offset)) return 0; - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); smp_mb__after_atomic(); /* @@ -8144,7 +8321,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, filemap_fdatawrite_range(inode->i_mapping, offset, offset + count - 1); - if (rw & WRITE) { + if (iov_iter_rw(iter) == WRITE) { /* * If the write DIO is beyond the EOF, we need update * the isize, but it is protected by i_mutex. So we can @@ -8169,26 +8346,35 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, current->journal_info = &outstanding_extents; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { - inode_dio_done(inode); + inode_dio_end(inode); flags = DIO_LOCKING | DIO_SKIP_HOLES; wakeup = false; } - ret = __blockdev_direct_IO(rw, iocb, inode, - BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, - iter, offset, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (rw & WRITE) { + ret = __blockdev_direct_IO(iocb, inode, + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, + iter, offset, btrfs_get_blocks_direct, NULL, + btrfs_submit_direct, flags); + if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) - btrfs_delalloc_release_space(inode, count); - else if (ret >= 0 && (size_t)ret < count) + if (ret < 0 && ret != -EIOCBQUEUED) { + /* + * If the error comes from submitting stage, + * btrfs_get_blocsk_direct() has free'd data space, + * and metadata space will be handled by + * finish_ordered_fn, don't do that again to make + * sure bytes_may_use is correct. + */ + if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, + &BTRFS_I(inode)->runtime_flags)) + btrfs_delalloc_release_space(inode, count); + } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); } out: if (wakeup) - inode_dio_done(inode); + inode_dio_end(inode); if (relock) mutex_lock(&inode->i_mutex); @@ -8581,7 +8767,7 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, BTRFS_EXTENT_DATA_KEY); - if (ret != -ENOSPC) { + if (ret != -ENOSPC && ret != -EAGAIN) { err = ret; break; } @@ -8875,7 +9061,7 @@ static int btrfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { u64 delalloc_bytes; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); u32 blocksize = inode->i_sb->s_blocksize; generic_fillattr(inode, stat); @@ -8896,8 +9082,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; - struct inode *new_inode = new_dentry->d_inode; - struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = d_inode(new_dentry); + struct inode *old_inode = d_inode(old_dentry); struct timespec ctime = CURRENT_TIME; u64 index = 0; u64 root_objectid; @@ -9009,7 +9195,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, old_dentry->d_name.len); } else { ret = __btrfs_unlink_inode(trans, root, old_dir, - old_dentry->d_inode, + d_inode(old_dentry), old_dentry->d_name.name, old_dentry->d_name.len); if (!ret) @@ -9033,12 +9219,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, dest, new_dir, - new_dentry->d_inode, + d_inode(new_dentry), new_dentry->d_name.name, new_dentry->d_name.len); } if (!ret && new_inode->i_nlink == 0) - ret = btrfs_orphan_add(trans, new_dentry->d_inode); + ret = btrfs_orphan_add(trans, d_inode(new_dentry)); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out_fail; @@ -9451,6 +9637,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_end_transaction(trans, root); break; } + btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 74609b931ba5..0770c91586ca 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 { static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff); + u64 off, u64 olen, u64 olen_aligned, u64 destoff, + int no_time_update); /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -456,6 +457,13 @@ static noinline int create_subvol(struct inode *dir, if (ret) return ret; + /* + * Don't create subvolume whose level is not zero. Or qgroup will be + * screwed up since it assume subvolme qgroup's level to be 0. + */ + if (btrfs_qgroup_level(objectid)) + return -ENOSPC; + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * The same as the snapshot creation, please see the comment @@ -546,8 +554,8 @@ static noinline int create_subvol(struct inode *dir, key.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { - btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); ret = PTR_ERR(new_root); + btrfs_abort_transaction(trans, root, ret); goto fail; } @@ -717,7 +725,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; - inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto fail; @@ -761,10 +769,10 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) { int error; - if (!victim->d_inode) + if (d_really_is_negative(victim)) return -ENOENT; - BUG_ON(victim->d_parent->d_inode != dir); + BUG_ON(d_inode(victim->d_parent) != dir); audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE); error = inode_permission(dir, MAY_WRITE | MAY_EXEC); @@ -772,8 +780,8 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || - IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) + if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) || + IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim))) return -EPERM; if (isdir) { if (!d_is_dir(victim)) @@ -792,7 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct inode *dir, struct dentry *child) { - if (child->d_inode) + if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; @@ -810,7 +818,7 @@ static noinline int btrfs_mksubvol(struct path *parent, u64 *async_transid, bool readonly, struct btrfs_qgroup_inherit *inherit) { - struct inode *dir = parent->dentry->d_inode; + struct inode *dir = d_inode(parent->dentry); struct dentry *dentry; int error; @@ -824,7 +832,7 @@ static noinline int btrfs_mksubvol(struct path *parent, goto out_unlock; error = -EEXIST; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto out_dput; error = btrfs_may_create(dir, dentry); @@ -1311,7 +1319,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index + 1; + max_to_defrag = last_index - i + 1; /* * make writeback starts from i, so the defrag range can be @@ -1361,7 +1369,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index = max(i, ra_index); btrfs_force_ra(inode->i_mapping, ra, file, ra_index, cluster); - ra_index += max_cluster; + ra_index += cluster; } mutex_lock(&inode->i_mutex); @@ -1564,7 +1572,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - do_div(new_size, root->sectorsize); + new_size = div_u64(new_size, root->sectorsize); new_size *= root->sectorsize; printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", @@ -2264,10 +2272,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, { struct btrfs_ioctl_ino_lookup_args *args; struct inode *inode; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) @@ -2275,13 +2280,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, inode = file_inode(file); + /* + * Unprivileged query to obtain the containing subvolume root id. The + * path is reset so it's consistent with btrfs_search_path_in_tree. + */ if (args->treeid == 0) args->treeid = BTRFS_I(inode)->root->root_key.objectid; + if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { + args->name[0] = 0; + goto out; + } + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, args->treeid, args->objectid, args->name); +out: if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2294,7 +2314,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, { struct dentry *parent = file->f_path.dentry; struct dentry *dentry; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *dest = NULL; @@ -2333,12 +2353,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_unlock_dir; } - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { err = -ENOENT; goto out_dput; } - inode = dentry->d_inode; + inode = d_inode(dentry); dest = BTRFS_I(inode)->root; if (!capable(CAP_SYS_ADMIN)) { /* @@ -2403,11 +2423,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, "Attempt to delete subvolume %llu during send", dest->root_key.objectid); err = -EPERM; - goto out_dput; + goto out_unlock_inode; } - d_invalidate(dentry); - down_write(&root->fs_info->subvol_sem); err = may_destroy_subvol(dest); @@ -2498,9 +2516,10 @@ out_up_write: root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); } +out_unlock_inode: mutex_unlock(&inode->i_mutex); if (!err) { - shrink_dcache_sb(root->fs_info->sb); + d_invalidate(dentry); btrfs_invalidate_inodes(dest); d_delete(dentry); ASSERT(dest->send_in_progress == 0); @@ -2747,14 +2766,11 @@ out: return ret; } -static struct page *extent_same_get_page(struct inode *inode, u64 off) +static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) { struct page *page; - pgoff_t index; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - index = off >> PAGE_CACHE_SHIFT; - page = grab_cache_page(inode->i_mapping, index); if (!page) return NULL; @@ -2775,6 +2791,20 @@ static struct page *extent_same_get_page(struct inode *inode, u64 off) return page; } +static int gather_extent_pages(struct inode *inode, struct page **pages, + int num_pages, u64 off) +{ + int i; + pgoff_t index = off >> PAGE_CACHE_SHIFT; + + for (i = 0; i < num_pages; i++) { + pages[i] = extent_same_get_page(inode, index + i); + if (!pages[i]) + return -ENOMEM; + } + return 0; +} + static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) { /* do any pending delalloc/csum calc on src, one way or @@ -2800,52 +2830,120 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) } } -static void btrfs_double_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); - mutex_unlock(&inode1->i_mutex); mutex_unlock(&inode2->i_mutex); } -static void btrfs_double_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) +{ + if (inode1 < inode2) + swap(inode1, inode2); + + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + if (inode1 != inode2) + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); +} + +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) { if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); } - - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); lock_extent_range(inode1, loff1, len); - if (inode1 != inode2) { - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + if (inode1 != inode2) lock_extent_range(inode2, loff2, len); +} + +struct cmp_pages { + int num_pages; + struct page **src_pages; + struct page **dst_pages; +}; + +static void btrfs_cmp_data_free(struct cmp_pages *cmp) +{ + int i; + struct page *pg; + + for (i = 0; i < cmp->num_pages; i++) { + pg = cmp->src_pages[i]; + if (pg) + page_cache_release(pg); + pg = cmp->dst_pages[i]; + if (pg) + page_cache_release(pg); + } + kfree(cmp->src_pages); + kfree(cmp->dst_pages); +} + +static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, + struct inode *dst, u64 dst_loff, + u64 len, struct cmp_pages *cmp) +{ + int ret; + int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + struct page **src_pgarr, **dst_pgarr; + + /* + * We must gather up all the pages before we initiate our + * extent locking. We use an array for the page pointers. Size + * of the array is bounded by len, which is in turn bounded by + * BTRFS_MAX_DEDUPE_LEN. + */ + src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + if (!src_pgarr || !dst_pgarr) { + kfree(src_pgarr); + kfree(dst_pgarr); + return -ENOMEM; } + cmp->num_pages = num_pages; + cmp->src_pages = src_pgarr; + cmp->dst_pages = dst_pgarr; + + ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff); + if (ret) + goto out; + + ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff); + +out: + if (ret) + btrfs_cmp_data_free(cmp); + return 0; } static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, - u64 dst_loff, u64 len) + u64 dst_loff, u64 len, struct cmp_pages *cmp) { int ret = 0; + int i; struct page *src_page, *dst_page; unsigned int cmp_len = PAGE_CACHE_SIZE; void *addr, *dst_addr; + i = 0; while (len) { if (len < PAGE_CACHE_SIZE) cmp_len = len; - src_page = extent_same_get_page(src, loff); - if (!src_page) - return -EINVAL; - dst_page = extent_same_get_page(dst, dst_loff); - if (!dst_page) { - page_cache_release(src_page); - return -EINVAL; - } + BUG_ON(i >= cmp->num_pages); + + src_page = cmp->src_pages[i]; + dst_page = cmp->dst_pages[i]; + addr = kmap_atomic(src_page); dst_addr = kmap_atomic(dst_page); @@ -2857,26 +2955,30 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, kunmap_atomic(addr); kunmap_atomic(dst_addr); - page_cache_release(src_page); - page_cache_release(dst_page); if (ret) break; - loff += cmp_len; - dst_loff += cmp_len; len -= cmp_len; + i++; } return ret; } -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, + u64 olen) { + u64 len = *plen; u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - if (off + len > inode->i_size || off + len < off) + if (off + olen > inode->i_size || off + olen < off) return -EINVAL; + + /* if we extend to eof, continue to block boundary */ + if (off + len == inode->i_size) + *plen = len = ALIGN(inode->i_size, bs) - off; + /* Check that we are block aligned - btrfs_clone() requires this */ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) return -EINVAL; @@ -2884,28 +2986,67 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) return 0; } -static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; + u64 len = olen; + struct cmp_pages cmp; + int same_inode = 0; + u64 same_lock_start = 0; + u64 same_lock_len = 0; - /* - * btrfs_clone() can't handle extents in the same file - * yet. Once that works, we can drop this check and replace it - * with a check for the same inode, but overlapping extents. - */ if (src == dst) - return -EINVAL; + same_inode = 1; + + if (len == 0) + return 0; - btrfs_double_lock(src, loff, dst, dst_loff, len); + if (same_inode) { + mutex_lock(&src->i_mutex); - ret = extent_same_check_offsets(src, loff, len); - if (ret) - goto out_unlock; + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; - ret = extent_same_check_offsets(dst, dst_loff, len); - if (ret) - goto out_unlock; + /* + * Single inode case wants the same checks, except we + * don't want our length pushed out past i_size as + * comparing that data range makes no sense. + * + * extent_same_check_offsets() will do this for an + * unaligned length at i_size, so catch it here and + * reject the request. + * + * This effectively means we require aligned extents + * for the single-inode case, whereas the other cases + * allow an unaligned length so long as it ends at + * i_size. + */ + if (len != olen) { + ret = -EINVAL; + goto out_unlock; + } + + /* Check for overlapping ranges */ + if (dst_loff + len > loff && dst_loff < loff + len) { + ret = -EINVAL; + goto out_unlock; + } + + same_lock_start = min_t(u64, loff, dst_loff); + same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; + } else { + btrfs_double_inode_lock(src, dst); + + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; + + ret = extent_same_check_offsets(dst, dst_loff, &len, olen); + if (ret) + goto out_unlock; + } /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != @@ -2914,12 +3055,32 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, goto out_unlock; } - ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); + ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); + if (ret) + goto out_unlock; + + if (same_inode) + lock_extent_range(src, same_lock_start, same_lock_len); + else + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + + /* pass original length for comparison so we stay within i_size */ + ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); if (ret == 0) - ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + + if (same_inode) + unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, + same_lock_start + same_lock_len - 1); + else + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + btrfs_cmp_data_free(&cmp); out_unlock: - btrfs_double_unlock(src, loff, dst, dst_loff, len); + if (same_inode) + mutex_unlock(&src->i_mutex); + else + btrfs_double_inode_unlock(src, dst); return ret; } @@ -2929,7 +3090,7 @@ out_unlock: static long btrfs_ioctl_file_extent_same(struct file *file, struct btrfs_ioctl_same_args __user *argp) { - struct btrfs_ioctl_same_args *same; + struct btrfs_ioctl_same_args *same = NULL; struct btrfs_ioctl_same_extent_info *info; struct inode *src = file_inode(file); u64 off; @@ -2959,6 +3120,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, if (IS_ERR(same)) { ret = PTR_ERR(same); + same = NULL; goto out; } @@ -3029,6 +3191,7 @@ static long btrfs_ioctl_file_extent_same(struct file *file, out: mnt_drop_write_file(file); + kfree(same); return ret; } @@ -3039,7 +3202,7 @@ out: static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 disko) { - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); struct ulist *roots; struct ulist_iterator uiter; struct ulist_node *root_node = NULL; @@ -3071,13 +3234,15 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, const u64 destoff, - const u64 olen) + const u64 olen, + int no_time_update) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (!no_time_update) + inode->i_mtime = inode->i_ctime = CURRENT_TIME; /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -3162,13 +3327,13 @@ static void clone_update_extent_map(struct inode *inode, * @inode: Inode to clone to * @off: Offset within source to start clone from * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen, extent_same uses - * identical values here + * @olen_aligned: Block-aligned value of olen * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff) + const u64 destoff, int no_time_update) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; @@ -3202,6 +3367,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { + u64 next_key_min_offset = key.offset + 1; + /* * note the key will change type as we walk through the * tree. @@ -3282,7 +3449,7 @@ process_slot: } else if (key.offset >= off + len) { break; } - + next_key_min_offset = key.offset + datal; size = btrfs_item_size_nr(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), @@ -3421,6 +3588,20 @@ process_slot: u64 trim = 0; u64 aligned_end = 0; + /* + * Don't copy an inline extent into an offset + * greater than zero. Having an inline extent + * at such an offset results in chaos as btrfs + * isn't prepared for such cases. Just skip + * this case for the same reasons as commented + * at btrfs_ioctl_clone(). + */ + if (last_dest_end > 0) { + ret = -EOPNOTSUPP; + btrfs_end_transaction(trans, root); + goto out; + } + if (off > key.offset) { skip = off - key.offset; new_key.offset += skip; @@ -3490,14 +3671,15 @@ process_slot: root->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, - destoff, olen); + destoff, olen, + no_time_update); if (ret) goto out; if (new_key.offset + datal >= destoff + len) break; } btrfs_release_path(path); - key.offset++; + key.offset = next_key_min_offset; } ret = 0; @@ -3528,7 +3710,7 @@ process_slot: clone_update_extent_map(inode, trans, NULL, last_dest_end, destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, - destoff, olen); + destoff, olen, no_time_update); } out: @@ -3626,6 +3808,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, if (off + len == src->i_size) len = ALIGN(src->i_size, bs) - off; + if (len == 0) { + ret = 0; + goto out_unlock; + } + /* verify the end result is block aligned */ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) || !IS_ALIGNED(destoff, bs)) @@ -3660,7 +3847,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, lock_extent_range(inode, destoff, len); } - ret = btrfs_clone(src, inode, off, olen, len, destoff); + ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); if (same_inode) { u64 lock_start = min_t(u64, off, destoff); @@ -4624,6 +4811,11 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) sa->src, sa->dst); } + /* update qgroup status and info */ + err = btrfs_run_qgroups(trans, root->fs_info); + if (err < 0) + btrfs_error(root->fs_info, ret, + "failed to update qgroup status and info\n"); err = btrfs_end_transaction(trans, root); if (err && !ret) ret = err; @@ -4669,8 +4861,7 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) /* FIXME: check if the IDs really exist */ if (sa->create) { - ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid, - NULL); + ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid); } else { ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid); } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 617553cdb7d3..a2f051347731 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -434,7 +434,7 @@ out: return ret; } -struct btrfs_compress_op btrfs_lzo_compress = { +const struct btrfs_compress_op btrfs_lzo_compress = { .alloc_workspace = lzo_alloc_workspace, .free_workspace = lzo_free_workspace, .compress_pages = lzo_compress_pages, diff --git a/fs/btrfs/math.h b/fs/btrfs/math.h index b7816cefbd13..1b10a3cd1195 100644 --- a/fs/btrfs/math.h +++ b/fs/btrfs/math.h @@ -28,8 +28,7 @@ static inline u64 div_factor(u64 num, int factor) if (factor == 10) return num; num *= factor; - do_div(num, 10); - return num; + return div_u64(num, 10); } static inline u64 div_factor_fine(u64 num, int factor) @@ -37,8 +36,7 @@ static inline u64 div_factor_fine(u64 num, int factor) if (factor == 100) return num; num *= factor; - do_div(num, 100); - return num; + return div_u64(num, 100); } #endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 157cc54fc634..52170cf1757e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && - !(type == BTRFS_ORDERED_NOCOW)) - entry->csum_bytes_left = disk_len; entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = igrab(inode); @@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode, tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - WARN_ON(entry->csum_bytes_left < sum->len); - entry->csum_bytes_left -= sum->len; - if (entry->csum_bytes_left == 0) - wake_up(&entry->wait); spin_unlock_irq(&tree->lock); } @@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - list_add_tail(&ordered->trans_list, &trans->ordered); + /* + * If our ordered extent completed it means it updated the + * fs/subvol and csum trees already, so no need to make the + * current transaction's commit wait for it, as we end up + * holding memory unnecessarily and delaying the inode's iput + * until the transaction commit (we schedule an iput for the + * inode when the ordered extent's refcount drops to 0), which + * prevents it from being evictable until the transaction + * commits. + */ + if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) + btrfs_put_ordered_extent(ordered); + else + list_add_tail(&ordered->trans_list, &trans->ordered); + spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -545,6 +552,10 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (atomic_dec_and_test(&entry->refs)) { + ASSERT(list_empty(&entry->log_list)); + ASSERT(list_empty(&entry->trans_list)); + ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { @@ -572,6 +583,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); @@ -722,6 +734,7 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) { int ret = 0; + int ret_wb = 0; u64 end; u64 orig_end; struct btrfs_ordered_extent *ordered; @@ -741,9 +754,14 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) if (ret) return ret; - ret = filemap_fdatawait_range(inode->i_mapping, start, orig_end); - if (ret) - return ret; + /* + * If we have a writeback error don't return immediately. Wait first + * for any ordered extents that haven't completed yet. This is to make + * sure no one can dirty the same page ranges and call writepages() + * before the ordered extents complete - to avoid failures (-EEXIST) + * when adding the new ordered extents to the ordered tree. + */ + ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end); end = orig_end; while (1) { @@ -767,7 +785,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) break; end--; } - return ret; + return ret_wb ? ret_wb : ret; } /* @@ -838,6 +856,20 @@ out: return entry; } +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_extent *oe; + + oe = btrfs_lookup_ordered_range(inode, file_offset, len); + if (oe) { + btrfs_put_ordered_extent(oe); + return true; + } + return false; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e96cd4ccd805..7176cc0fe43f 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -89,9 +89,6 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; - /* number of bytes that still need csumming */ - u64 csum_bytes_left; - /* * the end of the ordered extent which is behind it but * didn't update disk_i_size. Please see the comment of @@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, u64 file_offset, u64 len); +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 129b1dd28527..dca137b04095 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -425,3 +425,5 @@ static const char *prop_compression_extract(struct inode *inode) return NULL; } + + diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 058c79eecbfb..8a8202956576 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -34,6 +34,7 @@ #include "extent_io.h" #include "qgroup.h" + /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? * - reorganize keys @@ -84,11 +85,42 @@ struct btrfs_qgroup { /* * temp variables for accounting operations + * Refer to qgroup_shared_accouting() for details. */ u64 old_refcnt; u64 new_refcnt; }; +static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->old_refcnt < seq) + qg->old_refcnt = seq; + qg->old_refcnt += mod; +} + +static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->new_refcnt < seq) + qg->new_refcnt = seq; + qg->new_refcnt += mod; +} + +static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->old_refcnt < seq) + return 0; + return qg->old_refcnt - seq; +} + +static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->new_refcnt < seq) + return 0; + return qg->new_refcnt - seq; +} + /* * glue structure to represent the relations between qgroups. */ @@ -644,9 +676,8 @@ out: } static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 qgroupid, - u64 flags, u64 max_rfer, u64 max_excl, - u64 rsv_rfer, u64 rsv_excl) + struct btrfs_root *root, + struct btrfs_qgroup *qgroup) { struct btrfs_path *path; struct btrfs_key key; @@ -657,7 +688,7 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, key.objectid = 0; key.type = BTRFS_QGROUP_LIMIT_KEY; - key.offset = qgroupid; + key.offset = qgroup->qgroupid; path = btrfs_alloc_path(); if (!path) @@ -673,11 +704,11 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, l = path->nodes[0]; slot = path->slots[0]; qgroup_limit = btrfs_item_ptr(l, slot, struct btrfs_qgroup_limit_item); - btrfs_set_qgroup_limit_flags(l, qgroup_limit, flags); - btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, max_rfer); - btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, max_excl); - btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, rsv_rfer); - btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, rsv_excl); + btrfs_set_qgroup_limit_flags(l, qgroup_limit, qgroup->lim_flags); + btrfs_set_qgroup_limit_max_rfer(l, qgroup_limit, qgroup->max_rfer); + btrfs_set_qgroup_limit_max_excl(l, qgroup_limit, qgroup->max_excl); + btrfs_set_qgroup_limit_rsv_rfer(l, qgroup_limit, qgroup->rsv_rfer); + btrfs_set_qgroup_limit_rsv_excl(l, qgroup_limit, qgroup->rsv_excl); btrfs_mark_buffer_dirty(l); @@ -967,6 +998,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, fs_info->pending_quota_state = 0; quota_root = fs_info->quota_root; fs_info->quota_root = NULL; + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON; spin_unlock(&fs_info->qgroup_lock); btrfs_free_qgroup_config(fs_info); @@ -982,7 +1014,7 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, list_del("a_root->dirty_list); btrfs_tree_lock(quota_root->node); - clean_tree_block(trans, tree_root, quota_root->node); + clean_tree_block(trans, tree_root->fs_info, quota_root->node); btrfs_tree_unlock(quota_root->node); btrfs_free_tree_block(trans, quota_root, quota_root->node, 0, 1); @@ -1001,6 +1033,110 @@ static void qgroup_dirty(struct btrfs_fs_info *fs_info, list_add(&qgroup->dirty, &fs_info->dirty_qgroups); } +/* + * The easy accounting, if we are adding/removing the only ref for an extent + * then this qgroup and all of the parent qgroups get their refrence and + * exclusive counts adjusted. + * + * Caller should hold fs_info->qgroup_lock. + */ +static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 ref_root, + u64 num_bytes, int sign) +{ + struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *glist; + struct ulist_node *unode; + struct ulist_iterator uiter; + int ret = 0; + + qgroup = find_qgroup_rb(fs_info, ref_root); + if (!qgroup) + goto out; + + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + if (sign > 0) + qgroup->reserved -= num_bytes; + + qgroup_dirty(fs_info, qgroup); + + /* Get all of the parent groups that contain this qgroup */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + + /* Iterate all of the parents and adjust their reference counts */ + ULIST_ITER_INIT(&uiter); + while ((unode = ulist_next(tmp, &uiter))) { + qgroup = u64_to_ptr(unode->aux); + qgroup->rfer += sign * num_bytes; + qgroup->rfer_cmpr += sign * num_bytes; + WARN_ON(sign < 0 && qgroup->excl < num_bytes); + qgroup->excl += sign * num_bytes; + if (sign > 0) + qgroup->reserved -= num_bytes; + qgroup->excl_cmpr += sign * num_bytes; + qgroup_dirty(fs_info, qgroup); + + /* Add any parents of the parents */ + list_for_each_entry(glist, &qgroup->groups, next_group) { + ret = ulist_add(tmp, glist->group->qgroupid, + ptr_to_u64(glist->group), GFP_ATOMIC); + if (ret < 0) + goto out; + } + } + ret = 0; +out: + return ret; +} + + +/* + * Quick path for updating qgroup with only excl refs. + * + * In that case, just update all parent will be enough. + * Or we needs to do a full rescan. + * Caller should also hold fs_info->qgroup_lock. + * + * Return 0 for quick update, return >0 for need to full rescan + * and mark INCONSISTENT flag. + * Return < 0 for other error. + */ +static int quick_update_accounting(struct btrfs_fs_info *fs_info, + struct ulist *tmp, u64 src, u64 dst, + int sign) +{ + struct btrfs_qgroup *qgroup; + int ret = 1; + int err = 0; + + qgroup = find_qgroup_rb(fs_info, src); + if (!qgroup) + goto out; + if (qgroup->excl == qgroup->rfer) { + ret = 0; + err = __qgroup_excl_accounting(fs_info, tmp, dst, + qgroup->excl, sign); + if (err < 0) { + ret = err; + goto out; + } + } +out: + if (ret) + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + return ret; +} + int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { @@ -1008,8 +1144,17 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; + struct ulist *tmp; int ret = 0; + /* Check the level of src and dst first */ + if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) + return -EINVAL; + + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; if (!quota_root) { @@ -1043,23 +1188,33 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, spin_lock(&fs_info->qgroup_lock); ret = add_relation_rb(quota_root->fs_info, src, dst); + if (ret < 0) { + spin_unlock(&fs_info->qgroup_lock); + goto out; + } + ret = quick_update_accounting(fs_info, tmp, src, dst, 1); spin_unlock(&fs_info->qgroup_lock); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); + ulist_free(tmp); return ret; } -int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, +int __del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst) { struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; + struct ulist *tmp; int ret = 0; int err; - mutex_lock(&fs_info->qgroup_ioctl_lock); + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + quota_root = fs_info->quota_root; if (!quota_root) { ret = -EINVAL; @@ -1088,14 +1243,27 @@ exist: spin_lock(&fs_info->qgroup_lock); del_relation_rb(fs_info, src, dst); + ret = quick_update_accounting(fs_info, tmp, src, dst, -1); spin_unlock(&fs_info->qgroup_lock); out: + ulist_free(tmp); + return ret; +} + +int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 src, u64 dst) +{ + int ret = 0; + + mutex_lock(&fs_info->qgroup_ioctl_lock); + ret = __del_qgroup_relation(trans, fs_info, src, dst); mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; } int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, char *name) + struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; @@ -1133,6 +1301,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, { struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; + struct btrfs_qgroup_list *list; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); @@ -1147,15 +1316,24 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } else { - /* check if there are no relations to this qgroup */ - if (!list_empty(&qgroup->groups) || - !list_empty(&qgroup->members)) { + /* check if there are no children of this qgroup */ + if (!list_empty(&qgroup->members)) { ret = -EBUSY; goto out; } } ret = del_qgroup_item(trans, quota_root, qgroupid); + while (!list_empty(&qgroup->groups)) { + list = list_first_entry(&qgroup->groups, + struct btrfs_qgroup_list, next_group); + ret = __del_qgroup_relation(trans, fs_info, + qgroupid, + list->group->qgroupid); + if (ret) + goto out; + } + spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); @@ -1171,6 +1349,11 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; + /* Sometimes we would want to clear the limit on this qgroup. + * To meet this requirement, we treat the -1 as a special value + * which tell kernel to clear the limit on this qgroup. + */ + const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; @@ -1184,307 +1367,140 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, ret = -ENOENT; goto out; } - ret = update_qgroup_limit_item(trans, quota_root, qgroupid, - limit->flags, limit->max_rfer, - limit->max_excl, limit->rsv_rfer, - limit->rsv_excl); - if (ret) { - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; - btrfs_info(fs_info, "unable to update quota limit for %llu", - qgroupid); - } spin_lock(&fs_info->qgroup_lock); - qgroup->lim_flags = limit->flags; - qgroup->max_rfer = limit->max_rfer; - qgroup->max_excl = limit->max_excl; - qgroup->rsv_rfer = limit->rsv_rfer; - qgroup->rsv_excl = limit->rsv_excl; - spin_unlock(&fs_info->qgroup_lock); -out: - mutex_unlock(&fs_info->qgroup_ioctl_lock); - return ret; -} - -static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) -{ - /* - * Ignore seq and type here, we're looking for any operation - * at all related to this extent on that root. - */ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - return 0; -} - -static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct rb_node *n; - struct btrfs_qgroup_operation *cur; - int cmp; - - spin_lock(&fs_info->qgroup_op_lock); - n = fs_info->qgroup_op_tree.rb_node; - while (n) { - cur = rb_entry(n, struct btrfs_qgroup_operation, n); - cmp = comp_oper_exist(cur, oper); - if (cmp < 0) { - n = n->rb_right; - } else if (cmp) { - n = n->rb_left; + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { + if (limit->max_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + qgroup->max_rfer = 0; } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; + qgroup->max_rfer = limit->max_rfer; } } - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -static int comp_oper(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) -{ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->seq < oper2->seq) - return -1; - if (oper1->seq > oper2->seq) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - if (oper1->type < oper2->type) - return -1; - if (oper1->type > oper2->type) - return 1; - return 0; -} - -static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_qgroup_operation *cur; - int cmp; - - spin_lock(&fs_info->qgroup_op_lock); - p = &fs_info->qgroup_op_tree.rb_node; - while (*p) { - parent = *p; - cur = rb_entry(parent, struct btrfs_qgroup_operation, n); - cmp = comp_oper(cur, oper); - if (cmp < 0) { - p = &(*p)->rb_right; - } else if (cmp) { - p = &(*p)->rb_left; + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + if (limit->max_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + qgroup->max_excl = 0; } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; + qgroup->max_excl = limit->max_excl; } } - rb_link_node(&oper->n, parent, p); - rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -/* - * Record a quota operation for processing later on. - * @trans: the transaction we are adding the delayed op to. - * @fs_info: the fs_info for this fs. - * @ref_root: the root of the reference we are acting on, - * @bytenr: the bytenr we are acting on. - * @num_bytes: the number of bytes in the reference. - * @type: the type of operation this is. - * @mod_seq: do we need to get a sequence number for looking up roots. - * - * We just add it to our trans qgroup_ref_list and carry on and process these - * operations in order at some later point. If the reference root isn't a fs - * root then we don't bother with doing anything. - * - * MUST BE HOLDING THE REF LOCK. - */ -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, - u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, int mod_seq) -{ - struct btrfs_qgroup_operation *oper; - int ret; - - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - return 0; - - oper = kmalloc(sizeof(*oper), GFP_NOFS); - if (!oper) - return -ENOMEM; - - oper->ref_root = ref_root; - oper->bytenr = bytenr; - oper->num_bytes = num_bytes; - oper->type = type; - oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); - INIT_LIST_HEAD(&oper->elem.list); - oper->elem.seq = 0; - - trace_btrfs_qgroup_record_ref(oper); - - if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { - /* - * If any operation for this bytenr/ref_root combo - * exists, then we know it's not exclusively owned and - * shouldn't be queued up. - * - * This also catches the case where we have a cloned - * extent that gets queued up multiple times during - * drop snapshot. - */ - if (qgroup_oper_exists(fs_info, oper)) { - kfree(oper); - return 0; + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { + if (limit->rsv_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + qgroup->rsv_rfer = 0; + } else { + qgroup->rsv_rfer = limit->rsv_rfer; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { + if (limit->rsv_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + qgroup->rsv_excl = 0; + } else { + qgroup->rsv_excl = limit->rsv_excl; } } + qgroup->lim_flags |= limit->flags; + + spin_unlock(&fs_info->qgroup_lock); - ret = insert_qgroup_oper(fs_info, oper); + ret = update_qgroup_limit_item(trans, quota_root, qgroup); if (ret) { - /* Shouldn't happen so have an assert for developers */ - ASSERT(0); - kfree(oper); - return ret; + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_info(fs_info, "unable to update quota limit for %llu", + qgroupid); } - list_add_tail(&oper->list, &trans->qgroup_ref_list); - if (mod_seq) - btrfs_get_tree_mod_seq(fs_info, &oper->elem); - - return 0; +out: + mutex_unlock(&fs_info->qgroup_ioctl_lock); + return ret; } -/* - * The easy accounting, if we are adding/removing the only ref for an extent - * then this qgroup and all of the parent qgroups get their refrence and - * exclusive counts adjusted. - */ -static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { - struct btrfs_qgroup *qgroup; - struct ulist *tmp; - struct btrfs_qgroup_list *glist; - struct ulist_node *unode; - struct ulist_iterator uiter; - int sign = 0; + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + u64 qgroup_to_skip; int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; - spin_lock(&fs_info->qgroup_lock); - if (!fs_info->quota_root) - goto out; - qgroup = find_qgroup_rb(fs_info, oper->ref_root); - if (!qgroup) - goto out; - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - sign = 1; - break; - case BTRFS_QGROUP_OPER_SUB_EXCL: - sign = -1; - break; - default: - ASSERT(0); - } - qgroup->rfer += sign * oper->num_bytes; - qgroup->rfer_cmpr += sign * oper->num_bytes; - - WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); - qgroup->excl += sign * oper->num_bytes; - qgroup->excl_cmpr += sign * oper->num_bytes; - - qgroup_dirty(fs_info, qgroup); - - /* Get all of the parent groups that contain this qgroup */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); + /* + * No need to do lock, since this function will only be called in + * btrfs_commmit_transaction(). + */ + node = rb_first(&delayed_refs->dirty_extent_root); + while (node) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); + ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, + &record->old_roots); if (ret < 0) - goto out; + break; + if (qgroup_to_skip) + ulist_del(record->old_roots, qgroup_to_skip, 0); + node = rb_next(node); } + return ret; +} - /* Iterate all of the parents and adjust their reference counts */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - qgroup = u64_to_ptr(unode->aux); - qgroup->rfer += sign * oper->num_bytes; - qgroup->rfer_cmpr += sign * oper->num_bytes; - WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes); - qgroup->excl += sign * oper->num_bytes; - qgroup->excl_cmpr += sign * oper->num_bytes; - qgroup_dirty(fs_info, qgroup); +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) +{ + struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_qgroup_extent_record *entry; + u64 bytenr = record->bytenr; - /* Add any parents of the parents */ - list_for_each_entry(glist, &qgroup->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - goto out; - } + while (*p) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, + node); + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; } - ret = 0; -out: - spin_unlock(&fs_info->qgroup_lock); - ulist_free(tmp); - return ret; + + rb_link_node(&record->node, parent_node, p); + rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); + return NULL; } +#define UPDATE_NEW 0 +#define UPDATE_OLD 1 /* - * Walk all of the roots that pointed to our bytenr and adjust their refcnts as - * properly. + * Walk all of the roots that points to the bytenr and adjust their refcnts. */ -static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, - u64 root_to_skip, struct ulist *tmp, - struct ulist *roots, struct ulist *qgroups, - u64 seq, int *old_roots, int rescan) +static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + struct ulist *qgroups, u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; struct ulist_node *tmp_unode; struct ulist_iterator tmp_uiter; struct btrfs_qgroup *qg; - int ret; + int ret = 0; + if (!roots) + return 0; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { - /* We don't count our current root here */ - if (unode->val == root_to_skip) - continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; - /* - * We could have a pending removal of this same ref so we may - * not have actually found our ref root when doing - * btrfs_find_all_roots, so we need to keep track of how many - * old roots we find in case we removed ours and added a - * different one at the same time. I don't think this could - * happen in practice but that sort of thinking leads to pain - * and suffering and to the dark side. - */ - (*old_roots)++; ulist_reinit(tmp); ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), @@ -1499,29 +1515,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *glist; qg = u64_to_ptr(tmp_unode->aux); - /* - * We use this sequence number to keep from having to - * run the whole list and 0 out the refcnt every time. - * We basically use sequnce as the known 0 count and - * then add 1 everytime we see a qgroup. This is how we - * get how many of the roots actually point up to the - * upper level qgroups in order to determine exclusive - * counts. - * - * For rescan we want to set old_refcnt to seq so our - * exclusive calculations end up correct. - */ - if (rescan) - qg->old_refcnt = seq; - else if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; + if (update_old) + btrfs_qgroup_update_old_refcnt(qg, seq, 1); else - qg->old_refcnt++; - - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; + btrfs_qgroup_update_new_refcnt(qg, seq, 1); list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(qgroups, glist->group->qgroupid, ptr_to_u64(glist->group), @@ -1540,161 +1537,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, } /* - * We need to walk forward in our operation tree and account for any roots that - * were deleted after we made this operation. - */ -static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct ulist *tmp, - struct ulist *qgroups, u64 seq, - int *old_roots) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - struct btrfs_qgroup_operation *tmp_oper; - struct rb_node *n; - int ret; - - ulist_reinit(tmp); - - /* - * We only walk forward in the tree since we're only interested in - * removals that happened _after_ our operation. - */ - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - return 0; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - while (tmp_oper->bytenr == oper->bytenr) { - /* - * If it's not a removal we don't care, additions work out - * properly with our refcnt tracking. - */ - if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && - tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) - goto next; - qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); - if (!qg) - goto next; - ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret) { - if (ret < 0) - return ret; - /* - * We only want to increase old_roots if this qgroup is - * not already in the list of qgroups. If it is already - * there then that means it must have been re-added or - * the delete will be discarded because we had an - * existing ref that we haven't looked up yet. In this - * case we don't want to increase old_roots. So if ret - * == 1 then we know that this is the first time we've - * seen this qgroup and we can bump the old_roots. - */ - (*old_roots)++; - ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret < 0) - return ret; - } -next: - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&tmp_oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - break; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - } - - /* Ok now process the qgroups we found */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* Add refcnt for the newly added reference. */ -static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct btrfs_qgroup *qgroup, - struct ulist *tmp, struct ulist *qgroups, - u64 seq) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - int ret; - - ulist_reinit(tmp); - ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - } else { - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * This adjusts the counters for all referenced qgroups if need be. + * Update qgroup rfer/excl counters. + * Rfer update is easy, codes can explain themselves. + * + * Excl update is tricky, the update is split into 2 part. + * Part 1: Possible exclusive <-> sharing detect: + * | A | !A | + * ------------------------------------- + * B | * | - | + * ------------------------------------- + * !B | + | ** | + * ------------------------------------- + * + * Conditions: + * A: cur_old_roots < nr_old_roots (not exclusive before) + * !A: cur_old_roots == nr_old_roots (possible exclusive before) + * B: cur_new_roots < nr_new_roots (not exclusive now) + * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * + * Results: + * +: Possible sharing -> exclusive -: Possible exclusive -> sharing + * *: Definitely not changed. **: Possible unchanged. + * + * For !A and !B condition, the exception is cur_old/new_roots == 0 case. + * + * To make the logic clear, we first use condition A and B to split + * combination into 4 results. + * + * Then, for result "+" and "-", check old/new_roots == 0 case, as in them + * only on variant maybe 0. + * + * Lastly, check result **, since there are 2 variants maybe 0, split them + * again(2x2). + * But this time we don't need to consider other things, the codes and logic + * is easy to understand now. */ -static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, - u64 root_to_skip, u64 num_bytes, - struct ulist *qgroups, u64 seq, - int old_roots, int new_roots, int rescan) +static int qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct ulist *qgroups, + u64 nr_old_roots, + u64 nr_new_roots, + u64 num_bytes, u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1706,423 +1588,196 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, bool dirty = false; qg = u64_to_ptr(unode->aux); - /* - * Wasn't referenced before but is now, add to the reference - * counters. - */ - if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); + cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + + /* Rfer update part */ + if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; qg->rfer_cmpr += num_bytes; dirty = true; } - - /* - * Was referenced before but isn't now, subtract from the - * reference counters. - */ - if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + if (cur_old_count > 0 && cur_new_count == 0) { qg->rfer -= num_bytes; qg->rfer_cmpr -= num_bytes; dirty = true; } - if (qg->old_refcnt < seq) - cur_old_count = 0; - else - cur_old_count = qg->old_refcnt - seq; - if (qg->new_refcnt < seq) - cur_new_count = 0; - else - cur_new_count = qg->new_refcnt - seq; + /* Excl update part */ + /* Exclusive/none -> shared case */ + if (cur_old_count == nr_old_roots && + cur_new_count < nr_new_roots) { + /* Exclusive -> shared */ + if (cur_old_count != 0) { + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + } - /* - * If our refcount was the same as the roots previously but our - * new count isn't the same as the number of roots now then we - * went from having a exclusive reference on this range to not. - */ - if (old_roots && cur_old_count == old_roots && - (cur_new_count != new_roots || new_roots == 0)) { - WARN_ON(cur_new_count != new_roots && new_roots == 0); - qg->excl -= num_bytes; - qg->excl_cmpr -= num_bytes; - dirty = true; + /* Shared -> exclusive/none case */ + if (cur_old_count < nr_old_roots && + cur_new_count == nr_new_roots) { + /* Shared->exclusive */ + if (cur_new_count != 0) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } } - /* - * If we didn't reference all the roots before but now we do we - * have an exclusive reference to this range. - */ - if ((!old_roots || (old_roots && cur_old_count != old_roots)) - && cur_new_count == new_roots) { - qg->excl += num_bytes; - qg->excl_cmpr += num_bytes; - dirty = true; + /* Exclusive/none -> exclusive/none case */ + if (cur_old_count == nr_old_roots && + cur_new_count == nr_new_roots) { + if (cur_old_count == 0) { + /* None -> exclusive/none */ + + if (cur_new_count != 0) { + /* None -> exclusive */ + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } + /* None -> none, nothing changed */ + } else { + /* Exclusive -> exclusive/none */ + + if (cur_new_count == 0) { + /* Exclusive -> none */ + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + /* Exclusive -> exclusive, nothing changed */ + } } + /* For exclusive extent, free its reserved bytes too */ + if (nr_old_roots == 0 && nr_new_roots == 1 && + cur_new_count == nr_new_roots) + qg->reserved -= num_bytes; if (dirty) qgroup_dirty(fs_info, qg); } return 0; } -/* - * If we removed a data extent and there were other references for that bytenr - * then we need to lookup all referenced roots to make sure we still don't - * reference this bytenr. If we do then we can just discard this operation. - */ -static int check_existing_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; - - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - oper->elem.seq, &roots); - if (ret < 0) - return ret; - ret = 0; - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - if (unode->val == oper->ref_root) { - ret = 1; - break; - } - } - ulist_free(roots); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - - return ret; -} - -/* - * If we share a reference across multiple roots then we may need to adjust - * various qgroups referenced and exclusive counters. The basic premise is this - * - * 1) We have seq to represent a 0 count. Instead of looping through all of the - * qgroups and resetting their refcount to 0 we just constantly bump this - * sequence number to act as the base reference count. This means that if - * anybody is equal to or below this sequence they were never referenced. We - * jack this sequence up by the number of roots we found each time in order to - * make sure we don't have any overlap. - * - * 2) We first search all the roots that reference the area _except_ the root - * we're acting on currently. This makes up the old_refcnt of all the qgroups - * before. - * - * 3) We walk all of the qgroups referenced by the root we are currently acting - * on, and will either adjust old_refcnt in the case of a removal or the - * new_refcnt in the case of an addition. - * - * 4) Finally we walk all the qgroups that are referenced by this range - * including the root we are acting on currently. We will adjust the counters - * based on the number of roots we had and will have after this operation. - * - * Take this example as an illustration - * - * [qgroup 1/0] - * / | \ - * [qg 0/0] [qg 0/1] [qg 0/2] - * \ | / - * [ extent ] - * - * Say we are adding a reference that is covered by qg 0/0. The first step - * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with - * old_roots being 2. Because it is adding new_roots will be 1. We then go - * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's - * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we - * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a - * reference and thus must add the size to the referenced bytes. Everything - * else is the same so nothing else changes. - */ -static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 bytenr, u64 num_bytes, + struct ulist *old_roots, struct ulist *new_roots) { - struct ulist *roots = NULL; - struct ulist *qgroups, *tmp; - struct btrfs_qgroup *qgroup; - struct seq_list elem = {}; + struct ulist *qgroups = NULL; + struct ulist *tmp = NULL; u64 seq; - int old_roots = 0; - int new_roots = 0; + u64 nr_new_roots = 0; + u64 nr_old_roots = 0; int ret = 0; - if (oper->elem.seq) { - ret = check_existing_refs(trans, fs_info, oper); - if (ret < 0) - return ret; - if (ret) - return 0; - } + if (new_roots) + nr_new_roots = new_roots->nnodes; + if (old_roots) + nr_old_roots = old_roots->nnodes; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - return -ENOMEM; + if (!fs_info->quota_enabled) + goto out_free; + BUG_ON(!fs_info->quota_root); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) { + ret = -ENOMEM; + goto out_free; + } tmp = ulist_alloc(GFP_NOFS); if (!tmp) { - ulist_free(qgroups); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, - &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) { - ulist_free(qgroups); - ulist_free(tmp); - return ret; + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + ret = 0; + goto out_free; + } } + mutex_unlock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); - qgroup = find_qgroup_rb(fs_info, oper->ref_root); - if (!qgroup) - goto out; seq = fs_info->qgroup_seq; - /* - * So roots is the list of all the roots currently pointing at the - * bytenr, including the ref we are adding if we are adding, or not if - * we are removing a ref. So we pass in the ref_root to skip that root - * in our calculations. We set old_refnct and new_refcnt cause who the - * hell knows what everything looked like before, and it doesn't matter - * except... - */ - ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, - seq, &old_roots, 0); - if (ret < 0) - goto out; - - /* - * Now adjust the refcounts of the qgroups that care about this - * reference, either the old_count in the case of removal or new_count - * in the case of an addition. - */ - ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, - seq); + /* Update old refcnts using old_roots */ + ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, + UPDATE_OLD); if (ret < 0) goto out; - /* - * ...in the case of removals. If we had a removal before we got around - * to processing this operation then we need to find that guy and count - * his references as if they really existed so we don't end up screwing - * up the exclusive counts. Then whenever we go to process the delete - * everything will be grand and we can account for whatever exclusive - * changes need to be made there. We also have to pass in old_roots so - * we have an accurate count of the roots as it pertains to this - * operations view of the world. - */ - ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, - &old_roots); + /* Update new refcnts using new_roots */ + ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, + UPDATE_NEW); if (ret < 0) goto out; - /* - * We are adding our root, need to adjust up the number of roots, - * otherwise old_roots is the number of roots we want. - */ - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - new_roots = old_roots + 1; - } else { - new_roots = old_roots; - old_roots++; - } - fs_info->qgroup_seq += old_roots + 1; - + qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + num_bytes, seq); /* - * And now the magic happens, bless Arne for having a pretty elegant - * solution for this. + * Bump qgroup_seq to avoid seq overlap */ - qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, - qgroups, seq, old_roots, new_roots, 0); + fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(qgroups); - ulist_free(roots); +out_free: ulist_free(tmp); + ulist_free(qgroups); + ulist_free(old_roots); + ulist_free(new_roots); return ret; } -/* - * Process a reference to a shared subtree. This type of operation is - * queued during snapshot removal when we encounter extents which are - * shared between more than one root. - */ -static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup_list *glist; - struct ulist *parents; - int ret = 0; - int err; - struct btrfs_qgroup *qg; - u64 root_obj = 0; - struct seq_list elem = {}; - - parents = ulist_alloc(GFP_NOFS); - if (!parents) - return -ENOMEM; - - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - elem.seq, &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) - goto out; - - if (roots->nnodes != 1) - goto out; - - ULIST_ITER_INIT(&uiter); - unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ - /* - * If we find our ref root then that means all refs - * this extent has to the root have not yet been - * deleted. In that case, we do nothing and let the - * last ref for this bytenr drive our update. - * - * This can happen for example if an extent is - * referenced multiple times in a snapshot (clone, - * etc). If we are in the middle of snapshot removal, - * queued updates for such an extent will find the - * root if we have not yet finished removing the - * snapshot. - */ - if (unode->val == oper->ref_root) - goto out; - - root_obj = unode->val; - BUG_ON(!root_obj); - - spin_lock(&fs_info->qgroup_lock); - qg = find_qgroup_rb(fs_info, root_obj); - if (!qg) - goto out_unlock; - - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* - * Adjust counts for parent groups. First we find all - * parents, then in the 2nd loop we do the adjustment - * while adding parents of the parents to our ulist. - */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(parents, &uiter))) { - qg = u64_to_ptr(unode->aux); - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* Add any parents of the parents */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - } - -out_unlock: - spin_unlock(&fs_info->qgroup_lock); - -out: - ulist_free(roots); - ulist_free(parents); - return ret; -} - -/* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. - */ -static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct ulist *new_roots = NULL; + struct rb_node *node; + u64 qgroup_to_skip; int ret = 0; - if (!fs_info->quota_enabled) - return 0; + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; + while ((node = rb_first(&delayed_refs->dirty_extent_root))) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); - BUG_ON(!fs_info->quota_root); - - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); - return 0; + if (!ret) { + /* + * Use (u64)-1 as time_seq to do special search, which + * doesn't lock tree or delayed_refs and search current + * root. It's safe inside commit_transaction(). + */ + ret = btrfs_find_all_roots(trans, fs_info, + record->bytenr, (u64)-1, &new_roots); + if (ret < 0) + goto cleanup; + if (qgroup_to_skip) + ulist_del(new_roots, qgroup_to_skip, 0); + ret = btrfs_qgroup_account_extent(trans, fs_info, + record->bytenr, record->num_bytes, + record->old_roots, new_roots); + record->old_roots = NULL; + new_roots = NULL; } - } - mutex_unlock(&fs_info->qgroup_rescan_lock); +cleanup: + ulist_free(record->old_roots); + ulist_free(new_roots); + new_roots = NULL; + rb_erase(node, &delayed_refs->dirty_extent_root); + kfree(record); - ASSERT(is_fstree(oper->ref_root)); - - trace_btrfs_qgroup_account(oper); - - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - case BTRFS_QGROUP_OPER_SUB_EXCL: - ret = qgroup_excl_accounting(fs_info, oper); - break; - case BTRFS_QGROUP_OPER_ADD_SHARED: - case BTRFS_QGROUP_OPER_SUB_SHARED: - ret = qgroup_shared_accounting(trans, fs_info, oper); - break; - case BTRFS_QGROUP_OPER_SUB_SUBTREE: - ret = qgroup_subtree_accounting(trans, fs_info, oper); - break; - default: - ASSERT(0); - } - return ret; -} - -/* - * Needs to be called everytime we run delayed refs, even if there is an error - * in order to cleanup outstanding operations. - */ -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_operation *oper; - int ret = 0; - - while (!list_empty(&trans->qgroup_ref_list)) { - oper = list_first_entry(&trans->qgroup_ref_list, - struct btrfs_qgroup_operation, list); - list_del_init(&oper->list); - if (!ret || !trans->aborted) - ret = btrfs_qgroup_account(trans, fs_info, oper); - spin_lock(&fs_info->qgroup_op_lock); - rb_erase(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - kfree(oper); } return ret; } @@ -2156,6 +1811,10 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans, if (ret) fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + ret = update_qgroup_limit_item(trans, quota_root, qgroup); + if (ret) + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; spin_lock(&fs_info->qgroup_lock); } if (fs_info->quota_enabled) @@ -2219,6 +1878,11 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, ret = -EINVAL; goto out; } + + if ((srcgroup->qgroupid >> 48) <= (objectid >> 48)) { + ret = -EINVAL; + goto out; + } ++i_qgroups; } } @@ -2230,17 +1894,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (ret) goto out; - if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { - ret = update_qgroup_limit_item(trans, quota_root, objectid, - inherit->lim.flags, - inherit->lim.max_rfer, - inherit->lim.max_excl, - inherit->lim.rsv_rfer, - inherit->lim.rsv_excl); - if (ret) - goto out; - } - if (srcid) { struct btrfs_root *srcroot; struct btrfs_key srckey; @@ -2286,6 +1939,22 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, goto unlock; } + if (inherit && inherit->flags & BTRFS_QGROUP_INHERIT_SET_LIMITS) { + dstgroup->lim_flags = inherit->lim.flags; + dstgroup->max_rfer = inherit->lim.max_rfer; + dstgroup->max_excl = inherit->lim.max_excl; + dstgroup->rsv_rfer = inherit->lim.rsv_rfer; + dstgroup->rsv_excl = inherit->lim.rsv_excl; + + ret = update_qgroup_limit_item(trans, quota_root, dstgroup); + if (ret) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_info(fs_info, "unable to update quota limit for %llu", + dstgroup->qgroupid); + goto unlock; + } + } + if (srcid) { srcgroup = find_qgroup_rb(fs_info, srcid); if (!srcgroup) @@ -2302,6 +1971,14 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, dstgroup->excl_cmpr = level_size; srcgroup->excl = level_size; srcgroup->excl_cmpr = level_size; + + /* inherit the limit info */ + dstgroup->lim_flags = srcgroup->lim_flags; + dstgroup->max_rfer = srcgroup->max_rfer; + dstgroup->max_excl = srcgroup->max_excl; + dstgroup->rsv_rfer = srcgroup->rsv_rfer; + dstgroup->rsv_excl = srcgroup->rsv_excl; + qgroup_dirty(fs_info, dstgroup); qgroup_dirty(fs_info, srcgroup); } @@ -2358,12 +2035,6 @@ out: return ret; } -/* - * reserve some space for a qgroup and all its parents. The reservation takes - * place with start_transaction or dealloc_reserve, similar to ENOSPC - * accounting. If not enough space is available, EDQUOT is returned. - * We assume that the requested space is new for all qgroups. - */ int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) { struct btrfs_root *quota_root; @@ -2513,19 +2184,17 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) /* * returns < 0 on error, 0 when more leafs are to be scanned. - * returns 1 when done, 2 when done and FLAG_INCONSISTENT was cleared. + * returns 1 when done. */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *qgroups, - struct ulist *tmp, struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, + struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; - struct seq_list tree_mod_seq_elem = {}; + struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; - u64 seq; - int new_roots; int slot; int ret; @@ -2575,33 +2244,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, else num_bytes = found.offset; - ulist_reinit(qgroups); ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, &roots); if (ret < 0) goto out; - spin_lock(&fs_info->qgroup_lock); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - - new_roots = 0; - ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, - seq, &new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - - ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, - seq, 0, new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); + /* For rescan, just pass old_roots as NULL */ + ret = btrfs_qgroup_account_extent(trans, fs_info, + found.objectid, num_bytes, NULL, roots); + if (ret < 0) goto out; - } - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); } out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -2615,19 +2266,13 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; + int ret = 0; path = btrfs_alloc_path(); if (!path) goto out; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - goto out; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - goto out; scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); if (!scratch_leaf) goto out; @@ -2643,7 +2288,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - qgroups, tmp, scratch_leaf); + scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2653,14 +2298,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); - ulist_free(qgroups); - ulist_free(tmp); btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; - if (err == 2 && + if (err > 0 && fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; } else if (err < 0) { @@ -2668,13 +2311,33 @@ out: } mutex_unlock(&fs_info->qgroup_rescan_lock); + /* + * only update status, since the previous part has alreay updated the + * qgroup info. + */ + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + btrfs_err(fs_info, + "fail to start transaction for status update: %d\n", + err); + goto done; + } + ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root); + if (ret < 0) { + err = ret; + btrfs_err(fs_info, "fail to update qgroup status: %d\n", err); + } + btrfs_end_transaction(trans, fs_info->quota_root); + if (err >= 0) { btrfs_info(fs_info, "qgroup scan completed%s", - err == 2 ? " (inconsistency flag cleared)" : ""); + err > 0 ? " (inconsistency flag cleared)" : ""); } else { btrfs_err(fs_info, "qgroup scan failed with %d", err); } +done: complete_all(&fs_info->qgroup_rescan_completion); } @@ -2709,7 +2372,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid, mutex_unlock(&fs_info->qgroup_rescan_lock); goto err; } - fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_RESCAN; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 18cc68ca3090..6387dcfa354c 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -19,43 +19,18 @@ #ifndef __BTRFS_QGROUP__ #define __BTRFS_QGROUP__ +#include "ulist.h" +#include "delayed-ref.h" + /* - * A description of the operations, all of these operations only happen when we - * are adding the 1st reference for that subvolume in the case of adding space - * or on the last reference delete in the case of subtraction. The only - * exception is the last one, which is added for confusion. - * - * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only - * one pointing at the bytes we are adding. This is called on the first - * allocation. - * - * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be - * shared between subvols. This is called on the creation of a ref that already - * has refs from a different subvolume, so basically reflink. - * - * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only - * one referencing the range. - * - * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with - * refs with other subvolumes. + * Record a dirty extent, and info qgroup to update quota on it + * TODO: Use kmem cache to alloc it. */ -enum btrfs_qgroup_operation_type { - BTRFS_QGROUP_OPER_ADD_EXCL, - BTRFS_QGROUP_OPER_ADD_SHARED, - BTRFS_QGROUP_OPER_SUB_EXCL, - BTRFS_QGROUP_OPER_SUB_SHARED, - BTRFS_QGROUP_OPER_SUB_SUBTREE, -}; - -struct btrfs_qgroup_operation { - u64 ref_root; +struct btrfs_qgroup_extent_record { + struct rb_node node; u64 bytenr; u64 num_bytes; - u64 seq; - enum btrfs_qgroup_operation_type type; - struct seq_list elem; - struct rb_node n; - struct list_head list; + struct ulist *old_roots; }; int btrfs_quota_enable(struct btrfs_trans_handle *trans, @@ -70,8 +45,7 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 src, u64 dst); int btrfs_create_qgroup(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 qgroupid, - char *name); + struct btrfs_fs_info *fs_info, u64 qgroupid); int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid); int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, @@ -80,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, - int mod_seq); -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper); + struct ulist *old_roots, struct ulist *new_roots); +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); int btrfs_run_qgroups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5264858ed768..fa72068bd256 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -237,12 +237,8 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) } x = cmpxchg(&info->stripe_hash_table, NULL, table); - if (x) { - if (is_vmalloc_addr(x)) - vfree(x); - else - kfree(x); - } + if (x) + kvfree(x); return 0; } @@ -453,10 +449,7 @@ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) if (!info->stripe_hash_table) return; btrfs_clear_rbio_cache(info); - if (is_vmalloc_addr(info->stripe_hash_table)) - vfree(info->stripe_hash_table); - else - kfree(info->stripe_hash_table); + kvfree(info->stripe_hash_table); info->stripe_hash_table = NULL; } @@ -1807,8 +1800,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int err; int i; - pointers = kzalloc(rbio->real_stripes * sizeof(void *), - GFP_NOFS); + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { err = -ENOMEM; goto cleanup_io; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d83085381bcc..88cbb5995667 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1847,8 +1847,10 @@ again: } eb = read_tree_block(dest, old_bytenr, old_ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { - ret = (!eb) ? -ENOMEM : -EIO; + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { + ret = -EIO; free_extent_buffer(eb); break; } @@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, bytenr = btrfs_node_blockptr(eb, path->slots[i]); eb = read_tree_block(root, bytenr, ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2710,7 +2714,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = root->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, generation); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + err = PTR_ERR(eb); + goto next; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); err = -EIO; goto next; @@ -2873,7 +2880,9 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.offset); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -3027,7 +3036,7 @@ int prealloc_file_extent_cluster(struct inode *inode, mutex_lock(&inode->i_mutex); ret = btrfs_check_data_free_space(inode, cluster->end + - 1 - cluster->start); + 1 - cluster->start, 0); if (ret) goto out; @@ -3430,7 +3439,9 @@ static int block_use_full_backref(struct reloc_control *rc, } static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct inode *inode, u64 ino) + struct btrfs_block_group_cache *block_group, + struct inode *inode, + u64 ino) { struct btrfs_key key; struct btrfs_root *root = fs_info->tree_root; @@ -3463,7 +3474,7 @@ truncate: goto out; } - ret = btrfs_truncate_free_space_cache(root, trans, inode); + ret = btrfs_truncate_free_space_cache(root, trans, block_group, inode); btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root); @@ -3509,6 +3520,7 @@ static int find_data_references(struct reloc_control *rc, */ if (ref_root == BTRFS_ROOT_TREE_OBJECTID) { ret = delete_block_group_cache(rc->extent_root->fs_info, + rc->block_group, NULL, ref_objectid); if (ret != -ENOENT) return ret; @@ -4037,7 +4049,7 @@ restart: if (trans && progress && err == -ENOSPC) { ret = btrfs_force_chunk_alloc(trans, rc->extent_root, rc->block_group->flags); - if (ret == 0) { + if (ret == 1) { err = 0; progress = 0; goto restart; @@ -4223,7 +4235,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) btrfs_free_path(path); if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, inode, 0); + ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); else ret = PTR_ERR(inode); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ec57687c9a4d..94db0fa5225a 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -964,9 +964,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * the statistics. */ - sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS * - sizeof(*sblocks_for_recheck), - GFP_NOFS); + sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS, + sizeof(*sblocks_for_recheck), GFP_NOFS); if (!sblocks_for_recheck) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2319,7 +2318,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, u64 start, u64 len) { - int offset; + u32 offset; int nsectors; int sectorsize = sparity->sctx->dev_root->sectorsize; @@ -2329,7 +2328,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } start -= sparity->logic_start; - offset = (int)do_div(start, sparity->stripe_len); + start = div_u64_rem(start, sparity->stripe_len, &offset); offset /= sectorsize; nsectors = (int)len / sectorsize; @@ -2612,8 +2611,8 @@ static int get_raid56_logic_offset(u64 physical, int num, int j = 0; u64 stripe_nr; u64 last_offset; - int stripe_index; - int rot; + u32 stripe_index; + u32 rot; last_offset = (physical - map->stripes[num].physical) * nr_data_stripes(map); @@ -2624,12 +2623,11 @@ static int get_raid56_logic_offset(u64 physical, int num, for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; - stripe_nr = *offset; - do_div(stripe_nr, map->stripe_len); - do_div(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64(*offset, map->stripe_len); + stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); /* Work out the disk rotation on this stripe-set */ - rot = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot); /* calculate which stripe this data locates */ rot += i; stripe_index = rot % map->num_stripes; @@ -2664,18 +2662,30 @@ static void scrub_free_parity(struct scrub_parity *sparity) kfree(sparity); } +static void scrub_parity_bio_endio_worker(struct btrfs_work *work) +{ + struct scrub_parity *sparity = container_of(work, struct scrub_parity, + work); + struct scrub_ctx *sctx = sparity->sctx; + + scrub_free_parity(sparity); + scrub_pending_bio_dec(sctx); +} + static void scrub_parity_bio_endio(struct bio *bio, int error) { struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; - struct scrub_ctx *sctx = sparity->sctx; if (error) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); - scrub_free_parity(sparity); - scrub_pending_bio_dec(sctx); bio_put(bio); + + btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, + scrub_parity_bio_endio_worker, NULL, NULL); + btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers, + &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) @@ -2995,10 +3005,9 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, int extent_mirror_num; int stop_loop = 0; - nstripes = length; physical = map->stripes[num].physical; offset = 0; - do_div(nstripes, map->stripe_len); + nstripes = div_u64(length, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; @@ -3562,8 +3571,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { - int ret = 0; - int flags = WQ_FREEZABLE | WQ_UNBOUND; + unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; if (fs_info->scrub_workers_refcnt == 0) { @@ -3575,27 +3583,36 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, fs_info->scrub_workers = btrfs_alloc_workqueue("btrfs-scrub", flags, max_active, 4); - if (!fs_info->scrub_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_workers) + goto fail_scrub_workers; + fs_info->scrub_wr_completion_workers = btrfs_alloc_workqueue("btrfs-scrubwrc", flags, max_active, 2); - if (!fs_info->scrub_wr_completion_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_wr_completion_workers) + goto fail_scrub_wr_completion_workers; + fs_info->scrub_nocow_workers = btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); - if (!fs_info->scrub_nocow_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_nocow_workers) + goto fail_scrub_nocow_workers; + fs_info->scrub_parity_workers = + btrfs_alloc_workqueue("btrfs-scrubparity", flags, + max_active, 2); + if (!fs_info->scrub_parity_workers) + goto fail_scrub_parity_workers; } ++fs_info->scrub_workers_refcnt; -out: - return ret; + return 0; + +fail_scrub_parity_workers: + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); +fail_scrub_nocow_workers: + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); +fail_scrub_wr_completion_workers: + btrfs_destroy_workqueue(fs_info->scrub_workers); +fail_scrub_workers: + return -ENOMEM; } static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) @@ -3604,6 +3621,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->scrub_workers); btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); + btrfs_destroy_workqueue(fs_info->scrub_parity_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d6033f540cc7..aa72bfd28f7d 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -243,6 +243,7 @@ struct waiting_dir_move { * after this directory is moved, we can try to rmdir the ino rmdir_ino. */ u64 rmdir_ino; + bool orphanized; }; struct orphan_dir_info { @@ -1158,6 +1159,9 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; + /* data offset in the file extent item */ + u64 data_offset; + /* Just to check for bugs in backref resolving */ int found_itself; }; @@ -1221,7 +1225,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) if (ret < 0) return ret; - if (offset + bctx->extent_len > i_size) + if (offset + bctx->data_offset + bctx->extent_len > i_size) return 0; /* @@ -1363,6 +1367,19 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx->cur_offset = data_offset; backref_ctx->found_itself = 0; backref_ctx->extent_len = num_bytes; + /* + * For non-compressed extents iterate_extent_inodes() gives us extent + * offsets that already take into account the data offset, but not for + * compressed extents, since the offset is logical and not relative to + * the physical extent locations. We must take this into account to + * avoid sending clone offsets that go beyond the source file's size, + * which would result in the clone ioctl failing with -EINVAL on the + * receiving end. + */ + if (compressed == BTRFS_COMPRESS_NONE) + backref_ctx->data_offset = 0; + else + backref_ctx->data_offset = btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. @@ -1900,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; } - /* we know that it is or will be overwritten. check this now */ - if (ow_inode < sctx->send_progress) + /* + * We know that it is or will be overwritten. Check this now. + * The current inode being processed might have been the one that caused + * inode 'ino' to be orphanized, therefore ow_inode can actually be the + * same as sctx->send_progress. + */ + if (ow_inode <= sctx->send_progress) ret = 1; else ret = 0; @@ -2223,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + struct waiting_dir_move *wdm; + fs_path_reset(name); if (is_waiting_for_rm(sctx, ino)) { @@ -2233,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, break; } - if (is_waiting_for_move(sctx, ino)) { + wdm = get_waiting_dir_move(sctx, ino); + if (wdm && wdm->orphanized) { + ret = gen_unique_name(sctx, ino, gen, name); + stop = 1; + } else if (wdm) { ret = get_first_ref(sctx->parent_root, ino, &parent_inode, &parent_gen, name); } else { @@ -2328,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - sctx->parent_root->root_item.uuid); + if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(sctx->parent_root->root_item.ctransid)); } @@ -2923,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) return entry != NULL; } -static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) { struct rb_node **p = &sctx->waiting_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2934,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) return -ENOMEM; dm->ino = ino; dm->rmdir_ino = 0; + dm->orphanized = orphanized; while (*p) { parent = *p; @@ -3030,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, goto out; } - ret = add_waiting_dir_move(sctx, pm->ino); + ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); if (ret) goto out; @@ -3067,48 +3100,6 @@ static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx, return NULL; } -static int path_loop(struct send_ctx *sctx, struct fs_path *name, - u64 ino, u64 gen, u64 *ancestor_ino) -{ - int ret = 0; - u64 parent_inode = 0; - u64 parent_gen = 0; - u64 start_ino = ino; - - *ancestor_ino = 0; - while (ino != BTRFS_FIRST_FREE_OBJECTID) { - fs_path_reset(name); - - if (is_waiting_for_rm(sctx, ino)) - break; - if (is_waiting_for_move(sctx, ino)) { - if (*ancestor_ino == 0) - *ancestor_ino = ino; - ret = get_first_ref(sctx->parent_root, ino, - &parent_inode, &parent_gen, name); - } else { - ret = __get_cur_name_and_parent(sctx, ino, gen, - &parent_inode, - &parent_gen, name); - if (ret > 0) { - ret = 0; - break; - } - } - if (ret < 0) - break; - if (parent_inode == start_ino) { - ret = 1; - if (*ancestor_ino == 0) - *ancestor_ino = ino; - break; - } - ino = parent_inode; - gen = parent_gen; - } - return ret; -} - static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) { struct fs_path *from_path = NULL; @@ -3120,7 +3111,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) struct waiting_dir_move *dm = NULL; u64 rmdir_ino = 0; int ret; - u64 ancestor = 0; name = fs_path_alloc(); from_path = fs_path_alloc(); @@ -3152,22 +3142,6 @@ static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm) goto out; sctx->send_progress = sctx->cur_ino + 1; - ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor); - if (ret) { - LIST_HEAD(deleted_refs); - ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID); - ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor, - &pm->update_refs, &deleted_refs, - pm->is_orphan); - if (ret < 0) - goto out; - if (rmdir_ino) { - dm = get_waiting_dir_move(sctx, pm->ino); - ASSERT(dm); - dm->rmdir_ino = rmdir_ino; - } - goto out; - } fs_path_reset(name); to_path = name; name = NULL; @@ -3412,8 +3386,40 @@ out: return ret; } +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int is_ancestor(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + int ret; + u64 parent; + u64 parent_gen; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) { + if (ret == -ENOENT && ino == ino2) + ret = 0; + return ret; + } + if (parent == ino1) + return parent_gen == ino1_gen ? 1 : 0; + ino = parent; + } + return 0; +} + static int wait_for_parent_move(struct send_ctx *sctx, - struct recorded_ref *parent_ref) + struct recorded_ref *parent_ref, + const bool is_orphan) { int ret = 0; u64 ino = parent_ref->dir; @@ -3433,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx, * Our current directory inode may not yet be renamed/moved because some * ancestor (immediate or not) has to be renamed/moved first. So find if * such ancestor exists and make sure our own rename/move happens after - * that ancestor is processed. + * that ancestor is processed to avoid path build infinite loops (done + * at get_cur_path()). */ while (ino > BTRFS_FIRST_FREE_OBJECTID) { if (is_waiting_for_move(sctx, ino)) { - ret = 1; + /* + * If the current inode is an ancestor of ino in the + * parent root, we need to delay the rename of the + * current inode, otherwise don't delayed the rename + * because we can end up with a circular dependency + * of renames, resulting in some directories never + * getting the respective rename operations issued in + * the send stream or getting into infinite path build + * loops. + */ + ret = is_ancestor(sctx->parent_root, + sctx->cur_ino, sctx->cur_inode_gen, + ino, path_before); break; } @@ -3479,7 +3498,7 @@ out: ino, &sctx->new_refs, &sctx->deleted_refs, - false); + is_orphan); if (!ret) ret = 1; } @@ -3610,10 +3629,27 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); if (ret < 0) goto out; if (ret) { + struct name_cache_entry *nce; + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) goto out; + /* + * Make sure we clear our orphanized inode's + * name from the name cache. This is because the + * inode ow_inode might be an ancestor of some + * other inode that will be orphanized as well + * later and has an inode number greater than + * sctx->send_progress. We need to prevent + * future name lookups from using the old name + * and get instead the orphan name. + */ + nce = name_cache_search(sctx, ow_inode, ow_gen); + if (nce) { + name_cache_delete(sctx, nce); + kfree(nce); + } } else { ret = send_unlink(sctx, cur->full_path); if (ret < 0) @@ -3631,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && + can_rename) { + ret = wait_for_parent_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move @@ -3651,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = wait_for_parent_move(sctx, cur); - if (ret < 0) - goto out; - if (ret) { - *pending_move = 1; - } else { - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, - cur->full_path); - } + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); if (ret < 0) goto out; } else { @@ -4550,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " if (ret < 0) goto out; - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - clone_root->root->root_item.uuid); + /* + * If the parent we're using has a received_uuid set then use that as + * our clone source as that is what we will look for when doing a + * receive. + * + * This covers the case that we create a snapshot off of a received + * subvolume and then use that as the parent and try to receive on a + * different host. + */ + if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(clone_root->root->root_item.ctransid)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); @@ -5852,19 +5905,20 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) ret = PTR_ERR(clone_root); goto out; } - clone_sources_to_rollback = i + 1; spin_lock(&clone_root->root_item_lock); - clone_root->send_in_progress++; - if (!btrfs_root_readonly(clone_root)) { + if (!btrfs_root_readonly(clone_root) || + btrfs_root_dead(clone_root)) { spin_unlock(&clone_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); ret = -EPERM; goto out; } + clone_root->send_in_progress++; spin_unlock(&clone_root->root_item_lock); srcu_read_unlock(&fs_info->subvol_srcu, index); sctx->clone_roots[i].root = clone_root; + clone_sources_to_rollback = i + 1; } vfree(clone_sources_tmp); clone_sources_tmp = NULL; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 05fef198ff94..cd7ef34d2dce 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. */ +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, * We'll complete the cleanup in btrfs_end_transaction and * btrfs_commit_transaction. */ +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - /* - * Report first abort since mount - */ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, - &root->fs_info->fs_state)) { - WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", - errno); - } trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ @@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, * __btrfs_panic decodes unexpected, fatal errors from the caller, * issues an alert, and either panics or BUGs, depending on mount options. */ +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -841,33 +836,153 @@ out: return error; } -static struct dentry *get_default_root(struct super_block *sb, - u64 subvol_objectid) +static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *new_root; - struct btrfs_dir_item *di; - struct btrfs_path *path; - struct btrfs_key location; - struct inode *inode; - u64 dir_id; - int new = 0; + struct btrfs_root *fs_root; + struct btrfs_root_ref *root_ref; + struct btrfs_inode_ref *inode_ref; + struct btrfs_key key; + struct btrfs_path *path = NULL; + char *name = NULL, *ptr; + u64 dirid; + int len; + int ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + path->leave_spinning = 1; + + name = kmalloc(PATH_MAX, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto err; + } + ptr = name + PATH_MAX - 1; + ptr[0] = '\0'; /* - * We have a specific subvol we want to mount, just setup location and - * go look up the root. + * Walk up the subvolume trees in the tree of tree roots by root + * backrefs until we hit the top-level subvolume. */ - if (subvol_objectid) { - location.objectid = subvol_objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - goto find_root; + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(root, path, subvol_objectid, + BTRFS_ROOT_BACKREF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + subvol_objectid = key.offset; + + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_root_ref); + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(root_ref + 1), len); + ptr[0] = '/'; + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); + btrfs_release_path(path); + + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + fs_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + goto err; + } + + /* + * Walk up the filesystem tree by inode refs until we hit the + * root directory. + */ + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(fs_root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + dirid = key.offset; + + inode_ref = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], + inode_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(inode_ref + 1), len); + ptr[0] = '/'; + btrfs_release_path(path); + } + } + + btrfs_free_path(path); + if (ptr == name + PATH_MAX - 1) { + name[0] = '/'; + name[1] = '\0'; + } else { + memmove(name, ptr, name + PATH_MAX - ptr); } + return name; + +err: + btrfs_free_path(path); + kfree(name); + return ERR_PTR(ret); +} + +static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_key location; + u64 dir_id; path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); + return -ENOMEM; path->leave_spinning = 1; /* @@ -879,49 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb, di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); - return ERR_CAST(di); + return PTR_ERR(di); } if (!di) { /* * Ok the default dir item isn't there. This is weird since * it's always been there, but don't freak out, just try and - * mount to root most subvolume. + * mount the top-level subvolume. */ btrfs_free_path(path); - dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = fs_info->fs_root; - goto setup_root; + *objectid = BTRFS_FS_TREE_OBJECTID; + return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); btrfs_free_path(path); - -find_root: - new_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (IS_ERR(new_root)) - return ERR_CAST(new_root); - - dir_id = btrfs_root_dirid(&new_root->root_item); -setup_root: - location.objectid = dir_id; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - - inode = btrfs_iget(sb, &location, new_root, &new); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - /* - * If we're just mounting the root most subvol put the inode and return - * a reference to the dentry. We will have already gotten a reference - * to the inode in btrfs_fill_super so we're good to go. - */ - if (!new && sb->s_root->d_inode == inode) { - iput(inode); - return dget(sb->s_root); - } - - return d_obtain_root(inode); + *objectid = location.objectid; + return 0; } static int btrfs_fill_super(struct super_block *sb, @@ -1099,6 +1188,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); + seq_printf(seq, ",subvolid=%llu", + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + seq_puts(seq, ",subvol="); + seq_dentry(seq, dentry, " \t\n\\"); return 0; } @@ -1129,107 +1222,139 @@ static inline int is_subvolume_inode(struct inode *inode) } /* - * This will strip out the subvol=%s argument for an argument string and add - * subvolid=0 to make sure we get the actual tree root for path walking to the - * subvol we want. + * This will add subvolid=0 to the argument string while removing any subvol= + * and subvolid= arguments to make sure we get the top-level root for path + * walking to the subvol we want. */ static char *setup_root_args(char *args) { - unsigned len = strlen(args) + 2 + 1; - char *src, *dst, *buf; + char *buf, *dst, *sep; - /* - * We need the same args as before, but with this substitution: - * s!subvol=[^,]+!subvolid=0! - * - * Since the replacement string is up to 2 bytes longer than the - * original, allocate strlen(args) + 2 + 1 bytes. - */ - - src = strstr(args, "subvol="); - /* This shouldn't happen, but just in case.. */ - if (!src) - return NULL; + if (!args) + return kstrdup("subvolid=0", GFP_NOFS); - buf = dst = kmalloc(len, GFP_NOFS); + /* The worst case is that we add ",subvolid=0" to the end. */ + buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); if (!buf) return NULL; - /* - * If the subvol= arg is not at the start of the string, - * copy whatever precedes it into buf. - */ - if (src != args) { - *src++ = '\0'; - strcpy(buf, args); - dst += strlen(args); + while (1) { + sep = strchrnul(args, ','); + if (!strstarts(args, "subvol=") && + !strstarts(args, "subvolid=")) { + memcpy(dst, args, sep - args); + dst += sep - args; + *dst++ = ','; + } + if (*sep) + args = sep + 1; + else + break; } - strcpy(dst, "subvolid=0"); - dst += strlen("subvolid=0"); - - /* - * If there is a "," after the original subvol=... string, - * copy that suffix into our buffer. Otherwise, we're done. - */ - src = strchr(src, ','); - if (src) - strcpy(dst, src); return buf; } -static struct dentry *mount_subvol(const char *subvol_name, int flags, - const char *device_name, char *data) +static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, + int flags, const char *device_name, + char *data) { struct dentry *root; - struct vfsmount *mnt; + struct vfsmount *mnt = NULL; char *newargs; + int ret; newargs = setup_root_args(data); - if (!newargs) - return ERR_PTR(-ENOMEM); - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, - newargs); + if (!newargs) { + root = ERR_PTR(-ENOMEM); + goto out; + } - if (PTR_RET(mnt) == -EBUSY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { if (flags & MS_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, + device_name, newargs); } else { - int r; - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, + device_name, newargs); if (IS_ERR(mnt)) { - kfree(newargs); - return ERR_CAST(mnt); + root = ERR_CAST(mnt); + mnt = NULL; + goto out; } - r = btrfs_remount(mnt->mnt_sb, &flags, NULL); - if (r < 0) { - /* FIXME: release vfsmount mnt ??*/ - kfree(newargs); - return ERR_PTR(r); + down_write(&mnt->mnt_sb->s_umount); + ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); + up_write(&mnt->mnt_sb->s_umount); + if (ret < 0) { + root = ERR_PTR(ret); + goto out; } } } + if (IS_ERR(mnt)) { + root = ERR_CAST(mnt); + mnt = NULL; + goto out; + } - kfree(newargs); + if (!subvol_name) { + if (!subvol_objectid) { + ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), + &subvol_objectid); + if (ret) { + root = ERR_PTR(ret); + goto out; + } + } + subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), + subvol_objectid); + if (IS_ERR(subvol_name)) { + root = ERR_CAST(subvol_name); + subvol_name = NULL; + goto out; + } - if (IS_ERR(mnt)) - return ERR_CAST(mnt); + } root = mount_subtree(mnt, subvol_name); + /* mount_subtree() drops our reference on the vfsmount. */ + mnt = NULL; - if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) { + if (!IS_ERR(root)) { struct super_block *s = root->d_sb; - dput(root); - root = ERR_PTR(-EINVAL); - deactivate_locked_super(s); - printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", - subvol_name); + struct inode *root_inode = d_inode(root); + u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; + + ret = 0; + if (!is_subvolume_inode(root_inode)) { + pr_err("BTRFS: '%s' is not a valid subvolume\n", + subvol_name); + ret = -EINVAL; + } + if (subvol_objectid && root_objectid != subvol_objectid) { + /* + * This will also catch a race condition where a + * subvolume which was passed by ID is renamed and + * another subvolume is renamed over the old location. + */ + pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n", + subvol_name, subvol_objectid); + ret = -EINVAL; + } + if (ret) { + dput(root); + root = ERR_PTR(ret); + deactivate_locked_super(s); + } } +out: + mntput(mnt); + kfree(newargs); + kfree(subvol_name); return root; } @@ -1294,7 +1419,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, { struct block_device *bdev = NULL; struct super_block *s; - struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; @@ -1314,10 +1438,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, return ERR_PTR(error); } - if (subvol_name) { - root = mount_subvol(subvol_name, flags, device_name, data); - kfree(subvol_name); - return root; + if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + /* mount_subvol() will free subvol_name. */ + return mount_subvol(subvol_name, subvol_objectid, flags, + device_name, data); } security_init_mnt_opts(&new_sec_opts); @@ -1383,23 +1507,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); } - - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) { + if (error) { deactivate_locked_super(s); - error = PTR_ERR(root); goto error_sec_opts; } fs_info = btrfs_sb(s); error = setup_security_options(fs_info, s, &new_sec_opts); if (error) { - dput(root); deactivate_locked_super(s); goto error_sec_opts; } - return root; + return dget(s->s_root); error_close_devices: btrfs_close_devices(fs_devices); @@ -1714,7 +1834,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) avail_space = device->total_bytes - device->bytes_used; /* align with stripe_len */ - do_div(avail_space, BTRFS_STRIPE_LEN); + avail_space = div_u64(avail_space, BTRFS_STRIPE_LEN); avail_space *= BTRFS_STRIPE_LEN; /* @@ -1886,8 +2006,8 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); /* Mask in the root object ID too, to disambiguate subvols */ - buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; - buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid; return 0; } @@ -1908,6 +2028,17 @@ static struct file_system_type btrfs_fs_type = { }; MODULE_ALIAS_FS("btrfs"); +static int btrfs_control_open(struct inode *inode, struct file *file) +{ + /* + * The control file's private_data is used to hold the + * transaction when it is started and is used to keep + * track of whether a transaction is already in progress. + */ + file->private_data = NULL; + return 0; +} + /* * used by btrfsctl to scan devices when no FS is mounted */ @@ -2009,6 +2140,7 @@ static const struct super_operations btrfs_super_ops = { }; static const struct file_operations btrfs_ctl_fops = { + .open = btrfs_control_open, .unlocked_ioctl = btrfs_control_ioctl, .compat_ioctl = btrfs_control_ioctl, .owner = THIS_MODULE, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 94edb0a2a026..603b0cc2b9bb 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -33,6 +33,7 @@ #include "volumes.h" static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) @@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); -static struct attribute *btrfs_attrs[] = { +static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), @@ -438,28 +439,36 @@ static struct attribute *btrfs_attrs[] = { static void btrfs_release_super_kobj(struct kobject *kobj) { - struct btrfs_fs_info *fs_info = to_fs_info(kobj); - complete(&fs_info->kobj_unregister); + struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); + + memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); + complete(&fs_devs->kobj_unregister); } static struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_super_kobj, - .default_attrs = btrfs_attrs, }; +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_devices, super_kobj); +} + static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; - return container_of(kobj, struct btrfs_fs_info, super_kobj); + return to_fs_devs(kobj)->fs_info; } #define NUM_FEATURE_BITS 64 static char btrfs_unknown_feature_names[3][NUM_FEATURE_BITS][13]; static struct btrfs_feature_attr btrfs_feature_attrs[3][NUM_FEATURE_BITS]; -static u64 supported_feature_masks[3] = { +static const u64 supported_feature_masks[3] = { [FEAT_COMPAT] = BTRFS_FEATURE_COMPAT_SUPP, [FEAT_COMPAT_RO] = BTRFS_FEATURE_COMPAT_RO_SUPP, [FEAT_INCOMPAT] = BTRFS_FEATURE_INCOMPAT_SUPP, @@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; - ret = sysfs_merge_group(&fs_info->super_kobj, + ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, &agroup); if (ret) return ret; } else - sysfs_unmerge_group(&fs_info->super_kobj, + sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, &agroup); } @@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) return 0; } -static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) +{ + if (fs_devs->device_dir_kobj) { + kobject_del(fs_devs->device_dir_kobj); + kobject_put(fs_devs->device_dir_kobj); + fs_devs->device_dir_kobj = NULL; + } + + if (fs_devs->super_kobj.state_initialized) { + kobject_del(&fs_devs->super_kobj); + kobject_put(&fs_devs->super_kobj); + wait_for_completion(&fs_devs->kobj_unregister); + } +} + +/* when fs_devs is NULL it will remove all fsid kobject */ +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { - kobject_del(&fs_info->super_kobj); - kobject_put(&fs_info->super_kobj); - wait_for_completion(&fs_info->kobj_unregister); + struct list_head *fs_uuids = btrfs_get_fs_uuids(); + + if (fs_devs) { + __btrfs_sysfs_remove_fsid(fs_devs); + return; + } + + list_for_each_entry(fs_devs, fs_uuids, list) { + __btrfs_sysfs_remove_fsid(fs_devs); + } } void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) { + btrfs_reset_fs_info_ptr(fs_info); + if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } - kobject_del(fs_info->device_dir_kobj); - kobject_put(fs_info->device_dir_kobj); addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); - __btrfs_sysfs_remove_one(fs_info); + sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); + btrfs_kobj_rm_device(fs_info->fs_devices, NULL); } const char * const btrfs_feature_set_names[3] = { @@ -602,40 +635,60 @@ static void init_feature_attrs(void) } } -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +/* when one_device is NULL, it removes all device links */ + +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; struct kobject *disk_kobj; - if (!fs_info->device_dir_kobj) + if (!fs_devices->device_dir_kobj) return -EINVAL; if (one_device && one_device->bdev) { disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_info->device_dir_kobj, + sysfs_remove_link(fs_devices->device_dir_kobj, + disk_kobj->name); + } + + if (one_device) + return 0; + + list_for_each_entry(one_device, + &fs_devices->devices, dev_list) { + if (!one_device->bdev) + continue; + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_devices->device_dir_kobj, disk_kobj->name); } return 0; } -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, - struct btrfs_device *one_device) +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { - int error = 0; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_device *dev; - - if (!fs_info->device_dir_kobj) - fs_info->device_dir_kobj = kobject_create_and_add("devices", - &fs_info->super_kobj); + if (!fs_devs->device_dir_kobj) + fs_devs->device_dir_kobj = kobject_create_and_add("devices", + &fs_devs->super_kobj); - if (!fs_info->device_dir_kobj) + if (!fs_devs->device_dir_kobj) return -ENOMEM; + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *one_device) +{ + int error = 0; + struct btrfs_device *dev; + list_for_each_entry(dev, &fs_devices->devices, dev_list) { struct hd_struct *disk; struct kobject *disk_kobj; @@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_info->device_dir_kobj, + error = sysfs_create_link(fs_devices->device_dir_kobj, disk_kobj, disk_kobj->name); if (error) break; @@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry; /* Debugging tunables and exported data */ u64 btrfs_debugfs_test; +/* + * Can be called by the device discovery thread. + * And parent can be specified for seed device + */ +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent) +{ + int error; + + init_completion(&fs_devs->kobj_unregister); + fs_devs->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->super_kobj, + &btrfs_ktype, parent, "%pU", fs_devs->fsid); + return error; +} + int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) { int error; + struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; + struct kobject *super_kobj = &fs_devs->super_kobj; + + btrfs_set_fs_info_ptr(fs_info); - init_completion(&fs_info->kobj_unregister); - fs_info->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, - "%pU", fs_info->fsid); + error = btrfs_kobj_add_device(fs_devs, NULL); if (error) return error; - error = sysfs_create_group(&fs_info->super_kobj, - &btrfs_feature_attr_group); + error = sysfs_create_files(super_kobj, btrfs_attrs); if (error) { - __btrfs_sysfs_remove_one(fs_info); + btrfs_kobj_rm_device(fs_devs, NULL); return error; } - error = addrm_unknown_feature_attrs(fs_info, true); + error = sysfs_create_group(super_kobj, + &btrfs_feature_attr_group); if (error) goto failure; - error = btrfs_kobj_add_device(fs_info, NULL); + error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", - &fs_info->super_kobj); + super_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index f7dd298b3cf6..6392527bcc15 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -61,17 +61,33 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) /* convert from attribute */ -#define to_btrfs_feature_attr(a) \ - container_of(a, struct btrfs_feature_attr, kobj_attr) -#define attr_to_btrfs_attr(a) container_of(a, struct kobj_attribute, attr) -#define attr_to_btrfs_feature_attr(a) \ - to_btrfs_feature_attr(attr_to_btrfs_attr(a)) +static inline struct btrfs_feature_attr * +to_btrfs_feature_attr(struct kobj_attribute *a) +{ + return container_of(a, struct btrfs_feature_attr, kobj_attr); +} + +static inline struct kobj_attribute *attr_to_btrfs_attr(struct attribute *attr) +{ + return container_of(attr, struct kobj_attribute, attr); +} + +static inline struct btrfs_feature_attr * +attr_to_btrfs_feature_attr(struct attribute *attr) +{ + return to_btrfs_feature_attr(attr_to_btrfs_attr(attr)); +} + char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent); +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 73f299ebdabb..846d277b1901 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -21,6 +21,7 @@ #include "../transaction.h" #include "../disk-io.h" #include "../qgroup.h" +#include "../backref.h" static void init_dummy_trans(struct btrfs_trans_handle *trans) { @@ -227,21 +228,28 @@ static int test_no_shared_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); - ret = btrfs_create_qgroup(NULL, fs_info, 5, NULL); + ret = btrfs_create_qgroup(NULL, fs_info, 5); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + /* + * Since the test trans doesn't havee the complicated delayed refs, + * we can only call btrfs_qgroup_account_extent() directly to test + * quota. + */ + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } @@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root) if (ret) return ret; - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root) test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } + old_roots = NULL; + new_roots = NULL; + + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } ret = remove_extent_item(root, 4096, 4096); if (ret) return -EINVAL; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't remove space from the qgroup %d\n", ret); - return -EINVAL; + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } @@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); @@ -301,26 +331,35 @@ static int test_multiple_refs(struct btrfs_root *root) test_msg("Qgroup multiple refs test\n"); /* We have 5 created already from the previous test */ - ret = btrfs_create_qgroup(NULL, fs_info, 256, NULL); + ret = btrfs_create_qgroup(NULL, fs_info, 256); if (ret) { test_msg("Couldn't create a qgroup %d\n", ret); return ret; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = add_tree_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = remove_extent_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8be4278e25e8..f5021fcb154e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -35,7 +35,7 @@ #define BTRFS_ROOT_TRANS_TAG 0 -static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { +static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, [TRANS_STATE_BLOCKED] = (__TRANS_USERSPACE | __TRANS_START), @@ -64,6 +64,9 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) if (atomic_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); + if (transaction->delayed_refs.pending_csums) + printk(KERN_ERR "pending csums is %llu\n", + transaction->delayed_refs.pending_csums); while (!list_empty(&transaction->pending_chunks)) { struct extent_map *em; @@ -93,11 +96,8 @@ static void clear_btree_io_tree(struct extent_io_tree *tree) */ ASSERT(!waitqueue_active(&state->wq)); free_extent_state(state); - if (need_resched()) { - spin_unlock(&tree->lock); - cond_resched(); - spin_lock(&tree->lock); - } + + cond_resched_lock(&tree->lock); } spin_unlock(&tree->lock); } @@ -222,13 +222,17 @@ loop: atomic_set(&cur_trans->use_count, 2); cur_trans->have_free_bgs = 0; cur_trans->start_time = get_seconds(); + cur_trans->dirty_bg_run = 0; cur_trans->delayed_refs.href_root = RB_ROOT; + cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; + cur_trans->delayed_refs.pending_csums = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; + cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, @@ -250,6 +254,9 @@ loop: INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->pending_ordered); INIT_LIST_HEAD(&cur_trans->dirty_bgs); + INIT_LIST_HEAD(&cur_trans->io_bgs); + mutex_init(&cur_trans->cache_write_mutex); + cur_trans->num_dirty_bgs = 0; spin_lock_init(&cur_trans->dirty_bgs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(&cur_trans->dirty_pages, @@ -504,6 +511,7 @@ again: h->transaction = cur_trans; h->blocks_used = 0; h->bytes_reserved = 0; + h->chunk_bytes_reserved = 0; h->root = root; h->delayed_ref_updates = 0; h->use_count = 1; @@ -721,7 +729,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, updates = trans->delayed_ref_updates; trans->delayed_ref_updates = 0; if (updates) { - err = btrfs_run_delayed_refs(trans, root, updates); + err = btrfs_run_delayed_refs(trans, root, updates * 2); if (err) /* Error code will also eval true */ return err; } @@ -753,7 +761,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->ordered)) { spin_lock(&info->trans_lock); - list_splice(&trans->ordered, &cur_trans->pending_ordered); + list_splice_init(&trans->ordered, &cur_trans->pending_ordered); spin_unlock(&info->trans_lock); } @@ -787,6 +795,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + btrfs_trans_release_chunk_metadata(trans); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root) && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { @@ -1057,6 +1067,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = root->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; + struct list_head *io_bgs = &trans->transaction->io_bgs; struct list_head *next; struct extent_buffer *eb; int ret; @@ -1110,7 +1121,7 @@ again: return ret; } - while (!list_empty(dirty_bgs)) { + while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans, root); if (ret) return ret; @@ -1284,6 +1295,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (pending->error) goto no_free_objectid; + /* + * Make qgroup to skip current new snapshot's qgroupid, as it is + * accounted by later btrfs_qgroup_inherit(). + */ + btrfs_set_skip_qgroup(trans, objectid); + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { @@ -1292,7 +1309,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, to_reserve, BTRFS_RESERVE_NO_FLUSH); if (pending->error) - goto no_free_objectid; + goto clear_skip_qgroup; } key.objectid = objectid; @@ -1390,25 +1407,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); goto fail; } - - /* - * We need to flush delayed refs in order to make sure all of our quota - * operations have been done before we call btrfs_qgroup_inherit. - */ - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); @@ -1491,11 +1489,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } } + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* + * account qgroup counters before qgroup_inherit() + */ + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + fail: pending->error = ret; dir_item_existed: trans->block_rsv = rsv; trans->bytes_reserved = 0; +clear_skip_qgroup: + btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); root_item_alloc_fail: @@ -1810,8 +1834,39 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, return ret; } + if (!cur_trans->dirty_bg_run) { + int run_it = 0; + + /* this mutex is also taken before trying to set + * block groups readonly. We need to make sure + * that nobody has set a block group readonly + * after a extents from that block group have been + * allocated for cache files. btrfs_set_block_group_ro + * will wait for the transaction to commit if it + * finds dirty_bg_run = 1 + * + * The dirty_bg_run flag is also used to make sure only + * one process starts all the block group IO. It wouldn't + * hurt to have more than one go through, but there's no + * real advantage to it either. + */ + mutex_lock(&root->fs_info->ro_block_group_mutex); + if (!cur_trans->dirty_bg_run) { + run_it = 1; + cur_trans->dirty_bg_run = 1; + } + mutex_unlock(&root->fs_info->ro_block_group_mutex); + + if (run_it) + ret = btrfs_start_dirty_block_groups(trans, root); + } + if (ret) { + btrfs_end_transaction(trans, root); + return ret; + } + spin_lock(&root->fs_info->trans_lock); - list_splice(&trans->ordered, &cur_trans->pending_ordered); + list_splice_init(&trans->ordered, &cur_trans->pending_ordered); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&root->fs_info->trans_lock); atomic_inc(&cur_trans->use_count); @@ -1926,6 +1981,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto scrub_continue; } + /* Reocrd old roots for later qgroup accounting */ + ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + /* * make sure none of the code above managed to slip in a * delayed item @@ -1967,6 +2029,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ btrfs_free_log_root_tree(trans, root->fs_info); + /* + * Since fs roots are all committed, we can get a quite accurate + * new_roots. So let's do quota accounting. + */ + ret = btrfs_qgroup_account_extents(trans, root->fs_info); + if (ret < 0) { + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + ret = commit_cowonly_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); @@ -2003,6 +2076,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, assert_qgroups_uptodate(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); + ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(root); btrfs_set_super_log_root(root->fs_info->super_copy, 0); @@ -2016,6 +2090,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + btrfs_trans_release_chunk_metadata(trans); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; @@ -2076,7 +2152,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (current != root->fs_info->transaction_kthread) + if (current != root->fs_info->transaction_kthread && + current != root->fs_info->cleaner_kthread) btrfs_run_delayed_iputs(root); return ret; @@ -2085,6 +2162,7 @@ scrub_continue: btrfs_scrub_continue(root); cleanup_transaction: btrfs_trans_release_metadata(trans, root); + btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; if (trans->qgroup_reserved) { btrfs_qgroup_free(root, trans->qgroup_reserved); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 937050a2b68e..eb09c2067fa8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -64,9 +64,19 @@ struct btrfs_transaction { struct list_head pending_ordered; struct list_head switch_commits; struct list_head dirty_bgs; + struct list_head io_bgs; + u64 num_dirty_bgs; + + /* + * we need to make sure block group deletion doesn't race with + * free space cache writeout. This mutex keeps them from stomping + * on each other + */ + struct mutex cache_write_mutex; spinlock_t dirty_bgs_lock; struct btrfs_delayed_ref_root delayed_refs; int aborted; + int dirty_bg_run; }; #define __TRANS_FREEZABLE (1U << 0) @@ -92,6 +102,7 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 chunk_bytes_reserved; u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; @@ -136,9 +147,34 @@ struct btrfs_pending_snapshot { static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, struct inode *inode) { + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->last_trans = trans->transaction->transid; BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + spin_unlock(&BTRFS_I(inode)->lock); +} + +/* + * Make qgroup codes to skip given qgroupid, means the old/new_roots for + * qgroup won't contain the qgroupid in it. + */ +static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, + u64 qgroupid) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = qgroupid; +} + +static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(!delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = 0; } int btrfs_end_transaction(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index a63719cc9578..a4b9c8b2d35a 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; - if (btrfs_test_opt(root, SSD)) - goto out; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c5b8ba37f88e..9c45431e69ab 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -492,11 +492,19 @@ insert: if (btrfs_inode_generation(eb, src_item) == 0) { struct extent_buffer *dst_eb = path->nodes[0]; + const u64 ino_size = btrfs_inode_size(eb, src_item); + /* + * For regular files an ino_size == 0 is used only when + * logging that an inode exists, as part of a directory + * fsync, and the inode wasn't fsynced before. In this + * case don't set the size of the inode in the fs/subvol + * tree, otherwise we would be throwing valid data away. + */ if (S_ISREG(btrfs_inode_mode(eb, src_item)) && - S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) { + S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) && + ino_size != 0) { struct btrfs_map_token token; - u64 ino_size = btrfs_inode_size(eb, src_item); btrfs_init_map_token(&token); btrfs_set_token_inode_size(dst_eb, dst_item, @@ -1951,6 +1959,104 @@ out: return ret; } +static int replay_xattr_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + const u64 ino) +{ + struct btrfs_key search_key; + struct btrfs_path *log_path; + int i; + int nritems; + int ret; + + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + search_key.objectid = ino; + search_key.type = BTRFS_XATTR_ITEM_KEY; + search_key.offset = 0; +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; +process_leaf: + nritems = btrfs_header_nritems(path->nodes[0]); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_key key; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + u32 total_size; + u32 cur; + + btrfs_item_key_to_cpu(path->nodes[0], &key, i); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) { + ret = 0; + goto out; + } + + di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item); + total_size = btrfs_item_size_nr(path->nodes[0], i); + cur = 0; + while (cur < total_size) { + u16 name_len = btrfs_dir_name_len(path->nodes[0], di); + u16 data_len = btrfs_dir_data_len(path->nodes[0], di); + u32 this_len = sizeof(*di) + name_len + data_len; + char *name; + + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(path->nodes[0], name, + (unsigned long)(di + 1), name_len); + + log_di = btrfs_lookup_xattr(NULL, log, log_path, ino, + name, name_len, 0); + btrfs_release_path(log_path); + if (!log_di) { + /* Doesn't exist in log tree, so delete it. */ + btrfs_release_path(path); + di = btrfs_lookup_xattr(trans, root, path, ino, + name, name_len, -1); + kfree(name); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + ASSERT(di); + ret = btrfs_delete_one_dir_name(trans, root, + path, di); + if (ret) + goto out; + btrfs_release_path(path); + search_key = key; + goto again; + } + kfree(name); + if (IS_ERR(log_di)) { + ret = PTR_ERR(log_di); + goto out; + } + cur += this_len; + di = (struct btrfs_dir_item *)((char *)di + this_len); + } + } + ret = btrfs_next_leaf(root, path); + if (ret > 0) + ret = 0; + else if (ret == 0) + goto process_leaf; +out: + btrfs_free_path(log_path); + btrfs_release_path(path); + return ret; +} + + /* * deletion replay happens before we copy any new directory items * out of the log or out of backreferences from inodes. It @@ -2104,6 +2210,10 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, inode_item = btrfs_item_ptr(eb, i, struct btrfs_inode_item); + ret = replay_xattr_deletes(wc->trans, root, log, + path, key.objectid); + if (ret) + break; mode = btrfs_inode_mode(eb, inode_item); if (S_ISDIR(mode)) { ret = replay_dir_deletes(wc->trans, @@ -2230,7 +2340,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); + clean_tree_block(trans, root->fs_info, + next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2308,7 +2419,8 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, root, next); + clean_tree_block(trans, root->fs_info, + next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -2384,7 +2496,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, if (trans) { btrfs_tree_lock(next); btrfs_set_lock_blocking(next); - clean_tree_block(trans, log, next); + clean_tree_block(trans, log->fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); } @@ -3020,6 +3132,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, struct btrfs_path *dst_path, int key_type, + struct btrfs_log_ctx *ctx, u64 min_offset, u64 *last_offset_ret) { struct btrfs_key min_key; @@ -3104,6 +3217,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, src = path->nodes[0]; nritems = btrfs_header_nritems(src); for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + btrfs_item_key_to_cpu(src, &min_key, i); if (min_key.objectid != ino || min_key.type != key_type) @@ -3114,6 +3229,37 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, err = ret; goto done; } + + /* + * We must make sure that when we log a directory entry, + * the corresponding inode, after log replay, has a + * matching link count. For example: + * + * touch foo + * mkdir mydir + * sync + * ln foo mydir/bar + * xfs_io -c "fsync" mydir + * <crash> + * <mount fs and log replay> + * + * Would result in a fsync log that when replayed, our + * file inode would have a link count of 1, but we get + * two directory entries pointing to the same inode. + * After removing one of the names, it would not be + * possible to remove the other name, which resulted + * always in stale file handle errors, and would not + * be possible to rmdir the parent directory, since + * its i_size could never decrement to the value + * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors. + */ + di = btrfs_item_ptr(src, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(src, di, &tmp); + if (ctx && + (btrfs_dir_transid(src, di) == trans->transid || + btrfs_dir_type(src, di) == BTRFS_FT_DIR) && + tmp.type != BTRFS_ROOT_ITEM_KEY) + ctx->log_new_dentries = true; } path->slots[0] = nritems; @@ -3175,7 +3321,8 @@ done: static noinline int log_directory_changes(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, - struct btrfs_path *dst_path) + struct btrfs_path *dst_path, + struct btrfs_log_ctx *ctx) { u64 min_key; u64 max_key; @@ -3187,7 +3334,7 @@ again: max_key = 0; while (1) { ret = log_dir_items(trans, root, inode, path, - dst_path, key_type, min_key, + dst_path, key_type, ctx, min_key, &max_key); if (ret) return ret; @@ -3734,12 +3881,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, &ordered->flags)) continue; - if (ordered->csum_bytes_left) { - btrfs_start_ordered_extent(inode, ordered, 0); - wait_event(ordered->wait, - ordered->csum_bytes_left == 0); - } - list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); if (ret) @@ -3963,7 +4104,7 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, if (ret < 0) { return ret; } else if (ret > 0) { - *size_ret = i_size_read(inode); + *size_ret = 0; } else { struct btrfs_inode_item *item; @@ -3976,6 +4117,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, return 0; } +/* + * At the moment we always log all xattrs. This is to figure out at log replay + * time which xattrs must have their deletion replayed. If a xattr is missing + * in the log tree and exists in the fs/subvol tree, we delete it. This is + * because if a xattr is deleted, the inode is fsynced and a power failure + * happens, causing the log to be replayed the next time the fs is mounted, + * we want the xattr to not exist anymore (same behaviour as other filesystems + * with a journal, ext3/4, xfs, f2fs, etc). + */ +static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + int ret; + struct btrfs_key key; + const u64 ino = btrfs_ino(inode); + int ins_nr = 0; + int start_slot = 0; + + key.objectid = ino; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (true) { + int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + int nritems = btrfs_header_nritems(leaf); + + if (slot >= nritems) { + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) + break; + + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + cond_resched(); + } + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * If the no holes feature is enabled we need to make sure any hole between the + * last extent and the i_size of our inode is explicitly marked in the log. This + * is to make sure that doing something like: + * + * 1) create file with 128Kb of data + * 2) truncate file to 64Kb + * 3) truncate file to 256Kb + * 4) fsync file + * 5) <crash/power failure> + * 6) mount fs and trigger log replay + * + * Will give us a file with a size of 256Kb, the first 64Kb of data match what + * the file had in its first 64Kb of data at step 1 and the last 192Kb of the + * file correspond to a hole. The presence of explicit holes in a log tree is + * what guarantees that log replay will remove/adjust file extent items in the + * fs/subvol tree. + * + * Here we do not need to care about holes between extents, that is already done + * by copy_items(). We also only need to do this in the full sync path, where we + * lookup for extents from the fs/subvol tree only. In the fast path case, we + * lookup the list of modified extent maps and if any represents a hole, we + * insert a corresponding extent representing a hole in the log tree. + */ +static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + u64 hole_start; + u64 hole_size; + struct extent_buffer *leaf; + struct btrfs_root *log = root->log_root; + const u64 ino = btrfs_ino(inode); + const u64 i_size = i_size_read(inode); + + if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ASSERT(ret != 0); + if (ret < 0) + return ret; + + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* inode does not have any extents */ + hole_start = 0; + hole_size = i_size; + } else { + struct btrfs_file_extent_item *extent; + u64 len; + + /* + * If there's an extent beyond i_size, an explicit hole was + * already inserted by copy_items(). + */ + if (key.offset >= i_size) + return 0; + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(leaf, + path->slots[0], + extent); + ASSERT(len == i_size); + return 0; + } + + len = btrfs_file_extent_num_bytes(leaf, extent); + /* Last extent goes beyond i_size, no need to log a hole. */ + if (key.offset + len > i_size) + return 0; + hole_start = key.offset + len; + hole_size = i_size - hole_start; + } + btrfs_release_path(path); + + /* Last extent ends at i_size. */ + if (hole_size == 0) + return 0; + + hole_size = ALIGN(hole_size, root->sectorsize); + ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, + hole_size, 0, hole_size, 0, 0, 0); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4014,6 +4336,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, u64 ino = btrfs_ino(inode); struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; u64 logged_isize = 0; + bool need_log_inode_item = true; path = btrfs_alloc_path(); if (!path) @@ -4070,10 +4393,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (S_ISDIR(inode->i_mode)) { int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; - if (inode_only == LOG_INODE_EXISTS) { - max_key_type = BTRFS_INODE_EXTREF_KEY; - max_key.type = max_key_type; - } + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key_type); } else { if (inode_only == LOG_INODE_EXISTS) { @@ -4098,7 +4419,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags)) { if (inode_only == LOG_INODE_EXISTS) { - max_key.type = BTRFS_INODE_EXTREF_KEY; + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { @@ -4106,30 +4427,24 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, &BTRFS_I(inode)->runtime_flags); clear_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - ret = btrfs_truncate_inode_items(trans, log, - inode, 0, 0); + while(1) { + ret = btrfs_truncate_inode_items(trans, + log, inode, 0, 0); + if (ret != -EAGAIN) + break; + } } - } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags) || + } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING, + &BTRFS_I(inode)->runtime_flags) || inode_only == LOG_INODE_EXISTS) { - if (inode_only == LOG_INODE_ALL) { - clear_bit(BTRFS_INODE_COPY_EVERYTHING, - &BTRFS_I(inode)->runtime_flags); + if (inode_only == LOG_INODE_ALL) fast_search = true; - max_key.type = BTRFS_XATTR_ITEM_KEY; - } else { - max_key.type = BTRFS_INODE_EXTREF_KEY; - } + max_key.type = BTRFS_XATTR_ITEM_KEY; ret = drop_objectid_items(trans, log, path, ino, max_key.type); } else { if (inode_only == LOG_INODE_ALL) fast_search = true; - ret = log_inode_item(trans, log, dst_path, inode); - if (ret) { - err = ret; - goto out_unlock; - } goto log_extents; } @@ -4152,6 +4467,28 @@ again: if (min_key.type > max_key.type) break; + if (min_key.type == BTRFS_INODE_ITEM_KEY) + need_log_inode_item = false; + + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key.type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + if (ret) { + btrfs_release_path(path); + continue; + } + goto next_slot; + } + src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; @@ -4219,9 +4556,26 @@ next_slot: ins_nr = 0; } + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_trailing_hole(trans, root, inode, path); + if (err) + goto out_unlock; + } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); + if (need_log_inode_item) { + err = log_inode_item(trans, log, dst_path, inode); + if (err) + goto out_unlock; + } if (fast_search) { /* * Some ordered extents started by fsync might have completed @@ -4277,15 +4631,18 @@ log_extents: } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { - ret = log_directory_changes(trans, root, inode, path, dst_path); + ret = log_directory_changes(trans, root, inode, path, dst_path, + ctx); if (ret) { err = ret; goto out_unlock; } } + spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->logged_trans = trans->transid; BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; + spin_unlock(&BTRFS_I(inode)->lock); out_unlock: if (unlikely(err)) btrfs_put_logged_extents(&logged_list); @@ -4327,9 +4684,9 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, goto out; if (!S_ISDIR(inode->i_mode)) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) goto out; - inode = parent->d_inode; + inode = d_inode(parent); } while (1) { @@ -4355,7 +4712,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, break; } - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; if (IS_ROOT(parent)) @@ -4364,7 +4721,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans, parent = dget_parent(parent); dput(old_parent); old_parent = parent; - inode = parent->d_inode; + inode = d_inode(parent); } dput(old_parent); @@ -4372,6 +4729,181 @@ out: return ret; } +struct btrfs_dir_list { + u64 ino; + struct list_head list; +}; + +/* + * Log the inodes of the new dentries of a directory. See log_dir_items() for + * details about the why it is needed. + * This is a recursive operation - if an existing dentry corresponds to a + * directory, that directory's new entries are logged too (same behaviour as + * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * the dentries point to we do not lock their i_mutex, otherwise lockdep + * complains about the following circular lock dependency / possible deadlock: + * + * CPU0 CPU1 + * ---- ---- + * lock(&type->i_mutex_dir_key#3/2); + * lock(sb_internal#2); + * lock(&type->i_mutex_dir_key#3/2); + * lock(&sb->s_type->i_mutex_key#14); + * + * Where sb_internal is the lock (a counter that works as a lock) acquired by + * sb_start_intwrite() in btrfs_start_transaction(). + * Not locking i_mutex of the inodes is still safe because: + * + * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible + * that while logging the inode new references (names) are added or removed + * from the inode, leaving the logged inode item with a link count that does + * not match the number of logged inode reference items. This is fine because + * at log replay time we compute the real number of links and correct the + * link count in the inode item (see replay_one_buffer() and + * link_to_fixup_dir()); + * + * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that + * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and + * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item + * has a size that doesn't match the sum of the lengths of all the logged + * names. This does not result in a problem because if a dir_item key is + * logged but its matching dir_index key is not logged, at log replay time we + * don't use it to replay the respective name (see replay_one_name()). On the + * other hand if only the dir_index key ends up being logged, the respective + * name is added to the fs/subvol tree with both the dir_item and dir_index + * keys created (see replay_one_name()). + * The directory's inode item with a wrong i_size is not a problem as well, + * since we don't use it at log replay time to set the i_size in the inode + * item of the fs/subvol tree (see overwrite_item()). + */ +static int log_new_dir_dentries(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *start_inode, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_path *path; + LIST_HEAD(dir_list); + struct btrfs_dir_list *dir_elem; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS); + if (!dir_elem) { + btrfs_free_path(path); + return -ENOMEM; + } + dir_elem->ino = btrfs_ino(start_inode); + list_add_tail(&dir_elem->list, &dir_list); + + while (!list_empty(&dir_list)) { + struct extent_buffer *leaf; + struct btrfs_key min_key; + int nritems; + int i; + + dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list, + list); + if (ret) + goto next_dir_inode; + + min_key.objectid = dir_elem->ino; + min_key.type = BTRFS_DIR_ITEM_KEY; + min_key.offset = 0; +again: + btrfs_release_path(path); + ret = btrfs_search_forward(log, &min_key, path, trans->transid); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + +process_leaf: + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + for (i = path->slots[0]; i < nritems; i++) { + struct btrfs_dir_item *di; + struct btrfs_key di_key; + struct inode *di_inode; + struct btrfs_dir_list *new_dir_elem; + int log_mode = LOG_INODE_EXISTS; + int type; + + btrfs_item_key_to_cpu(leaf, &min_key, i); + if (min_key.objectid != dir_elem->ino || + min_key.type != BTRFS_DIR_ITEM_KEY) + goto next_dir_inode; + + di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); + type = btrfs_dir_type(leaf, di); + if (btrfs_dir_transid(leaf, di) < trans->transid && + type != BTRFS_FT_DIR) + continue; + btrfs_dir_item_key_to_cpu(leaf, di, &di_key); + if (di_key.type == BTRFS_ROOT_ITEM_KEY) + continue; + + di_inode = btrfs_iget(root->fs_info->sb, &di_key, + root, NULL); + if (IS_ERR(di_inode)) { + ret = PTR_ERR(di_inode); + goto next_dir_inode; + } + + if (btrfs_inode_in_log(di_inode, trans->transid)) { + iput(di_inode); + continue; + } + + ctx->log_new_dentries = false; + if (type == BTRFS_FT_DIR) + log_mode = LOG_INODE_ALL; + btrfs_release_path(path); + ret = btrfs_log_inode(trans, root, di_inode, + log_mode, 0, LLONG_MAX, ctx); + iput(di_inode); + if (ret) + goto next_dir_inode; + if (ctx->log_new_dentries) { + new_dir_elem = kmalloc(sizeof(*new_dir_elem), + GFP_NOFS); + if (!new_dir_elem) { + ret = -ENOMEM; + goto next_dir_inode; + } + new_dir_elem->ino = di_key.objectid; + list_add_tail(&new_dir_elem->list, &dir_list); + } + break; + } + if (i == nritems) { + ret = btrfs_next_leaf(log, path); + if (ret < 0) { + goto next_dir_inode; + } else if (ret > 0) { + ret = 0; + goto next_dir_inode; + } + goto process_leaf; + } + if (min_key.offset < (u64)-1) { + min_key.offset++; + goto again; + } +next_dir_inode: + list_del(&dir_elem->list); + kfree(dir_elem); + } + + btrfs_free_path(path); + return ret; +} + /* * helper function around btrfs_log_inode to make sure newly created * parent directories also end up in the log. A minimal inode and backref @@ -4394,6 +4926,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, const struct dentry * const first_parent = parent; const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans > last_committed); + bool log_dentries = false; + struct inode *orig_inode = inode; sb = inode->i_sb; @@ -4449,11 +4983,14 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } + if (S_ISDIR(inode->i_mode) && ctx && ctx->log_new_dentries) + log_dentries = true; + while (1) { - if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb) + if (!parent || d_really_is_negative(parent) || sb != d_inode(parent)->i_sb) break; - inode = parent->d_inode; + inode = d_inode(parent); if (root != BTRFS_I(inode)->root) break; @@ -4485,7 +5022,10 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, dput(old_parent); old_parent = parent; } - ret = 0; + if (log_dentries) + ret = log_new_dir_dentries(trans, root, orig_inode, ctx); + else + ret = 0; end_trans: dput(old_parent); if (ret < 0) { @@ -4515,7 +5055,7 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *parent = dget_parent(dentry); int ret; - ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, + ret = btrfs_log_inode_parent(trans, root, d_inode(dentry), parent, start, end, 0, ctx); dput(parent); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 154990c26dcb..6916a781ea02 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -29,6 +29,7 @@ struct btrfs_log_ctx { int log_ret; int log_transid; int io_err; + bool log_new_dentries; struct list_head list; }; @@ -37,6 +38,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) ctx->log_ret = 0; ctx->log_transid = 0; ctx->io_err = 0; + ctx->log_new_dentries = false; INIT_LIST_HEAD(&ctx->list); } diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 840a38b2778a..91feb2bdefee 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) return NULL; } +static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) +{ + rb_erase(&node->rb_node, &ulist->root); + list_del(&node->list); + kfree(node); + BUG_ON(ulist->nnodes == 0); + ulist->nnodes--; +} + static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) { struct rb_node **p = &ulist->root.rb_node; @@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, node->val = val; node->aux = aux; -#ifdef CONFIG_BTRFS_DEBUG - node->seqnum = ulist->nnodes; -#endif ret = ulist_rbtree_insert(ulist, node); ASSERT(!ret); @@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, return 1; } +/* + * ulist_del - delete one node from ulist + * @ulist: ulist to remove node from + * @val: value to delete + * @aux: aux to delete + * + * The deletion will only be done when *BOTH* val and aux matches. + * Return 0 for successful delete. + * Return > 0 for not found. + */ +int ulist_del(struct ulist *ulist, u64 val, u64 aux) +{ + struct ulist_node *node; + + node = ulist_rbtree_search(ulist, val); + /* Not found */ + if (!node) + return 1; + + if (node->aux != aux) + return 1; + + /* Found and delete */ + ulist_rbtree_erase(ulist, node); + return 0; +} + /** * ulist_next - iterate ulist * @ulist: ulist to iterate @@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) uiter->cur_list = uiter->cur_list->next; } else { uiter->cur_list = ulist->nodes.next; -#ifdef CONFIG_BTRFS_DEBUG - uiter->i = 0; -#endif } node = list_entry(uiter->cur_list, struct ulist_node, list); -#ifdef CONFIG_BTRFS_DEBUG - ASSERT(node->seqnum == uiter->i); - ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); - uiter->i++; -#endif return node; } diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 4c29db604bbe..a01a2c45825f 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask); +int ulist_del(struct ulist *ulist, u64 val, u64 aux); /* just like ulist_add_merge() but take a pointer for the aux data */ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 8222f6f74147..fbe7c104531c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); +struct list_head *btrfs_get_fs_uuids(void) +{ + return &fs_uuids; +} static struct btrfs_fs_devices *__alloc_fs_devices(void) { @@ -345,7 +349,7 @@ loop_lock: waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); - BUG_ON(atomic_read(&cur->bi_cnt) == 0); + BUG_ON(atomic_read(&cur->__bi_cnt) == 0); /* * if we're doing the sync list, record that our @@ -366,8 +370,8 @@ loop_lock: btrfsic_submit_bio(cur->bi_rw, cur); num_run++; batch_run++; - if (need_resched()) - cond_resched(); + + cond_resched(); /* * we made progress, there is more work to do and the bdi @@ -400,8 +404,7 @@ loop_lock: * against it before looping */ last_waited = ioc->last_waited; - if (need_resched()) - cond_resched(); + cond_resched(); continue; } spin_lock(&device->io_lock); @@ -442,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } + +void btrfs_free_stale_device(struct btrfs_device *cur_dev) +{ + struct btrfs_fs_devices *fs_devs; + struct btrfs_device *dev; + + if (!cur_dev->name) + return; + + list_for_each_entry(fs_devs, &fs_uuids, list) { + int del = 1; + + if (fs_devs->opened) + continue; + if (fs_devs->seeding) + continue; + + list_for_each_entry(dev, &fs_devs->devices, dev_list) { + + if (dev == cur_dev) + continue; + if (!dev->name) + continue; + + /* + * Todo: This won't be enough. What if the same device + * comes back (with new uuid and) with its mapper path? + * But for now, this does help as mostly an admin will + * either use mapper or non mapper path throughout. + */ + rcu_read_lock(); + del = strcmp(rcu_str_deref(dev->name), + rcu_str_deref(cur_dev->name)); + rcu_read_unlock(); + if (!del) + break; + } + + if (!del) { + /* delete the stale device */ + if (fs_devs->num_devices == 1) { + btrfs_sysfs_remove_fsid(fs_devs); + list_del(&fs_devs->list); + free_fs_devices(fs_devs); + } else { + fs_devs->num_devices--; + list_del(&dev->dev_list); + rcu_string_free(dev->name); + kfree(dev); + } + break; + } + } +} + /* * Add new device to list of registered devices * @@ -557,6 +615,12 @@ static noinline int device_list_add(const char *path, if (!fs_devices->opened) device->generation = found_transid; + /* + * if there is new btrfs on an already registered device, + * then remove the stale device entry. + */ + btrfs_free_stale_device(device); + *fs_devices_ret = fs_devices; return ret; @@ -609,8 +673,7 @@ error: return ERR_PTR(-ENOMEM); } -void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, int step) +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; struct btrfs_device *latest_dev = NULL; @@ -695,13 +758,13 @@ static void free_device(struct rcu_head *head) static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; + struct btrfs_device *device, *tmp; if (--fs_devices->opened > 0) return 0; mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { struct btrfs_device *new_device; struct rcu_string *name; @@ -1060,6 +1123,7 @@ static int contains_pending_extent(struct btrfs_trans_handle *trans, struct extent_map *em; struct list_head *search_list = &trans->transaction->pending_chunks; int ret = 0; + u64 physical_start = *start; again: list_for_each_entry(em, search_list, list) { @@ -1068,15 +1132,31 @@ again: map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { + u64 end; + if (map->stripes[i].dev != device) continue; - if (map->stripes[i].physical >= *start + len || + if (map->stripes[i].physical >= physical_start + len || map->stripes[i].physical + em->orig_block_len <= - *start) + physical_start) continue; - *start = map->stripes[i].physical + - em->orig_block_len; - ret = 1; + /* + * Make sure that while processing the pinned list we do + * not override our *start with a lower value, because + * we can have pinned chunks that fall within this + * device hole and that have lower physical addresses + * than the pending chunks we processed before. If we + * do not take this special care we can end up getting + * 2 pending chunks that start at the same physical + * device offsets because the end offset of a pinned + * chunk can be equal to the start offset of some + * pending chunk. + */ + end = map->stripes[i].physical + em->orig_block_len; + if (end > *start) { + *start = end; + ret = 1; + } } } if (search_list == &trans->transaction->pending_chunks) { @@ -1136,11 +1216,11 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; -again: + max_hole_start = search_start; max_hole_size = 0; - hole_size = 0; +again: if (search_start >= search_end || device->is_tgtdev_for_dev_replace) { ret = -ENOSPC; goto out; @@ -1195,8 +1275,14 @@ again: */ if (contains_pending_extent(trans, device, &search_start, - hole_size)) - hole_size = 0; + hole_size)) { + if (key.offset >= search_start) { + hole_size = key.offset - search_start; + } else { + WARN_ON_ONCE(1); + hole_size = 0; + } + } if (hole_size > max_hole_size) { max_hole_start = search_start; @@ -1233,21 +1319,23 @@ next: * allocated dev extents, and when shrinking the device, * search_end may be smaller than search_start. */ - if (search_end > search_start) + if (search_end > search_start) { hole_size = search_end - search_start; - if (hole_size > max_hole_size) { - max_hole_start = search_start; - max_hole_size = hole_size; - } + if (contains_pending_extent(trans, device, &search_start, + hole_size)) { + btrfs_release_path(path); + goto again; + } - if (contains_pending_extent(trans, device, &search_start, hole_size)) { - btrfs_release_path(path); - goto again; + if (hole_size > max_hole_size) { + max_hole_start = search_start; + max_hole_size = hole_size; + } } /* See above. */ - if (hole_size < num_bytes) + if (max_hole_size < num_bytes) ret = -ENOSPC; else ret = 0; @@ -1699,7 +1787,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); } call_rcu(&device->rcu, free_device); @@ -1868,6 +1956,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); + + btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); + if (tgtdev->bdev) { btrfs_scratch_superblock(tgtdev); fs_info->fs_devices->open_devices--; @@ -2204,7 +2295,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) tmp + 1); /* add sysfs device entry */ - btrfs_kobj_add_device(root->fs_info, device); + btrfs_kobj_add_device(root->fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2245,8 +2336,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", root->fs_info->fsid); - if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) - goto error_trans; + if (kobject_rename(&root->fs_info->fs_devices->super_kobj, + fsid_buf)) + pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); } root->fs_info->num_tolerated_disk_barrier_failures = @@ -2282,7 +2374,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: btrfs_end_transaction(trans, root); rcu_string_free(device->name); - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2487,8 +2579,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, } static int btrfs_free_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, + struct btrfs_root *root, u64 chunk_objectid, u64 chunk_offset) { int ret; @@ -2580,7 +2671,6 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct map_lookup *map; u64 dev_extent_len = 0; u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - u64 chunk_tree = root->fs_info->chunk_root->objectid; int i, ret = 0; /* Just in case */ @@ -2604,6 +2694,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, return -EINVAL; } map = (struct map_lookup *)em->bdev; + lock_chunks(root->fs_info->chunk_root); + check_system_chunk(trans, extent_root, map->type); + unlock_chunks(root->fs_info->chunk_root); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; @@ -2634,8 +2727,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, } } } - ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, - chunk_offset); + ret = btrfs_free_chunk(trans, root, chunk_objectid, chunk_offset); if (ret) { btrfs_abort_transaction(trans, root, ret); goto out; @@ -2664,8 +2756,8 @@ out: } static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset) + u64 chunk_objectid, + u64 chunk_offset) { struct btrfs_root *extent_root; struct btrfs_trans_handle *trans; @@ -2674,6 +2766,20 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, root = root->fs_info->chunk_root; extent_root = root->fs_info->extent_root; + /* + * Prevent races with automatic removal of unused block groups. + * After we relocate and before we remove the chunk with offset + * chunk_offset, automatic removal of the block group can kick in, + * resulting in a failure when calling btrfs_remove_chunk() below. + * + * Make sure to acquire this mutex before doing a tree search (dev + * or chunk trees) to find chunks. Otherwise the cleaner kthread might + * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after + * we release the path used to search the chunk/dev tree and before + * the current task acquires this mutex and calls us. + */ + ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex)); + ret = btrfs_can_relocate(extent_root, chunk_offset); if (ret) return -ENOSPC; @@ -2707,7 +2813,6 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root) struct btrfs_chunk *chunk; struct btrfs_key key; struct btrfs_key found_key; - u64 chunk_tree = chunk_root->root_key.objectid; u64 chunk_type; bool retried = false; int failed = 0; @@ -2723,13 +2828,18 @@ again: key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); goto error; + } BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); + if (ret) + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret < 0) goto error; if (ret > 0) @@ -2744,7 +2854,7 @@ again: btrfs_release_path(path); if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { - ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + ret = btrfs_relocate_chunk(chunk_root, found_key.objectid, found_key.offset); if (ret == -ENOSPC) @@ -2752,6 +2862,7 @@ again: else BUG_ON(ret); } + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (found_key.offset == 0) break; @@ -3022,7 +3133,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf, stripe_offset = btrfs_stripe_offset(leaf, stripe); stripe_length = btrfs_chunk_length(leaf, chunk); - do_div(stripe_length, factor); + stripe_length = div_u64(stripe_length, factor); if (stripe_offset < bargs->pend && stripe_offset + stripe_length > bargs->pstart) @@ -3208,9 +3319,12 @@ again: goto error; } + mutex_lock(&fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; + } /* * this shouldn't happen, it means the last relocate @@ -3222,6 +3336,7 @@ again: ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); ret = 0; break; } @@ -3230,8 +3345,10 @@ again: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != key.objectid) + if (found_key.objectid != key.objectid) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); break; + } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -3244,10 +3361,13 @@ again: ret = should_balance_chunk(chunk_root, leaf, chunk, found_key.offset); btrfs_release_path(path); - if (!ret) + if (!ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto loop; + } if (counting) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); @@ -3255,9 +3375,9 @@ again: } ret = btrfs_relocate_chunk(chunk_root, - chunk_root->root_key.objectid, found_key.objectid, found_key.offset); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto error; if (ret == -ENOSPC) { @@ -3906,9 +4026,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) uuid_root = btrfs_create_tree(trans, fs_info, BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { - btrfs_abort_transaction(trans, tree_root, - PTR_ERR(uuid_root)); - return PTR_ERR(uuid_root); + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, tree_root, ret); + return ret; } fs_info->uuid_root = uuid_root; @@ -3957,13 +4077,13 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_path *path; u64 length; - u64 chunk_tree; u64 chunk_objectid; u64 chunk_offset; int ret; int slot; int failed = 0; bool retried = false; + bool checked_pending_chunks = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = root->fs_info->super_copy; @@ -3997,11 +4117,16 @@ again: key.type = BTRFS_DEV_EXTENT_KEY; do { + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); goto done; + } ret = btrfs_previous_item(root, path, 0, key.type); + if (ret) + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret < 0) goto done; if (ret) { @@ -4015,6 +4140,7 @@ again: btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } @@ -4023,17 +4149,17 @@ again: length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } - chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); btrfs_release_path(path); - ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, - chunk_offset); + ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto done; if (ret == -ENOSPC) @@ -4046,15 +4172,6 @@ again: goto again; } else if (failed && retried) { ret = -ENOSPC; - lock_chunks(root); - - btrfs_device_set_total_bytes(device, old_size); - if (device->writeable) - device->fs_devices->total_rw_bytes += diff; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += diff; - spin_unlock(&root->fs_info->free_chunk_lock); - unlock_chunks(root); goto done; } @@ -4066,6 +4183,35 @@ again: } lock_chunks(root); + + /* + * We checked in the above loop all device extents that were already in + * the device tree. However before we have updated the device's + * total_bytes to the new size, we might have had chunk allocations that + * have not complete yet (new block groups attached to transaction + * handles), and therefore their device extents were not yet in the + * device tree and we missed them in the loop above. So if we have any + * pending chunk using a device extent that overlaps the device range + * that we can not use anymore, commit the current transaction and + * repeat the search on the device tree - this way we guarantee we will + * not have chunks using device extents that end beyond 'new_size'. + */ + if (!checked_pending_chunks) { + u64 start = new_size; + u64 len = old_size - new_size; + + if (contains_pending_extent(trans, device, &start, len)) { + unlock_chunks(root); + checked_pending_chunks = true; + failed = 0; + retried = false; + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto done; + goto again; + } + } + btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, @@ -4080,6 +4226,16 @@ again: btrfs_end_transaction(trans, root); done: btrfs_free_path(path); + if (ret) { + lock_chunks(root); + btrfs_device_set_total_bytes(device, old_size); + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); + unlock_chunks(root); + } return ret; } @@ -4131,7 +4287,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } -static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { +static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, .dev_stripes = 1, @@ -4289,7 +4445,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), max_chunk_size); - devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices, + devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), GFP_NOFS); if (!devices_info) return -ENOMEM; @@ -4400,8 +4556,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, */ if (stripe_size * data_stripes > max_chunk_size) { u64 mask = (1ULL << 24) - 1; - stripe_size = max_chunk_size; - do_div(stripe_size, data_stripes); + + stripe_size = div_u64(max_chunk_size, data_stripes); /* bump the answer up to a 16MB boundary */ stripe_size = (stripe_size + mask) & ~mask; @@ -4413,10 +4569,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = devices_info[ndevs-1].max_avail; } - do_div(stripe_size, dev_stripes); + stripe_size = div_u64(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - do_div(stripe_size, raid_stripe_len); + stripe_size = div_u64(stripe_size, raid_stripe_len); stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -4626,6 +4782,7 @@ int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, { u64 chunk_offset; + ASSERT(mutex_is_locked(&extent_root->fs_info->chunk_mutex)); chunk_offset = find_next_chunk(extent_root->fs_info); return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type); } @@ -4954,7 +5111,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, u64 stripe_nr_orig; u64 stripe_nr_end; u64 stripe_len; - int stripe_index; + u32 stripe_index; int i; int ret = 0; int num_stripes; @@ -4995,7 +5152,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * stripe_nr counts the total number of stripes we have to stride * to get to this block */ - do_div(stripe_nr, stripe_len); + stripe_nr = div64_u64(stripe_nr, stripe_len); stripe_offset = stripe_nr * stripe_len; BUG_ON(offset < stripe_offset); @@ -5011,7 +5168,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* allow a write of a full stripe, but make sure we don't * allow straddling of stripes */ - do_div(raid56_full_stripe_start, full_stripe_len); + raid56_full_stripe_start = div64_u64(raid56_full_stripe_start, + full_stripe_len); raid56_full_stripe_start *= full_stripe_len; } @@ -5136,7 +5294,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = 0; stripe_nr_orig = stripe_nr; stripe_nr_end = ALIGN(offset + *length, map->stripe_len); - do_div(stripe_nr_end, map->stripe_len); + stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); stripe_end_offset = stripe_nr_end * map->stripe_len - (offset + *length); @@ -5144,7 +5302,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, if (rw & REQ_DISCARD) num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); - stripe_index = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { @@ -5170,9 +5329,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - int factor = map->num_stripes / map->sub_stripes; + u32 factor = map->num_stripes / map->sub_stripes; - stripe_index = do_div(stripe_nr, factor); + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= map->sub_stripes; if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) @@ -5198,8 +5357,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ - stripe_nr = raid56_full_stripe_start; - do_div(stripe_nr, stripe_len * nr_data_stripes(map)); + stripe_nr = div_u64(raid56_full_stripe_start, + stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; @@ -5209,32 +5368,32 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, stripe_index = 0; stripe_offset = 0; } else { - u64 tmp; - /* * Mirror #0 or #1 means the original data block. * Mirror #2 is RAID5 parity block. * Mirror #3 is RAID6 Q block. */ - stripe_index = do_div(stripe_nr, nr_data_stripes(map)); + stripe_nr = div_u64_rem(stripe_nr, + nr_data_stripes(map), &stripe_index); if (mirror_num > 1) stripe_index = nr_data_stripes(map) + mirror_num - 2; /* We distribute the parity blocks across stripes */ - tmp = stripe_nr + stripe_index; - stripe_index = do_div(tmp, map->num_stripes); + div_u64_rem(stripe_nr + stripe_index, map->num_stripes, + &stripe_index); if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) && mirror_num <= 1) mirror_num = 1; } } else { /* - * after this do_div call, stripe_nr is the number of stripes - * on this device we have to walk to find the data, and - * stripe_index is the number of our device in the stripe array + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array */ - stripe_index = do_div(stripe_nr, map->num_stripes); + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); mirror_num = stripe_index + 1; } BUG_ON(stripe_index >= map->num_stripes); @@ -5261,7 +5420,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) || mirror_num > 1)) { u64 tmp; - int i, rot; + unsigned rot; bbio->raid_map = (u64 *)((void *)bbio->stripes + sizeof(struct btrfs_bio_stripe) * @@ -5269,8 +5428,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, sizeof(int) * tgtdev_indexes); /* Work out the disk rotation on this stripe-set */ - tmp = stripe_nr; - rot = do_div(tmp, num_stripes); + div_u64_rem(stripe_nr, num_stripes, &rot); /* Fill in the logical address of each stripe */ tmp = stripe_nr * nr_data_stripes(map); @@ -5285,8 +5443,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } if (rw & REQ_DISCARD) { - int factor = 0; - int sub_stripes = 0; + u32 factor = 0; + u32 sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; @@ -5437,9 +5595,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } if (found) { - u64 length = map->stripe_len; - - if (physical_of_found + length <= + if (physical_of_found + map->stripe_len <= dev_replace->cursor_left) { struct btrfs_bio_stripe *tgtdev_stripe = bbio->stripes + num_stripes; @@ -5535,15 +5691,15 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, rmap_len = map->stripe_len; if (map->type & BTRFS_BLOCK_GROUP_RAID10) - do_div(length, map->num_stripes / map->sub_stripes); + length = div_u64(length, map->num_stripes / map->sub_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID0) - do_div(length, map->num_stripes); + length = div_u64(length, map->num_stripes); else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - do_div(length, nr_data_stripes(map)); + length = div_u64(length, nr_data_stripes(map)); rmap_len = map->stripe_len * nr_data_stripes(map); } - buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); BUG_ON(!buf); /* -ENOMEM */ for (i = 0; i < map->num_stripes; i++) { @@ -5554,11 +5710,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, continue; stripe_nr = physical - map->stripes[i].physical; - do_div(stripe_nr, map->stripe_len); + stripe_nr = div_u64(stripe_nr, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; - do_div(stripe_nr, map->sub_stripes); + stripe_nr = div_u64(stripe_nr, map->sub_stripes); } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { stripe_nr = stripe_nr * map->num_stripes + i; } /* else if RAID[56], multiply by nr_data_stripes(). @@ -5587,17 +5743,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) { - if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) - bio_endio_nodec(bio, err); - else - bio_endio(bio, err); + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio_endio(bio, err); + btrfs_put_bbio(bbio); } static void btrfs_end_bio(struct bio *bio, int err) { struct btrfs_bio *bbio = bio->bi_private; - struct btrfs_device *dev = bbio->stripes[0].dev; int is_orig_bio = 0; if (err) { @@ -5605,6 +5760,7 @@ static void btrfs_end_bio(struct bio *bio, int err) if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; + struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; @@ -5634,8 +5790,6 @@ static void btrfs_end_bio(struct bio *bio, int err) bio = bbio->orig_bio; } - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio @@ -5817,8 +5971,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) /* Shoud be the original bio. */ WARN_ON(bio != bbio->orig_bio); - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; @@ -5835,8 +5987,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, u64 length = 0; u64 map_length; int ret; - int dev_nr = 0; - int total_devs = 1; + int dev_nr; + int total_devs; struct btrfs_bio *bbio = NULL; length = bio->bi_iter.bi_size; @@ -5877,11 +6029,10 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, BUG(); } - while (dev_nr < total_devs) { + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { dev = bbio->stripes[dev_nr].dev; if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { bbio_error(bbio, first_bio, logical); - dev_nr++; continue; } @@ -5894,22 +6045,18 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, ret = breakup_stripe_bio(root, bbio, first_bio, dev, dev_nr, rw, async_submit); BUG_ON(ret); - dev_nr++; continue; } if (dev_nr < total_devs - 1) { bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ - } else { + } else bio = first_bio; - bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; - } submit_stripe_bio(root, bbio, bio, bbio->stripes[dev_nr].physical, dev_nr, rw, async_submit); - dev_nr++; } btrfs_bio_counter_dec(root->fs_info); return 0; @@ -6082,6 +6229,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); return -EIO; } + btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing", + devid, uuid); } map->stripes[i].dev->in_fs_metadata = 1; } @@ -6201,10 +6350,11 @@ static int read_one_dev(struct btrfs_root *root, if (!btrfs_test_opt(root, DEGRADED)) return -EIO; - btrfs_warn(root->fs_info, "devid %llu missing", devid); device = add_missing_dev(root, fs_devices, devid, dev_uuid); if (!device) return -ENOMEM; + btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", + devid, dev_uuid); } else { if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) return -EIO; @@ -6732,3 +6882,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, } unlock_chunks(root); } + +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = fs_info; + fs_devices = fs_devices->seed; + } +} + +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = NULL; + fs_devices = fs_devices->seed; + } +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 83069dec6898..95842a909e7f 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -253,6 +253,12 @@ struct btrfs_fs_devices { * nonrot flag set */ int rotating; + + struct btrfs_fs_info *fs_info; + /* sysfs kobjects */ + struct kobject super_kobj; + struct kobject *device_dir_kobj; + struct completion kobj_unregister; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -292,8 +298,6 @@ struct btrfs_bio_stripe { struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); -#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) - struct btrfs_bio { atomic_t refs; atomic_t stripes_pending; @@ -421,8 +425,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, struct btrfs_fs_devices **fs_devices_ret); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); -void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, - struct btrfs_fs_devices *fs_devices, int step); +void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, char *device_path, struct btrfs_device **device); @@ -538,5 +541,8 @@ static inline void unlock_chunks(struct btrfs_root *root) mutex_unlock(&root->fs_info->chunk_mutex); } +struct list_head *btrfs_get_fs_uuids(void); +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 883b93623bc5..6f518c90e1c1 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -261,7 +261,7 @@ out: ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { struct btrfs_key key, found_key; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -364,22 +364,42 @@ const struct xattr_handler *btrfs_xattr_handlers[] = { /* * Check if the attribute is in a supported namespace. * - * This applied after the check for the synthetic attributes in the system + * This is applied after the check for the synthetic attributes in the system * namespace. */ -static bool btrfs_is_valid_xattr(const char *name) +static int btrfs_is_valid_xattr(const char *name) { - return !strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN) || - !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || - !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || - !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) || - !strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN); + int len = strlen(name); + int prefixlen = 0; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN)) + prefixlen = XATTR_SECURITY_PREFIX_LEN; + else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + prefixlen = XATTR_SYSTEM_PREFIX_LEN; + else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) + prefixlen = XATTR_TRUSTED_PREFIX_LEN; + else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) + prefixlen = XATTR_USER_PREFIX_LEN; + else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) + prefixlen = XATTR_BTRFS_PREFIX_LEN; + else + return -EOPNOTSUPP; + + /* + * The name cannot consist of just prefix + */ + if (len <= prefixlen) + return -EINVAL; + + return 0; } ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size) { + int ret; + /* * If this is a request for a synthetic attribute in the system.* * namespace use the generic infrastructure to resolve a handler @@ -388,15 +408,17 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_getxattr(dentry, name, buffer, size); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; - return __btrfs_getxattr(dentry->d_inode, name, buffer, size); + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; + return __btrfs_getxattr(d_inode(dentry), name, buffer, size); } int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; + int ret; /* * The permission on security.* and system.* is not checked @@ -413,23 +435,25 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_setxattr(dentry, name, value, size, flags); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(dentry->d_inode, name, + return btrfs_set_prop(d_inode(dentry), name, value, size, flags); if (size == 0) value = ""; /* empty EA, do not remove */ - return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, + return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size, flags); } int btrfs_removexattr(struct dentry *dentry, const char *name) { - struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root; + struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; + int ret; /* * The permission on security.* and system.* is not checked @@ -446,14 +470,15 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_removexattr(dentry, name); - if (!btrfs_is_valid_xattr(name)) - return -EOPNOTSUPP; + ret = btrfs_is_valid_xattr(name); + if (ret) + return ret; if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(dentry->d_inode, name, + return btrfs_set_prop(d_inode(dentry), name, NULL, 0, XATTR_REPLACE); - return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, + return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0, XATTR_REPLACE); } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index fb22fd8d8fb8..82990b8f872b 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -403,7 +403,7 @@ next: return ret; } -struct btrfs_compress_op btrfs_zlib_compress = { +const struct btrfs_compress_op btrfs_zlib_compress = { .alloc_workspace = zlib_alloc_workspace, .free_workspace = zlib_free_workspace, .compress_pages = zlib_compress_pages, diff --git a/fs/buffer.c b/fs/buffer.c index 20805db2c987..1cf7a53a0277 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -30,6 +30,7 @@ #include <linux/quotaops.h> #include <linux/highmem.h> #include <linux/export.h> +#include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/hash.h> #include <linux/suspend.h> @@ -44,6 +45,9 @@ #include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -623,21 +627,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. + * + * The caller must hold mem_cgroup_begin_page_stat() lock. */ -static void __set_page_dirty(struct page *page, - struct address_space *mapping, int warn) +static void __set_page_dirty(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg, int warn) { unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irqrestore(&mapping->tree_lock, flags); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } /* @@ -668,6 +673,7 @@ static void __set_page_dirty(struct page *page, int __set_page_dirty_buffers(struct page *page) { int newly_dirty; + struct mem_cgroup *memcg; struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) @@ -683,11 +689,22 @@ int __set_page_dirty_buffers(struct page *page) bh = bh->b_this_page; } while (bh != head); } + /* + * Use mem_group_begin_page_stat() to keep PageDirty synchronized with + * per-memcg dirty page counters. + */ + memcg = mem_cgroup_begin_page_stat(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); if (newly_dirty) - __set_page_dirty(page, mapping, 1); + __set_page_dirty(page, mapping, memcg, 1); + + mem_cgroup_end_page_stat(memcg); + + if (newly_dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return newly_dirty; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -1158,11 +1175,18 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!test_set_buffer_dirty(bh)) { struct page *page = bh->b_page; + struct address_space *mapping = NULL; + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); + mapping = page_mapping(page); if (mapping) - __set_page_dirty(page, mapping, 0); + __set_page_dirty(page, mapping, memcg, 0); } + mem_cgroup_end_page_stat(memcg); + if (mapping) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } } EXPORT_SYMBOL(mark_buffer_dirty); @@ -1684,8 +1708,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : WRITE); + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1774,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(write_op, bh); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -1828,7 +1851,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh(write_op, bh); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -2938,10 +2961,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) { struct buffer_head *bh = bio->bi_private; - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - } - if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) set_bit(BH_Quiet, &bh->b_state); @@ -2997,10 +3016,10 @@ void guard_bio_eod(int rw, struct bio *bio) } } -int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, struct writeback_control *wbc) { struct bio *bio; - int ret = 0; BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); @@ -3020,6 +3039,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) */ bio = bio_alloc(GFP_NOIO, 1); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_io_vec[0].bv_page = bh->b_page; @@ -3041,20 +3065,19 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) if (buffer_prio(bh)) rw |= REQ_PRIO; - bio_get(bio); submit_bio(rw, bio); + return 0; +} - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - - bio_put(bio); - return ret; +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +{ + return submit_bh_wbc(rw, bh, bio_flags, NULL); } EXPORT_SYMBOL_GPL(_submit_bh); int submit_bh(int rw, struct buffer_head *bh) { - return _submit_bh(rw, bh, 0); + return submit_bh_wbc(rw, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); @@ -3244,7 +3267,7 @@ int try_to_free_buffers(struct page *page) * dirty bit from being lost. */ if (ret) - cancel_dirty_page(page, PAGE_CACHE_SIZE); + cancel_dirty_page(page); spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index fbb08e97438d..6af790fc3df8 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -123,11 +123,11 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) /* check parameters */ ret = -EOPNOTSUPP; - if (!root->d_inode || - !root->d_inode->i_op->lookup || - !root->d_inode->i_op->mkdir || - !root->d_inode->i_op->setxattr || - !root->d_inode->i_op->getxattr || + if (d_is_negative(root) || + !d_backing_inode(root)->i_op->lookup || + !d_backing_inode(root)->i_op->mkdir || + !d_backing_inode(root)->i_op->setxattr || + !d_backing_inode(root)->i_op->getxattr || !root->d_sb->s_op->statfs || !root->d_sb->s_op->sync_fs) goto error_unsupported; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 232426214fdd..afa023dded5b 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -441,12 +441,12 @@ static int cachefiles_attr_changed(struct fscache_object *_object) fscache_set_store_limit(&object->fscache, ni_size); - oi_size = i_size_read(object->backer->d_inode); + oi_size = i_size_read(d_backing_inode(object->backer)); if (oi_size == ni_size) return 0; cachefiles_begin_secure(cache, &saved_cred); - mutex_lock(&object->backer->d_inode->i_mutex); + mutex_lock(&d_inode(object->backer)->i_mutex); /* if there's an extension to a partial page at the end of the backing * file, we need to discard the partial page so that we pick up new @@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) ret = notify_change(object->backer, &newattrs, NULL); truncate_failed: - mutex_unlock(&object->backer->d_inode->i_mutex); + mutex_unlock(&d_inode(object->backer)->i_mutex); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) { diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 8c52472d2efa..aecd0859eacb 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -43,7 +43,6 @@ struct cachefiles_object { loff_t i_size; /* object size */ unsigned long flags; #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ -#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ atomic_t usage; /* object usage count */ uint8_t type; /* object type */ uint8_t new; /* T if object new */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 1e51714eb33e..fc1056f5c96a 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -97,7 +97,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object, * call vfs_unlink(), vfs_rmdir() or vfs_rename() */ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, - struct dentry *dentry) + struct dentry *dentry, + enum fscache_why_object_killed why) { struct cachefiles_object *object; struct rb_node *p; @@ -132,8 +133,9 @@ found_dentry: pr_err("\n"); pr_err("Error: Can't preemptively bury live object\n"); cachefiles_printk_object(object, NULL); - } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { - pr_err("Error: Object already preemptively buried\n"); + } else { + if (why != FSCACHE_OBJECT_IS_STALE) + fscache_object_mark_killed(&object->fscache, why); } write_unlock(&cache->active_lock); @@ -265,7 +267,8 @@ requeue: static int cachefiles_bury_object(struct cachefiles_cache *cache, struct dentry *dir, struct dentry *rep, - bool preemptive) + bool preemptive, + enum fscache_why_object_killed why) { struct dentry *grave, *trap; struct path path, path_to_graveyard; @@ -286,13 +289,13 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, if (ret < 0) { cachefiles_io_error(cache, "Unlink security error"); } else { - ret = vfs_unlink(dir->d_inode, rep, NULL); + ret = vfs_unlink(d_inode(dir), rep, NULL); if (preemptive) - cachefiles_mark_object_buried(cache, rep); + cachefiles_mark_object_buried(cache, rep, why); } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); @@ -303,7 +306,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -355,7 +358,7 @@ try_again: return -EIO; } - if (grave->d_inode) { + if (d_is_positive(grave)) { unlock_rename(cache->graveyard, dir); dput(grave); grave = NULL; @@ -387,14 +390,14 @@ try_again: if (ret < 0) { cachefiles_io_error(cache, "Rename security error %d", ret); } else { - ret = vfs_rename(dir->d_inode, rep, - cache->graveyard->d_inode, grave, NULL, 0); + ret = vfs_rename(d_inode(dir), rep, + d_inode(cache->graveyard), grave, NULL, 0); if (ret != 0 && ret != -ENOMEM) cachefiles_io_error(cache, "Rename failed with error %d", ret); if (preemptive) - cachefiles_mark_object_buried(cache, rep); + cachefiles_mark_object_buried(cache, rep, why); } unlock_rename(cache->graveyard, dir); @@ -415,30 +418,31 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, _enter(",OBJ%x{%p}", object->fscache.debug_id, object->dentry); ASSERT(object->dentry); - ASSERT(object->dentry->d_inode); + ASSERT(d_backing_inode(object->dentry)); ASSERT(object->dentry->d_parent); dir = dget_parent(object->dentry); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); - if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = 0; } else { /* we need to check that our parent is _still_ our parent - it * may have been renamed */ if (dir == object->dentry->d_parent) { ret = cachefiles_bury_object(cache, dir, - object->dentry, false); + object->dentry, false, + FSCACHE_OBJECT_WAS_RETIRED); } else { /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore * it */ - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = 0; } } @@ -473,7 +477,7 @@ int cachefiles_walk_to_object(struct cachefiles_object *parent, path.mnt = cache->mnt; ASSERT(parent->dentry); - ASSERT(parent->dentry->d_inode); + ASSERT(d_backing_inode(parent->dentry)); if (!(d_is_dir(parent->dentry))) { // TODO: convert file to dir @@ -497,7 +501,7 @@ lookup_again: /* search the current directory for the element name */ _debug("lookup '%s'", name); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); start = jiffies; next = lookup_one_len(name, dir, nlen); @@ -505,74 +509,74 @@ lookup_again: if (IS_ERR(next)) goto lookup_error; - _debug("next -> %p %s", next, next->d_inode ? "positive" : "negative"); + _debug("next -> %p %s", next, d_backing_inode(next) ? "positive" : "negative"); if (!key) - object->new = !next->d_inode; + object->new = !d_backing_inode(next); /* if this element of the path doesn't exist, then the lookup phase * failed, and we can release any readers in the certain knowledge that * there's nothing for them to actually read */ - if (!next->d_inode) + if (d_is_negative(next)) fscache_object_lookup_negative(&object->fscache); /* we need to create the object if it's negative */ if (key || object->type == FSCACHE_COOKIE_TYPE_INDEX) { /* index objects and intervening tree levels must be subdirs */ - if (!next->d_inode) { + if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) - goto create_error; + goto no_space_error; path.dentry = dir; ret = security_path_mkdir(&path, next, 0); if (ret < 0) goto create_error; start = jiffies; - ret = vfs_mkdir(dir->d_inode, next, 0); + ret = vfs_mkdir(d_inode(dir), next, 0); cachefiles_hist(cachefiles_mkdir_histogram, start); if (ret < 0) goto create_error; - ASSERT(next->d_inode); + ASSERT(d_backing_inode(next)); _debug("mkdir -> %p{%p{ino=%lu}}", - next, next->d_inode, next->d_inode->i_ino); + next, d_backing_inode(next), d_backing_inode(next)->i_ino); } else if (!d_can_lookup(next)) { pr_err("inode %lu is not a directory\n", - next->d_inode->i_ino); + d_backing_inode(next)->i_ino); ret = -ENOBUFS; goto error; } } else { /* non-index objects start out life as files */ - if (!next->d_inode) { + if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) - goto create_error; + goto no_space_error; path.dentry = dir; ret = security_path_mknod(&path, next, S_IFREG, 0); if (ret < 0) goto create_error; start = jiffies; - ret = vfs_create(dir->d_inode, next, S_IFREG, true); + ret = vfs_create(d_inode(dir), next, S_IFREG, true); cachefiles_hist(cachefiles_create_histogram, start); if (ret < 0) goto create_error; - ASSERT(next->d_inode); + ASSERT(d_backing_inode(next)); _debug("create -> %p{%p{ino=%lu}}", - next, next->d_inode, next->d_inode->i_ino); + next, d_backing_inode(next), d_backing_inode(next)->i_ino); } else if (!d_can_lookup(next) && !d_is_reg(next) ) { pr_err("inode %lu is not a file or directory\n", - next->d_inode->i_ino); + d_backing_inode(next)->i_ino); ret = -ENOBUFS; goto error; } @@ -581,7 +585,7 @@ lookup_again: /* process the next component */ if (key) { _debug("advance"); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(dir); dir = next; next = NULL; @@ -602,7 +606,8 @@ lookup_again: * mutex) */ object->dentry = NULL; - ret = cachefiles_bury_object(cache, dir, next, true); + ret = cachefiles_bury_object(cache, dir, next, true, + FSCACHE_OBJECT_IS_STALE); dput(next); next = NULL; @@ -610,6 +615,7 @@ lookup_again: goto delete_error; _debug("redo lookup"); + fscache_object_retrying_stale(&object->fscache); goto lookup_again; } } @@ -617,7 +623,7 @@ lookup_again: /* note that we're now using this object */ ret = cachefiles_mark_object_active(cache, object); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(dir); dir = NULL; @@ -646,7 +652,7 @@ lookup_again: const struct address_space_operations *aops; ret = -EPERM; - aops = object->dentry->d_inode->i_mapping->a_ops; + aops = d_backing_inode(object->dentry)->i_mapping->a_ops; if (!aops->bmap) goto check_error; @@ -659,9 +665,11 @@ lookup_again: object->new = 0; fscache_obtained_object(&object->fscache); - _leave(" = 0 [%lu]", object->dentry->d_inode->i_ino); + _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino); return 0; +no_space_error: + fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE); create_error: _debug("create error %d", ret); if (ret == -EIO) @@ -695,7 +703,7 @@ lookup_error: cachefiles_io_error(cache, "Lookup failed"); next = NULL; error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(next); error_out2: dput(dir); @@ -719,7 +727,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); @@ -731,10 +739,10 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } _debug("subdir -> %p %s", - subdir, subdir->d_inode ? "positive" : "negative"); + subdir, d_backing_inode(subdir) ? "positive" : "negative"); /* we need to create the subdir if it doesn't exist yet */ - if (!subdir->d_inode) { + if (d_is_negative(subdir)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) goto mkdir_error; @@ -746,22 +754,22 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, ret = security_path_mkdir(&path, subdir, 0700); if (ret < 0) goto mkdir_error; - ret = vfs_mkdir(dir->d_inode, subdir, 0700); + ret = vfs_mkdir(d_inode(dir), subdir, 0700); if (ret < 0) goto mkdir_error; - ASSERT(subdir->d_inode); + ASSERT(d_backing_inode(subdir)); _debug("mkdir -> %p{%p{ino=%lu}}", subdir, - subdir->d_inode, - subdir->d_inode->i_ino); + d_backing_inode(subdir), + d_backing_inode(subdir)->i_ino); } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); /* we need to make sure the subdir is a directory */ - ASSERT(subdir->d_inode); + ASSERT(d_backing_inode(subdir)); if (!d_can_lookup(subdir)) { pr_err("%s is not a directory\n", dirname); @@ -770,18 +778,18 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, } ret = -EPERM; - if (!subdir->d_inode->i_op->setxattr || - !subdir->d_inode->i_op->getxattr || - !subdir->d_inode->i_op->lookup || - !subdir->d_inode->i_op->mkdir || - !subdir->d_inode->i_op->create || - (!subdir->d_inode->i_op->rename && - !subdir->d_inode->i_op->rename2) || - !subdir->d_inode->i_op->rmdir || - !subdir->d_inode->i_op->unlink) + if (!d_backing_inode(subdir)->i_op->setxattr || + !d_backing_inode(subdir)->i_op->getxattr || + !d_backing_inode(subdir)->i_op->lookup || + !d_backing_inode(subdir)->i_op->mkdir || + !d_backing_inode(subdir)->i_op->create || + (!d_backing_inode(subdir)->i_op->rename && + !d_backing_inode(subdir)->i_op->rename2) || + !d_backing_inode(subdir)->i_op->rmdir || + !d_backing_inode(subdir)->i_op->unlink) goto check_error; - _leave(" = [%lu]", subdir->d_inode->i_ino); + _leave(" = [%lu]", d_backing_inode(subdir)->i_ino); return subdir; check_error: @@ -790,19 +798,19 @@ check_error: return ERR_PTR(ret); mkdir_error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } @@ -827,7 +835,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, // dir, filename); /* look up the victim */ - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); @@ -836,13 +844,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, goto lookup_error; //_debug("victim -> %p %s", - // victim, victim->d_inode ? "positive" : "negative"); + // victim, d_backing_inode(victim) ? "positive" : "negative"); /* if the object is no longer there then we probably retired the object * at the netfs's request whilst the cull was in progress */ - if (!victim->d_inode) { - mutex_unlock(&dir->d_inode->i_mutex); + if (d_is_negative(victim)) { + mutex_unlock(&d_inode(dir)->i_mutex); dput(victim); _leave(" = -ENOENT [absent]"); return ERR_PTR(-ENOENT); @@ -871,13 +879,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, object_in_use: read_unlock(&cache->active_lock); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(victim); //_leave(" = -EBUSY [in use]"); return ERR_PTR(-EBUSY); lookup_error: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); ret = PTR_ERR(victim); if (ret == -ENOENT) { /* file or dir now absent - probably retired by netfs */ @@ -913,7 +921,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return PTR_ERR(victim); _debug("victim -> %p %s", - victim, victim->d_inode ? "positive" : "negative"); + victim, d_backing_inode(victim) ? "positive" : "negative"); /* okay... the victim is not being used so we can cull it * - start by marking it as stale @@ -927,7 +935,8 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, /* actually remove the victim (drops the dir mutex) */ _debug("bury"); - ret = cachefiles_bury_object(cache, dir, victim, false); + ret = cachefiles_bury_object(cache, dir, victim, false, + FSCACHE_OBJECT_WAS_CULLED); if (ret < 0) goto error; @@ -936,7 +945,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return 0; error_unlock: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); error: dput(victim); if (ret == -ENOENT) { @@ -971,7 +980,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, if (IS_ERR(victim)) return PTR_ERR(victim); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(victim); //_leave(" = 0"); return 0; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index c6cd8d7a4eef..3cbb0e834694 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -74,12 +74,12 @@ static int cachefiles_read_waiter(wait_queue_t *wait, unsigned mode, static int cachefiles_read_reissue(struct cachefiles_object *object, struct cachefiles_one_read *monitor) { - struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; struct page *backpage = monitor->back_page, *backpage2; int ret; _enter("{ino=%lx},{%lx,%lx}", - object->backer->d_inode->i_ino, + d_backing_inode(object->backer)->i_ino, backpage->index, backpage->flags); /* skip if the page was truncated away completely */ @@ -157,7 +157,7 @@ static void cachefiles_read_copier(struct fscache_operation *_op) object = container_of(op->op.object, struct cachefiles_object, fscache); - _enter("{ino=%lu}", object->backer->d_inode->i_ino); + _enter("{ino=%lu}", d_backing_inode(object->backer)->i_ino); max = 8; spin_lock_irq(&object->work_lock); @@ -247,7 +247,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, init_waitqueue_func_entry(&monitor->monitor, cachefiles_read_waiter); /* attempt to get hold of the backing page */ - bmapping = object->backer->d_inode->i_mapping; + bmapping = d_backing_inode(object->backer)->i_mapping; newpage = NULL; for (;;) { @@ -408,7 +408,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, if (!object->backer) goto enobufs; - inode = object->backer->d_inode; + inode = d_backing_inode(object->backer); ASSERT(S_ISREG(inode->i_mode)); ASSERT(inode->i_mapping->a_ops->bmap); ASSERT(inode->i_mapping->a_ops->readpages); @@ -468,7 +468,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, struct list_head *list) { struct cachefiles_one_read *monitor = NULL; - struct address_space *bmapping = object->backer->d_inode->i_mapping; + struct address_space *bmapping = d_backing_inode(object->backer)->i_mapping; struct page *newpage = NULL, *netpage, *_n, *backpage = NULL; int ret = 0; @@ -705,7 +705,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, if (cachefiles_has_space(cache, 0, *nr_pages) < 0) space = 0; - inode = object->backer->d_inode; + inode = d_backing_inode(object->backer); ASSERT(S_ISREG(inode->i_mode)); ASSERT(inode->i_mapping->a_ops->bmap); ASSERT(inode->i_mapping->a_ops->readpages); diff --git a/fs/cachefiles/security.c b/fs/cachefiles/security.c index 396c18ea2764..31bbc0528b11 100644 --- a/fs/cachefiles/security.c +++ b/fs/cachefiles/security.c @@ -55,14 +55,14 @@ static int cachefiles_check_cache_dir(struct cachefiles_cache *cache, { int ret; - ret = security_inode_mkdir(root->d_inode, root, 0); + ret = security_inode_mkdir(d_backing_inode(root), root, 0); if (ret < 0) { pr_err("Security denies permission to make dirs: error %d", ret); return ret; } - ret = security_inode_create(root->d_inode, root, 0); + ret = security_inode_create(d_backing_inode(root), root, 0); if (ret < 0) pr_err("Security denies permission to create files: error %d", ret); @@ -95,7 +95,7 @@ int cachefiles_determine_cache_security(struct cachefiles_cache *cache, /* use the cache root dir's security context as the basis with * which create files */ - ret = set_create_files_as(new, root->d_inode); + ret = set_create_files_as(new, d_backing_inode(root)); if (ret < 0) { abort_creds(new); cachefiles_begin_secure(cache, _saved_cred); diff --git a/fs/cachefiles/xattr.c b/fs/cachefiles/xattr.c index a8a68745e11d..d31c1a72d8a5 100644 --- a/fs/cachefiles/xattr.c +++ b/fs/cachefiles/xattr.c @@ -33,7 +33,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) int ret; ASSERT(dentry); - ASSERT(dentry->d_inode); + ASSERT(d_backing_inode(dentry)); if (!object->fscache.cookie) strcpy(type, "C3"); @@ -52,7 +52,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) if (ret != -EEXIST) { pr_err("Can't set xattr on %pd [%lu] (err %d)\n", - dentry, dentry->d_inode->i_ino, + dentry, d_backing_inode(dentry)->i_ino, -ret); goto error; } @@ -64,7 +64,7 @@ int cachefiles_check_object_type(struct cachefiles_object *object) goto bad_type_length; pr_err("Can't read xattr on %pd [%lu] (err %d)\n", - dentry, dentry->d_inode->i_ino, + dentry, d_backing_inode(dentry)->i_ino, -ret); goto error; } @@ -84,14 +84,14 @@ error: bad_type_length: pr_err("Cache object %lu type xattr length incorrect\n", - dentry->d_inode->i_ino); + d_backing_inode(dentry)->i_ino); ret = -EIO; goto error; bad_type: xtype[2] = 0; pr_err("Cache object %pd [%lu] type %s not %s\n", - dentry, dentry->d_inode->i_ino, + dentry, d_backing_inode(dentry)->i_ino, xtype, type); ret = -EIO; goto error; @@ -165,7 +165,7 @@ int cachefiles_check_auxdata(struct cachefiles_object *object) int ret; ASSERT(dentry); - ASSERT(dentry->d_inode); + ASSERT(d_backing_inode(dentry)); ASSERT(object->fscache.cookie->def->check_aux); auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, GFP_KERNEL); @@ -204,7 +204,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, _enter("%p,#%d", object, auxdata->len); ASSERT(dentry); - ASSERT(dentry->d_inode); + ASSERT(d_backing_inode(dentry)); auxbuf = kmalloc(sizeof(struct cachefiles_xattr) + 512, cachefiles_gfp); if (!auxbuf) { @@ -225,7 +225,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, cachefiles_io_error_obj(object, "Can't read xattr on %lu (err %d)", - dentry->d_inode->i_ino, -ret); + d_backing_inode(dentry)->i_ino, -ret); goto error; } @@ -276,7 +276,7 @@ int cachefiles_check_object_xattr(struct cachefiles_object *object, cachefiles_io_error_obj(object, "Can't update xattr on %lu" " (error %d)", - dentry->d_inode->i_ino, -ret); + d_backing_inode(dentry)->i_ino, -ret); goto error; } } @@ -291,7 +291,7 @@ error: bad_type_length: pr_err("Cache object %lu xattr length incorrect\n", - dentry->d_inode->i_ino); + d_backing_inode(dentry)->i_ino); ret = -EIO; goto error; @@ -316,7 +316,7 @@ int cachefiles_remove_object_xattr(struct cachefiles_cache *cache, cachefiles_io_error(cache, "Can't remove xattr from %lu" " (error %d)", - dentry->d_inode->i_ino, -ret); + d_backing_inode(dentry)->i_ino, -ret); } _leave(" = %d", ret); diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 64fa248343f6..8f84646f10e9 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, val_size2 = posix_acl_xattr_size(default_acl->a_count); err = -ENOMEM; - tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS); + tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); if (!tmp_buf) goto out_err; - pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS); + pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL); if (!pagelist) goto out_err; ceph_pagelist_init(pagelist); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index fd5599d32362..890c50971a69 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -87,17 +87,21 @@ static int ceph_set_page_dirty(struct page *page) inode = mapping->host; ci = ceph_inode(inode); - /* - * Note that we're grabbing a snapc ref here without holding - * any locks! - */ - snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); - /* dirty the head */ spin_lock(&ci->i_ceph_lock); - if (ci->i_head_snapc == NULL) - ci->i_head_snapc = ceph_get_snap_context(snapc); - ++ci->i_wrbuffer_ref_head; + BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + capsnap->dirty_pages++; + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + ++ci->i_wrbuffer_ref_head; + } if (ci->i_wrbuffer_ref == 0) ihold(inode); ++ci->i_wrbuffer_ref; @@ -346,7 +350,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) /* build page vector */ nr_pages = calc_pages_for(0, len); - pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL); ret = -ENOMEM; if (!pages) goto out; @@ -358,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) dout("start_read %p adding %p idx %lu\n", inode, page, page->index); if (add_to_page_cache_lru(page, &inode->i_data, page->index, - GFP_NOFS)) { + GFP_KERNEL)) { ceph_fscache_uncache_page(inode, page); page_cache_release(page); dout("start_read %p add_to_page_cache failed %p\n", @@ -436,7 +440,7 @@ out: * only snap context we are allowed to write back. */ static struct ceph_snap_context *get_oldest_context(struct inode *inode, - u64 *snap_size) + loff_t *snap_size) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; @@ -476,8 +480,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_osd_client *osdc; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); + loff_t snap_size = -1; long writeback_stat; - u64 truncate_size, snap_size = 0; + u64 truncate_size; u32 truncate_seq; int err = 0, len = PAGE_CACHE_SIZE; @@ -512,7 +517,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) spin_lock(&ci->i_ceph_lock); truncate_seq = ci->i_truncate_seq; truncate_size = ci->i_truncate_size; - if (!snap_size) + if (snap_size == -1) snap_size = i_size_read(inode); spin_unlock(&ci->i_ceph_lock); @@ -695,7 +700,8 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync = 0; - u64 truncate_size, snap_size; + loff_t snap_size, i_size; + u64 truncate_size; u32 truncate_seq; /* @@ -741,7 +747,7 @@ static int ceph_writepages_start(struct address_space *mapping, retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); - snap_size = 0; + snap_size = -1; snapc = get_oldest_context(inode, &snap_size); if (!snapc) { /* hmm, why does writepages get called when there @@ -749,16 +755,13 @@ retry: dout(" no snap context with dirty data?\n"); goto out; } - if (snap_size == 0) - snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); spin_lock(&ci->i_ceph_lock); truncate_seq = ci->i_truncate_seq; truncate_size = ci->i_truncate_size; - if (!snap_size) - snap_size = i_size_read(inode); + i_size = i_size_read(inode); spin_unlock(&ci->i_ceph_lock); if (last_snapc && snapc != last_snapc) { @@ -828,8 +831,10 @@ get_more_pages: dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if (page_offset(page) >= snap_size) { - dout("%p page eof %llu\n", page, snap_size); + if (page_offset(page) >= + (snap_size == -1 ? i_size : snap_size)) { + dout("%p page eof %llu\n", page, + (snap_size == -1 ? i_size : snap_size)); done = 1; unlock_page(page); break; @@ -884,7 +889,8 @@ get_more_pages: } if (do_sync) - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(req, 1, + CEPH_OSD_OP_STARTSYNC, 0); req->r_callback = writepages_finish; req->r_inode = inode; @@ -944,10 +950,18 @@ get_more_pages: } /* Format the osd request message and submit the write */ - offset = page_offset(pages[0]); - len = min(snap_size - offset, - (u64)locked_pages << PAGE_CACHE_SHIFT); + len = (u64)locked_pages << PAGE_CACHE_SHIFT; + if (snap_size == -1) { + len = min(len, (u64)i_size_read(inode) - offset); + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + len = max(len, 1 + + ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT)); + } else { + len = min(len, snap_size - offset); + } dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); @@ -1032,7 +1046,6 @@ static int ceph_update_writeable_page(struct file *file, { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t page_off = pos & PAGE_CACHE_MASK; int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; @@ -1044,10 +1057,6 @@ retry_locked: /* writepages currently holds page lock, but if we change that later, */ wait_on_page_writeback(page); - /* check snap context */ - BUG_ON(!ci->i_snap_realm); - down_read(&mdsc->snap_rwsem); - BUG_ON(!ci->i_snap_realm->cached_context); snapc = page_snap_context(page); if (snapc && snapc != ci->i_head_snapc) { /* @@ -1055,7 +1064,6 @@ retry_locked: * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL); - up_read(&mdsc->snap_rwsem); if (snapc->seq > oldest->seq) { ceph_put_snap_context(oldest); @@ -1112,7 +1120,6 @@ retry_locked: } /* we need to read it. */ - up_read(&mdsc->snap_rwsem); r = readpage_nounlock(file, page); if (r < 0) goto fail_nosnap; @@ -1146,6 +1153,10 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, inode, page, (int)pos, (int)len); r = ceph_update_writeable_page(file, pos, len, page); + if (r < 0) + page_cache_release(page); + else + *pagep = page; } while (r == -EAGAIN); return r; @@ -1153,16 +1164,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, /* * we don't do anything in here that simple_write_end doesn't do - * except adjust dirty page accounting and drop read lock on - * mdsc->snap_rwsem. + * except adjust dirty page accounting */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; @@ -1184,7 +1192,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, set_page_dirty(page); unlock_page(page); - up_read(&mdsc->snap_rwsem); page_cache_release(page); if (check_cap) @@ -1198,8 +1205,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, * intercept O_DIRECT reads and writes early, this function should * never get called. */ -static ssize_t ceph_direct_io(int rw, struct kiocb *iocb, - struct iov_iter *iter, +static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { WARN_ON(1); @@ -1311,13 +1317,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_cap_flush *prealloc_cf; struct page *page = vmf->page; loff_t off = page_offset(page); loff_t size = i_size_read(inode); size_t len; int want, got, ret; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return VM_FAULT_SIGBUS; + if (ci->i_inline_version != CEPH_INLINE_NONE) { struct page *locked_page = NULL; if (off == 0) { @@ -1327,8 +1337,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ceph_uninline_data(vma->vm_file, locked_page); if (locked_page) unlock_page(locked_page); - if (ret < 0) - return VM_FAULT_SIGBUS; + if (ret < 0) { + ret = VM_FAULT_SIGBUS; + goto out_free; + } } if (off + PAGE_CACHE_SIZE <= size) @@ -1350,7 +1362,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) break; if (ret != -ERESTARTSYS) { WARN_ON(1); - return VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; + goto out_free; } } dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", @@ -1370,7 +1383,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret == 0) { /* success. we'll keep the page locked. */ set_page_dirty(page); - up_read(&mdsc->snap_rwsem); ret = VM_FAULT_LOCKED; } else { if (ret == -ENOMEM) @@ -1386,7 +1398,8 @@ out: int dirty; spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1395,6 +1408,8 @@ out: dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", inode, off, len, ceph_cap_string(got), ret); ceph_put_cap_refs(ci, got); +out_free: + ceph_free_cap_flush(prealloc_cf); return ret; } @@ -1506,8 +1521,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_vino(inode), 0, &len, 0, 1, CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - ci->i_snap_realm->cached_context, - 0, 0, false); + ceph_empty_snapc, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -1525,7 +1539,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_vino(inode), 0, &len, 1, 3, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - ci->i_snap_realm->cached_context, + ceph_empty_snapc, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -1535,19 +1549,27 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false); - err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, - "inline_version", &inline_version, - sizeof(inline_version), - CEPH_OSD_CMPXATTR_OP_GT, - CEPH_OSD_CMPXATTR_MODE_U64); - if (err) - goto out_put; + { + __le64 xattr_buf = cpu_to_le64(inline_version); + err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR, + "inline_version", &xattr_buf, + sizeof(xattr_buf), + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64); + if (err) + goto out_put; + } - err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, - "inline_version", &inline_version, - sizeof(inline_version), 0, 0); - if (err) - goto out_put; + { + char xattr_buf[32]; + int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf), + "%llu", inline_version); + err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR, + "inline_version", + xattr_buf, xattr_len, 0, 0); + if (err) + goto out_put; + } ceph_osdc_build_request(req, 0, NULL, CEPH_NOSNAP, &inode->i_mtime); err = ceph_osdc_start_request(&fsc->client->osdc, req, false); @@ -1586,3 +1608,206 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &ceph_vmops; return 0; } + +enum { + POOL_READ = 1, + POOL_WRITE = 2, +}; + +static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; + struct rb_node **p, *parent; + struct ceph_pool_perm *perm; + struct page **pages; + int err = 0, err2 = 0, have = 0; + + down_read(&mdsc->pool_perm_rwsem); + p = &mdsc->pool_perm_tree.rb_node; + while (*p) { + perm = rb_entry(*p, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + up_read(&mdsc->pool_perm_rwsem); + if (*p) + goto out; + + dout("__ceph_pool_perm_get pool %u no perm cached\n", pool); + + down_write(&mdsc->pool_perm_rwsem); + parent = NULL; + while (*p) { + parent = *p; + perm = rb_entry(parent, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + if (*p) { + up_write(&mdsc->pool_perm_rwsem); + goto out; + } + + rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, + ceph_empty_snapc, + 1, false, GFP_NOFS); + if (!rd_req) { + err = -ENOMEM; + goto out_unlock; + } + + rd_req->r_flags = CEPH_OSD_FLAG_READ; + osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); + rd_req->r_base_oloc.pool = pool; + snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), + "%llx.00000000", ci->i_vino.ino); + rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); + + wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, + ceph_empty_snapc, + 1, false, GFP_NOFS); + if (!wr_req) { + err = -ENOMEM; + goto out_unlock; + } + + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); + wr_req->r_base_oloc.pool = pool; + wr_req->r_base_oid = rd_req->r_base_oid; + + /* one page should be large enough for STAT data */ + pages = ceph_alloc_page_vector(1, GFP_KERNEL); + if (IS_ERR(pages)) { + err = PTR_ERR(pages); + goto out_unlock; + } + + osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, + 0, false, true); + ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, + &ci->vfs_inode.i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); + + ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, + &ci->vfs_inode.i_mtime); + err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); + + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); + if (!err2) + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); + + if (err >= 0 || err == -ENOENT) + have |= POOL_READ; + else if (err != -EPERM) + goto out_unlock; + + if (err2 == 0 || err2 == -EEXIST) + have |= POOL_WRITE; + else if (err2 != -EPERM) { + err = err2; + goto out_unlock; + } + + perm = kmalloc(sizeof(*perm), GFP_NOFS); + if (!perm) { + err = -ENOMEM; + goto out_unlock; + } + + perm->pool = pool; + perm->perm = have; + rb_link_node(&perm->node, parent, p); + rb_insert_color(&perm->node, &mdsc->pool_perm_tree); + err = 0; +out_unlock: + up_write(&mdsc->pool_perm_rwsem); + + if (rd_req) + ceph_osdc_put_request(rd_req); + if (wr_req) + ceph_osdc_put_request(wr_req); +out: + if (!err) + err = have; + dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err); + return err; +} + +int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) +{ + u32 pool; + int ret, flags; + + if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), + NOPOOLPERM)) + return 0; + + spin_lock(&ci->i_ceph_lock); + flags = ci->i_ceph_flags; + pool = ceph_file_layout_pg_pool(ci->i_layout); + spin_unlock(&ci->i_ceph_lock); +check: + if (flags & CEPH_I_POOL_PERM) { + if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { + dout("ceph_pool_perm_check pool %u no read perm\n", + pool); + return -EPERM; + } + if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { + dout("ceph_pool_perm_check pool %u no write perm\n", + pool); + return -EPERM; + } + return 0; + } + + ret = __ceph_pool_perm_get(ci, pool); + if (ret < 0) + return ret; + + flags = CEPH_I_POOL_PERM; + if (ret & POOL_READ) + flags |= CEPH_I_POOL_RD; + if (ret & POOL_WRITE) + flags |= CEPH_I_POOL_WR; + + spin_lock(&ci->i_ceph_lock); + if (pool == ceph_file_layout_pg_pool(ci->i_layout)) { + ci->i_ceph_flags = flags; + } else { + pool = ceph_file_layout_pg_pool(ci->i_layout); + flags = ci->i_ceph_flags; + } + spin_unlock(&ci->i_ceph_lock); + goto check; +} + +void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc) +{ + struct ceph_pool_perm *perm; + struct rb_node *n; + + while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) { + n = rb_first(&mdsc->pool_perm_tree); + perm = rb_entry(n, struct ceph_pool_perm, node); + rb_erase(n, &mdsc->pool_perm_tree); + kfree(perm); + } +} diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8172775428a0..ddd5e9471290 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; - if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) + if (ci->i_rdcache_ref || + (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ + ci->vfs_inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; @@ -896,6 +898,18 @@ int ceph_is_any_caps(struct inode *inode) return ret; } +static void drop_inode_snap_realm(struct ceph_inode_info *ci) +{ + struct ceph_snap_realm *realm = ci->i_snap_realm; + spin_lock(&realm->inodes_with_caps_lock); + list_del_init(&ci->i_snap_realm_item); + ci->i_snap_realm_counter++; + ci->i_snap_realm = NULL; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc, + realm); +} + /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * @@ -914,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) /* remove from session list */ spin_lock(&session->s_cap_lock); - /* - * s_cap_reconnect is protected by s_cap_lock. no one changes - * s_cap_gen while session is in the reconnect state. - */ - if (queue_release && - (!session->s_cap_reconnect || - cap->cap_gen == session->s_cap_gen)) - __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, - cap->mseq, cap->issue_seq); - if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -936,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) } /* protect backpointer with s_cap_lock: see iterate_session_caps */ cap->ci = NULL; + + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { + cap->queue_release = 1; + if (removed) { + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; + removed = 0; + } + } else { + cap->queue_release = 0; + } + cap->cap_ino = ci->i_vino.ino; + spin_unlock(&session->s_cap_lock); /* remove from inode list */ @@ -946,15 +969,13 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) if (removed) ceph_put_cap(mdsc, cap); - if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { - struct ceph_snap_realm *realm = ci->i_snap_realm; - spin_lock(&realm->inodes_with_caps_lock); - list_del_init(&ci->i_snap_realm_item); - ci->i_snap_realm_counter++; - ci->i_snap_realm = NULL; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_put_snap_realm(mdsc, realm); - } + /* when reconnect denied, we remove session caps forcibly, + * i_wr_ref can be non-zero. If there are ongoing write, + * keep i_snap_realm. + */ + if (!__ceph_is_any_caps(ci) && ci->i_wr_ref == 0 && ci->i_snap_realm) + drop_inode_snap_realm(ci); + if (!__ceph_is_any_real_caps(ci)) __cap_delay_cancel(mdsc, ci); } @@ -967,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) static int send_cap_msg(struct ceph_mds_session *session, u64 ino, u64 cid, int op, int caps, int wanted, int dirty, - u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, - u64 size, u64 max_size, + u32 seq, u64 flush_tid, u64 oldest_flush_tid, + u32 issue_seq, u32 mseq, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, kuid_t uid, kgid_t gid, umode_t mode, @@ -982,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session, size_t extra_len; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" - " seq %u/%u mseq %u follows %lld size %llu/%llu" + " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), ceph_cap_string(dirty), - seq, issue_seq, mseq, follows, size, max_size, + seq, issue_seq, flush_tid, oldest_flush_tid, + mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - /* flock buffer size + inline version + inline data size */ - extra_len = 4 + 8 + 4; + /* flock buffer size + inline version + inline data size + + * osd_epoch_barrier + oldest_flush_tid */ + extra_len = 4 + 8 + 4 + 4 + 8; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, GFP_NOFS, false); if (!msg) return -ENOMEM; + msg->hdr.version = cpu_to_le16(6); msg->hdr.tid = cpu_to_le64(flush_tid); fc = msg->front.iov_base; @@ -1031,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); + /* osd_epoch_barrier */ + ceph_encode_32(&p, 0); + /* oldest_flush_tid */ + ceph_encode_64(&p, oldest_flush_tid); fc->xattr_version = cpu_to_le64(xattr_version); if (xattrs_buf) { @@ -1043,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) -{ - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - struct ceph_mds_cap_item *item; - - BUG_ON(!session->s_num_cap_releases); - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - - dout(" adding %llx release to mds%d msg %p (%d left)\n", - ino, session->s_mds, msg, session->s_num_cap_releases); - - BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); - head = msg->front.iov_base; - le32_add_cpu(&head->num, 1); - item = msg->front.iov_base + msg->front.iov_len; - item->ino = cpu_to_le64(ino); - item->cap_id = cpu_to_le64(cap_id); - item->migrate_seq = cpu_to_le32(migrate_seq); - item->seq = cpu_to_le32(issue_seq); - - session->s_num_cap_releases--; - - msg->front.iov_len += sizeof(*item); - if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { - dout(" release msg %p full\n", msg); - list_move_tail(&msg->list_head, &session->s_cap_releases_done); - } else { - dout(" release msg %p at %d/%d (%d)\n", msg, - (int)le32_to_cpu(head->num), - (int)CEPH_CAPS_PER_RELEASE, - (int)msg->front.iov_len); - } -} - /* * Queue cap releases when an inode is dropped from our cache. Since * inode is about to be destroyed, there is no need for i_ceph_lock. @@ -1117,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode) */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, int op, int used, int want, int retain, int flushing, - unsigned *pflush_tid) + u64 flush_tid, u64 oldest_flush_tid) __releases(cap->ci->i_ceph_lock) { struct ceph_inode_info *ci = cap->ci; @@ -1135,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 xattr_version = 0; struct ceph_buffer *xattr_blob = NULL; int delayed = 0; - u64 flush_tid = 0; - int i; int ret; bool inline_data; @@ -1180,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, cap->implemented &= cap->issued | used; cap->mds_wanted = want; - if (flushing) { - /* - * assign a tid for flush operations so we can avoid - * flush1 -> dirty1 -> flush2 -> flushack1 -> mark - * clean type races. track latest tid for every bit - * so we can handle flush AxFw, flush Fw, and have the - * first ack clean Ax. - */ - flush_tid = ++ci->i_cap_flush_last_tid; - if (pflush_tid) - *pflush_tid = flush_tid; - dout(" cap_flush_tid %d\n", (int)flush_tid); - for (i = 0; i < CEPH_CAP_BITS; i++) - if (flushing & (1 << i)) - ci->i_cap_flush_tid[i] = flush_tid; - - follows = ci->i_head_snapc->seq; - } else { - follows = 0; - } + follows = flushing ? ci->i_head_snapc->seq : 0; keep = cap->implemented; seq = cap->seq; @@ -1227,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, - op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, + op, keep, want, flushing, seq, + flush_tid, oldest_flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, uid, gid, mode, xattr_version, xattr_blob, follows, inline_data); @@ -1249,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Unless @again is true, skip cap_snaps that were already sent to + * Unless @kick is true, skip cap_snaps that were already sent to * the MDS (i.e., during this session). * * Called under i_ceph_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession, - int again) + int kick) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { @@ -1287,11 +1257,8 @@ retry: if (capsnap->dirty_pages || capsnap->writing) break; - /* - * if cap writeback already occurred, we should have dropped - * the capsnap in ceph_put_wrbuffer_cap_refs. - */ - BUG_ON(capsnap->dirty == 0); + /* should be removed by ceph_try_drop_cap_snap() */ + BUG_ON(!capsnap->need_flush); /* pick mds, take s_mutex */ if (ci->i_auth_cap == NULL) { @@ -1300,7 +1267,7 @@ retry: } /* only flush each capsnap once */ - if (!again && !list_empty(&capsnap->flushing_item)) { + if (!kick && !list_empty(&capsnap->flushing_item)) { dout("already flushed %p, skipping\n", capsnap); continue; } @@ -1310,6 +1277,9 @@ retry: if (session && session->s_mds != mds) { dout("oops, wrong session %p mutex\n", session); + if (kick) + goto out; + mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); session = NULL; @@ -1333,20 +1303,22 @@ retry: goto retry; } - capsnap->flush_tid = ++ci->i_cap_flush_last_tid; + spin_lock(&mdsc->cap_dirty_lock); + capsnap->flush_tid = ++mdsc->last_cap_flush_tid; + spin_unlock(&mdsc->cap_dirty_lock); + atomic_inc(&capsnap->nref); - if (!list_empty(&capsnap->flushing_item)) - list_del_init(&capsnap->flushing_item); - list_add_tail(&capsnap->flushing_item, - &session->s_cap_snaps_flushing); + if (list_empty(&capsnap->flushing_item)) + list_add_tail(&capsnap->flushing_item, + &session->s_cap_snaps_flushing); spin_unlock(&ci->i_ceph_lock); dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", inode, capsnap, capsnap->follows, capsnap->flush_tid); send_cap_msg(session, ceph_vino(inode).ino, 0, CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, - capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, - capsnap->size, 0, + capsnap->dirty, 0, capsnap->flush_tid, 0, + 0, mseq, capsnap->size, 0, &capsnap->mtime, &capsnap->atime, capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, @@ -1386,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) * Caller is then responsible for calling __mark_inode_dirty with the * returned flags value. */ -int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; @@ -1394,17 +1367,28 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) int was = ci->i_dirty_caps; int dirty = 0; + if (!ci->i_auth_cap) { + pr_warn("__mark_dirty_caps %p %llx mask %s, " + "but no auth cap (session was closed?)\n", + inode, ceph_ino(inode), ceph_cap_string(mask)); + return 0; + } + dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode, ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { - if (!ci->i_head_snapc) + WARN_ON_ONCE(ci->i_prealloc_cap_flush); + swap(ci->i_prealloc_cap_flush, *pcf); + + if (!ci->i_head_snapc) { + WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); + } dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); - WARN_ON(!ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &mdsc->cap_dirty); @@ -1413,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ihold(inode); dirty |= I_DIRTY_SYNC; } + } else { + WARN_ON_ONCE(!ci->i_prealloc_cap_flush); } BUG_ON(list_empty(&ci->i_dirty_item)); if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && @@ -1422,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) return dirty; } +static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct rb_node **p = &ci->i_cap_flush_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap_flush *other = NULL; + + while (*p) { + parent = *p; + other = rb_entry(parent, struct ceph_cap_flush, i_node); + + if (cf->tid < other->tid) + p = &(*p)->rb_left; + else if (cf->tid > other->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cf->i_node, parent, p); + rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); +} + +static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc, + struct ceph_cap_flush *cf) +{ + struct rb_node **p = &mdsc->cap_flush_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap_flush *other = NULL; + + while (*p) { + parent = *p; + other = rb_entry(parent, struct ceph_cap_flush, g_node); + + if (cf->tid < other->tid) + p = &(*p)->rb_left; + else if (cf->tid > other->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cf->g_node, parent, p); + rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree); +} + +struct ceph_cap_flush *ceph_alloc_cap_flush(void) +{ + return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); +} + +void ceph_free_cap_flush(struct ceph_cap_flush *cf) +{ + if (cf) + kmem_cache_free(ceph_cap_flush_cachep, cf); +} + +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) +{ + struct rb_node *n = rb_first(&mdsc->cap_flush_tree); + if (n) { + struct ceph_cap_flush *cf = + rb_entry(n, struct ceph_cap_flush, g_node); + return cf->tid; + } + return 0; +} + /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. @@ -1429,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + u64 *flush_tid, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_flush *cf = NULL; int flushing; BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); + BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", @@ -1447,22 +1504,30 @@ static int __mark_caps_flushing(struct inode *inode, ci->i_dirty_caps = 0; dout(" inode %p now !dirty\n", inode); + swap(cf, ci->i_prealloc_cap_flush); + cf->caps = flushing; + spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); + cf->tid = ++mdsc->last_cap_flush_tid; + __add_cap_flushing_to_mdsc(mdsc, cf); + *oldest_flush_tid = __get_oldest_flush_tid(mdsc); + if (list_empty(&ci->i_flushing_item)) { - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; - dout(" inode %p now flushing seq %lld\n", inode, - ci->i_cap_flush_seq); + dout(" inode %p now flushing tid %llu\n", inode, cf->tid); } else { list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); - dout(" inode %p now flushing (more) seq %lld\n", inode, - ci->i_cap_flush_seq); + dout(" inode %p now flushing (more) tid %llu\n", + inode, cf->tid); } spin_unlock(&mdsc->cap_dirty_lock); + __add_cap_flushing_to_inode(ci, cf); + + *flush_tid = cf->tid; return flushing; } @@ -1508,6 +1573,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; + u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; @@ -1537,15 +1603,27 @@ retry: retry_locked: file_wanted = __ceph_caps_file_wanted(ci); used = __ceph_caps_used(ci); - want = file_wanted | used; issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; - retain = want | CEPH_CAP_PIN; + want = file_wanted; + retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { - if (want) { + if (file_wanted) { retain |= CEPH_CAP_ANY; /* be greedy */ + } else if (S_ISDIR(inode->i_mode) && + (issued & CEPH_CAP_FILE_SHARED) && + __ceph_dir_is_complete(ci)) { + /* + * If a directory is complete, we want to keep + * the exclusive cap. So that MDS does not end up + * revoking the shared cap on every create/unlink + * operation. + */ + want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; + retain |= want; } else { + retain |= CEPH_CAP_ANY_SHARED; /* * keep RD only if we didn't have the file open RW, @@ -1574,9 +1652,10 @@ retry_locked: * If we fail, it's because pages are locked.... try again later. */ if ((!is_delayed || mdsc->stopping) && - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ - inode->i_data.nrpages && /* have cached pages */ - (file_wanted == 0 || /* no open files */ + !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ + ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + inode->i_data.nrpages && /* have cached pages */ + (file_wanted == 0 || /* no open files */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ !tried_invalidate) { @@ -1714,17 +1793,25 @@ ack: took_snap_rwsem = 1; } - if (cap == ci->i_auth_cap && ci->i_dirty_caps) - flushing = __mark_caps_flushing(inode, session); - else + if (cap == ci->i_auth_cap && ci->i_dirty_caps) { + flushing = __mark_caps_flushing(inode, session, + &flush_tid, + &oldest_flush_tid); + } else { flushing = 0; + flush_tid = 0; + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + } mds = cap->mds; /* remember mds, so we don't repeat */ sent++; /* __send_cap drops i_ceph_lock */ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, - want, retain, flushing, NULL); + want, retain, flushing, + flush_tid, oldest_flush_tid); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -1753,12 +1840,13 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int flushing = 0; struct ceph_mds_session *session = NULL; + int flushing = 0; + u64 flush_tid = 0, oldest_flush_tid = 0; retry: spin_lock(&ci->i_ceph_lock); @@ -1783,42 +1871,54 @@ retry: if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; - flushing = __mark_caps_flushing(inode, session); + flushing = __mark_caps_flushing(inode, session, &flush_tid, + &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, - cap->issued | cap->implemented, flushing, - flush_tid); - if (!delayed) - goto out_unlocked; + (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + } + } else { + struct rb_node *n = rb_last(&ci->i_cap_flush_tree); + if (n) { + struct ceph_cap_flush *cf = + rb_entry(n, struct ceph_cap_flush, i_node); + flush_tid = cf->tid; + } + flushing = ci->i_flushing_caps; + spin_unlock(&ci->i_ceph_lock); } out: - spin_unlock(&ci->i_ceph_lock); -out_unlocked: if (session) mutex_unlock(&session->s_mutex); + + *ptid = flush_tid; return flushing; } /* * Return true if we've flushed caps through the given flush_tid. */ -static int caps_are_flushed(struct inode *inode, unsigned tid) +static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); - int i, ret = 1; + struct ceph_cap_flush *cf; + struct rb_node *n; + int ret = 1; spin_lock(&ci->i_ceph_lock); - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((ci->i_flushing_caps & (1 << i)) && - ci->i_cap_flush_tid[i] <= tid) { - /* still flushing this bit */ + n = rb_first(&ci->i_cap_flush_tree); + if (n) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (cf->tid <= flush_tid) ret = 0; - break; - } + } spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1836,13 +1936,16 @@ static void sync_write_wait(struct inode *inode) struct ceph_osd_request *req; u64 last_tid; + if (!S_ISREG(inode->i_mode)) + return; + spin_lock(&ci->i_unsafe_lock); if (list_empty(head)) goto out; /* set upper bound as _last_ entry in chain */ - req = list_entry(head->prev, struct ceph_osd_request, - r_unsafe_item); + req = list_last_entry(head, struct ceph_osd_request, + r_unsafe_item); last_tid = req->r_tid; do { @@ -1860,18 +1963,64 @@ static void sync_write_wait(struct inode *inode) */ if (list_empty(head)) break; - req = list_entry(head->next, struct ceph_osd_request, - r_unsafe_item); + req = list_first_entry(head, struct ceph_osd_request, + r_unsafe_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); +} + +/* + * wait for any uncommitted directory operations to commit. + */ +static int unsafe_dirop_wait(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_dirops; + struct ceph_mds_request *req; + u64 last_tid; + int ret = 0; + + if (!S_ISDIR(inode->i_mode)) + return 0; + + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + req = list_last_entry(head, struct ceph_mds_request, + r_unsafe_dir_item); + last_tid = req->r_tid; + + do { + ceph_mdsc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + + dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n", + inode, req->r_tid, last_tid); + ret = !wait_for_completion_timeout(&req->r_safe_completion, + ceph_timeout_jiffies(req->r_timeout)); + if (ret) + ret = -EIO; /* timed out */ + + ceph_mdsc_put_request(req); + + spin_lock(&ci->i_unsafe_lock); + if (ret || list_empty(head)) + break; + req = list_first_entry(head, struct ceph_mds_request, + r_unsafe_dir_item); } while (req->r_tid < last_tid); out: spin_unlock(&ci->i_unsafe_lock); + return ret; } int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + u64 flush_tid; int ret; int dirty; @@ -1880,25 +2029,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) - return ret; + goto out; + + if (datasync) + goto out; + mutex_lock(&inode->i_mutex); dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + ret = unsafe_dirop_wait(inode); + /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - dout("fsync waiting for flush_tid %u\n", flush_tid); + if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { ret = wait_event_interruptible(ci->i_cap_wq, - caps_are_flushed(inode, flush_tid)); + caps_are_flushed(inode, flush_tid)); } - - dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); mutex_unlock(&inode->i_mutex); +out: + dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; } @@ -1911,7 +2065,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + u64 flush_tid; int err = 0; int dirty; int wait = wbc->sync_mode == WB_SYNC_ALL; @@ -1966,6 +2120,93 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, } } +static int __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + struct ceph_cap_flush *cf; + struct rb_node *n; + int delayed = 0; + u64 first_tid = 0; + u64 oldest_flush_tid; + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + + while (true) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + break; + } + + for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (cf->tid >= first_tid) + break; + } + if (!n) { + spin_unlock(&ci->i_ceph_lock); + break; + } + + cf = rb_entry(n, struct ceph_cap_flush, i_node); + + first_tid = cf->tid + 1; + + dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, + cap, cf->tid, ceph_cap_string(cf->caps)); + delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + cf->caps, cf->tid, oldest_flush_tid); + } + return delayed; +} + +void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci; + struct ceph_cap *cap; + + dout("early_kick_flushing_caps mds%d\n", session->s_mds); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", + &ci->vfs_inode, cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + continue; + } + + + /* + * if flushing caps were revoked, we re-send the cap flush + * in client reconnect stage. This guarantees MDS * processes + * the cap flush message before issuing the flushing caps to + * other client. + */ + if ((cap->issued & ci->i_flushing_caps) != + ci->i_flushing_caps) { + spin_unlock(&ci->i_ceph_lock); + if (!__kick_flushing_caps(mdsc, session, ci)) + continue; + spin_lock(&ci->i_ceph_lock); + } + + spin_unlock(&ci->i_ceph_lock); + } +} + void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { @@ -1975,28 +2216,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, dout("kick_flushing_caps mds%d\n", session->s_mds); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { - struct inode *inode = &ci->vfs_inode; - struct ceph_cap *cap; - int delayed = 0; - - spin_lock(&ci->i_ceph_lock); - cap = ci->i_auth_cap; - if (cap && cap->session == session) { - dout("kick_flushing_caps %p cap %p %s\n", inode, - cap, ceph_cap_string(ci->i_flushing_caps)); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); - spin_unlock(&ci->i_ceph_lock); - } - } else { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); + int delayed = __kick_flushing_caps(mdsc, session, ci); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } } @@ -2008,26 +2231,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; - int delayed = 0; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; - dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, - ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + dout("kick_flushing_inode_caps %p flushing %s\n", inode, + ceph_cap_string(ci->i_flushing_caps)); __ceph_flush_snaps(ci, &session, 1); if (ci->i_flushing_caps) { + int delayed; + spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); spin_unlock(&mdsc->cap_dirty_lock); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); + spin_unlock(&ci->i_ceph_lock); + + delayed = __kick_flushing_caps(mdsc, session, ci); if (delayed) { spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); @@ -2045,7 +2267,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, * * Protected by i_ceph_lock. */ -static void __take_cap_refs(struct ceph_inode_info *ci, int got) +static void __take_cap_refs(struct ceph_inode_info *ci, int got, + bool snap_rwsem_locked) { if (got & CEPH_CAP_PIN) ci->i_pin_ref++; @@ -2053,8 +2276,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; - if (got & CEPH_CAP_FILE_WR) + if (got & CEPH_CAP_FILE_WR) { + if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { + BUG_ON(!snap_rwsem_locked); + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + } ci->i_wr_ref++; + } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) ihold(&ci->vfs_inode); @@ -2072,16 +2301,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * requested from the MDS. */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, int *got, int *check_max, int *err) + loff_t endoff, bool nonblock, int *got, int *err) { struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; int file_wanted; + bool snap_rwsem_locked = false; dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); +again: spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ @@ -2097,6 +2329,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) { + up_read(&mdsc->snap_rwsem); + snap_rwsem_locked = false; + } __ceph_do_pending_vmtruncate(inode); spin_lock(&ci->i_ceph_lock); } @@ -2108,7 +2344,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) { - *check_max = 1; + *err = -EAGAIN; ret = 1; } goto out_unlock; @@ -2136,8 +2372,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if ((revoking & not) == 0) { + if (!snap_rwsem_locked && + !ci->i_head_snapc && + (need & CEPH_CAP_FILE_WR)) { + if (!down_read_trylock(&mdsc->snap_rwsem)) { + /* + * we can not call down_read() when + * task isn't in TASK_RUNNING state + */ + if (nonblock) { + *err = -EAGAIN; + ret = 1; + goto out_unlock; + } + + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + snap_rwsem_locked = true; + goto again; + } + snap_rwsem_locked = true; + } *got = need | (have & want); - __take_cap_refs(ci, *got); + __take_cap_refs(ci, *got, true); ret = 1; } } else { @@ -2161,6 +2418,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, } out_unlock: spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) + up_read(&mdsc->snap_rwsem); dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); @@ -2203,50 +2462,70 @@ static void check_max_size(struct inode *inode, loff_t endoff) int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int _got, check_max, ret, err = 0; + int _got, ret, err = 0; -retry: - if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); - _got = 0; - check_max = 0; - ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, endoff, - &_got, &check_max, &err)); - if (err) - ret = err; + ret = ceph_pool_perm_check(ci, need); if (ret < 0) return ret; - if (check_max) - goto retry; + while (true) { + if (endoff > 0) + check_max_size(&ci->vfs_inode, endoff); - if (ci->i_inline_version != CEPH_INLINE_NONE && - (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(&ci->vfs_inode) > 0) { - struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - *pinned_page = page; - goto out; - } - page_cache_release(page); - } - /* - * drop cap refs first because getattr while holding - * caps refs can cause deadlock. - */ - ceph_put_cap_refs(ci, _got); + err = 0; _got = 0; + ret = try_get_cap_refs(ci, need, want, endoff, + false, &_got, &err); + if (ret) { + if (err == -EAGAIN) + continue; + if (err < 0) + return err; + } else { + ret = wait_event_interruptible(ci->i_cap_wq, + try_get_cap_refs(ci, need, want, endoff, + true, &_got, &err)); + if (err == -EAGAIN) + continue; + if (err < 0) + ret = err; + if (ret < 0) + return ret; + } - /* getattr request will bring inline data into page cache */ - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, - CEPH_STAT_CAP_INLINE_DATA, true); - if (ret < 0) - return ret; - goto retry; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(&ci->vfs_inode) > 0) { + struct page *page = + find_get_page(ci->vfs_inode.i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + break; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while + * holding * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* + * getattr request will bring inline data into + * page cache + */ + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, + true); + if (ret < 0) + return ret; + continue; + } + break; } -out: + *got = _got; return 0; } @@ -2258,10 +2537,31 @@ out: void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); - __take_cap_refs(ci, caps); + __take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } + +/* + * drop cap_snap that is not associated with any snapshot. + * we don't need to send FLUSHSNAP message for it. + */ +static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap) +{ + if (!capsnap->need_flush && + !capsnap->writing && !capsnap->dirty_pages) { + + dout("dropping cap_snap %p follows %llu\n", + capsnap, capsnap->follows); + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + return 1; + } + return 0; +} + /* * Release cap refs. * @@ -2275,7 +2575,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) { struct inode *inode = &ci->vfs_inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; - struct ceph_cap_snap *capsnap; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) @@ -2297,18 +2596,28 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (had & CEPH_CAP_FILE_WR) if (--ci->i_wr_ref == 0) { last++; - if (!list_empty(&ci->i_cap_snaps)) { - capsnap = list_first_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, - ci_item); - if (capsnap->writing) { - capsnap->writing = 0; - flushsnaps = - __ceph_finish_cap_snap(ci, - capsnap); - wake = 1; - } + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + capsnap->writing = 0; + if (ceph_try_drop_cap_snap(capsnap)) + put++; + else if (__ceph_finish_cap_snap(ci, capsnap)) + flushsnaps = 1; + wake = 1; } + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + /* see comment in __ceph_remove_cap() */ + if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) + drop_inode_snap_realm(ci); } spin_unlock(&ci->i_ceph_lock); @@ -2321,7 +2630,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) ceph_flush_snaps(ci); if (wake) wake_up_all(&ci->i_cap_wq); - if (put) + while (put-- > 0) iput(inode); } @@ -2349,7 +2658,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; if (ci->i_wrbuffer_ref_head == 0 && - ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2370,25 +2681,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = 1; - if (capsnap->dirty == 0) - /* cap writeback completed before we created - * the cap_snap; no FLUSHSNAP is needed */ - drop_capsnap = 1; + drop_capsnap = ceph_try_drop_cap_snap(capsnap); } dout("put_wrbuffer_cap_refs on %p cap_snap %p " - " snap %lld %d/%d -> %d/%d %s%s%s\n", + " snap %lld %d/%d -> %d/%d %s%s\n", inode, capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", - complete_capsnap ? " (complete capsnap)" : "", - drop_capsnap ? " (drop capsnap)" : ""); - if (drop_capsnap) { - ceph_put_snap_context(capsnap->context); - list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); - ceph_put_cap_snap(capsnap); - } + complete_capsnap ? " (complete capsnap)" : ""); } spin_unlock(&ci->i_ceph_lock); @@ -2495,7 +2796,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ - if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ + ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { if (try_nonblocking_invalidate(inode)) { @@ -2701,16 +3003,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_cap_flush *cf; + struct rb_node *n; + LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; int drop = 0; - int i; - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((dirty & (1 << i)) && - (u16)flush_tid == ci->i_cap_flush_tid[i]) - cleaned |= 1 << i; + n = rb_first(&ci->i_cap_flush_tree); + while (n) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + n = rb_next(&cf->i_node); + if (cf->tid == flush_tid) + cleaned = cf->caps; + if (cf->tid <= flush_tid) { + rb_erase(&cf->i_node, &ci->i_cap_flush_tree); + list_add_tail(&cf->list, &to_remove); + } else { + cleaned &= ~cf->caps; + if (!cleaned) + break; + } + } dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," " flushing %s -> %s\n", @@ -2718,12 +3033,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps & ~cleaned)); - if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) + if (list_empty(&to_remove) && !cleaned) goto out; ci->i_flushing_caps &= ~cleaned; spin_lock(&mdsc->cap_dirty_lock); + + if (!list_empty(&to_remove)) { + list_for_each_entry(cf, &to_remove, list) + rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + + n = rb_first(&mdsc->cap_flush_tree); + cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; + if (!cf || cf->tid > flush_tid) + wake_up_all(&mdsc->cap_flushing_wq); + } + if (ci->i_flushing_caps == 0) { list_del_init(&ci->i_flushing_item); if (!list_empty(&session->s_cap_flushing)) @@ -2733,14 +3059,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = 1; - if (ci->i_wrbuffer_ref_head == 0) { + if (ci->i_wr_ref == 0 && + ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2754,6 +3080,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, out: spin_unlock(&ci->i_ceph_lock); + + while (!list_empty(&to_remove)) { + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, list); + list_del(&cf->list); + ceph_free_cap_flush(cf); + } if (drop) iput(inode); } @@ -2769,6 +3102,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap; int drop = 0; @@ -2792,6 +3126,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, list_del(&capsnap->ci_item); list_del(&capsnap->flushing_item); ceph_put_cap_snap(capsnap); + wake_up_all(&mdsc->cap_flushing_wq); drop = 1; break; } else { @@ -2940,7 +3275,6 @@ retry: mutex_lock_nested(&session->s_mutex, SINGLE_DEPTH_NESTING); } - ceph_add_cap_releases(mdsc, tsession); new_cap = ceph_get_cap(mdsc, NULL); } else { WARN_ON(1); @@ -3136,16 +3470,20 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); - if (op == CEPH_CAP_OP_IMPORT) - ceph_add_cap_releases(mdsc, session); - if (!inode) { dout(" i don't have ino %llx\n", vino.ino); if (op == CEPH_CAP_OP_IMPORT) { + cap = ceph_get_cap(mdsc, NULL); + cap->cap_ino = vino.ino; + cap->queue_release = 1; + cap->cap_id = cap_id; + cap->mseq = mseq; + cap->seq = seq; spin_lock(&session->s_cap_lock); - __queue_cap_release(session, vino.ino, cap_id, - mseq, seq); + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; spin_unlock(&session->s_cap_lock); } goto flush_cap_releases; @@ -3221,11 +3559,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, flush_cap_releases: /* - * send any full release message to try to move things + * send any cap release message to try to move things * along for the mds (who clearly thinks we still have this * cap). */ - ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); done: @@ -3391,7 +3728,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode, int ceph_encode_dentry_release(void **p, struct dentry *dentry, int mds, int drop, int unless) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); int force = 0; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 1b2355109b9f..31f831471ed2 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -84,7 +84,7 @@ static int mdsc_show(struct seq_file *s, void *p) path = NULL; spin_lock(&req->r_dentry->d_lock); seq_printf(s, " #%llx/%pd (%s)", - ceph_ino(req->r_dentry->d_parent->d_inode), + ceph_ino(d_inode(req->r_dentry->d_parent)), req->r_dentry, path ? path : ""); spin_unlock(&req->r_dentry->d_lock); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 83e9976f7189..9314b4ea2375 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ @@ -49,9 +49,9 @@ int ceph_init_dentry(struct dentry *dentry) goto out_unlock; } - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) + if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) d_set_d_op(dentry, &ceph_dentry_ops); - else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR) + else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR) d_set_d_op(dentry, &ceph_snapdir_dentry_ops); else d_set_d_op(dentry, &ceph_snap_dentry_ops); @@ -77,7 +77,7 @@ struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry) spin_lock(&dentry->d_lock); if (!IS_ROOT(dentry)) { - inode = dentry->d_parent->d_inode; + inode = d_inode(dentry->d_parent); ihold(inode); } spin_unlock(&dentry->d_lock); @@ -107,6 +107,27 @@ static int fpos_cmp(loff_t l, loff_t r) } /* + * make note of the last dentry we read, so we can + * continue at the same lexicographical point, + * regardless of what dir changes take place on the + * server. + */ +static int note_last_dentry(struct ceph_file_info *fi, const char *name, + int len, unsigned next_offset) +{ + char *buf = kmalloc(len+1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + kfree(fi->last_name); + fi->last_name = buf; + memcpy(fi->last_name, name, len); + fi->last_name[len] = 0; + fi->next_offset = next_offset; + dout("note_last_dentry '%s'\n", fi->last_name); + return 0; +} + +/* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on * d_child when we initially get results back from the MDS, and @@ -122,124 +143,114 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, { struct ceph_file_info *fi = file->private_data; struct dentry *parent = file->f_path.dentry; - struct inode *dir = parent->d_inode; - struct list_head *p; - struct dentry *dentry, *last; + struct inode *dir = d_inode(parent); + struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; + unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *); int err = 0; + loff_t ptr_pos = 0; + struct ceph_readdir_cache_control cache_ctl = {}; - /* claim ref on last dentry we returned */ - last = fi->dentry; - fi->dentry = NULL; - - dout("__dcache_readdir %p v%u at %llu (last %p)\n", - dir, shared_gen, ctx->pos, last); + dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); - spin_lock(&parent->d_lock); - - /* start at beginning? */ - if (ctx->pos == 2 || last == NULL || - fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { - if (list_empty(&parent->d_subdirs)) - goto out_unlock; - p = parent->d_subdirs.prev; - dout(" initial p %p/%p\n", p->prev, p->next); - } else { - p = last->d_child.prev; + /* we can calculate cache index for the first dirfrag */ + if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { + cache_ctl.index = fpos_off(ctx->pos) - 2; + BUG_ON(cache_ctl.index < 0); + ptr_pos = cache_ctl.index * sizeof(struct dentry *); } -more: - dentry = list_entry(p, struct dentry, d_child); - di = ceph_dentry(dentry); - while (1) { - dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, - d_unhashed(dentry) ? "!hashed" : "hashed", - parent->d_subdirs.prev, parent->d_subdirs.next); - if (p == &parent->d_subdirs) { + while (true) { + pgoff_t pgoff; + bool emit_dentry; + + if (ptr_pos >= i_size_read(dir)) { fi->flags |= CEPH_F_ATEND; - goto out_unlock; - } - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - if (di->lease_shared_gen == shared_gen && - !d_unhashed(dentry) && dentry->d_inode && - ceph_snap(dentry->d_inode) != CEPH_SNAPDIR && - ceph_ino(dentry->d_inode) != CEPH_INO_CEPH && - fpos_cmp(ctx->pos, di->offset) <= 0) + err = 0; break; - dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry, - dentry, di->offset, - ctx->pos, d_unhashed(dentry) ? " unhashed" : "", - !dentry->d_inode ? " null" : ""); - spin_unlock(&dentry->d_lock); - p = p->prev; - dentry = list_entry(p, struct dentry, d_child); - di = ceph_dentry(dentry); - } - - dget_dlock(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); + } - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete_ordered(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - dput(dentry); err = -EAGAIN; - goto out; - } - - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, - dentry, dentry, dentry->d_inode); - if (!dir_emit(ctx, dentry->d_name.name, - dentry->d_name.len, - ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino), - dentry->d_inode->i_mode >> 12)) { - if (last) { - /* remember our position */ - fi->dentry = last; - fi->next_offset = fpos_off(di->offset); + pgoff = ptr_pos >> PAGE_CACHE_SHIFT; + if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { + ceph_readdir_cache_release(&cache_ctl); + cache_ctl.page = find_lock_page(&dir->i_data, pgoff); + if (!cache_ctl.page) { + dout(" page %lu not found\n", pgoff); + break; + } + /* reading/filling the cache are serialized by + * i_mutex, no need to use page lock */ + unlock_page(cache_ctl.page); + cache_ctl.dentries = kmap(cache_ctl.page); } - dput(dentry); - return 0; - } - ctx->pos = di->offset + 1; + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_mutex. */ + if (ceph_dir_is_complete_ordered(dir) && + ptr_pos < i_size_read(dir)) + dentry = cache_ctl.dentries[cache_ctl.index % nsize]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + if (!dentry) + break; + + emit_dentry = false; + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); + if (di->lease_shared_gen == shared_gen && + d_really_is_positive(dentry) && + ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && + ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && + fpos_cmp(ctx->pos, di->offset) <= 0) { + emit_dentry = true; + } + spin_unlock(&dentry->d_lock); - if (last) - dput(last); - last = dentry; + if (emit_dentry) { + dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dentry, dentry, d_inode(dentry)); + ctx->pos = di->offset; + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, + ceph_translate_ino(dentry->d_sb, + d_inode(dentry)->i_ino), + d_inode(dentry)->i_mode >> 12)) { + dput(dentry); + err = 0; + break; + } + ctx->pos++; - spin_lock(&parent->d_lock); - p = p->prev; /* advance to next dentry */ - goto more; + if (last) + dput(last); + last = dentry; + } else { + dput(dentry); + } -out_unlock: - spin_unlock(&parent->d_lock); -out: - if (last) + cache_ctl.index++; + ptr_pos += sizeof(struct dentry *); + } + ceph_readdir_cache_release(&cache_ctl); + if (last) { + int ret; + di = ceph_dentry(last); + ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, + fpos_off(di->offset) + 1); + if (ret < 0) + err = ret; dput(last); + } return err; } -/* - * make note of the last dentry we read, so we can - * continue at the same lexicographical point, - * regardless of what dir changes take place on the - * server. - */ -static int note_last_dentry(struct ceph_file_info *fi, const char *name, - int len) -{ - kfree(fi->last_name); - fi->last_name = kmalloc(len+1, GFP_NOFS); - if (!fi->last_name) - return -ENOMEM; - memcpy(fi->last_name, name, len); - fi->last_name[len] = 0; - dout("note_last_dentry '%s'\n", fi->last_name); - return 0; -} - static int ceph_readdir(struct file *file, struct dir_context *ctx) { struct ceph_file_info *fi = file->private_data; @@ -280,7 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); - if ((ctx->pos == 2 || fi->dentry) && + if (ceph_test_mount_opt(fsc, DCACHE) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete_ordered(ci) && @@ -295,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) } else { spin_unlock(&ci->i_ceph_lock); } - if (fi->dentry) { - err = note_last_dentry(fi, fi->dentry->d_name.name, - fi->dentry->d_name.len); - if (err) - return err; - dput(fi->dentry); - fi->dentry = NULL; - } /* proceed with a normal readdir */ - - if (ctx->pos == 2) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - fi->dir_release_count = atomic_read(&ci->i_release_count); - fi->dir_ordered_count = ci->i_ordered_count; - } - more: /* do we have the correct frag content buffered? */ if (fi->frag != frag || fi->last_readdir == NULL) { @@ -336,16 +331,26 @@ more: ceph_mdsc_put_request(req); return err; } - req->r_inode = inode; - ihold(inode); - req->r_dentry = dget(file->f_path.dentry); /* hints to request -> mds selection code */ req->r_direct_mode = USE_AUTH_MDS; req->r_direct_hash = ceph_frag_value(frag); req->r_direct_is_hash = true; - req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + if (fi->last_name) { + req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + return -ENOMEM; + } + } + req->r_dir_release_cnt = fi->dir_release_count; + req->r_dir_ordered_cnt = fi->dir_ordered_count; + req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); + + req->r_inode = inode; + ihold(inode); + req->r_dentry = dget(file->f_path.dentry); err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); @@ -356,26 +361,38 @@ more: (int)req->r_reply_info.dir_end, (int)req->r_reply_info.dir_complete); - if (!req->r_did_prepopulate) { - dout("readdir !did_prepopulate"); - /* preclude from marking dir complete */ - fi->dir_release_count--; - } /* note next offset and last dentry name */ rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - if (ceph_frag_is_leftmost(frag)) - fi->next_offset = 2; - else - fi->next_offset = 0; - off = fi->next_offset; + off = req->r_readdir_offset; + fi->next_offset = off; } + fi->frag = frag; fi->offset = fi->next_offset; fi->last_readdir = req; + if (req->r_did_prepopulate) { + fi->readdir_cache_idx = req->r_readdir_cache_idx; + if (fi->readdir_cache_idx < 0) { + /* preclude from marking dir ordered */ + fi->dir_ordered_count = 0; + } else if (ceph_frag_is_leftmost(frag) && off == 2) { + /* note dir version at start of readdir so + * we can tell if any dentries get dropped */ + fi->dir_release_count = req->r_dir_release_cnt; + fi->dir_ordered_count = req->r_dir_ordered_cnt; + } + } else { + dout("readdir !did_prepopulate"); + /* disable readdir cache */ + fi->readdir_cache_idx = -1; + /* preclude from marking dir complete */ + fi->dir_release_count = 0; + } + if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; @@ -386,10 +403,10 @@ more: } else { err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], - rinfo->dir_dname_len[rinfo->dir_nr-1]); + rinfo->dir_dname_len[rinfo->dir_nr-1], + fi->next_offset + rinfo->dir_nr); if (err) return err; - fi->next_offset += rinfo->dir_nr; } } @@ -445,16 +462,22 @@ more: * were released during the whole readdir, and we should have * the complete dir contents in our cache. */ - spin_lock(&ci->i_ceph_lock); - if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { - if (ci->i_ordered_count == fi->dir_ordered_count) + if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { + spin_lock(&ci->i_ceph_lock); + if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { dout(" marking %p complete and ordered\n", inode); - else + /* use i_size to track number of entries in + * readdir cache */ + BUG_ON(fi->readdir_cache_idx < 0); + i_size_write(inode, fi->readdir_cache_idx * + sizeof(struct dentry*)); + } else { dout(" marking %p complete\n", inode); + } __ceph_dir_set_complete(ci, fi->dir_release_count, fi->dir_ordered_count); + spin_unlock(&ci->i_ceph_lock); } - spin_unlock(&ci->i_ceph_lock); dout("readdir %p file %p done.\n", inode, file); return 0; @@ -468,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) } kfree(fi->last_name); fi->last_name = NULL; + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; if (ceph_frag_is_leftmost(frag)) fi->next_offset = 2; /* compensate for . and .. */ else fi->next_offset = 0; - if (fi->dentry) { - dput(fi->dentry); - fi->dentry = NULL; - } fi->flags &= ~CEPH_F_ATEND; } @@ -489,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) mutex_lock(&inode->i_mutex); retval = -EINVAL; switch (whence) { - case SEEK_END: - offset += inode->i_size + 2; /* FIXME */ - break; case SEEK_CUR: offset += file->f_pos; case SEEK_SET: break; + case SEEK_END: + retval = -EOPNOTSUPP; default: goto out; } @@ -508,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } retval = offset; - /* - * discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk. - */ if (offset == 0 || fpos_frag(offset) != fi->frag || fpos_off(offset) < fi->offset) { + /* discard buffered readdir content on seekdir(0), or + * seek to new frag, or seek prior to current chunk */ dout("dir_llseek dropping %p content\n", file); reset_readdir(fi, fpos_frag(offset)); + } else if (fpos_cmp(offset, old_offset) > 0) { + /* reset dir_release_count if we did a forward seek */ + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; } - - /* bump dir_release_count if we did a forward seek */ - if (fpos_cmp(offset, old_offset) > 0) - fi->dir_release_count--; } out: mutex_unlock(&inode->i_mutex); @@ -535,7 +553,7 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *parent = dentry->d_parent->d_inode; /* we hold i_mutex */ + struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ /* .snap dir? */ if (err == -ENOENT && @@ -571,8 +589,8 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, err = 0; if (!req->r_reply_info.head->is_dentry) { dout("ENOENT and no trace, dentry %p inode %p\n", - dentry, dentry->d_inode); - if (dentry->d_inode) { + dentry, d_inode(dentry)); + if (d_really_is_positive(dentry)) { d_drop(dentry); err = -ENOENT; } else { @@ -619,7 +637,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, return ERR_PTR(err); /* can we conclude ENOENT locally? */ - if (dentry->d_inode == NULL) { + if (d_really_is_negative(dentry)) { struct ceph_inode_info *ci = ceph_inode(dir); struct ceph_dentry_info *di = ceph_dentry(dentry); @@ -629,6 +647,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, fsc->mount_options->snapdir_name, dentry->d_name.len) && !is_root_ceph_dentry(dir, dentry) && + ceph_test_mount_opt(fsc, DCACHE) && __ceph_dir_is_complete(ci) && (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) { spin_unlock(&ci->i_ceph_lock); @@ -725,7 +744,7 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, ceph_mdsc_put_request(req); out: if (!err) - ceph_init_inode_acls(dentry->d_inode, &acls); + ceph_init_inode_acls(d_inode(dentry), &acls); else d_drop(dentry); ceph_release_acls_info(&acls); @@ -755,10 +774,15 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, err = PTR_ERR(req); goto out; } + req->r_path2 = kstrdup(dest, GFP_KERNEL); + if (!req->r_path2) { + err = -ENOMEM; + ceph_mdsc_put_request(req); + goto out; + } + req->r_locked_dir = dir; req->r_dentry = dget(dentry); req->r_num_caps = 2; - req->r_path2 = kstrdup(dest, GFP_NOFS); - req->r_locked_dir = dir; req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; err = ceph_mdsc_do_request(mdsc, dir, req); @@ -821,7 +845,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ceph_mdsc_put_request(req); out: if (!err) - ceph_init_inode_acls(dentry->d_inode, &acls); + ceph_init_inode_acls(d_inode(dentry), &acls); else d_drop(dentry); ceph_release_acls_info(&acls); @@ -858,8 +882,8 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir, if (err) { d_drop(dentry); } else if (!req->r_reply_info.head->is_dentry) { - ihold(old_dentry->d_inode); - d_instantiate(dentry, old_dentry->d_inode); + ihold(d_inode(old_dentry)); + d_instantiate(dentry, d_inode(old_dentry)); } ceph_mdsc_put_request(req); return err; @@ -892,7 +916,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; int err = -EROFS; int op; @@ -933,16 +957,20 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + int op = CEPH_MDS_OP_RENAME; int err; if (ceph_snap(old_dir) != ceph_snap(new_dir)) return -EXDEV; - if (ceph_snap(old_dir) != CEPH_NOSNAP || - ceph_snap(new_dir) != CEPH_NOSNAP) - return -EROFS; + if (ceph_snap(old_dir) != CEPH_NOSNAP) { + if (old_dir == new_dir && ceph_snap(old_dir) == CEPH_SNAPDIR) + op = CEPH_MDS_OP_RENAMESNAP; + else + return -EROFS; + } dout("rename dir %p dentry %p to dir %p dentry %p\n", old_dir, old_dentry, new_dir, new_dentry); - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS); + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); ihold(old_dir); @@ -957,8 +985,8 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, req->r_dentry_unless = CEPH_CAP_FILE_EXCL; /* release LINK_RDCACHE on source inode (mds will lock it) */ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED; - if (new_dentry->d_inode) - req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode); + if (d_really_is_positive(new_dentry)) + req->r_inode_drop = drop_caps_for_unlink(d_inode(new_dentry)); err = ceph_mdsc_do_request(mdsc, old_dir, req); if (!err && !req->r_reply_info.head->is_dentry) { /* @@ -967,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); - - /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_complete(old_dir); - ceph_dir_clear_complete(new_dir); - } ceph_mdsc_put_request(req); return err; @@ -1024,7 +1051,7 @@ static int dentry_lease_is_valid(struct dentry *dentry) if (di->lease_renew_after && time_after(jiffies, di->lease_renew_after)) { /* we should renew */ - dir = dentry->d_parent->d_inode; + dir = d_inode(dentry->d_parent); session = ceph_get_mds_session(s); seq = di->lease_seq; di->lease_renew_after = 0; @@ -1074,22 +1101,22 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, - dentry, dentry->d_inode, ceph_dentry(dentry)->offset); + dentry, d_inode(dentry), ceph_dentry(dentry)->offset); dir = ceph_get_dentry_parent_inode(dentry); /* always trust cached snapped dentries, snapdir dentry */ if (ceph_snap(dir) != CEPH_NOSNAP) { dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry, - dentry, dentry->d_inode); + dentry, d_inode(dentry)); valid = 1; - } else if (dentry->d_inode && - ceph_snap(dentry->d_inode) == CEPH_SNAPDIR) { + } else if (d_really_is_positive(dentry) && + ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) { valid = 1; } else if (dentry_lease_is_valid(dentry) || dir_lease_is_valid(dir, dentry)) { - if (dentry->d_inode) - valid = ceph_is_any_caps(dentry->d_inode); + if (d_really_is_positive(dentry)) + valid = ceph_is_any_caps(d_inode(dentry)); else valid = 1; } @@ -1151,7 +1178,7 @@ static void ceph_d_prune(struct dentry *dentry) * we hold d_lock, so d_parent is stable, and d_fsdata is never * cleared until d_release */ - ceph_dir_clear_complete(dentry->d_parent->d_inode); + ceph_dir_clear_complete(d_inode(dentry->d_parent)); } /* @@ -1171,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, return -EISDIR; if (!cf->dir_info) { - cf->dir_info = kmalloc(bufsize, GFP_NOFS); + cf->dir_info = kmalloc(bufsize, GFP_KERNEL); if (!cf->dir_info) return -ENOMEM; cf->dir_info_len = @@ -1206,65 +1233,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, } /* - * an fsync() on a dir will wait for any uncommitted directory - * operations to commit. - */ -static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_dirops; - struct ceph_mds_request *req; - u64 last_tid; - int ret = 0; - - dout("dir_fsync %p\n", inode); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); - - spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - req = list_entry(head->prev, - struct ceph_mds_request, r_unsafe_dir_item); - last_tid = req->r_tid; - - do { - ceph_mdsc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - - dout("dir_fsync %p wait on tid %llu (until %llu)\n", - inode, req->r_tid, last_tid); - if (req->r_timeout) { - ret = wait_for_completion_timeout( - &req->r_safe_completion, req->r_timeout); - if (ret > 0) - ret = 0; - else if (ret == 0) - ret = -EIO; /* timed out */ - } else { - wait_for_completion(&req->r_safe_completion); - } - ceph_mdsc_put_request(req); - - spin_lock(&ci->i_unsafe_lock); - if (ret || list_empty(head)) - break; - req = list_entry(head->next, - struct ceph_mds_request, r_unsafe_dir_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); - mutex_unlock(&inode->i_mutex); - - return ret; -} - -/* * We maintain a private dentry LRU. * * FIXME: this needs to be changed to a per-mds lru to be useful. @@ -1334,7 +1302,7 @@ const struct file_operations ceph_dir_fops = { .open = ceph_open, .release = ceph_release, .unlocked_ioctl = ceph_ioctl, - .fsync = ceph_dir_fsync, + .fsync = ceph_fsync, }; const struct file_operations ceph_snapdir_fops = { @@ -1372,6 +1340,7 @@ const struct inode_operations ceph_snapdir_iops = { .getattr = ceph_getattr, .mkdir = ceph_mkdir, .rmdir = ceph_unlink, + .rename = ceph_rename, }; const struct dentry_operations ceph_dentry_ops = { diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 8d7d782f4382..fe02ae7f056a 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -136,8 +136,8 @@ static struct dentry *__get_parent(struct super_block *sb, return ERR_CAST(req); if (child) { - req->r_inode = child->d_inode; - ihold(child->d_inode); + req->r_inode = d_inode(child); + ihold(d_inode(child)); } else { req->r_ino1 = (struct ceph_vino) { .ino = ino, @@ -164,7 +164,7 @@ static struct dentry *__get_parent(struct super_block *sb, return ERR_PTR(err); } dout("__get_parent ino %llx parent %p ino %llx.%llx\n", - child ? ceph_ino(child->d_inode) : ino, + child ? ceph_ino(d_inode(child)) : ino, dentry, ceph_vinop(inode)); return dentry; } @@ -172,11 +172,11 @@ static struct dentry *__get_parent(struct super_block *sb, static struct dentry *ceph_get_parent(struct dentry *child) { /* don't re-export snaps */ - if (ceph_snap(child->d_inode) != CEPH_NOSNAP) + if (ceph_snap(d_inode(child)) != CEPH_NOSNAP) return ERR_PTR(-EINVAL); dout("get_parent %p ino %llx.%llx\n", - child, ceph_vinop(child->d_inode)); + child, ceph_vinop(d_inode(child))); return __get_parent(child->d_sb, child, 0); } @@ -209,32 +209,32 @@ static int ceph_get_name(struct dentry *parent, char *name, struct ceph_mds_request *req; int err; - mdsc = ceph_inode_to_client(child->d_inode)->mdsc; + mdsc = ceph_inode_to_client(d_inode(child))->mdsc; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPNAME, USE_ANY_MDS); if (IS_ERR(req)) return PTR_ERR(req); - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); - req->r_inode = child->d_inode; - ihold(child->d_inode); - req->r_ino2 = ceph_vino(parent->d_inode); - req->r_locked_dir = parent->d_inode; + req->r_inode = d_inode(child); + ihold(d_inode(child)); + req->r_ino2 = ceph_vino(d_inode(parent)); + req->r_locked_dir = d_inode(parent); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); if (!err) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; memcpy(name, rinfo->dname, rinfo->dname_len); name[rinfo->dname_len] = 0; dout("get_name %p ino %llx.%llx name %s\n", - child, ceph_vinop(child->d_inode), name); + child, ceph_vinop(d_inode(child)), name); } else { dout("get_name %p ino %llx.%llx err %d\n", - child, ceph_vinop(child->d_inode), err); + child, ceph_vinop(d_inode(child)), err); } ceph_mdsc_put_request(req); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index d533075a823d..8b79d87eaf46 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,7 +7,6 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/writeback.h> -#include <linux/aio.h> #include <linux/falloc.h> #include "super.h" @@ -90,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); - cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); + cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO); if (cf == NULL) { ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ return -ENOMEM; } cf->fmode = fmode; cf->next_offset = 2; + cf->readdir_cache_idx = -1; file->private_data = cf; BUG_ON(inode->i_fop->release != ceph_release); break; @@ -292,14 +292,14 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } if (err) goto out_req; - if (dn || dentry->d_inode == NULL || d_is_symlink(dentry)) { + if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) { /* make vfs retry on splice, ENOENT, or symlink */ dout("atomic_open finish_no_open on dn %p\n", dn); err = finish_no_open(file, dn); } else { dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { - ceph_init_inode_acls(dentry->d_inode, &acls); + ceph_init_inode_acls(d_inode(dentry), &acls); *opened |= FILE_CREATED; } err = finish_open(file, dentry, ceph_open, opened); @@ -325,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file) ceph_mdsc_put_request(cf->last_readdir); kfree(cf->last_name); kfree(cf->dir_info); - dput(cf->dentry); kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ @@ -458,7 +457,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, if (ret < 0) return ret; - if (file->f_flags & O_DIRECT) { + if (iocb->ki_flags & IOCB_DIRECT) { while (iov_iter_count(i)) { size_t start; ssize_t n; @@ -484,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, } } else { num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); ret = striped_read(inode, off, len, pages, @@ -558,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, + struct ceph_snap_context *snapc) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; @@ -601,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) size_t start; ssize_t n; - snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &len, 0, @@ -615,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) break; } - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); n = iov_iter_get_pages_alloc(from, &pages, len, &start); if (unlikely(n < 0)) { @@ -675,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, + struct ceph_snap_context *snapc) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; @@ -718,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) size_t left; int n; - snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &len, 0, 1, @@ -737,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) */ num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -808,7 +805,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; - size_t len = iocb->ki_nbytes; + size_t len = iov_iter_count(to); struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct page *pinned_page = NULL; @@ -829,7 +826,7 @@ again: return ret; if ((got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_filp->f_flags & O_DIRECT) || + (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { dout("aio_sync_read %p %llx.%llx %llu~%u got cap refs on %s\n", @@ -861,7 +858,7 @@ again: struct page *page = NULL; loff_t i_size; if (retry_op == READ_INLINE) { - page = __page_cache_alloc(GFP_NOFS); + page = __page_cache_alloc(GFP_KERNEL); if (!page) return -ENOMEM; } @@ -942,27 +939,30 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; - ssize_t count = iov_iter_count(from), written = 0; + struct ceph_cap_flush *prealloc_cf; + ssize_t count, written = 0; int err, want, got; - loff_t pos = iocb->ki_pos; + loff_t pos; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) - goto out; - - if (count == 0) + err = generic_write_checks(iocb, from); + if (err <= 0) goto out; - iov_iter_truncate(from, count); - err = file_remove_suid(file); + pos = iocb->ki_pos; + count = iov_iter_count(from); + err = file_remove_privs(file); if (err) goto out; @@ -998,15 +998,31 @@ retry_snap: inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (file->f_flags & O_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + struct ceph_snap_context *snapc; struct iov_iter data; mutex_unlock(&inode->i_mutex); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + /* we might need to revert back to that point */ data = *from; - if (file->f_flags & O_DIRECT) - written = ceph_sync_direct_write(iocb, &data, pos); + if (iocb->ki_flags & IOCB_DIRECT) + written = ceph_sync_direct_write(iocb, &data, pos, + snapc); else - written = ceph_sync_write(iocb, &data, pos); + written = ceph_sync_write(iocb, &data, pos, snapc); if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", @@ -1017,6 +1033,7 @@ retry_snap: } if (written > 0) iov_iter_advance(from, written); + ceph_put_snap_context(snapc); } else { loff_t old_size = inode->i_size; /* @@ -1038,7 +1055,8 @@ retry_snap: int dirty; spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1062,6 +1080,7 @@ retry_snap: out: mutex_unlock(&inode->i_mutex); out_unlocked: + ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; return written ? written : err; } @@ -1258,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; + struct ceph_cap_flush *prealloc_cf; int want, got = 0; int dirty; int ret = 0; @@ -1270,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { @@ -1316,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1325,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode, ceph_put_cap_refs(ci, got); unlock: mutex_unlock(&inode->i_mutex); + ceph_free_cap_flush(prealloc_cf); return ret; } @@ -1332,8 +1358,6 @@ const struct file_operations ceph_file_fops = { .open = ceph_open, .release = ceph_release, .llseek = ceph_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = ceph_read_iter, .write_iter = ceph_write_iter, .mmap = ceph_mmap, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 119c43c80638..96d2bd829902 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -6,7 +6,6 @@ #include <linux/string.h> #include <linux/uaccess.h> #include <linux/kernel.h> -#include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> #include <linux/posix_acl.h> @@ -390,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_inline_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; - ci->i_ordered_count = 0; - atomic_set(&ci->i_release_count, 1); - atomic_set(&ci->i_complete_count, 0); + atomic64_set(&ci->i_ordered_count, 1); + atomic64_set(&ci->i_release_count, 1); + atomic64_set(&ci->i_complete_seq[0], 0); + atomic64_set(&ci->i_complete_seq[1], 0); ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); @@ -416,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_flushing_caps = 0; INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_flushing_item); - ci->i_cap_flush_seq = 0; - ci->i_cap_flush_last_tid = 0; - memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid)); + ci->i_prealloc_cap_flush = NULL; + ci->i_cap_flush_tree = RB_ROOT; init_waitqueue_head(&ci->i_cap_wq); ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; @@ -753,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; ci->i_layout = info->layout; + queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), le64_to_cpu(info->truncate_size), @@ -819,6 +821,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, else kfree(sym); /* lost a race */ } + inode->i_link = ci->i_symlink; break; case S_IFDIR: inode->i_op = &ceph_dir_iops; @@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, (issued & CEPH_CAP_FILE_EXCL) == 0 && !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); + i_size_write(inode, 0); __ceph_dir_set_complete(ci, - atomic_read(&ci->i_release_count), - ci->i_ordered_count); + atomic64_read(&ci->i_release_count), + atomic64_read(&ci->i_ordered_count)); } wake = true; @@ -940,7 +944,7 @@ static void update_dentry_lease(struct dentry *dentry, dentry, duration, ttl); /* make lease_rdcache_gen match directory */ - dir = dentry->d_parent->d_inode; + dir = d_inode(dentry->d_parent); di->lease_shared_gen = ceph_inode(dir)->i_shared_gen; if (duration == 0) @@ -980,7 +984,7 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, { struct dentry *realdn; - BUG_ON(dn->d_inode); + BUG_ON(d_inode(dn)); /* dn must be unhashed */ if (!d_unhashed(dn)) @@ -998,13 +1002,13 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in, "inode %p ino %llx.%llx\n", dn, d_count(dn), realdn, d_count(realdn), - realdn->d_inode, ceph_vinop(realdn->d_inode)); + d_inode(realdn), ceph_vinop(d_inode(realdn))); dput(dn); dn = realdn; } else { BUG_ON(!ceph_dentry(dn)); dout("dn %p attached to %p ino %llx.%llx\n", - dn, dn->d_inode, ceph_vinop(dn->d_inode)); + dn, d_inode(dn), ceph_vinop(d_inode(dn))); } if ((!prehash || *prehash) && d_unhashed(dn)) d_rehash(dn); @@ -1125,11 +1129,11 @@ retry_lookup: dput(parent); goto done; } - } else if (dn->d_inode && - (ceph_ino(dn->d_inode) != vino.ino || - ceph_snap(dn->d_inode) != vino.snap)) { + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != vino.ino || + ceph_snap(d_inode(dn)) != vino.snap)) { dout(" dn %p points to wrong inode %p\n", - dn, dn->d_inode); + dn, d_inode(dn)); d_delete(dn); dput(dn); goto retry_lookup; @@ -1183,7 +1187,7 @@ retry_lookup: BUG_ON(!dn); BUG_ON(!dir); - BUG_ON(dn->d_parent->d_inode != dir); + BUG_ON(d_inode(dn->d_parent) != dir); BUG_ON(ceph_ino(dir) != le64_to_cpu(rinfo->diri.in->ino)); BUG_ON(ceph_snap(dir) != @@ -1212,6 +1216,10 @@ retry_lookup: dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(olddir); + d_move(req->r_old_dentry, dn); dout(" src %p '%pd' dst %p '%pd'\n", req->r_old_dentry, @@ -1222,10 +1230,6 @@ retry_lookup: rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_ordered(dir); - ceph_dir_clear_ordered(olddir); - dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1235,7 +1239,7 @@ retry_lookup: /* null dentry? */ if (!rinfo->head->is_target) { dout("fill_trace null dentry\n"); - if (dn->d_inode) { + if (d_really_is_positive(dn)) { ceph_dir_clear_ordered(dir); dout("d_delete %p\n", dn); d_delete(dn); @@ -1252,7 +1256,7 @@ retry_lookup: } /* attach proper inode */ - if (!dn->d_inode) { + if (d_really_is_negative(dn)) { ceph_dir_clear_ordered(dir); ihold(in); dn = splice_dentry(dn, in, &have_lease); @@ -1261,9 +1265,9 @@ retry_lookup: goto done; } req->r_dentry = dn; /* may have spliced */ - } else if (dn->d_inode && dn->d_inode != in) { + } else if (d_really_is_positive(dn) && d_inode(dn) != in) { dout(" %p links to %p %llx.%llx, not %llx.%llx\n", - dn, dn->d_inode, ceph_vinop(dn->d_inode), + dn, d_inode(dn), ceph_vinop(d_inode(dn)), ceph_vinop(in)); have_lease = false; } @@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, return err; } +void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) +{ + if (ctl->page) { + kunmap(ctl->page); + page_cache_release(ctl->page); + ctl->page = NULL; + } +} + +static int fill_readdir_cache(struct inode *dir, struct dentry *dn, + struct ceph_readdir_cache_control *ctl, + struct ceph_mds_request *req) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*); + unsigned idx = ctl->index % nsize; + pgoff_t pgoff = ctl->index / nsize; + + if (!ctl->page || pgoff != page_index(ctl->page)) { + ceph_readdir_cache_release(ctl); + ctl->page = grab_cache_page(&dir->i_data, pgoff); + if (!ctl->page) { + ctl->index = -1; + return -ENOMEM; + } + /* reading/filling the cache are serialized by + * i_mutex, no need to use page lock */ + unlock_page(ctl->page); + ctl->dentries = kmap(ctl->page); + } + + if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && + req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { + dout("readdir cache dn %p idx %d\n", dn, ctl->index); + ctl->dentries[idx] = dn; + ctl->index++; + } else { + dout("disable readdir cache\n"); + ctl->index = -1; + } + return 0; +} + int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { @@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_dentry_info *di; - u64 r_readdir_offset = req->r_readdir_offset; u32 frag = le32_to_cpu(rhead->args.readdir.frag); + struct ceph_readdir_cache_control cache_ctl = {}; + + if (req->r_aborted) + return readdir_prepopulate_inodes_only(req, session); if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { @@ -1354,16 +1404,13 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); if (ceph_frag_is_leftmost(frag)) - r_readdir_offset = 2; + req->r_readdir_offset = 2; else - r_readdir_offset = 0; + req->r_readdir_offset = 0; } - if (req->r_aborted) - return readdir_prepopulate_inodes_only(req, session); - if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { - snapdir = ceph_get_snapdir(parent->d_inode); + snapdir = ceph_get_snapdir(d_inode(parent)); parent = d_find_alias(snapdir); dout("readdir_prepopulate %d items under SNAPDIR dn %p\n", rinfo->dir_nr, parent); @@ -1371,9 +1418,20 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, dout("readdir_prepopulate %d items under dn %p\n", rinfo->dir_nr, parent); if (rinfo->dir_dir) - ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir); + ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); + } + + if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { + /* note dir version at start of readdir so we can tell + * if any dentries get dropped */ + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); + req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); + req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); + req->r_readdir_cache_idx = 0; } + cache_ctl.index = req->r_readdir_cache_idx; + /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_vino vino; @@ -1405,26 +1463,19 @@ retry_lookup: err = ret; goto out; } - } else if (dn->d_inode && - (ceph_ino(dn->d_inode) != vino.ino || - ceph_snap(dn->d_inode) != vino.snap)) { + } else if (d_really_is_positive(dn) && + (ceph_ino(d_inode(dn)) != vino.ino || + ceph_snap(d_inode(dn)) != vino.snap)) { dout(" dn %p points to wrong inode %p\n", - dn, dn->d_inode); + dn, d_inode(dn)); d_delete(dn); dput(dn); goto retry_lookup; - } else { - /* reorder parent's d_subdirs */ - spin_lock(&parent->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_child, &parent->d_subdirs); - spin_unlock(&dn->d_lock); - spin_unlock(&parent->d_lock); } /* inode */ - if (dn->d_inode) { - in = dn->d_inode; + if (d_really_is_positive(dn)) { + in = d_inode(dn); } else { in = ceph_get_inode(parent->d_sb, vino); if (IS_ERR(in)) { @@ -1436,17 +1487,19 @@ retry_lookup: } } - if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, - req->r_request_started, -1, - &req->r_caps_reservation) < 0) { + ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation); + if (ret < 0) { pr_err("fill_inode badness on %p\n", in); - if (!dn->d_inode) + if (d_really_is_negative(dn)) iput(in); d_drop(dn); + err = ret; goto next_item; } - if (!dn->d_inode) { + if (d_really_is_negative(dn)) { struct dentry *realdn = splice_dentry(dn, in, NULL); if (IS_ERR(realdn)) { err = PTR_ERR(realdn); @@ -1458,19 +1511,28 @@ retry_lookup: } di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); update_dentry_lease(dn, rinfo->dir_dlease[i], req->r_session, req->r_request_started); + + if (err == 0 && cache_ctl.index >= 0) { + ret = fill_readdir_cache(d_inode(parent), dn, + &cache_ctl, req); + if (ret < 0) + err = ret; + } next_item: if (dn) dput(dn); } - if (err == 0) - req->r_did_prepopulate = true; - out: + if (err == 0) { + req->r_did_prepopulate = true; + req->r_readdir_cache_idx = cache_ctl.index; + } + ceph_readdir_cache_release(&cache_ctl); if (snapdir) { iput(snapdir); dput(parent); @@ -1691,16 +1753,9 @@ retry: /* * symlinks */ -static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ceph_inode_info *ci = ceph_inode(dentry->d_inode); - nd_set_link(nd, ci->i_symlink); - return NULL; -} - static const struct inode_operations ceph_symlink_iops = { .readlink = generic_readlink, - .follow_link = ceph_sym_follow_link, + .follow_link = simple_follow_link, .setattr = ceph_setattr, .getattr = ceph_getattr, .setxattr = ceph_setxattr, @@ -1714,16 +1769,18 @@ static const struct inode_operations ceph_symlink_iops = { */ int ceph_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf; int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; int inode_dirty_flags = 0; + bool lock_snap_rwsem = false; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -1732,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (err != 0) return err; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, USE_AUTH_MDS); - if (IS_ERR(req)) + if (IS_ERR(req)) { + ceph_free_cap_flush(prealloc_cf); return PTR_ERR(req); + } spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); + + if (!ci->i_head_snapc && + (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + } + } + dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); if (ia_valid & ATTR_UID) { @@ -1881,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dout("setattr %p ATTR_FILE ... hrm!\n", inode); if (dirtied) { - inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); + inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, + &prealloc_cf); inode->i_ctime = CURRENT_TIME; } release &= issued; spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -1911,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_mdsc_put_request(req); if (mask & CEPH_SETATTR_SIZE) __ceph_do_pending_vmtruncate(inode); + ceph_free_cap_flush(prealloc_cf); return err; out_put: ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); return err; } @@ -1990,7 +2070,7 @@ int ceph_permission(struct inode *inode, int mask) int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); int err; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 4347039ecc18..6706bde9ad1b 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -287,7 +287,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + list_for_each_entry(lock, &ctx->flc_posix, fl_list) { ++seen_fcntl; if (seen_fcntl > num_fcntl_locks) { err = -ENOSPC; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 71c073f38e54..6aa07af67603 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -8,6 +8,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include <linux/ratelimit.h> #include "super.h" #include "mds_client.h" @@ -458,7 +459,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); - INIT_LIST_HEAD(&s->s_cap_releases_done); INIT_LIST_HEAD(&s->s_cap_flushing); INIT_LIST_HEAD(&s->s_cap_snaps_flushing); @@ -629,6 +629,9 @@ static void __register_request(struct ceph_mds_client *mdsc, req->r_uid = current_fsuid(); req->r_gid = current_fsgid(); + if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) + mdsc->oldest_tid = req->r_tid; + if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -644,6 +647,21 @@ static void __unregister_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { dout("__unregister_request %p tid %lld\n", req, req->r_tid); + + if (req->r_tid == mdsc->oldest_tid) { + struct rb_node *p = rb_next(&req->r_node); + mdsc->oldest_tid = 0; + while (p) { + struct ceph_mds_request *next_req = + rb_entry(p, struct ceph_mds_request, r_node); + if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) { + mdsc->oldest_tid = next_req->r_tid; + break; + } + p = rb_next(p); + } + } + rb_erase(&req->r_node, &mdsc->request_tree); RB_CLEAR_NODE(&req->r_node); @@ -679,7 +697,7 @@ static struct dentry *get_nonsnap_parent(struct dentry *dentry) * except to resplice to another snapdir, and either the old or new * result is a valid result. */ - while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + while (!IS_ROOT(dentry) && ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) dentry = dentry->d_parent; return dentry; } @@ -716,20 +734,20 @@ static int __choose_mds(struct ceph_mds_client *mdsc, } else if (req->r_dentry) { /* ignore race with rename; old or new d_parent is okay */ struct dentry *parent = req->r_dentry->d_parent; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); if (dir->i_sb != mdsc->fsc->sb) { /* not this fs! */ - inode = req->r_dentry->d_inode; + inode = d_inode(req->r_dentry); } else if (ceph_snap(dir) != CEPH_NOSNAP) { /* direct snapped/virtual snapdir requests * based on parent dir inode */ struct dentry *dn = get_nonsnap_parent(parent); - inode = dn->d_inode; + inode = d_inode(dn); dout("__choose_mds using nonsnap parent %p\n", inode); } else { /* dentry target */ - inode = req->r_dentry->d_inode; + inode = d_inode(req->r_dentry); if (!inode || mode == USE_AUTH_MDS) { /* dir + name */ inode = dir; @@ -998,27 +1016,53 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, * session caps */ -/* - * Free preallocated cap messages assigned to this session - */ -static void cleanup_cap_releases(struct ceph_mds_session *session) +/* caller holds s_cap_lock, we drop it */ +static void cleanup_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) + __releases(session->s_cap_lock) { - struct ceph_msg *msg; + LIST_HEAD(tmp_list); + list_splice_init(&session->s_cap_releases, &tmp_list); + session->s_num_cap_releases = 0; + spin_unlock(&session->s_cap_lock); - spin_lock(&session->s_cap_lock); - while (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - ceph_msg_put(msg); - } - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - ceph_msg_put(msg); + dout("cleanup_cap_releases mds%d\n", session->s_mds); + while (!list_empty(&tmp_list)) { + struct ceph_cap *cap; + /* zero out the in-progress message */ + cap = list_first_entry(&tmp_list, + struct ceph_cap, session_caps); + list_del(&cap->session_caps); + ceph_put_cap(mdsc, cap); } - spin_unlock(&session->s_cap_lock); +} + +static void cleanup_session_requests(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_mds_request *req; + struct rb_node *p; + + dout("cleanup_session_requests mds%d\n", session->s_mds); + mutex_lock(&mdsc->mutex); + while (!list_empty(&session->s_unsafe)) { + req = list_first_entry(&session->s_unsafe, + struct ceph_mds_request, r_unsafe_item); + list_del_init(&req->r_unsafe_item); + pr_warn_ratelimited(" dropping unsafe request %llu\n", + req->r_tid); + __unregister_request(mdsc, req); + } + /* zero r_attempts, so kick_requests() will re-send requests */ + p = rb_first(&mdsc->request_tree); + while (p) { + req = rb_entry(p, struct ceph_mds_request, r_node); + p = rb_next(p); + if (req->r_session && + req->r_session->s_mds == session->s_mds) + req->r_attempts = 0; + } + mutex_unlock(&mdsc->mutex); } /* @@ -1068,10 +1112,16 @@ static int iterate_session_caps(struct ceph_mds_session *session, dout("iterate_session_caps finishing cap %p removal\n", cap); BUG_ON(cap->session != session); + cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; - cap->session = NULL; - old_cap = cap; /* put_cap it w/o locks held */ + if (cap->queue_release) { + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; + } else { + old_cap = cap; /* put_cap it w/o locks held */ + } } if (ret < 0) goto out; @@ -1092,19 +1142,35 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); + LIST_HEAD(to_remove); int drop = 0; dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); __ceph_remove_cap(cap, false); - if (!__ceph_is_any_real_caps(ci)) { + if (!ci->i_auth_cap) { + struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + while (true) { + struct rb_node *n = rb_first(&ci->i_cap_flush_tree); + if (!n) + break; + cf = rb_entry(n, struct ceph_cap_flush, i_node); + rb_erase(&cf->i_node, &ci->i_cap_flush_tree); + list_add(&cf->list, &to_remove); + } + spin_lock(&mdsc->cap_dirty_lock); + + list_for_each_entry(cf, &to_remove, list) + rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + if (!list_empty(&ci->i_dirty_item)) { - pr_info(" dropping dirty %s state for %p %lld\n", + pr_warn_ratelimited( + " dropping dirty %s state for %p %lld\n", ceph_cap_string(ci->i_dirty_caps), inode, ceph_ino(inode)); ci->i_dirty_caps = 0; @@ -1112,7 +1178,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, drop = 1; } if (!list_empty(&ci->i_flushing_item)) { - pr_info(" dropping dirty+flushing %s state for %p %lld\n", + pr_warn_ratelimited( + " dropping dirty+flushing %s state for %p %lld\n", ceph_cap_string(ci->i_flushing_caps), inode, ceph_ino(inode)); ci->i_flushing_caps = 0; @@ -1120,16 +1187,21 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, mdsc->num_cap_flushing--; drop = 1; } - if (drop && ci->i_wrbuffer_ref) { - pr_info(" dropping dirty data for %p %lld\n", - inode, ceph_ino(inode)); - ci->i_wrbuffer_ref = 0; - ci->i_wrbuffer_ref_head = 0; - drop++; - } spin_unlock(&mdsc->cap_dirty_lock); + + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { + list_add(&ci->i_prealloc_cap_flush->list, &to_remove); + ci->i_prealloc_cap_flush = NULL; + } } spin_unlock(&ci->i_ceph_lock); + while (!list_empty(&to_remove)) { + struct ceph_cap_flush *cf; + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, list); + list_del(&cf->list); + ceph_free_cap_flush(cf); + } while (drop--) iput(inode); return 0; @@ -1171,11 +1243,12 @@ static void remove_session_caps(struct ceph_mds_session *session) spin_lock(&session->s_cap_lock); } } - spin_unlock(&session->s_cap_lock); + + // drop cap expires and unlock s_cap_lock + cleanup_cap_releases(session->s_mdsc, session); BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); - cleanup_cap_releases(session); } /* @@ -1351,7 +1424,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), ceph_cap_string(used), ceph_cap_string(wanted)); if (cap == ci->i_auth_cap) { - if (ci->i_dirty_caps | ci->i_flushing_caps) + if (ci->i_dirty_caps || ci->i_flushing_caps || + !list_empty(&ci->i_cap_snaps)) goto out; if ((used | wanted) & CEPH_CAP_ANY_WR) goto out; @@ -1397,121 +1471,80 @@ static int trim_caps(struct ceph_mds_client *mdsc, session->s_trim_caps = 0; } - ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); return 0; } -/* - * Allocate cap_release messages. If there is a partially full message - * in the queue, try to allocate enough to cover it's remainder, so that - * we can send it immediately. - * - * Called under s_mutex. - */ -int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static int check_capsnap_flush(struct ceph_inode_info *ci, + u64 want_snap_seq) { - struct ceph_msg *msg, *partial = NULL; - struct ceph_mds_cap_release *head; - int err = -ENOMEM; - int extra = mdsc->fsc->mount_options->cap_release_safety; - int num; - - dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, - extra); - - spin_lock(&session->s_cap_lock); - - if (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, - list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - if (num) { - dout(" partial %p with (%d/%d)\n", msg, num, - (int)CEPH_CAPS_PER_RELEASE); - extra += CEPH_CAPS_PER_RELEASE - num; - partial = msg; - } - } - while (session->s_num_cap_releases < session->s_nr_caps + extra) { - spin_unlock(&session->s_cap_lock); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - GFP_NOFS, false); - if (!msg) - goto out_unlocked; - dout("add_cap_releases %p msg %p now %d\n", session, msg, - (int)msg->front.iov_len); - head = msg->front.iov_base; - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - spin_lock(&session->s_cap_lock); - list_add(&msg->list_head, &session->s_cap_releases); - session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; - } - - if (partial) { - head = partial->front.iov_base; - num = le32_to_cpu(head->num); - dout(" queueing partial %p with %d/%d\n", partial, num, - (int)CEPH_CAPS_PER_RELEASE); - list_move_tail(&partial->list_head, - &session->s_cap_releases_done); - session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; + int ret = 1; + spin_lock(&ci->i_ceph_lock); + if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { + struct ceph_cap_snap *capsnap = + list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + ret = capsnap->follows >= want_snap_seq; } - err = 0; - spin_unlock(&session->s_cap_lock); -out_unlocked: - return err; + spin_unlock(&ci->i_ceph_lock); + return ret; } -static int check_cap_flush(struct inode *inode, u64 want_flush_seq) +static int check_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid) { - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - spin_lock(&ci->i_ceph_lock); - if (ci->i_flushing_caps) - ret = ci->i_cap_flush_seq >= want_flush_seq; - else - ret = 1; - spin_unlock(&ci->i_ceph_lock); + struct rb_node *n; + struct ceph_cap_flush *cf; + int ret = 1; + + spin_lock(&mdsc->cap_dirty_lock); + n = rb_first(&mdsc->cap_flush_tree); + cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; + if (cf && cf->tid <= want_flush_tid) { + dout("check_caps_flush still flushing tid %llu <= %llu\n", + cf->tid, want_flush_tid); + ret = 0; + } + spin_unlock(&mdsc->cap_dirty_lock); return ret; } /* * flush all dirty inode data to disk. * - * returns true if we've flushed through want_flush_seq + * returns true if we've flushed through want_flush_tid */ -static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +static void wait_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid, u64 want_snap_seq) { int mds; - dout("check_cap_flush want %lld\n", want_flush_seq); + dout("check_caps_flush want %llu snap want %llu\n", + want_flush_tid, want_snap_seq); mutex_lock(&mdsc->mutex); - for (mds = 0; mds < mdsc->max_sessions; mds++) { + for (mds = 0; mds < mdsc->max_sessions; ) { struct ceph_mds_session *session = mdsc->sessions[mds]; struct inode *inode = NULL; - if (!session) + if (!session) { + mds++; continue; + } get_session(session); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); - if (!list_empty(&session->s_cap_flushing)) { - struct ceph_inode_info *ci = - list_entry(session->s_cap_flushing.next, - struct ceph_inode_info, - i_flushing_item); - - if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { - dout("check_cap_flush still flushing %p " - "seq %lld <= %lld to mds%d\n", - &ci->vfs_inode, ci->i_cap_flush_seq, - want_flush_seq, session->s_mds); + if (!list_empty(&session->s_cap_snaps_flushing)) { + struct ceph_cap_snap *capsnap = + list_first_entry(&session->s_cap_snaps_flushing, + struct ceph_cap_snap, + flushing_item); + struct ceph_inode_info *ci = capsnap->ci; + if (!check_capsnap_flush(ci, want_snap_seq)) { + dout("check_cap_flush still flushing snap %p " + "follows %lld <= %lld to mds%d\n", + &ci->vfs_inode, capsnap->follows, + want_snap_seq, mds); inode = igrab(&ci->vfs_inode); } } @@ -1520,15 +1553,21 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) if (inode) { wait_event(mdsc->cap_flushing_wq, - check_cap_flush(inode, want_flush_seq)); + check_capsnap_flush(ceph_inode(inode), + want_snap_seq)); iput(inode); + } else { + mds++; } mutex_lock(&mdsc->mutex); } - mutex_unlock(&mdsc->mutex); - dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); + + wait_event(mdsc->cap_flushing_wq, + check_caps_flush(mdsc, want_flush_tid)); + + dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid); } /* @@ -1537,60 +1576,74 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { - struct ceph_msg *msg; + struct ceph_msg *msg = NULL; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + struct ceph_cap *cap; + LIST_HEAD(tmp_list); + int num_cap_releases; - dout("send_cap_releases mds%d\n", session->s_mds); spin_lock(&session->s_cap_lock); - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - spin_unlock(&session->s_cap_lock); - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("send_cap_releases mds%d %p\n", session->s_mds, msg); - ceph_con_send(&session->s_con, msg); - spin_lock(&session->s_cap_lock); - } +again: + list_splice_init(&session->s_cap_releases, &tmp_list); + num_cap_releases = session->s_num_cap_releases; + session->s_num_cap_releases = 0; spin_unlock(&session->s_cap_lock); -} - -static void discard_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) -{ - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - unsigned num; - dout("discard_cap_releases mds%d\n", session->s_mds); + while (!list_empty(&tmp_list)) { + if (!msg) { + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, + PAGE_CACHE_SIZE, GFP_NOFS, false); + if (!msg) + goto out_err; + head = msg->front.iov_base; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + } + cap = list_first_entry(&tmp_list, struct ceph_cap, + session_caps); + list_del(&cap->session_caps); + num_cap_releases--; - if (!list_empty(&session->s_cap_releases)) { - /* zero out the in-progress message */ - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", - session->s_mds, msg, num); - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - session->s_num_cap_releases += num; + le32_add_cpu(&head->num, 1); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(cap->cap_ino); + item->cap_id = cpu_to_le64(cap->cap_id); + item->migrate_seq = cpu_to_le32(cap->mseq); + item->seq = cpu_to_le32(cap->issue_seq); + msg->front.iov_len += sizeof(*item); + + ceph_put_cap(mdsc, cap); + + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + msg = NULL; + } } - /* requeue completed messages */ - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); + BUG_ON(num_cap_releases != 0); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, - num); - session->s_num_cap_releases += num; - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - list_add(&msg->list_head, &session->s_cap_releases); + spin_lock(&session->s_cap_lock); + if (!list_empty(&session->s_cap_releases)) + goto again; + spin_unlock(&session->s_cap_lock); + + if (msg) { + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); } + return; +out_err: + pr_err("send_cap_releases mds%d, failed to allocate message\n", + session->s_mds); + spin_lock(&session->s_cap_lock); + list_splice(&tmp_list, &session->s_cap_releases); + session->s_num_cap_releases += num_cap_releases; + spin_unlock(&session->s_cap_lock); } /* @@ -1615,7 +1668,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { - rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, + rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN, order); if (rinfo->dir_in) break; @@ -1677,13 +1731,9 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) struct ceph_mds_request, r_node); } -static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) +static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) { - struct ceph_mds_request *req = __get_oldest_req(mdsc); - - if (req) - return req->r_tid; - return 0; + return mdsc->oldest_tid; } /* @@ -1712,7 +1762,7 @@ retry: seq = read_seqbegin(&rename_lock); rcu_read_lock(); for (temp = dentry; !IS_ROOT(temp);) { - struct inode *inode = temp->d_inode; + struct inode *inode = d_inode(temp); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) len++; /* slash only */ else if (stop_on_nosnap && inode && @@ -1736,7 +1786,7 @@ retry: struct inode *inode; spin_lock(&temp->d_lock); - inode = temp->d_inode; + inode = d_inode(temp); if (inode && ceph_snap(inode) == CEPH_SNAPDIR) { dout("build_path path+%d: %p SNAPDIR\n", pos, temp); @@ -1770,7 +1820,7 @@ retry: goto retry; } - *base = ceph_ino(temp->d_inode); + *base = ceph_ino(d_inode(temp)); *plen = len; dout("build_path on %p %d built %llx '%.*s'\n", dentry, d_count(dentry), *base, len, path); @@ -1783,8 +1833,8 @@ static int build_dentry_path(struct dentry *dentry, { char *path; - if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) { - *pino = ceph_ino(dentry->d_parent->d_inode); + if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP) { + *pino = ceph_ino(d_inode(dentry->d_parent)); *ppath = dentry->d_name.name; *ppathlen = dentry->d_name.len; return 0; @@ -1853,7 +1903,7 @@ static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry, */ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, - int mds) + int mds, bool drop_cap_releases) { struct ceph_msg *msg; struct ceph_mds_request_head *head; @@ -1925,7 +1975,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, releases = 0; if (req->r_inode_drop) releases += ceph_encode_inode_release(&p, - req->r_inode ? req->r_inode : req->r_dentry->d_inode, + req->r_inode ? req->r_inode : d_inode(req->r_dentry), mds, req->r_inode_drop, req->r_inode_unless, 0); if (req->r_dentry_drop) releases += ceph_encode_dentry_release(&p, req->r_dentry, @@ -1935,8 +1985,14 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, mds, req->r_old_dentry_drop, req->r_old_dentry_unless); if (req->r_old_inode_drop) releases += ceph_encode_inode_release(&p, - req->r_old_dentry->d_inode, + d_inode(req->r_old_dentry), mds, req->r_old_inode_drop, req->r_old_inode_unless, 0); + + if (drop_cap_releases) { + releases = 0; + p = msg->front.iov_base + req->r_request_release_offset; + } + head->num_releases = cpu_to_le16(releases); /* time stamp */ @@ -1989,7 +2045,7 @@ static void complete_request(struct ceph_mds_client *mdsc, */ static int __prepare_send_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req, - int mds) + int mds, bool drop_cap_releases) { struct ceph_mds_request_head *rhead; struct ceph_msg *msg; @@ -2048,7 +2104,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, ceph_msg_put(req->r_request); req->r_request = NULL; } - msg = create_request_message(mdsc, req, mds); + msg = create_request_message(mdsc, req, mds, drop_cap_releases); if (IS_ERR(msg)) { req->r_err = PTR_ERR(msg); complete_request(mdsc, req); @@ -2132,7 +2188,7 @@ static int __do_request(struct ceph_mds_client *mdsc, if (req->r_request_started == 0) /* note request start time */ req->r_request_started = jiffies; - err = __prepare_send_request(mdsc, req, mds); + err = __prepare_send_request(mdsc, req, mds, false); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2241,15 +2297,18 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, /* wait */ mutex_unlock(&mdsc->mutex); dout("do_request waiting\n"); - if (req->r_timeout) { - err = (long)wait_for_completion_killable_timeout( - &req->r_completion, req->r_timeout); - if (err == 0) - err = -EIO; - } else if (req->r_wait_for_completion) { + if (!req->r_timeout && req->r_wait_for_completion) { err = req->r_wait_for_completion(mdsc, req); } else { - err = wait_for_completion_killable(&req->r_completion); + long timeleft = wait_for_completion_killable_timeout( + &req->r_completion, + ceph_timeout_jiffies(req->r_timeout)); + if (timeleft > 0) + err = 0; + else if (!timeleft) + err = -EIO; /* timed out */ + else + err = timeleft; /* killed */ } dout("do_request waited, got %d\n", err); mutex_lock(&mdsc->mutex); @@ -2470,7 +2529,6 @@ out_err: } mutex_unlock(&mdsc->mutex); - ceph_add_cap_releases(mdsc, req->r_session); mutex_unlock(&session->s_mutex); /* kick calling process */ @@ -2590,6 +2648,7 @@ static void handle_session(struct ceph_mds_session *session, case CEPH_SESSION_CLOSE: if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect denied\n", session->s_mds); + cleanup_session_requests(mdsc, session); remove_session_caps(session); wake = 2; /* for good measure */ wake_up_all(&mdsc->session_close_wq); @@ -2658,7 +2717,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, mutex_lock(&mdsc->mutex); list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { - err = __prepare_send_request(mdsc, req, session->s_mds); + err = __prepare_send_request(mdsc, req, session->s_mds, true); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2679,7 +2738,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, continue; /* only old requests */ if (req->r_session && req->r_session->s_mds == session->s_mds) { - err = __prepare_send_request(mdsc, req, session->s_mds); + err = __prepare_send_request(mdsc, req, + session->s_mds, true); if (!err) { ceph_msg_get(req->r_request); ceph_con_send(&session->s_con, req->r_request); @@ -2860,11 +2920,11 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, */ session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ - discard_cap_releases(mdsc, session); - spin_unlock(&session->s_cap_lock); + cleanup_cap_releases(mdsc, session); /* trim unused caps to reduce MDS's cache rejoin time */ - shrink_dcache_parent(mdsc->fsc->sb->s_root); + if (mdsc->fsc->sb->s_root) + shrink_dcache_parent(mdsc->fsc->sb->s_root); ceph_con_close(&session->s_con); ceph_con_open(&session->s_con, @@ -2927,6 +2987,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, reply->hdr.data_len = cpu_to_le32(pagelist->length); ceph_msg_data_add_pagelist(reply, pagelist); + + ceph_early_kick_flushing_caps(mdsc, session); + ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -3133,7 +3196,7 @@ static void handle_lease(struct ceph_mds_client *mdsc, di->lease_renew_from && di->lease_renew_after == 0) { unsigned long duration = - le32_to_cpu(h->duration_ms) * HZ / 1000; + msecs_to_jiffies(le32_to_cpu(h->duration_ms)); di->lease_seq = seq; dentry->d_time = di->lease_renew_from + duration; @@ -3323,7 +3386,6 @@ static void delayed_work(struct work_struct *work) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); - ceph_add_cap_releases(mdsc, s); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(mdsc, s); @@ -3361,11 +3423,13 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) atomic_set(&mdsc->num_sessions, 0); mdsc->max_sessions = 0; mdsc->stopping = 0; + mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); spin_lock_init(&mdsc->snap_empty_lock); mdsc->last_tid = 0; + mdsc->oldest_tid = 0; mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; @@ -3373,7 +3437,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); - mdsc->cap_flush_seq = 0; + mdsc->last_cap_flush_tid = 1; + mdsc->cap_flush_tree = RB_ROOT; INIT_LIST_HEAD(&mdsc->cap_dirty); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; @@ -3385,6 +3450,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) ceph_caps_init(mdsc); ceph_adjust_min_caps(mdsc, fsc->min_caps); + init_rwsem(&mdsc->pool_perm_rwsem); + mdsc->pool_perm_tree = RB_ROOT; + return 0; } @@ -3394,8 +3462,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) */ static void wait_requests(struct ceph_mds_client *mdsc) { + struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_request *req; - struct ceph_fs_client *fsc = mdsc->fsc; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { @@ -3403,7 +3471,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, - fsc->client->options->mount_timeout * HZ); + ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); @@ -3456,7 +3524,8 @@ restart: nextreq = rb_entry(n, struct ceph_mds_request, r_node); else nextreq = NULL; - if ((req->r_op & CEPH_MDS_OP_WRITE)) { + if (req->r_op != CEPH_MDS_OP_SETFILELOCK && + (req->r_op & CEPH_MDS_OP_WRITE)) { /* write op */ ceph_mdsc_get_request(req); if (nextreq) @@ -3484,7 +3553,7 @@ restart: void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { - u64 want_tid, want_flush; + u64 want_tid, want_flush, want_snap; if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return; @@ -3496,13 +3565,18 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) ceph_flush_dirty_caps(mdsc); spin_lock(&mdsc->cap_dirty_lock); - want_flush = mdsc->cap_flush_seq; + want_flush = mdsc->last_cap_flush_tid; spin_unlock(&mdsc->cap_dirty_lock); - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); + down_read(&mdsc->snap_rwsem); + want_snap = mdsc->last_snap_seq; + up_read(&mdsc->snap_rwsem); + + dout("sync want tid %lld flush_seq %lld snap_seq %lld\n", + want_tid, want_flush, want_snap); wait_unsafe_requests(mdsc, want_tid); - wait_caps_flush(mdsc, want_flush); + wait_caps_flush(mdsc, want_flush, want_snap); } /* @@ -3520,10 +3594,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc) */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { + struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_session *session; int i; - struct ceph_fs_client *fsc = mdsc->fsc; - unsigned long timeout = fsc->client->options->mount_timeout * HZ; dout("close_sessions\n"); @@ -3544,7 +3617,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) dout("waiting for sessions to close\n"); wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), - timeout); + ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining sessions */ mutex_lock(&mdsc->mutex); @@ -3578,6 +3651,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); ceph_caps_finalize(mdsc); + ceph_pool_perm_destroy(mdsc); } void ceph_mdsc_destroy(struct ceph_fs_client *fsc) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 1875b5d985c6..762757e6cebf 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -139,7 +139,6 @@ struct ceph_mds_session { int s_cap_reconnect; int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ - struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; /* protected by mutex */ @@ -228,7 +227,7 @@ struct ceph_mds_request { int r_err; bool r_aborted; - unsigned long r_timeout; /* optional. jiffies */ + unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ unsigned long r_request_started; /* start time for mds request only, used to measure lease durations */ @@ -254,12 +253,21 @@ struct ceph_mds_request { bool r_got_unsafe, r_got_safe, r_got_result; bool r_did_prepopulate; + long long r_dir_release_cnt; + long long r_dir_ordered_cnt; + int r_readdir_cache_idx; u32 r_readdir_offset; struct ceph_cap_reservation r_caps_reservation; int r_num_caps; }; +struct ceph_pool_perm { + struct rb_node node; + u32 pool; + int perm; +}; + /* * mds client state */ @@ -284,12 +292,15 @@ struct ceph_mds_client { * references (implying they contain no inodes with caps) that * should be destroyed. */ + u64 last_snap_seq; struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ + u64 oldest_tid; /* oldest incomplete mds request, + excluding setfilelock requests */ struct rb_root request_tree; /* pending mds requests */ struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ @@ -298,7 +309,8 @@ struct ceph_mds_client { struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; - u64 cap_flush_seq; + u64 last_cap_flush_tid; + struct rb_root cap_flush_tree; struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ @@ -328,6 +340,9 @@ struct ceph_mds_client { spinlock_t dentry_lru_lock; struct list_head dentry_lru; int num_dentry; + + struct rw_semaphore pool_perm_rwsem; + struct rb_root pool_perm_tree; }; extern const char *ceph_mds_op_name(int op); @@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } -extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index a97e39f09ba6..233d906aec02 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b) } -static struct ceph_snap_context *empty_snapc; +struct ceph_snap_context *ceph_empty_snapc; /* * build the snap context for a given realm. @@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) return 0; } - if (num == 0 && realm->seq == empty_snapc->seq) { - ceph_get_snap_context(empty_snapc); - snapc = empty_snapc; + if (num == 0 && realm->seq == ceph_empty_snapc->seq) { + ceph_get_snap_context(ceph_empty_snapc); + snapc = ceph_empty_snapc; goto done; } @@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num) return 0; } +static bool has_new_snaps(struct ceph_snap_context *o, + struct ceph_snap_context *n) +{ + if (n->num_snaps == 0) + return false; + /* snaps are in descending order */ + return n->snaps[0] > o->seq; +} /* * When a snapshot is applied, the size/mtime inode metadata is queued @@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; + struct ceph_snap_context *old_snapc, *new_snapc; int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); @@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); + old_snapc = ci->i_head_snapc; + new_snapc = ci->i_snap_realm->cached_context; + /* * If there is a write in progress, treat that as a dirty Fw, * even though it hasn't completed yet; by the time we finish @@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) writes in progress now were started before the previous cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); - kfree(capsnap); - } else if (ci->i_snap_realm->cached_context == empty_snapc) { - dout("queue_cap_snap %p empty snapc\n", inode); - kfree(capsnap); - } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| - CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { - struct ceph_snap_context *snapc = ci->i_head_snapc; - - /* - * if we are a sync write, we may need to go to the snaprealm - * to get the current snapc. - */ - if (!snapc) - snapc = ci->i_snap_realm->cached_context; + goto update_snapc; + } + if (ci->i_wrbuffer_ref_head == 0 && + !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { + dout("queue_cap_snap %p nothing dirty|writing\n", inode); + goto update_snapc; + } - dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", - inode, capsnap, snapc, ceph_cap_string(dirty)); - ihold(inode); + BUG_ON(!old_snapc); - atomic_set(&capsnap->nref, 1); - capsnap->ci = ci; - INIT_LIST_HEAD(&capsnap->ci_item); - INIT_LIST_HEAD(&capsnap->flushing_item); - - capsnap->follows = snapc->seq; - capsnap->issued = __ceph_caps_issued(ci, NULL); - capsnap->dirty = dirty; - - capsnap->mode = inode->i_mode; - capsnap->uid = inode->i_uid; - capsnap->gid = inode->i_gid; - - if (dirty & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); - capsnap->xattr_blob = - ceph_buffer_get(ci->i_xattrs.blob); - capsnap->xattr_version = ci->i_xattrs.version; - } else { - capsnap->xattr_blob = NULL; - capsnap->xattr_version = 0; + /* + * There is no need to send FLUSHSNAP message to MDS if there is + * no new snapshot. But when there is dirty pages or on-going + * writes, we still need to create cap_snap. cap_snap is needed + * by the write path and page writeback path. + * + * also see ceph_try_drop_cap_snap() + */ + if (has_new_snaps(old_snapc, new_snapc)) { + if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) + capsnap->need_flush = true; + } else { + if (!(used & CEPH_CAP_FILE_WR) && + ci->i_wrbuffer_ref_head == 0) { + dout("queue_cap_snap %p " + "no new_snap|dirty_page|writing\n", inode); + goto update_snapc; } + } - capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; - - /* dirty page count moved from _head to this cap_snap; - all subsequent writes page dirties occur _after_ this - snapshot. */ - capsnap->dirty_pages = ci->i_wrbuffer_ref_head; - ci->i_wrbuffer_ref_head = 0; - capsnap->context = snapc; - ci->i_head_snapc = - ceph_get_snap_context(ci->i_snap_realm->cached_context); - dout(" new snapc is %p\n", ci->i_head_snapc); - list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); - - if (used & CEPH_CAP_FILE_WR) { - dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, - capsnap, snapc, snapc->seq); - capsnap->writing = 1; - } else { - /* note mtime, size NOW. */ - __ceph_finish_cap_snap(ci, capsnap); - } + dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", + inode, capsnap, old_snapc, ceph_cap_string(dirty), + capsnap->need_flush ? "" : "no_flush"); + ihold(inode); + + atomic_set(&capsnap->nref, 1); + capsnap->ci = ci; + INIT_LIST_HEAD(&capsnap->ci_item); + INIT_LIST_HEAD(&capsnap->flushing_item); + + capsnap->follows = old_snapc->seq; + capsnap->issued = __ceph_caps_issued(ci, NULL); + capsnap->dirty = dirty; + + capsnap->mode = inode->i_mode; + capsnap->uid = inode->i_uid; + capsnap->gid = inode->i_gid; + + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; } else { - dout("queue_cap_snap %p nothing dirty|writing\n", inode); - kfree(capsnap); + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; } + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + + /* dirty page count moved from _head to this cap_snap; + all subsequent writes page dirties occur _after_ this + snapshot. */ + capsnap->dirty_pages = ci->i_wrbuffer_ref_head; + ci->i_wrbuffer_ref_head = 0; + capsnap->context = old_snapc; + list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); + old_snapc = NULL; + + if (used & CEPH_CAP_FILE_WR) { + dout("queue_cap_snap %p cap_snap %p snapc %p" + " seq %llu used WR, now pending\n", inode, + capsnap, old_snapc, old_snapc->seq); + capsnap->writing = 1; + } else { + /* note mtime, size NOW. */ + __ceph_finish_cap_snap(ci, capsnap); + } + capsnap = NULL; + +update_snapc: + if (ci->i_head_snapc) { + ci->i_head_snapc = ceph_get_snap_context(new_snapc); + dout(" new snapc is %p\n", new_snapc); + } spin_unlock(&ci->i_ceph_lock); + + kfree(capsnap); + ceph_put_snap_context(old_snapc); } /* @@ -699,6 +730,8 @@ more: /* queue realm for cap_snap creation */ list_add(&realm->dirty_item, &dirty_realms); + if (realm->seq > mdsc->last_snap_seq) + mdsc->last_snap_seq = realm->seq; invalidate = 1; } else if (!realm->cached_context) { @@ -964,14 +997,14 @@ out: int __init ceph_snap_init(void) { - empty_snapc = ceph_create_snap_context(0, GFP_NOFS); - if (!empty_snapc) + ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS); + if (!ceph_empty_snapc) return -ENOMEM; - empty_snapc->seq = 1; + ceph_empty_snapc->seq = 1; return 0; } void ceph_snap_exit(void) { - ceph_put_snap_context(empty_snapc); + ceph_put_snap_context(ceph_empty_snapc); } diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c index 51cc23e48111..89e6bc321df3 100644 --- a/fs/ceph/strings.c +++ b/fs/ceph/strings.c @@ -75,6 +75,7 @@ const char *ceph_mds_op_name(int op) case CEPH_MDS_OP_LSSNAP: return "lssnap"; case CEPH_MDS_OP_MKSNAP: return "mksnap"; case CEPH_MDS_OP_RMSNAP: return "rmsnap"; + case CEPH_MDS_OP_RENAMESNAP: return "renamesnap"; case CEPH_MDS_OP_SETFILELOCK: return "setfilelock"; case CEPH_MDS_OP_GETFILELOCK: return "getfilelock"; } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index a63997b8bcff..d1c833c321b9 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -44,7 +44,7 @@ static void ceph_put_super(struct super_block *s) static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct ceph_fs_client *fsc = ceph_inode_to_client(dentry->d_inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry)); struct ceph_monmap *monmap = fsc->client->monc.monmap; struct ceph_statfs st; u64 fsid; @@ -134,10 +134,12 @@ enum { Opt_noino32, Opt_fscache, Opt_nofscache, + Opt_poolperm, + Opt_nopoolperm, #ifdef CONFIG_CEPH_FS_POSIX_ACL Opt_acl, #endif - Opt_noacl + Opt_noacl, }; static match_table_t fsopt_tokens = { @@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = { {Opt_noino32, "noino32"}, {Opt_fscache, "fsc"}, {Opt_nofscache, "nofsc"}, + {Opt_poolperm, "poolperm"}, + {Opt_nopoolperm, "nopoolperm"}, #ifdef CONFIG_CEPH_FS_POSIX_ACL {Opt_acl, "acl"}, #endif @@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private) case Opt_nofscache: fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; break; + case Opt_poolperm: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; + printk ("pool perm"); + break; + case Opt_nopoolperm: + fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; + break; #ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: fsopt->sb_flags |= MS_POSIXACL; @@ -345,6 +356,11 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt, fsopt->rsize = CEPH_RSIZE_DEFAULT; fsopt->rasize = CEPH_RASIZE_DEFAULT; fsopt->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL); + if (!fsopt->snapdir_name) { + err = -ENOMEM; + goto out; + } + fsopt->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT; fsopt->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT; fsopt->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT; @@ -406,31 +422,20 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) { struct ceph_fs_client *fsc = ceph_sb_to_client(root->d_sb); struct ceph_mount_options *fsopt = fsc->mount_options; - struct ceph_options *opt = fsc->client->options; - - if (opt->flags & CEPH_OPT_FSID) - seq_printf(m, ",fsid=%pU", &opt->fsid); - if (opt->flags & CEPH_OPT_NOSHARE) - seq_puts(m, ",noshare"); - if (opt->flags & CEPH_OPT_NOCRC) - seq_puts(m, ",nocrc"); - if (opt->flags & CEPH_OPT_NOMSGAUTH) - seq_puts(m, ",nocephx_require_signatures"); - if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0) - seq_puts(m, ",notcp_nodelay"); - - if (opt->name) - seq_printf(m, ",name=%s", opt->name); - if (opt->key) - seq_puts(m, ",secret=<hidden>"); - - if (opt->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT) - seq_printf(m, ",mount_timeout=%d", opt->mount_timeout); - if (opt->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT) - seq_printf(m, ",osd_idle_ttl=%d", opt->osd_idle_ttl); - if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) - seq_printf(m, ",osdkeepalivetimeout=%d", - opt->osd_keepalive_timeout); + size_t pos; + int ret; + + /* a comma between MNT/MS and client options */ + seq_putc(m, ','); + pos = m->count; + + ret = ceph_print_client_options(m, fsc->client); + if (ret) + return ret; + + /* retract our comma if no client options */ + if (m->count == pos) + m->count--; if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) seq_puts(m, ",dirstat"); @@ -438,14 +443,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",norbytes"); if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) seq_puts(m, ",noasyncreaddir"); - if (fsopt->flags & CEPH_MOUNT_OPT_DCACHE) - seq_puts(m, ",dcache"); - else + if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) seq_puts(m, ",nodcache"); if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) seq_puts(m, ",fsc"); - else - seq_puts(m, ",nofsc"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) + seq_puts(m, ",nopoolperm"); #ifdef CONFIG_CEPH_FS_POSIX_ACL if (fsopt->sb_flags & MS_POSIXACL) @@ -477,6 +480,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",readdir_max_bytes=%d", fsopt->max_readdir_bytes); if (strcmp(fsopt->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT)) seq_printf(m, ",snapdirname=%s", fsopt->snapdir_name); + return 0; } @@ -618,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) */ struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; @@ -643,6 +648,10 @@ static int __init init_caches(void) SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); if (ceph_cap_cachep == NULL) goto bad_cap; + ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_cap_flush_cachep == NULL) + goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); @@ -661,6 +670,8 @@ static int __init init_caches(void) bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: + kmem_cache_destroy(ceph_cap_flush_cachep); +bad_cap_flush: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); @@ -677,6 +688,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); @@ -730,10 +742,15 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, if (IS_ERR(req)) return ERR_CAST(req); req->r_path1 = kstrdup(path, GFP_NOFS); + if (!req->r_path1) { + root = ERR_PTR(-ENOMEM); + goto out; + } + req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; - req->r_timeout = fsc->client->options->mount_timeout * HZ; + req->r_timeout = fsc->client->options->mount_timeout; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); @@ -976,7 +993,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, if (IS_ERR(res)) goto out_splat; dout("root %p inode %p ino %llx.%llx\n", res, - res->d_inode, ceph_vinop(res->d_inode)); + d_inode(res), ceph_vinop(d_inode(res))); return res; out_splat: diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 04c8124ed30e..2f2460d23a06 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -35,8 +35,10 @@ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ +#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ -#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES) +#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ + CEPH_MOUNT_OPT_DCACHE) #define ceph_set_mount_opt(fsc, opt) \ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; @@ -120,11 +122,21 @@ struct ceph_cap { struct rb_node ci_node; /* per-ci cap tree */ struct ceph_mds_session *session; struct list_head session_caps; /* per-session caplist */ - int mds; u64 cap_id; /* unique cap id (mds provided) */ - int issued; /* latest, from the mds */ - int implemented; /* implemented superset of issued (for revocation) */ - int mds_wanted; + union { + /* in-use caps */ + struct { + int issued; /* latest, from the mds */ + int implemented; /* implemented superset of + issued (for revocation) */ + int mds, mds_wanted; + }; + /* caps to release */ + struct { + u64 cap_ino; + int queue_release; + }; + }; u32 seq, issue_seq, mseq; u32 cap_gen; /* active/stale cycle */ unsigned long last_used; @@ -162,6 +174,7 @@ struct ceph_cap_snap { int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ bool inline_data; + bool need_flush; }; static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) @@ -173,6 +186,16 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) } } +struct ceph_cap_flush { + u64 tid; + int caps; + struct rb_node g_node; // global + union { + struct rb_node i_node; // inode + struct list_head list; + }; +}; + /* * The frag tree describes how a directory is fragmented, potentially across * multiple metadata servers. It is also used to indicate points where @@ -258,9 +281,9 @@ struct ceph_inode_info { u32 i_time_warp_seq; unsigned i_ceph_flags; - int i_ordered_count; - atomic_t i_release_count; - atomic_t i_complete_count; + atomic64_t i_release_count; + atomic64_t i_ordered_count; + atomic64_t i_complete_seq[2]; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; @@ -282,11 +305,11 @@ struct ceph_inode_info { struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ struct list_head i_dirty_item, i_flushing_item; - u64 i_cap_flush_seq; /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ - u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; + struct ceph_cap_flush *i_prealloc_cap_flush; + struct rb_root i_cap_flush_tree; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ @@ -437,36 +460,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ +#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ +#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ +#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ +#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ + static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, - int release_count, int ordered_count) + long long release_count, + long long ordered_count) { - atomic_set(&ci->i_complete_count, release_count); - if (ci->i_ordered_count == ordered_count) - ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; - else - ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; + smp_mb__before_atomic(); + atomic64_set(&ci->i_complete_seq[0], release_count); + atomic64_set(&ci->i_complete_seq[1], ordered_count); } static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { - atomic_inc(&ci->i_release_count); + atomic64_inc(&ci->i_release_count); +} + +static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci) +{ + atomic64_inc(&ci->i_ordered_count); } static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) { - return atomic_read(&ci->i_complete_count) == - atomic_read(&ci->i_release_count); + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count); } static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) { - return __ceph_dir_is_complete(ci) && - (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count) && + atomic64_read(&ci->i_complete_seq[1]) == + atomic64_read(&ci->i_ordered_count); } static inline void ceph_dir_clear_complete(struct inode *inode) @@ -476,20 +509,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode) static inline void ceph_dir_clear_ordered(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - spin_lock(&ci->i_ceph_lock); - ci->i_ordered_count++; - ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; - spin_unlock(&ci->i_ceph_lock); + __ceph_dir_clear_ordered(ceph_inode(inode)); } static inline bool ceph_dir_is_complete_ordered(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - bool ret; - spin_lock(&ci->i_ceph_lock); - ret = __ceph_dir_is_complete_ordered(ci); - spin_unlock(&ci->i_ceph_lock); + bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode)); + smp_rmb(); return ret; } @@ -551,7 +577,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) { return ci->i_dirty_caps | ci->i_flushing_caps; } -extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern struct ceph_cap_flush *ceph_alloc_cap_flush(void); +extern void ceph_free_cap_flush(struct ceph_cap_flush *cf); +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf); extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask); @@ -605,16 +634,20 @@ struct ceph_file_info { unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ - struct dentry *dentry; /* next dentry (for dcache readdir) */ - int dir_release_count; - int dir_ordered_count; + long long dir_release_count; + long long dir_ordered_count; + int readdir_cache_idx; /* used for -o dirstat read() on directory thing */ char *dir_info; int dir_info_len; }; - +struct ceph_readdir_cache_control { + struct page *page; + struct dentry **dentries; + int index; +}; /* * A "snap realm" describes a subset of the file hierarchy sharing @@ -686,6 +719,7 @@ static inline int default_congestion_kb(void) /* snap.c */ +extern struct ceph_snap_context *ceph_empty_snapc; struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, @@ -712,8 +746,8 @@ extern void ceph_snap_exit(void); static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) { return !list_empty(&ci->i_cap_snaps) && - list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap, - ci_item)->writing; + list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, + ci_item)->writing; } /* inode.c */ @@ -837,12 +871,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); extern int ceph_is_any_caps(struct inode *inode); -extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, - u64 cap_id, u32 migrate_seq, u32 issue_seq); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync); +extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, @@ -878,10 +912,12 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); /* addr.c */ extern const struct address_space_operations ceph_aops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); +extern int ceph_uninline_data(struct file *filp, struct page *locked_page); +extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); +extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; -extern const struct address_space_operations ceph_aops; extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, @@ -890,7 +926,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); -int ceph_uninline_data(struct file *filp, struct page *locked_page); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; @@ -911,6 +946,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); +extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); /* * our d_ops vary depending on whether the inode is live, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5a492caf34cb..819163d8313b 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -776,12 +776,12 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) return generic_getxattr(dentry, name, value, size); - return __ceph_getxattr(dentry->d_inode, name, value, size); + return __ceph_getxattr(d_inode(dentry), name, value, size); } ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_vxattr *vxattrs = ceph_inode_vxattrs(inode); u32 vir_namelen = 0; @@ -847,7 +847,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, const char *value, size_t size, int flags) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; @@ -877,16 +877,23 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, err = PTR_ERR(req); goto out; } - req->r_inode = inode; - ihold(inode); - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; - req->r_num_caps = 1; + req->r_args.setxattr.flags = cpu_to_le32(flags); req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) { + ceph_mdsc_put_request(req); + err = -ENOMEM; + goto out; + } req->r_pagelist = pagelist; pagelist = NULL; + req->r_inode = inode; + ihold(inode); + req->r_num_caps = 1; + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; + dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); @@ -901,9 +908,11 @@ out: int __ceph_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf = NULL; int issued; int err; int dirty = 0; @@ -913,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; int required_blob_size; + bool lock_snap_rwsem = false; if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -941,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, if (!xattr) goto out; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + goto out; + spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + + if (!lock_snap_rwsem && !ci->i_head_snapc) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + } + + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, name_len, val_len); @@ -959,7 +984,7 @@ retry: dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) - goto out; + goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); @@ -971,21 +996,28 @@ retry: flags, value ? 1 : -1, &xattr); if (!err) { - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, + &prealloc_cf); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; } spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); + ceph_free_cap_flush(prealloc_cf); return err; do_sync: spin_unlock(&ci->i_ceph_lock); do_sync_unlocked: + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); err = ceph_sync_setxattr(dentry, name, value, size, flags); out: + ceph_free_cap_flush(prealloc_cf); kfree(newname); kfree(newval); kfree(xattr); @@ -995,7 +1027,7 @@ out: int ceph_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) return -EROFS; if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) @@ -1011,7 +1043,7 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct ceph_mds_client *mdsc = fsc->mdsc; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_mds_request *req; int err; @@ -1019,12 +1051,14 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) USE_AUTH_MDS); if (IS_ERR(req)) return PTR_ERR(req); + req->r_path2 = kstrdup(name, GFP_NOFS); + if (!req->r_path2) + return -ENOMEM; + req->r_inode = inode; ihold(inode); - req->r_inode_drop = CEPH_CAP_XATTR_SHARED; req->r_num_caps = 1; - req->r_path2 = kstrdup(name, GFP_NOFS); - + req->r_inode_drop = CEPH_CAP_XATTR_SHARED; err = ceph_mdsc_do_request(mdsc, NULL, req); ceph_mdsc_put_request(req); return err; @@ -1032,13 +1066,16 @@ static int ceph_send_removexattr(struct dentry *dentry, const char *name) int __ceph_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf = NULL; int issued; int err; int required_blob_size; int dirty; + bool lock_snap_rwsem = false; if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -1051,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) goto do_sync_unlocked; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + err = -ENOMEM; spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); - if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + + if (!lock_snap_rwsem && !ci->i_head_snapc) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + } + + dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); + __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, 0, 0); @@ -1071,7 +1123,7 @@ retry: dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) - goto out; + goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); @@ -1081,24 +1133,30 @@ retry: err = __remove_xattr_by_name(ceph_inode(inode), name); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, + &prealloc_cf); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); + ceph_free_cap_flush(prealloc_cf); return err; do_sync: spin_unlock(&ci->i_ceph_lock); do_sync_unlocked: + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); + ceph_free_cap_flush(prealloc_cf); err = ceph_send_removexattr(dentry, name); -out: return err; } int ceph_removexattr(struct dentry *dentry, const char *name) { - if (ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + if (ceph_snap(d_inode(dentry)) != CEPH_NOSNAP) return -EROFS; if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index a2172f3f69e3..e7b478b49985 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -192,6 +192,15 @@ config CIFS_SMB2 options are also slightly simpler (compared to CIFS) due to protocol improvements. +config CIFS_SMB311 + bool "SMB3.1.1 network file system support (Experimental)" + depends on CIFS_SMB2 && INET + + help + This enables experimental support for the newest, SMB3.1.1, dialect. + This dialect includes improved security negotiation features. + If unsure, say N + config CIFS_FSCACHE bool "Provide CIFS client caching support" depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index b8602f199815..7dc886c9a78f 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -24,6 +24,7 @@ #include "cifsfs.h" #include "dns_resolve.h" #include "cifs_debug.h" +#include "cifs_unicode.h" static LIST_HEAD(cifs_dfs_automount_list); @@ -301,7 +302,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) if (full_path == NULL) goto cdda_exit; - cifs_sb = CIFS_SB(mntpt->d_inode->i_sb); + cifs_sb = CIFS_SB(d_inode(mntpt)->i_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { mnt = ERR_CAST(tlink); @@ -312,7 +313,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) xid = get_xid(); rc = get_dfs_path(xid, ses, full_path + 1, cifs_sb->local_nls, &num_referrals, &referrals, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); free_xid(xid); cifs_put_tlink(tlink); diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 0303c6793d90..5a53ac6b1e02 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -27,41 +27,6 @@ #include "cifsglob.h" #include "cifs_debug.h" -/* - * cifs_utf16_bytes - how long will a string be after conversion? - * @utf16 - pointer to input string - * @maxbytes - don't go past this many bytes of input string - * @codepage - destination codepage - * - * Walk a utf16le string and return the number of bytes that the string will - * be after being converted to the given charset, not including any null - * termination required. Don't walk past maxbytes in the source buffer. - */ -int -cifs_utf16_bytes(const __le16 *from, int maxbytes, - const struct nls_table *codepage) -{ - int i; - int charlen, outlen = 0; - int maxwords = maxbytes / 2; - char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; - - for (i = 0; i < maxwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) - break; - - charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE); - if (charlen > 0) - outlen += charlen; - else - outlen++; - } - - return outlen; -} - int cifs_remap(struct cifs_sb_info *cifs_sb) { int map_type; @@ -155,10 +120,13 @@ convert_sfm_char(const __u16 src_char, char *target) * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). */ static int -cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, +cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, int maptype) { int len = 1; + __u16 src_char; + + src_char = *from; if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target)) return len; @@ -168,10 +136,23 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, /* if character not one of seven in special remap set */ len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); - if (len <= 0) { - *target = '?'; - len = 1; - } + if (len <= 0) + goto surrogate_pair; + + return len; + +surrogate_pair: + /* convert SURROGATE_PAIR and IVS */ + if (strcmp(cp->charset, "utf8")) + goto unknown; + len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6); + if (len <= 0) + goto unknown; + return len; + +unknown: + *target = '?'; + len = 1; return len; } @@ -206,7 +187,7 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, int nullsize = nls_nullsize(codepage); int fromwords = fromlen / 2; char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; + __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */ /* * because the chars can be of varying widths, we need to take care @@ -217,9 +198,17 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); for (i = 0; i < fromwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) break; + if (i + 1 < fromwords) + ftmp[1] = get_unaligned_le16(&from[i + 1]); + else + ftmp[1] = 0; + if (i + 2 < fromwords) + ftmp[2] = get_unaligned_le16(&from[i + 2]); + else + ftmp[2] = 0; /* * check to see if converting this character might make the @@ -234,6 +223,17 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, /* put converted char into 'to' buffer */ charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type); outlen += charlen; + + /* charlen (=bytes of UTF-8 for 1 character) + * 4bytes UTF-8(surrogate pair) is charlen=4 + * (4bytes UTF-16 code) + * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4 + * (2 UTF-8 pairs divided to 2 UTF-16 pairs) */ + if (charlen == 4) + i++; + else if (charlen >= 5) + /* 5-6bytes UTF-8 */ + i += 2; } /* properly null-terminate string */ @@ -296,6 +296,46 @@ success: } /* + * cifs_utf16_bytes - how long will a string be after conversion? + * @utf16 - pointer to input string + * @maxbytes - don't go past this many bytes of input string + * @codepage - destination codepage + * + * Walk a utf16le string and return the number of bytes that the string will + * be after being converted to the given charset, not including any null + * termination required. Don't walk past maxbytes in the source buffer. + */ +int +cifs_utf16_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage) +{ + int i; + int charlen, outlen = 0; + int maxwords = maxbytes / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp[3]; + + for (i = 0; i < maxwords; i++) { + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) + break; + if (i + 1 < maxwords) + ftmp[1] = get_unaligned_le16(&from[i + 1]); + else + ftmp[1] = 0; + if (i + 2 < maxwords) + ftmp[2] = get_unaligned_le16(&from[i + 2]); + else + ftmp[2] = 0; + + charlen = cifs_mapchar(tmp, ftmp, codepage, NO_MAP_UNI_RSVD); + outlen += charlen; + } + + return outlen; +} + +/* * cifs_strndup_from_utf16 - copy a string from wire format to the local * codepage * @src - source string @@ -409,10 +449,15 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, char src_char; __le16 dst_char; wchar_t tmp; + wchar_t *wchar_to; /* UTF-16 */ + int ret; + unicode_t u; if (map_chars == NO_MAP_UNI_RSVD) return cifs_strtoUTF16(target, source, PATH_MAX, cp); + wchar_to = kzalloc(6, GFP_KERNEL); + for (i = 0; i < srclen; j++) { src_char = source[i]; charlen = 1; @@ -441,11 +486,55 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, * if no match, use question mark, which at least in * some cases serves as wild card */ - if (charlen < 1) { - dst_char = cpu_to_le16(0x003f); - charlen = 1; + if (charlen > 0) + goto ctoUTF16; + + /* convert SURROGATE_PAIR */ + if (strcmp(cp->charset, "utf8") || !wchar_to) + goto unknown; + if (*(source + i) & 0x80) { + charlen = utf8_to_utf32(source + i, 6, &u); + if (charlen < 0) + goto unknown; + } else + goto unknown; + ret = utf8s_to_utf16s(source + i, charlen, + UTF16_LITTLE_ENDIAN, + wchar_to, 6); + if (ret < 0) + goto unknown; + + i += charlen; + dst_char = cpu_to_le16(*wchar_to); + if (charlen <= 3) + /* 1-3bytes UTF-8 to 2bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + else if (charlen == 4) { + /* 4bytes UTF-8(surrogate pair) to 4bytes UTF-16 + * 7-8bytes UTF-8(IVS) divided to 2 UTF-16 + * (charlen=3+4 or 4+4) */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + } else if (charlen >= 5) { + /* 5-6bytes UTF-8 to 6bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 2)); + j++; + put_unaligned(dst_char, &target[j]); } + continue; + +unknown: + dst_char = cpu_to_le16(0x003f); + charlen = 1; } + +ctoUTF16: /* * character may take more than one byte in the source string, * but will take exactly two bytes in the target string @@ -456,6 +545,7 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, ctoUTF16_out: put_unaligned(0, &target[j]); /* Null terminate target unicode string */ + kfree(wchar_to); return j; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index d72fe37f5420..0a9fb6b53126 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -469,6 +469,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",nouser_xattr"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) seq_puts(s, ",mapchars"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) + seq_puts(s, ",mapposix"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) seq_puts(s, ",sfu"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) @@ -607,7 +609,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) p = s = full_path; do { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct dentry *child; if (!dir) { @@ -906,8 +908,6 @@ const struct inode_operations cifs_symlink_inode_ops = { }; const struct file_operations cifs_file_ops = { - .read = new_sync_read, - .write = new_sync_write, .read_iter = cifs_loose_read_iter, .write_iter = cifs_file_write_iter, .open = cifs_open, @@ -926,8 +926,6 @@ const struct file_operations cifs_file_ops = { }; const struct file_operations cifs_file_strict_ops = { - .read = new_sync_read, - .write = new_sync_write, .read_iter = cifs_strict_readv, .write_iter = cifs_strict_writev, .open = cifs_open, @@ -947,8 +945,6 @@ const struct file_operations cifs_file_strict_ops = { const struct file_operations cifs_file_direct_ops = { /* BB reevaluate whether they can be done with directio, no cache */ - .read = new_sync_read, - .write = new_sync_write, .read_iter = cifs_user_readv, .write_iter = cifs_user_writev, .open = cifs_open, @@ -967,8 +963,6 @@ const struct file_operations cifs_file_direct_ops = { }; const struct file_operations cifs_file_nobrl_ops = { - .read = new_sync_read, - .write = new_sync_write, .read_iter = cifs_loose_read_iter, .write_iter = cifs_file_write_iter, .open = cifs_open, @@ -986,8 +980,6 @@ const struct file_operations cifs_file_nobrl_ops = { }; const struct file_operations cifs_file_strict_nobrl_ops = { - .read = new_sync_read, - .write = new_sync_write, .read_iter = cifs_strict_readv, .write_iter = cifs_strict_writev, .open = cifs_open, @@ -1006,8 +998,6 @@ const struct file_operations cifs_file_strict_nobrl_ops = { const struct file_operations cifs_file_direct_nobrl_ops = { /* BB reevaluate whether they can be done with directio, no cache */ - .read = new_sync_read, - .write = new_sync_write, .read_iter = cifs_user_readv, .write_iter = cifs_user_writev, .open = cifs_open, diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 252f5c15806b..a782b22904e4 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -120,7 +120,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); #endif /* Functions related to symlinks */ -extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); +extern const char *cifs_follow_link(struct dentry *direntry, void **cookie); extern int cifs_readlink(struct dentry *direntry, char __user *buffer, int buflen); extern int cifs_symlink(struct inode *inode, struct dentry *direntry, diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 22b289a3b1c4..b406a32deb1f 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -171,6 +171,10 @@ enum smb_version { Smb_21, Smb_30, Smb_302, +#ifdef CONFIG_CIFS_SMB311 + Smb_311, +#endif /* SMB311 */ + Smb_version_err }; struct mid_q_entry; @@ -368,6 +372,8 @@ struct smb_version_operations { void (*new_lease_key)(struct cifs_fid *); int (*generate_signingkey)(struct cifs_ses *); int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); + int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, + struct cifsFileInfo *src_file); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -386,6 +392,9 @@ struct smb_version_operations { int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, u64 src_off, u64 len, u64 dest_off); + int (*duplicate_extents)(const unsigned int, struct cifsFileInfo *src, + struct cifsFileInfo *target_file, u64 src_off, u64 len, + u64 dest_off); int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, const unsigned char *, const unsigned char *, char *, @@ -1617,4 +1626,8 @@ extern struct smb_version_values smb30_values; #define SMB302_VERSION_STRING "3.02" /*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */ extern struct smb_version_values smb302_values; +#define SMB311_VERSION_STRING "3.1.1" +#define ALT_SMB311_VERSION_STRING "3.11" +extern struct smb_version_operations smb311_operations; +extern struct smb_version_values smb311_values; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 5f9822ac0245..47b030da0781 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2255,6 +2255,8 @@ typedef struct { /* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */ +#define FILE_SUPPORTS_SPARSE_VDL 0x10000000 /* faster nonsparse extend */ +#define FILE_SUPPORTS_BLOCK_REFCOUNTING 0x08000000 /* allow ioctl dup extents */ #define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000 #define FILE_SUPPORTS_USN_JOURNAL 0x02000000 #define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000 @@ -2310,6 +2312,16 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */ char FileName[1]; } __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ +typedef struct { + __le64 AllocationSize; + __le64 EndOfFile; /* size ie offset to first free byte in file */ + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __u16 Pad; +} __attribute__((packed)) FILE_STANDARD_INFO; /* level 0x102 QPathInfo */ + + /* defines for enumerating possible values of the Unix type field below */ #define UNIX_FILE 0 #define UNIX_DIR 1 diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c31ce98c1704..c63fd1dde25b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -361,11 +361,11 @@ extern int CIFSUnixCreateHardLink(const unsigned int xid, extern int CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, - const struct nls_table *nls_codepage); + const struct nls_table *nls_codepage, int remap); extern int CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char **syminfo, - const struct nls_table *nls_codepage); + const struct nls_table *nls_codepage, int remap); extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, char **symlinkinfo, const struct nls_table *nls_codepage); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index fa13d5e79f64..672ef35c9f73 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -625,9 +625,8 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) server->negflavor = CIFS_NEGFLAVOR_UNENCAP; memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, CIFS_CRYPTO_KEY_SIZE); - } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || - server->capabilities & CAP_EXTENDED_SECURITY) && - (pSMBr->EncryptionKeyLength == 0)) { + } else if (pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || + server->capabilities & CAP_EXTENDED_SECURITY) { server->negflavor = CIFS_NEGFLAVOR_EXTENDED; rc = decode_ext_sec_blob(ses, pSMBr); } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { @@ -1898,7 +1897,7 @@ static void cifs_writev_requeue(struct cifs_writedata *wdata) { int i, rc = 0; - struct inode *inode = wdata->cfile->dentry->d_inode; + struct inode *inode = d_inode(wdata->cfile->dentry); struct TCP_Server_Info *server; unsigned int rest_len; @@ -1981,7 +1980,7 @@ cifs_writev_complete(struct work_struct *work) { struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); - struct inode *inode = wdata->cfile->dentry->d_inode; + struct inode *inode = d_inode(wdata->cfile->dentry); int i = 0; if (wdata->result == 0) { @@ -2784,7 +2783,7 @@ copyRetry: int CIFSUnixCreateSymLink(const unsigned int xid, struct cifs_tcon *tcon, const char *fromName, const char *toName, - const struct nls_table *nls_codepage) + const struct nls_table *nls_codepage, int remap) { TRANSACTION2_SPI_REQ *pSMB = NULL; TRANSACTION2_SPI_RSP *pSMBr = NULL; @@ -2804,9 +2803,9 @@ createSymLinkRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifs_strtoUTF16((__le16 *) pSMB->FileName, fromName, - /* find define for this maxpathcomponent */ - PATH_MAX, nls_codepage); + cifsConvertToUTF16((__le16 *) pSMB->FileName, fromName, + /* find define for this maxpathcomponent */ + PATH_MAX, nls_codepage, remap); name_len++; /* trailing null */ name_len *= 2; @@ -2828,9 +2827,9 @@ createSymLinkRetry: data_offset = (char *) (&pSMB->hdr.Protocol) + offset; if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len_target = - cifs_strtoUTF16((__le16 *) data_offset, toName, PATH_MAX - /* find define for this maxpathcomponent */ - , nls_codepage); + cifsConvertToUTF16((__le16 *) data_offset, toName, + /* find define for this maxpathcomponent */ + PATH_MAX, nls_codepage, remap); name_len_target++; /* trailing null */ name_len_target *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -3034,7 +3033,7 @@ winCreateHardLinkRetry: int CIFSSMBUnixQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon, const unsigned char *searchName, char **symlinkinfo, - const struct nls_table *nls_codepage) + const struct nls_table *nls_codepage, int remap) { /* SMB_QUERY_FILE_UNIX_LINK */ TRANSACTION2_QPI_REQ *pSMB = NULL; @@ -3055,8 +3054,9 @@ querySymLinkRetry: if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = - cifs_strtoUTF16((__le16 *) pSMB->FileName, searchName, - PATH_MAX, nls_codepage); + cifsConvertToUTF16((__le16 *) pSMB->FileName, + searchName, PATH_MAX, nls_codepage, + remap); name_len++; /* trailing null */ name_len *= 2; } else { /* BB improve the check for buffer overruns BB */ @@ -4917,7 +4917,7 @@ getDFSRetry: strncpy(pSMB->RequestFileName, search_name, name_len); } - if (ses->server && ses->server->sign) + if (ses->server->sign) pSMB->hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; pSMB->hdr.Uid = ses->Suid; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 480cf9c81d50..773f4dc77630 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -280,6 +280,11 @@ static const match_table_t cifs_smb_version_tokens = { { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, { Smb_302, SMB302_VERSION_STRING }, +#ifdef CONFIG_CIFS_SMB311 + { Smb_311, SMB311_VERSION_STRING }, + { Smb_311, ALT_SMB311_VERSION_STRING }, +#endif /* SMB311 */ + { Smb_version_err, NULL } }; static int ip_connect(struct TCP_Server_Info *server); @@ -386,6 +391,7 @@ cifs_reconnect(struct TCP_Server_Info *server) rc = generic_ip_connect(server); if (rc) { cifs_dbg(FYI, "reconnect error %d\n", rc); + mutex_unlock(&server->srv_mutex); msleep(3000); } else { atomic_inc(&tcpSesReconnectCount); @@ -393,8 +399,8 @@ cifs_reconnect(struct TCP_Server_Info *server) if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); + mutex_unlock(&server->srv_mutex); } - mutex_unlock(&server->srv_mutex); } while (server->tcpStatus == CifsNeedReconnect); return rc; @@ -773,8 +779,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server) length = atomic_dec_return(&tcpSesAllocCount); if (length > 0) - mempool_resize(cifs_req_poolp, length + cifs_min_rcv, - GFP_KERNEL); + mempool_resize(cifs_req_poolp, length + cifs_min_rcv); } static int @@ -848,8 +853,7 @@ cifs_demultiplex_thread(void *p) length = atomic_inc_return(&tcpSesAllocCount); if (length > 1) - mempool_resize(cifs_req_poolp, length + cifs_min_rcv, - GFP_KERNEL); + mempool_resize(cifs_req_poolp, length + cifs_min_rcv); set_freezable(); while (server->tcpStatus != CifsExiting) { @@ -1134,6 +1138,12 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->ops = &smb30_operations; /* currently identical with 3.0 */ vol->vals = &smb302_values; break; +#ifdef CONFIG_CIFS_SMB311 + case Smb_311: + vol->ops = &smb311_operations; + vol->vals = &smb311_values; + break; +#endif /* SMB311 */ #endif default: cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); @@ -3462,6 +3472,8 @@ try_mount_again: else if (ses) cifs_put_smb_ses(ses); + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS; + free_xid(xid); } #endif diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index b72bc29cba23..c3eb998a99bd 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -620,8 +620,7 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, } rc = CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); if (rc) goto mknod_out; @@ -745,13 +744,13 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, goto lookup_out; } - if (direntry->d_inode != NULL) { + if (d_really_is_positive(direntry)) { cifs_dbg(FYI, "non-NULL inode in lookup\n"); } else { cifs_dbg(FYI, "NULL inode in lookup\n"); } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", - full_path, direntry->d_inode); + full_path, d_inode(direntry)); if (pTcon->unix_ext) { rc = cifs_get_inode_info_unix(&newInode, full_path, @@ -792,7 +791,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - if (direntry->d_inode) { + if (d_really_is_positive(direntry)) { if (cifs_revalidate_dentry(direntry)) return 0; else { @@ -803,7 +802,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) * attributes will have been updated by * cifs_revalidate_dentry(). */ - if (IS_AUTOMOUNT(direntry->d_inode) && + if (IS_AUTOMOUNT(d_inode(direntry)) && !(direntry->d_flags & DCACHE_NEED_AUTOMOUNT)) { spin_lock(&direntry->d_lock); direntry->d_flags |= DCACHE_NEED_AUTOMOUNT; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ca30c391a894..3f50cee79df9 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -140,8 +140,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, posix_flags = cifs_posix_convert_flags(f_flags); rc = CIFSPOSIXCreate(xid, tcon, posix_flags, mode, pnetfid, presp_data, poplock, full_path, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); cifs_put_tlink(tlink); if (rc) @@ -273,7 +272,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifsFileInfo *cfile; struct cifs_fid_locks *fdlocks; @@ -357,7 +356,7 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) { - struct inode *inode = cifs_file->dentry->d_inode; + struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); struct TCP_Server_Info *server = tcon->ses->server; struct cifsInodeInfo *cifsi = CIFS_I(inode); @@ -386,7 +385,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) if (list_empty(&cifsi->openFileList)) { cifs_dbg(FYI, "closing last open instance for inode %p\n", - cifs_file->dentry->d_inode); + d_inode(cifs_file->dentry)); /* * In strict cache mode we need invalidate mapping on the last * close because it may cause a error when we open this file @@ -572,7 +571,7 @@ static int cifs_relock_file(struct cifsFileInfo *cfile) { struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; @@ -620,7 +619,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) return rc; } - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifs_sb = CIFS_SB(inode->i_sb); tcon = tlink_tcon(cfile->tlink); server = tcon->ses->server; @@ -874,7 +873,7 @@ cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length, { bool rc = false; struct cifs_fid_locks *cur; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); list_for_each_entry(cur, &cinode->llist, llist) { rc = cifs_find_fid_lock_conflict(cur, offset, length, type, @@ -899,7 +898,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, { int rc = 0; struct cifsLockInfo *conf_lock; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; bool exist; @@ -927,7 +926,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length, static void cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); down_write(&cinode->lock_sem); list_add_tail(&lock->llist, &cfile->llist->locks); up_write(&cinode->lock_sem); @@ -944,7 +943,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock, bool wait) { struct cifsLockInfo *conf_lock; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); bool exist; int rc = 0; @@ -1125,7 +1124,7 @@ struct lock_to_push { static int cifs_push_posix_locks(struct cifsFileInfo *cfile) { - struct inode *inode = cfile->dentry->d_inode; + struct inode *inode = d_inode(cfile->dentry); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct file_lock *flock; struct file_lock_context *flctx = inode->i_flctx; @@ -1214,7 +1213,7 @@ static int cifs_push_locks(struct cifsFileInfo *cfile) { struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); int rc = 0; @@ -1382,7 +1381,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, unsigned int max_num, num, max_buf; LOCKING_ANDX_RANGE *buf, *cur; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifsLockInfo *li, *tmp; __u64 length = 1 + flock->fl_end - flock->fl_start; struct list_head tmp_llist; @@ -1488,7 +1487,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; - struct inode *inode = cfile->dentry->d_inode; + struct inode *inode = d_inode(cfile->dentry); if (posix_lck) { int posix_lock_type; @@ -1553,8 +1552,8 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type, rc = server->ops->mand_unlock_range(cfile, flock, xid); out: - if (flock->fl_flags & FL_POSIX) - posix_lock_file_wait(file, flock); + if (flock->fl_flags & FL_POSIX && !rc) + rc = posix_lock_file_wait(file, flock); return rc; } @@ -1643,7 +1642,7 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, struct TCP_Server_Info *server; unsigned int xid; struct dentry *dentry = open_file->dentry; - struct cifsInodeInfo *cifsi = CIFS_I(dentry->d_inode); + struct cifsInodeInfo *cifsi = CIFS_I(d_inode(dentry)); struct cifs_io_parms io_parms; cifs_sb = CIFS_SB(dentry->d_sb); @@ -1676,7 +1675,7 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, break; } - len = min(server->ops->wp_retry_size(dentry->d_inode), + len = min(server->ops->wp_retry_size(d_inode(dentry)), (unsigned int)write_size - total_written); /* iov[0] is reserved for smb header */ iov[1].iov_base = (char *)write_data + total_written; @@ -1696,9 +1695,9 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, return rc; } } else { - spin_lock(&dentry->d_inode->i_lock); + spin_lock(&d_inode(dentry)->i_lock); cifs_update_eof(cifsi, *offset, bytes_written); - spin_unlock(&dentry->d_inode->i_lock); + spin_unlock(&d_inode(dentry)->i_lock); *offset += bytes_written; } } @@ -1706,12 +1705,12 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, cifs_stats_bytes_written(tcon, total_written); if (total_written > 0) { - spin_lock(&dentry->d_inode->i_lock); - if (*offset > dentry->d_inode->i_size) - i_size_write(dentry->d_inode, *offset); - spin_unlock(&dentry->d_inode->i_lock); + spin_lock(&d_inode(dentry)->i_lock); + if (*offset > d_inode(dentry)->i_size) + i_size_write(d_inode(dentry), *offset); + spin_unlock(&d_inode(dentry)->i_lock); } - mark_inode_dirty_sync(dentry->d_inode); + mark_inode_dirty_sync(d_inode(dentry)); free_xid(xid); return total_written; } @@ -2406,7 +2405,7 @@ cifs_uncached_writev_complete(struct work_struct *work) { struct cifs_writedata *wdata = container_of(work, struct cifs_writedata, work); - struct inode *inode = wdata->cfile->dentry->d_inode; + struct inode *inode = d_inode(wdata->cfile->dentry); struct cifsInodeInfo *cifsi = CIFS_I(inode); spin_lock(&inode->i_lock); @@ -2560,10 +2559,9 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, return rc; } -static ssize_t -cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) { - size_t len; + struct file *file = iocb->ki_filp; ssize_t total_written = 0; struct cifsFileInfo *open_file; struct cifs_tcon *tcon; @@ -2573,15 +2571,15 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) struct iov_iter saved_from; int rc; - len = iov_iter_count(from); - rc = generic_write_checks(file, poffset, &len, 0); - if (rc) - return rc; - - if (!len) - return 0; + /* + * BB - optimize the way when signing is disabled. We can drop this + * extra memory-to-memory copying and use iovec buffers for constructing + * write request. + */ - iov_iter_truncate(from, len); + rc = generic_write_checks(iocb, from); + if (rc <= 0) + return rc; INIT_LIST_HEAD(&wdata_list); cifs_sb = CIFS_FILE_SB(file); @@ -2593,8 +2591,8 @@ cifs_iovec_write(struct file *file, struct iov_iter *from, loff_t *poffset) memcpy(&saved_from, from, sizeof(struct iov_iter)); - rc = cifs_write_from_iter(*poffset, len, from, open_file, cifs_sb, - &wdata_list); + rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from, + open_file, cifs_sb, &wdata_list); /* * If at least one write was successfully sent, then discard any rc @@ -2633,7 +2631,7 @@ restart_loop: memcpy(&tmp_from, &saved_from, sizeof(struct iov_iter)); iov_iter_advance(&tmp_from, - wdata->offset - *poffset); + wdata->offset - iocb->ki_pos); rc = cifs_write_from_iter(wdata->offset, wdata->bytes, &tmp_from, @@ -2650,34 +2648,13 @@ restart_loop: kref_put(&wdata->refcount, cifs_uncached_writedata_release); } - if (total_written > 0) - *poffset += total_written; + if (unlikely(!total_written)) + return rc; + iocb->ki_pos += total_written; + set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(file_inode(file))->flags); cifs_stats_bytes_written(tcon, total_written); - return total_written ? total_written : (ssize_t)rc; -} - -ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) -{ - ssize_t written; - struct inode *inode; - loff_t pos = iocb->ki_pos; - - inode = file_inode(iocb->ki_filp); - - /* - * BB - optimize the way when signing is disabled. We can drop this - * extra memory-to-memory copying and use iovec buffers for constructing - * write request. - */ - - written = cifs_iovec_write(iocb->ki_filp, from, &pos); - if (written > 0) { - set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(inode)->flags); - iocb->ki_pos = pos; - } - - return written; + return total_written; } static ssize_t @@ -2688,8 +2665,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file->f_mapping->host; struct cifsInodeInfo *cinode = CIFS_I(inode); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; - ssize_t rc = -EACCES; - loff_t lock_pos = iocb->ki_pos; + ssize_t rc; /* * We need to hold the sem to be sure nobody modifies lock list @@ -2697,23 +2673,24 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) */ down_read(&cinode->lock_sem); mutex_lock(&inode->i_mutex); - if (file->f_flags & O_APPEND) - lock_pos = i_size_read(inode); - if (!cifs_find_lock_conflict(cfile, lock_pos, iov_iter_count(from), + + rc = generic_write_checks(iocb, from); + if (rc <= 0) + goto out; + + if (!cifs_find_lock_conflict(cfile, iocb->ki_pos, iov_iter_count(from), server->vals->exclusive_lock_type, NULL, - CIFS_WRITE_OP)) { + CIFS_WRITE_OP)) rc = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); - - if (rc > 0) { - ssize_t err; + else + rc = -EACCES; +out: + mutex_unlock(&inode->i_mutex); - err = generic_write_sync(file, iocb->ki_pos - rc, rc); - if (err < 0) - rc = err; - } - } else { - mutex_unlock(&inode->i_mutex); + if (rc > 0) { + ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc); + if (err < 0) + rc = err; } up_read(&cinode->lock_sem); return rc; @@ -3816,7 +3793,7 @@ void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, oplock_break); - struct inode *inode = cfile->dentry->d_inode; + struct inode *inode = d_inode(cfile->dentry); struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -3877,8 +3854,7 @@ void cifs_oplock_break(struct work_struct *work) * Direct IO is not yet supported in the cached mode. */ static ssize_t -cifs_direct_io(int rw, struct kiocb *iocb, struct iov_iter *iter, - loff_t pos) +cifs_direct_io(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { /* * FIXME diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 3e126d7bb2ea..f621b44cb800 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -373,8 +373,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* could have done a find first instead but this returns more info */ rc = CIFSSMBUnixQPathInfo(xid, tcon, full_path, &find_data, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); cifs_put_tlink(tlink); if (!rc) { @@ -402,9 +401,25 @@ int cifs_get_inode_info_unix(struct inode **pinode, rc = -ENOMEM; } else { /* we already have inode, update it */ + + /* if uniqueid is different, return error */ + if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && + CIFS_I(*pinode)->uniqueid != fattr.cf_uniqueid)) { + rc = -ESTALE; + goto cgiiu_exit; + } + + /* if filetype is different, return error */ + if (unlikely(((*pinode)->i_mode & S_IFMT) != + (fattr.cf_mode & S_IFMT))) { + rc = -ESTALE; + goto cgiiu_exit; + } + cifs_fattr_to_inode(*pinode, &fattr); } +cgiiu_exit: return rc; } @@ -839,6 +854,15 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, if (!*inode) rc = -ENOMEM; } else { + /* we already have inode, update it */ + + /* if filetype is different, return error */ + if (unlikely(((*inode)->i_mode & S_IFMT) != + (fattr.cf_mode & S_IFMT))) { + rc = -ESTALE; + goto cgii_exit; + } + cifs_fattr_to_inode(*inode, &fattr); } @@ -1067,7 +1091,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, int rc; struct cifs_fid fid; struct cifs_open_parms oparms; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; @@ -1196,7 +1220,7 @@ cifs_drop_nlink(struct inode *inode) } /* - * If dentry->d_inode is null (usually meaning the cached dentry + * If d_inode(dentry) is null (usually meaning the cached dentry * is a negative dentry) then we would attempt a standard SMB delete, but * if that fails we can not attempt the fall back mechanisms on EACCESS * but will return the EACCESS to the caller. Note that the VFS does not call @@ -1207,7 +1231,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) int rc = 0; unsigned int xid; char *full_path = NULL; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifs_inode; struct super_block *sb = dir->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -1551,13 +1575,13 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifs_put_tlink(tlink); if (!rc) { - spin_lock(&direntry->d_inode->i_lock); - i_size_write(direntry->d_inode, 0); - clear_nlink(direntry->d_inode); - spin_unlock(&direntry->d_inode->i_lock); + spin_lock(&d_inode(direntry)->i_lock); + i_size_write(d_inode(direntry), 0); + clear_nlink(d_inode(direntry)); + spin_unlock(&d_inode(direntry)->i_lock); } - cifsInode = CIFS_I(direntry->d_inode); + cifsInode = CIFS_I(d_inode(direntry)); /* force revalidate to go get info when needed */ cifsInode->time = 0; @@ -1568,7 +1592,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) */ cifsInode->time = 0; - direntry->d_inode->i_ctime = inode->i_ctime = inode->i_mtime = + d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); rmdir_exit: @@ -1727,7 +1751,7 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, unlink_target: /* Try unlinking the target dentry if it's not negative */ - if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) { + if (d_really_is_positive(target_dentry) && (rc == -EACCES || rc == -EEXIST)) { if (d_is_dir(target_dentry)) tmprc = cifs_rmdir(target_dir, target_dentry); else @@ -1867,7 +1891,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) { unsigned int xid; int rc = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = dentry->d_sb; char *full_path = NULL; @@ -1919,7 +1943,7 @@ int cifs_revalidate_file(struct file *filp) int cifs_revalidate_dentry(struct dentry *dentry) { int rc; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); rc = cifs_revalidate_dentry_attr(dentry); if (rc) @@ -1933,7 +1957,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, { struct cifs_sb_info *cifs_sb = CIFS_SB(dentry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc; /* @@ -2110,7 +2134,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) int rc; unsigned int xid; char *full_path = NULL; - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; @@ -2215,8 +2239,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) pTcon = tlink_tcon(tlink); rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); cifs_put_tlink(tlink); } @@ -2251,7 +2274,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) unsigned int xid; kuid_t uid = INVALID_UID; kgid_t gid = INVALID_GID; - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); char *full_path = NULL; @@ -2409,7 +2432,7 @@ cifs_setattr_exit: int cifs_setattr(struct dentry *direntry, struct iattr *attrs) { - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 8b7898b7670f..49b8b6e41a18 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -31,12 +31,15 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#include <linux/btrfs.h> #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) +#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, - unsigned long srcfd, u64 off, u64 len, u64 destoff) + unsigned long srcfd, u64 off, u64 len, u64 destoff, + bool dup_extents) { int rc; struct cifsFileInfo *smb_file_target = dst_file->private_data; @@ -109,9 +112,14 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, truncate_inode_pages_range(&target_inode->i_data, destoff, PAGE_CACHE_ALIGN(destoff + len)-1); - if (target_tcon->ses->server->ops->clone_range) + if (dup_extents && target_tcon->ses->server->ops->duplicate_extents) + rc = target_tcon->ses->server->ops->duplicate_extents(xid, + smb_file_src, smb_file_target, off, len, destoff); + else if (!dup_extents && target_tcon->ses->server->ops->clone_range) rc = target_tcon->ses->server->ops->clone_range(xid, smb_file_src, smb_file_target, off, len, destoff); + else + rc = -EOPNOTSUPP; /* force revalidate of size and timestamps of target file now that target is updated on the server */ @@ -205,7 +213,20 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) } break; case CIFS_IOC_COPYCHUNK_FILE: - rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0); + rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false); + break; + case BTRFS_IOC_CLONE: + rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true); + break; + case CIFS_IOC_SET_INTEGRITY: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + if (tcon->ses->server->ops->set_integrity) + rc = tcon->ses->server->ops->set_integrity(xid, + tcon, pSMBFile); + else + rc = -EOPNOTSUPP; break; default: cifs_dbg(FYI, "unsupported ioctl\n"); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 2ec6037f61c7..e3548f73bdea 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -586,12 +586,12 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, * if source file is cached (oplocked) revalidate will not go to server * until the file is closed or oplock broken so update nlinks locally */ - if (old_file->d_inode) { - cifsInode = CIFS_I(old_file->d_inode); + if (d_really_is_positive(old_file)) { + cifsInode = CIFS_I(d_inode(old_file)); if (rc == 0) { - spin_lock(&old_file->d_inode->i_lock); - inc_nlink(old_file->d_inode); - spin_unlock(&old_file->d_inode->i_lock); + spin_lock(&d_inode(old_file)->i_lock); + inc_nlink(d_inode(old_file)); + spin_unlock(&d_inode(old_file)->i_lock); /* * parent dir timestamps will update from srv within a @@ -626,10 +626,10 @@ cifs_hl_exit: return rc; } -void * -cifs_follow_link(struct dentry *direntry, struct nameidata *nd) +const char * +cifs_follow_link(struct dentry *direntry, void **cookie) { - struct inode *inode = direntry->d_inode; + struct inode *inode = d_inode(direntry); int rc = -ENOMEM; unsigned int xid; char *full_path = NULL; @@ -643,16 +643,18 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { - rc = PTR_ERR(tlink); - tlink = NULL; - goto out; + free_xid(xid); + return ERR_CAST(tlink); } tcon = tlink_tcon(tlink); server = tcon->ses->server; full_path = build_path_from_dentry(direntry); - if (!full_path) - goto out; + if (!full_path) { + free_xid(xid); + cifs_put_tlink(tlink); + return ERR_PTR(-ENOMEM); + } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); @@ -670,17 +672,13 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) &target_path, cifs_sb); kfree(full_path); -out: + free_xid(xid); + cifs_put_tlink(tlink); if (rc != 0) { kfree(target_path); - target_path = ERR_PTR(rc); + return ERR_PTR(rc); } - - free_xid(xid); - if (tlink) - cifs_put_tlink(tlink); - nd_set_link(nd, target_path); - return NULL; + return *cookie = target_path; } int @@ -717,7 +715,8 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname); else if (pTcon->unix_ext) rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname, - cifs_sb->local_nls); + cifs_sb->local_nls, + cifs_remap(cifs_sb)); /* else rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName, cifs_sb_target->local_nls); */ diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 337946355b29..8442b8b8e0be 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -473,7 +473,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) continue; cifs_dbg(FYI, "file id match, oplock break\n"); - pCifsInode = CIFS_I(netfile->dentry->d_inode); + pCifsInode = CIFS_I(d_inode(netfile->dentry)); set_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &pCifsInode->flags); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index c295338e0a98..b1eede3678a9 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -78,7 +78,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, { struct dentry *dentry, *alias; struct inode *inode; - struct super_block *sb = parent->d_inode->i_sb; + struct super_block *sb = d_inode(parent)->i_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); @@ -88,8 +88,10 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, return; if (dentry) { - inode = dentry->d_inode; + inode = d_inode(dentry); if (inode) { + if (d_mountpoint(dentry)) + goto out; /* * If we're generating inode numbers, then we don't * want to clobber the existing one with the one that diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index d2979036a4c7..fc537c29044e 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -722,7 +722,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, static void cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); cfile->fid.netfid = fid->netfid; cifs_set_oplock_level(cinode, oplock); cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode); @@ -960,7 +960,8 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, /* Check for unix extensions */ if (cap_unix(tcon->ses)) { rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, target_path, - cifs_sb->local_nls); + cifs_sb->local_nls, + cifs_remap(cifs_sb)); if (rc == -EREMOTE) rc = cifs_unix_dfs_readlink(xid, tcon, full_path, target_path, diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 7198eac5dddd..2ab297dae5a7 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -95,7 +95,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock, unsigned int max_num, num = 0, max_buf; struct smb2_lock_element *buf, *cur; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifsLockInfo *li, *tmp; __u64 length = 1 + flock->fl_end - flock->fl_start; struct list_head tmp_llist; @@ -231,7 +231,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile) unsigned int xid; unsigned int max_num, max_buf; struct smb2_lock_element *buf; - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct cifs_fid_locks *fdlocks; xid = get_xid(); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 22dfdf17d065..1c5907019045 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -453,7 +453,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, list_for_each(tmp, &tcon->openFileList) { cfile = list_entry(tmp, struct cifsFileInfo, tlist); - cinode = CIFS_I(cfile->dentry->d_inode); + cinode = CIFS_I(d_inode(cfile->dentry)); if (memcmp(cinode->lease_key, rsp->LeaseKey, SMB2_LEASE_KEY_SIZE)) @@ -590,7 +590,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) continue; cifs_dbg(FYI, "file id match, oplock break\n"); - cinode = CIFS_I(cfile->dentry->d_inode); + cinode = CIFS_I(d_inode(cfile->dentry)); if (!CIFS_CACHE_WRITE(cinode) && rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE) diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index eab05e1aa587..df91bcf56d67 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -524,7 +524,7 @@ smb2_print_stats(struct seq_file *m, struct cifs_tcon *tcon) static void smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock) { - struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode); + struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry)); struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server; cfile->fid.persistent_fid = fid->persistent_fid; @@ -793,7 +793,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, * If extending file more than one page make sparse. Many Linux fs * make files sparse by default when extending via ftruncate */ - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); if (!set_alloc && (size > inode->i_size + 8192)) { __u8 set_sparse = 1; @@ -806,6 +806,53 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, &eof, false); } +#ifdef CONFIG_CIFS_SMB311 +static int +smb2_duplicate_extents(const unsigned int xid, + struct cifsFileInfo *srcfile, + struct cifsFileInfo *trgtfile, u64 src_off, + u64 len, u64 dest_off) +{ + int rc; + unsigned int ret_data_len; + char *retbuf = NULL; + struct duplicate_extents_to_file dup_ext_buf; + struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); + + /* server fileays advertise duplicate extent support with this flag */ + if ((le32_to_cpu(tcon->fsAttrInfo.Attributes) & + FILE_SUPPORTS_BLOCK_REFCOUNTING) == 0) + return -EOPNOTSUPP; + + dup_ext_buf.VolatileFileHandle = srcfile->fid.volatile_fid; + dup_ext_buf.PersistentFileHandle = srcfile->fid.persistent_fid; + dup_ext_buf.SourceFileOffset = cpu_to_le64(src_off); + dup_ext_buf.TargetFileOffset = cpu_to_le64(dest_off); + dup_ext_buf.ByteCount = cpu_to_le64(len); + cifs_dbg(FYI, "duplicate extents: src off %lld dst off %lld len %lld", + src_off, dest_off, len); + + rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); + if (rc) + goto duplicate_extents_out; + + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, + FSCTL_DUPLICATE_EXTENTS_TO_FILE, + true /* is_fsctl */, (char *)&dup_ext_buf, + sizeof(struct duplicate_extents_to_file), + (char **)&retbuf, + &ret_data_len); + + if (ret_data_len > 0) + cifs_dbg(FYI, "non-zero response length in duplicate extents"); + +duplicate_extents_out: + return rc; +} +#endif /* CONFIG_CIFS_SMB311 */ + + static int smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) @@ -815,6 +862,28 @@ smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, } static int +smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + struct fsctl_set_integrity_information_req integr_info; + char *retbuf = NULL; + unsigned int ret_data_len; + + integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED); + integr_info.Flags = 0; + integr_info.Reserved = 0; + + return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SET_INTEGRITY_INFORMATION, + true /* is_fsctl */, (char *)&integr_info, + sizeof(struct fsctl_set_integrity_information_req), + (char **)&retbuf, + &ret_data_len); + +} + +static int smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, struct cifs_fid *fid, __u16 search_flags, @@ -1032,7 +1101,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); /* if file not oplocked can't be sure whether asking to extend size */ @@ -1083,7 +1152,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); /* Need to make file sparse, if not already, before freeing range. */ @@ -1115,7 +1184,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode = cfile->dentry->d_inode; + inode = d_inode(cfile->dentry); cifsi = CIFS_I(inode); /* if file not oplocked can't be sure whether asking to extend size */ @@ -1624,6 +1693,7 @@ struct smb_version_operations smb30_operations = { .new_lease_key = smb2_new_lease_key, .generate_signingkey = generate_smb3signingkey, .calc_signature = smb3_calc_signature, + .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, .set_oplock_level = smb3_set_oplock_level, .create_lease_buf = smb3_create_lease_buf, @@ -1635,6 +1705,94 @@ struct smb_version_operations smb30_operations = { .fallocate = smb3_fallocate, }; +#ifdef CONFIG_CIFS_SMB311 +struct smb_version_operations smb311_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .dump_share_caps = smb2_dump_share_caps, + .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb3_qfs_tcon, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, + .query_mf_symlink = smb3_query_mf_symlink, + .create_mf_symlink = smb3_create_mf_symlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .generate_signingkey = generate_smb3signingkey, + .calc_signature = smb3_calc_signature, + .set_integrity = smb3_set_integrity, + .is_read_op = smb21_is_read_op, + .set_oplock_level = smb3_set_oplock_level, + .create_lease_buf = smb3_create_lease_buf, + .parse_lease_buf = smb3_parse_lease_buf, + .clone_range = smb2_clone_range, + .duplicate_extents = smb2_duplicate_extents, +/* .validate_negotiate = smb3_validate_negotiate, */ /* not used in 3.11 */ + .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, + .fallocate = smb3_fallocate, +}; +#endif /* CIFS_SMB311 */ + struct smb_version_values smb20_values = { .version_string = SMB20_VERSION_STRING, .protocol_id = SMB20_PROT_ID, @@ -1714,3 +1872,25 @@ struct smb_version_values smb302_values = { .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, .create_lease_size = sizeof(struct create_lease_v2), }; + +#ifdef CONFIG_CIFS_SMB311 +struct smb_version_values smb311_values = { + .version_string = SMB311_VERSION_STRING, + .protocol_id = SMB311_PROT_ID, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), +}; +#endif /* SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 65cd7a84c8bc..b8b4f08ee094 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -110,7 +110,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , /* GLOBAL_CAP_LARGE_MTU will only be set if dialect > SMB2.02 */ /* See sections 2.2.4 and 3.2.4.1.5 of MS-SMB2 */ - if ((tcon->ses) && + if ((tcon->ses) && (tcon->ses->server) && (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) hdr->CreditCharge = cpu_to_le16(1); /* else CreditCharge MBZ */ @@ -304,6 +304,59 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, return rc; } +#ifdef CONFIG_CIFS_SMB311 +/* offset is sizeof smb2_negotiate_req - 4 but rounded up to 8 bytes */ +#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) - 4 */ + + +#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) +#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) + +static void +build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES; + pneg_ctxt->DataLength = cpu_to_le16(38); + pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1); + pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE); + get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE); + pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512; +} + +static void +build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; + pneg_ctxt->DataLength = cpu_to_le16(6); + pneg_ctxt->CipherCount = cpu_to_le16(2); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; +} + +static void +assemble_neg_contexts(struct smb2_negotiate_req *req) +{ + + /* +4 is to account for the RFC1001 len field */ + char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT + 4; + + build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); + /* Add 2 to size to round to 8 byte boundary */ + pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context); + build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); + req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); + req->NegotiateContextCount = cpu_to_le16(2); + inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2 + + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ +} +#else +static void assemble_neg_contexts(struct smb2_negotiate_req *req) +{ + return; +} +#endif /* SMB311 */ + + /* * * SMB2 Worker functions follow: @@ -363,10 +416,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* ClientGUID must be zero for SMB2.02 dialect */ if (ses->server->vals->protocol_id == SMB20_PROT_ID) memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE); - else + else { memcpy(req->ClientGUID, server->client_guid, SMB2_CLIENT_GUID_SIZE); - + if (ses->server->vals->protocol_id == SMB311_PROT_ID) + assemble_neg_contexts(req); + } iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field */ iov[0].iov_len = get_rfc1002_length(req) + 4; @@ -393,8 +448,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) cifs_dbg(FYI, "negotiated smb3.0 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID)) cifs_dbg(FYI, "negotiated smb3.02 dialect\n"); +#ifdef CONFIG_CIFS_SMB311 + else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) + cifs_dbg(FYI, "negotiated smb3.1.1 dialect\n"); +#endif /* SMB311 */ else { - cifs_dbg(VFS, "Illegal dialect returned by server %d\n", + cifs_dbg(VFS, "Illegal dialect returned by server 0x%x\n", le16_to_cpu(rsp->DialectRevision)); rc = -EIO; goto neg_exit; @@ -572,7 +631,7 @@ ssetup_ntlmssp_authenticate: return rc; req->hdr.SessionId = 0; /* First session, not a reauthenticate */ - req->VcNumber = 0; /* MBZ */ + req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 70867d54fb8b..451108284a2f 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -136,9 +136,6 @@ struct smb2_transform_hdr { __u64 SessionId; } __packed; -/* Encryption Algorithms */ -#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) - /* * SMB2 flag definitions */ @@ -191,7 +188,10 @@ struct smb2_negotiate_req { __le16 Reserved; /* MBZ */ __le32 Capabilities; __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; - __le64 ClientStartTime; /* MBZ */ + /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ + __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ + __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ + __le16 Reserved2; __le16 Dialects[1]; /* One dialect (vers=) at a time for now */ } __packed; @@ -200,6 +200,7 @@ struct smb2_negotiate_req { #define SMB21_PROT_ID 0x0210 #define SMB30_PROT_ID 0x0300 #define SMB302_PROT_ID 0x0302 +#define SMB311_PROT_ID 0x0311 #define BAD_PROT_ID 0xFFFF /* SecurityMode flags */ @@ -217,12 +218,38 @@ struct smb2_negotiate_req { #define SMB2_NT_FIND 0x00100000 #define SMB2_LARGE_FILES 0x00200000 +#define SMB311_SALT_SIZE 32 +/* Hash Algorithm Types */ +#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) + +struct smb2_preauth_neg_context { + __le16 ContextType; /* 1 */ + __le16 DataLength; + __le32 Reserved; + __le16 HashAlgorithmCount; /* 1 */ + __le16 SaltLength; + __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ + __u8 Salt[SMB311_SALT_SIZE]; +} __packed; + +/* Encryption Algorithms Ciphers */ +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) + +struct smb2_encryption_neg_context { + __le16 ContextType; /* 2 */ + __le16 DataLength; + __le32 Reserved; + __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ + __le16 Ciphers[2]; /* Ciphers[0] since only one used now */ +} __packed; + struct smb2_negotiate_rsp { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 65 */ __le16 SecurityMode; __le16 DialectRevision; - __le16 Reserved; /* MBZ */ + __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ __u8 ServerGUID[16]; __le32 Capabilities; __le32 MaxTransactSize; @@ -232,14 +259,18 @@ struct smb2_negotiate_rsp { __le64 ServerStartTime; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __le32 Reserved2; /* may be any value, ignore */ + __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ __u8 Buffer[1]; /* variable length GSS security buffer */ } __packed; +/* Flags */ +#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 +#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 + struct smb2_sess_setup_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 25 */ - __u8 VcNumber; + __u8 Flags; __u8 SecurityMode; __le32 Capabilities; __le32 Channel; @@ -274,10 +305,13 @@ struct smb2_logoff_rsp { __le16 Reserved; } __packed; +/* Flags/Reserved for SMB3.1.1 */ +#define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001 + struct smb2_tree_connect_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ - __le16 Reserved; + __le16 Reserved; /* Flags in SMB3.1.1 */ __le16 PathOffset; __le16 PathLength; __u8 Buffer[1]; /* variable length */ @@ -587,6 +621,29 @@ struct copychunk_ioctl_rsp { __le32 TotalBytesWritten; } __packed; +struct fsctl_set_integrity_information_req { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; +} __packed; + +struct fsctl_get_integrity_information_rsp { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; + __le32 ChecksumChunkSizeInBytes; + __le32 ClusterSizeInBytes; +} __packed; + +/* Integrity ChecksumAlgorithm choices for above */ +#define CHECKSUM_TYPE_NONE 0x0000 +#define CHECKSUM_TYPE_CRC64 0x0002 +#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */ + +/* Integrity flags for above */ +#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 + + struct validate_negotiate_info_req { __le32 Capabilities; __u8 Guid[SMB2_CLIENT_GUID_SIZE]; @@ -620,6 +677,14 @@ struct compress_ioctl { __le16 CompressionState; /* See cifspdu.h for possible flag values */ } __packed; +struct duplicate_extents_to_file { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ +} __packed; + struct smb2_ioctl_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 83efa59535be..a639d0dab453 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -75,10 +75,13 @@ #define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */ #define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ +#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ +#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344 #define FSCTL_SIS_LINK_FILES 0x0009C104 +#define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280 #define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ #define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ /* strange that the number for this op is not sequential with previous op */ diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 72a4d10653d6..ff9e1f8b16a4 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -50,9 +50,9 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; @@ -111,9 +111,9 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; @@ -177,12 +177,12 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, memcpy(pacl, ea_value, value_size); if (pTcon->ses->server->ops->set_acl) rc = pTcon->ses->server->ops->set_acl(pacl, - value_size, direntry->d_inode, + value_size, d_inode(direntry), full_path, CIFS_ACL_DACL); else rc = -EOPNOTSUPP; if (rc == 0) /* force revalidate of the inode */ - CIFS_I(direntry->d_inode)->time = 0; + CIFS_I(d_inode(direntry))->time = 0; kfree(pacl); } #else @@ -246,9 +246,9 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; @@ -324,7 +324,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, goto get_ea_exit; /* rc already EOPNOTSUPP */ pacl = pTcon->ses->server->ops->get_acl(cifs_sb, - direntry->d_inode, full_path, &acllen); + d_inode(direntry), full_path, &acllen); if (IS_ERR(pacl)) { rc = PTR_ERR(pacl); cifs_dbg(VFS, "%s: error %zd getting sec desc\n", @@ -382,9 +382,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) if (direntry == NULL) return -EIO; - if (direntry->d_inode == NULL) + if (d_really_is_negative(direntry)) return -EIO; - sb = direntry->d_inode->i_sb; + sb = d_inode(direntry)->i_sb; if (sb == NULL) return -EIO; diff --git a/fs/coda/cache.c b/fs/coda/cache.c index 46ee6f238985..5bb630a769e0 100644 --- a/fs/coda/cache.c +++ b/fs/coda/cache.c @@ -94,8 +94,8 @@ static void coda_flag_children(struct dentry *parent, int flag) spin_lock(&parent->d_lock); list_for_each_entry(de, &parent->d_subdirs, d_child) { /* don't know what to do with negative dentries */ - if (de->d_inode ) - coda_flag_inode(de->d_inode, flag); + if (d_inode(de) ) + coda_flag_inode(d_inode(de), flag); } spin_unlock(&parent->d_lock); return; diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index d6f7a76a1f5b..f829fe963f5b 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -79,7 +79,7 @@ void coda_sysctl_clean(void); static inline struct coda_inode_info *ITOC(struct inode *inode) { - return list_entry(inode, struct coda_inode_info, vfs_inode); + return container_of(inode, struct coda_inode_info, vfs_inode); } static __inline__ struct CodaFid *coda_i2f(struct inode *inode) diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 60cb88c1dd2b..fda9f4311212 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -201,7 +201,7 @@ err_out: static int coda_link(struct dentry *source_de, struct inode *dir_inode, struct dentry *de) { - struct inode *inode = source_de->d_inode; + struct inode *inode = d_inode(source_de); const char * name = de->d_name.name; int len = de->d_name.len; int error; @@ -266,7 +266,7 @@ static int coda_unlink(struct inode *dir, struct dentry *de) return error; coda_dir_update_mtime(dir); - drop_nlink(de->d_inode); + drop_nlink(d_inode(de)); return 0; } @@ -279,8 +279,8 @@ static int coda_rmdir(struct inode *dir, struct dentry *de) error = venus_rmdir(dir->i_sb, coda_i2f(dir), name, len); if (!error) { /* VFS may delete the child */ - if (de->d_inode) - clear_nlink(de->d_inode); + if (d_really_is_positive(de)) + clear_nlink(d_inode(de)); /* fix the link count of the parent */ coda_dir_drop_nlink(dir); @@ -303,14 +303,14 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, coda_i2f(new_dir), old_length, new_length, (const char *) old_name, (const char *)new_name); if (!error) { - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { if (d_is_dir(new_dentry)) { coda_dir_drop_nlink(old_dir); coda_dir_inc_nlink(new_dir); } coda_dir_update_mtime(old_dir); coda_dir_update_mtime(new_dir); - coda_flag_inode(new_dentry->d_inode, C_VATTR); + coda_flag_inode(d_inode(new_dentry), C_VATTR); } else { coda_flag_inode(old_dir, C_VATTR); coda_flag_inode(new_dir, C_VATTR); @@ -449,13 +449,13 @@ static int coda_dentry_revalidate(struct dentry *de, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = de->d_inode; + inode = d_inode(de); if (!inode || is_root_inode(inode)) goto out; if (is_bad_inode(inode)) goto bad; - cii = ITOC(de->d_inode); + cii = ITOC(d_inode(de)); if (!(cii->c_flags & (C_PURGE | C_FLUSH))) goto out; @@ -487,11 +487,11 @@ static int coda_dentry_delete(const struct dentry * dentry) { int flags; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return 0; - flags = (ITOC(dentry->d_inode)->c_flags) & C_PURGE; - if (is_bad_inode(dentry->d_inode) || flags) { + flags = (ITOC(d_inode(dentry))->c_flags) & C_PURGE; + if (is_bad_inode(d_inode(dentry)) || flags) { return 1; } return 0; diff --git a/fs/coda/file.c b/fs/coda/file.c index d244d743a232..1da3805f3ddc 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -27,19 +27,14 @@ #include "coda_int.h" static ssize_t -coda_file_read(struct file *coda_file, char __user *buf, size_t count, loff_t *ppos) +coda_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct coda_file_info *cfi; - struct file *host_file; + struct file *coda_file = iocb->ki_filp; + struct coda_file_info *cfi = CODA_FTOC(coda_file); - cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - host_file = cfi->cfi_container; - if (!host_file->f_op->read) - return -EINVAL; - - return host_file->f_op->read(host_file, buf, count, ppos); + return vfs_iter_read(cfi->cfi_container, to, &iocb->ki_pos); } static ssize_t @@ -64,32 +59,25 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos, } static ssize_t -coda_file_write(struct file *coda_file, const char __user *buf, size_t count, loff_t *ppos) +coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) { - struct inode *host_inode, *coda_inode = file_inode(coda_file); - struct coda_file_info *cfi; + struct file *coda_file = iocb->ki_filp; + struct inode *coda_inode = file_inode(coda_file); + struct coda_file_info *cfi = CODA_FTOC(coda_file); struct file *host_file; ssize_t ret; - cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); - host_file = cfi->cfi_container; - - if (!host_file->f_op->write) - return -EINVAL; - host_inode = file_inode(host_file); + host_file = cfi->cfi_container; file_start_write(host_file); mutex_lock(&coda_inode->i_mutex); - - ret = host_file->f_op->write(host_file, buf, count, ppos); - - coda_inode->i_size = host_inode->i_size; + ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); + coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; mutex_unlock(&coda_inode->i_mutex); file_end_write(host_file); - return ret; } @@ -231,8 +219,8 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) const struct file_operations coda_file_operations = { .llseek = generic_file_llseek, - .read = coda_file_read, - .write = coda_file_write, + .read_iter = coda_file_read_iter, + .write_iter = coda_file_write_iter, .mmap = coda_file_mmap, .open = coda_open, .release = coda_release, diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 82ec68b59208..cac1390b87a3 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -257,15 +257,15 @@ static void coda_evict_inode(struct inode *inode) int coda_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - int err = coda_revalidate_inode(dentry->d_inode); + int err = coda_revalidate_inode(d_inode(dentry)); if (!err) - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return err; } int coda_setattr(struct dentry *de, struct iattr *iattr) { - struct inode *inode = de->d_inode; + struct inode *inode = d_inode(de); struct coda_vattr vattr; int error; diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c index 4326d172fc27..f36a4040afb8 100644 --- a/fs/coda/pioctl.c +++ b/fs/coda/pioctl.c @@ -72,7 +72,7 @@ static long coda_pioctl(struct file *filp, unsigned int cmd, if (error) return error; - target_inode = path.dentry->d_inode; + target_inode = d_inode(path.dentry); /* return if it is not a Coda inode */ if (target_inode->i_sb != inode->i_sb) { diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index 5bb6e27298a4..9b1ffaa0572e 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -820,8 +820,8 @@ int coda_downcall(struct venus_comm *vcp, int opcode, union outputArgs *out) case CODA_FLUSH: coda_cache_clear_all(sb); shrink_dcache_sb(sb); - if (sb->s_root->d_inode) - coda_flag_inode(sb->s_root->d_inode, C_FLUSH); + if (d_really_is_positive(sb->s_root)) + coda_flag_inode(d_inode(sb->s_root), C_FLUSH); break; case CODA_PURGEUSER: diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index afec6450450f..48851f6ea6ec 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -570,6 +570,7 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) #define BNEPCONNDEL _IOW('B', 201, int) #define BNEPGETCONNLIST _IOR('B', 210, int) #define BNEPGETCONNINFO _IOR('B', 211, int) +#define BNEPGETSUPPFEAT _IOR('B', 212, int) #define CMTPCONNADD _IOW('C', 200, int) #define CMTPCONNDEL _IOW('C', 201, int) @@ -895,6 +896,7 @@ COMPATIBLE_IOCTL(FIGETBSZ) /* 'X' - originally XFS but some now in the VFS */ COMPATIBLE_IOCTL(FIFREEZE) COMPATIBLE_IOCTL(FITHAW) +COMPATIBLE_IOCTL(FITRIM) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) COMPATIBLE_IOCTL(KDGKBTYPE) @@ -1247,6 +1249,7 @@ COMPATIBLE_IOCTL(BNEPCONNADD) COMPATIBLE_IOCTL(BNEPCONNDEL) COMPATIBLE_IOCTL(BNEPGETCONNLIST) COMPATIBLE_IOCTL(BNEPGETCONNINFO) +COMPATIBLE_IOCTL(BNEPGETSUPPFEAT) COMPATIBLE_IOCTL(CMTPCONNADD) COMPATIBLE_IOCTL(CMTPCONNDEL) COMPATIBLE_IOCTL(CMTPGETCONNLIST) diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index cf0db005d2f5..c81ce7f200a6 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -289,7 +289,7 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry) configfs_set_dir_dirent_depth(p->d_fsdata, dentry->d_fsdata); error = configfs_create(dentry, mode, init_dir); if (!error) { - inc_nlink(p->d_inode); + inc_nlink(d_inode(p)); item->ci_dentry = dentry; } else { struct configfs_dirent *sd = dentry->d_fsdata; @@ -375,8 +375,8 @@ static void remove_dir(struct dentry * d) list_del_init(&sd->s_sibling); spin_unlock(&configfs_dirent_lock); configfs_put(sd); - if (d->d_inode) - simple_rmdir(parent->d_inode,d); + if (d_really_is_positive(d)) + simple_rmdir(d_inode(parent),d); pr_debug(" o %pd removing done (%d)\n", d, d_count(d)); @@ -513,7 +513,7 @@ static int configfs_detach_prep(struct dentry *dentry, struct mutex **wait_mutex /* Abort if racing with mkdir() */ if (sd->s_type & CONFIGFS_USET_IN_MKDIR) { if (wait_mutex) - *wait_mutex = &sd->s_dentry->d_inode->i_mutex; + *wait_mutex = &d_inode(sd->s_dentry)->i_mutex; return -EAGAIN; } @@ -624,13 +624,13 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; - mutex_lock(&child->d_inode->i_mutex); + mutex_lock(&d_inode(child)->i_mutex); configfs_detach_group(sd->s_element); - child->d_inode->i_flags |= S_DEAD; + d_inode(child)->i_flags |= S_DEAD; dont_mount(child); - mutex_unlock(&child->d_inode->i_mutex); + mutex_unlock(&d_inode(child)->i_mutex); d_delete(child); dput(child); @@ -672,7 +672,7 @@ static int create_default_group(struct config_group *parent_group, sd = child->d_fsdata; sd->s_type |= CONFIGFS_USET_DEFAULT; } else { - BUG_ON(child->d_inode); + BUG_ON(d_inode(child)); d_drop(child); dput(child); } @@ -818,11 +818,11 @@ static int configfs_attach_item(struct config_item *parent_item, * the VFS may already have hit and used them. Thus, * we must lock them as rmdir() would. */ - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); configfs_remove_dir(item); - dentry->d_inode->i_flags |= S_DEAD; + d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); d_delete(dentry); } } @@ -858,16 +858,16 @@ static int configfs_attach_group(struct config_item *parent_item, * We must also lock the inode to remove it safely in case of * error, as rmdir() would. */ - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); ret = populate_groups(to_config_group(item)); if (ret) { configfs_detach_item(item); - dentry->d_inode->i_flags |= S_DEAD; + d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); } configfs_adjust_dir_dirent_depth_after_populate(sd); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); if (ret) d_delete(dentry); } @@ -1075,7 +1075,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, * subsystem is really registered, and so we need to lock out * configfs_[un]register_subsystem(). */ - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); root_sd = root->d_fsdata; @@ -1111,7 +1111,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, out_unlock_dirent_lock: spin_unlock(&configfs_dirent_lock); out_unlock_fs: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); /* * If we succeeded, the fs is pinned via other methods. If not, @@ -1453,11 +1453,11 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) down_write(&configfs_rename_sem); parent = item->parent->dentry; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { - if (!new_dentry->d_inode) { + if (d_really_is_negative(new_dentry)) { error = config_item_set_name(item, "%s", new_name); if (!error) { d_add(new_dentry, NULL); @@ -1469,7 +1469,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) error = -EEXIST; dput(new_dentry); } - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); up_write(&configfs_rename_sem); return error; @@ -1482,7 +1482,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) struct configfs_dirent * parent_sd = dentry->d_fsdata; int err; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached @@ -1495,7 +1495,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) else err = 0; } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return err; } @@ -1505,11 +1505,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file) struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * cursor = file->private_data; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); spin_unlock(&configfs_dirent_lock); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); release_configfs_dirent(cursor); @@ -1567,7 +1567,7 @@ static int configfs_readdir(struct file *file, struct dir_context *ctx) spin_lock(&configfs_dirent_lock); dentry = next->s_dentry; if (dentry) - inode = dentry->d_inode; + inode = d_inode(dentry); if (inode) ino = inode->i_ino; spin_unlock(&configfs_dirent_lock); @@ -1590,7 +1590,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); switch (whence) { case 1: offset += file->f_pos; @@ -1598,7 +1598,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&file_inode(file)->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -1624,7 +1624,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&configfs_dirent_lock); } } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return offset; } @@ -1654,7 +1654,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) sd = root->d_fsdata; link_group(to_config_group(sd->s_element), group); - mutex_lock_nested(&root->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); err = -ENOMEM; dentry = d_alloc_name(root, group->cg_item.ci_name); @@ -1664,7 +1664,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) err = configfs_attach_group(sd->s_element, &group->cg_item, dentry); if (err) { - BUG_ON(dentry->d_inode); + BUG_ON(d_inode(dentry)); d_drop(dentry); dput(dentry); } else { @@ -1674,7 +1674,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) } } - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); if (err) { unlink_group(group); @@ -1695,9 +1695,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) return; } - mutex_lock_nested(&root->d_inode->i_mutex, + mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { @@ -1706,13 +1706,13 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) spin_unlock(&configfs_dirent_lock); mutex_unlock(&configfs_symlink_mutex); configfs_detach_group(&group->cg_item); - dentry->d_inode->i_flags |= S_DEAD; + d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); d_delete(dentry); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); dput(dentry); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 56d2cdc9ae0a..403269ffcdf3 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -326,10 +326,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, CONFIGFS_ITEM_ATTR); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); return error; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 5423a6a6ecc8..eae87575e681 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -56,7 +56,7 @@ static const struct inode_operations configfs_inode_operations ={ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct configfs_dirent * sd = dentry->d_fsdata; struct iattr * sd_iattr; unsigned int ia_valid = iattr->ia_valid; @@ -186,7 +186,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in if (!dentry) return -ENOENT; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return -EEXIST; sd = dentry->d_fsdata; @@ -194,7 +194,7 @@ int configfs_create(struct dentry * dentry, umode_t mode, void (*init)(struct in if (!inode) return -ENOMEM; - p_inode = dentry->d_parent->d_inode; + p_inode = d_inode(dentry->d_parent); p_inode->i_mtime = p_inode->i_ctime = CURRENT_TIME; configfs_set_inode_lock_class(sd, inode); @@ -236,11 +236,11 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dentry->d_lock); - if (!d_unhashed(dentry) && dentry->d_inode) { + if (simple_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - simple_unlink(parent->d_inode, dentry); + simple_unlink(d_inode(parent), dentry); } else spin_unlock(&dentry->d_lock); } @@ -251,11 +251,11 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) struct configfs_dirent * sd; struct configfs_dirent * parent_sd = dir->d_fsdata; - if (dir->d_inode == NULL) + if (d_really_is_negative(dir)) /* no inode means this hasn't been made visible yet */ return; - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; @@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) break; } } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); } diff --git a/fs/configfs/item.c b/fs/configfs/item.c index e65f9ffbb999..b863a09cd2f1 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref); * config_item_init - initialize item. * @item: item in question. */ -void config_item_init(struct config_item *item) +static void config_item_init(struct config_item *item) { kref_init(&item->ci_kref); INIT_LIST_HEAD(&item->ci_entry); } -EXPORT_SYMBOL(config_item_init); /** * config_item_set_name - Set the name of an item @@ -116,7 +115,7 @@ void config_item_init_type_name(struct config_item *item, const char *name, struct config_item_type *type) { - config_item_set_name(item, name); + config_item_set_name(item, "%s", name); item->ci_type = type; config_item_init(item); } @@ -125,7 +124,7 @@ EXPORT_SYMBOL(config_item_init_type_name); void config_group_init_type_name(struct config_group *group, const char *name, struct config_item_type *type) { - config_item_set_name(&group->cg_item, name); + config_item_set_name(&group->cg_item, "%s", name); group->cg_item.ci_type = type; config_group_init(group); } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index da94e41bdbf6..a8f3b589a2df 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -129,8 +129,6 @@ void configfs_release_fs(void) } -static struct kobject *config_kobj; - static int __init configfs_init(void) { int err = -ENOMEM; @@ -141,8 +139,8 @@ static int __init configfs_init(void) if (!configfs_dir_cachep) goto out; - config_kobj = kobject_create_and_add("config", kernel_kobj); - if (!config_kobj) + err = sysfs_create_mount_point(kernel_kobj, "config"); + if (err) goto out2; err = register_filesystem(&configfs_fs_type); @@ -152,7 +150,7 @@ static int __init configfs_init(void) return 0; out3: pr_err("Unable to register filesystem!\n"); - kobject_put(config_kobj); + sysfs_remove_mount_point(kernel_kobj, "config"); out2: kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; @@ -163,7 +161,7 @@ out: static void __exit configfs_exit(void) { unregister_filesystem(&configfs_fs_type); - kobject_put(config_kobj); + sysfs_remove_mount_point(kernel_kobj, "config"); kmem_cache_destroy(configfs_dir_cachep); configfs_dir_cachep = NULL; } @@ -173,5 +171,5 @@ MODULE_LICENSE("GPL"); MODULE_VERSION("0.0.2"); MODULE_DESCRIPTION("Simple RAM filesystem for user driven kernel subsystem configuration."); -module_init(configfs_init); +core_initcall(configfs_init); module_exit(configfs_exit); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index cc9f2546ea4a..ec5c8325b503 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -279,36 +279,27 @@ static int configfs_getlink(struct dentry *dentry, char * path) } -static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *configfs_follow_link(struct dentry *dentry, void **cookie) { - int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); + int error; - if (page) { - error = configfs_getlink(dentry, (char *)page); - if (!error) { - nd_set_link(nd, (char *)page); - return (void *)page; - } - } - - nd_set_link(nd, ERR_PTR(error)); - return NULL; -} + if (!page) + return ERR_PTR(-ENOMEM); -static void configfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - if (cookie) { - unsigned long page = (unsigned long)cookie; - free_page(page); + error = configfs_getlink(dentry, (char *)page); + if (!error) { + return *cookie = (void *)page; } + + free_page(page); + return ERR_PTR(error); } const struct inode_operations configfs_symlink_inode_operations = { .follow_link = configfs_follow_link, .readlink = generic_readlink, - .put_link = configfs_put_link, + .put_link = free_page_put_link, .setattr = configfs_setattr, }; diff --git a/fs/coredump.c b/fs/coredump.c index f319926ddf8c..c5ecde6f3eed 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -70,7 +70,8 @@ static int expand_corename(struct core_name *cn, int size) return 0; } -static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) +static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt, + va_list arg) { int free, need; va_list arg_copy; @@ -93,7 +94,7 @@ again: return -ENOMEM; } -static int cn_printf(struct core_name *cn, const char *fmt, ...) +static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...) { va_list arg; int ret; @@ -105,7 +106,8 @@ static int cn_printf(struct core_name *cn, const char *fmt, ...) return ret; } -static int cn_esc_printf(struct core_name *cn, const char *fmt, ...) +static __printf(2, 3) +int cn_esc_printf(struct core_name *cn, const char *fmt, ...) { int cur = cn->used; va_list arg; @@ -138,7 +140,7 @@ static int cn_print_exe_file(struct core_name *cn) goto put_exe_file; } - path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); + path = file_path(exe_file, pathbuf, PATH_MAX); if (IS_ERR(path)) { ret = PTR_ERR(path); goto free_buf; @@ -209,11 +211,15 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) break; /* uid */ case 'u': - err = cn_printf(cn, "%d", cred->uid); + err = cn_printf(cn, "%u", + from_kuid(&init_user_ns, + cred->uid)); break; /* gid */ case 'g': - err = cn_printf(cn, "%d", cred->gid); + err = cn_printf(cn, "%u", + from_kgid(&init_user_ns, + cred->gid)); break; case 'd': err = cn_printf(cn, "%d", @@ -221,7 +227,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) break; /* signal that caused the coredump */ case 's': - err = cn_printf(cn, "%ld", cprm->siginfo->si_signo); + err = cn_printf(cn, "%d", + cprm->siginfo->si_signo); break; /* UNIX time of coredump */ case 't': { @@ -657,7 +664,7 @@ void do_coredump(const siginfo_t *siginfo) */ if (!uid_eq(inode->i_uid, current_fsuid())) goto close_fail; - if (!cprm.file->f_op->write) + if (!(cprm.file->f_mode & FMODE_CAN_WRITE)) goto close_fail; if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file)) goto close_fail; @@ -98,9 +98,9 @@ static bool buffer_size_valid(struct buffer_head *bh) return bh->b_state != 0; } -static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, - loff_t start, loff_t end, get_block_t get_block, - struct buffer_head *bh) +static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, + loff_t start, loff_t end, get_block_t get_block, + struct buffer_head *bh) { ssize_t retval = 0; loff_t pos = start; @@ -109,7 +109,7 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, void *addr; bool hole = false; - if (rw != WRITE) + if (iov_iter_rw(iter) != WRITE) end = min(end, i_size_read(inode)); while (pos < end) { @@ -124,7 +124,7 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; retval = get_block(inode, block, bh, - rw == WRITE); + iov_iter_rw(iter) == WRITE); if (retval) break; if (!buffer_size_valid(bh)) @@ -137,7 +137,7 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, bh->b_size -= done; } - hole = (rw != WRITE) && !buffer_written(bh); + hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); if (hole) { addr = NULL; size = bh->b_size - first; @@ -154,8 +154,8 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, max = min(pos + size, end); } - if (rw == WRITE) - len = copy_from_iter(addr, max - pos, iter); + if (iov_iter_rw(iter) == WRITE) + len = copy_from_iter_nocache(addr, max - pos, iter); else if (!hole) len = copy_to_iter(addr, max - pos, iter); else @@ -173,7 +173,6 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, /** * dax_do_io - Perform I/O to a DAX file - * @rw: READ to read or WRITE to write * @iocb: The control block for this I/O * @inode: The file which the I/O is directed at * @iter: The addresses to do I/O from or to @@ -189,9 +188,9 @@ static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter, * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O * is in progress. */ -ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, - struct iov_iter *iter, loff_t pos, - get_block_t get_block, dio_iodone_t end_io, int flags) +ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, + struct iov_iter *iter, loff_t pos, get_block_t get_block, + dio_iodone_t end_io, int flags) { struct buffer_head bh; ssize_t retval = -EINVAL; @@ -199,7 +198,7 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, memset(&bh, 0, sizeof(bh)); - if ((flags & DIO_LOCKING) && (rw == READ)) { + if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { struct address_space *mapping = inode->i_mapping; mutex_lock(&inode->i_mutex); retval = filemap_write_and_wait_range(mapping, pos, end - 1); @@ -210,17 +209,19 @@ ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode, } /* Protects against truncate */ - atomic_inc(&inode->i_dio_count); + if (!(flags & DIO_SKIP_DIO_COUNT)) + inode_dio_begin(inode); - retval = dax_io(rw, inode, iter, pos, end, get_block, &bh); + retval = dax_io(inode, iter, pos, end, get_block, &bh); - if ((flags & DIO_LOCKING) && (rw == READ)) + if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) mutex_unlock(&inode->i_mutex); if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); - inode_dio_done(inode); + if (!(flags & DIO_SKIP_DIO_COUNT)) + inode_dio_end(inode); out: return retval; } @@ -310,14 +311,27 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, out: i_mmap_unlock_read(mapping); - if (bh->b_end_io) - bh->b_end_io(bh, 1); - return error; } -static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) +/** + * __dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * @complete_unwritten: The filesystem method used to convert unwritten blocks + * to written so the data written to them is exposed. This is required for + * required by write faults for filesystems that will return unwritten + * extent mappings from @get_block, but it is optional for reads as + * dax_insert_mapping() will always zero unwritten blocks. If the fs does + * not support unwritten extents, the it should pass NULL. + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. __dax_fault() assumes the caller has done all + * the necessary locking for the page fault to proceed successfully. + */ +int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -418,7 +432,23 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, page_cache_release(page); } + /* + * If we successfully insert the new mapping over an unwritten extent, + * we need to ensure we convert the unwritten extent. If there is an + * error inserting the mapping, the filesystem needs to leave it as + * unwritten to prevent exposure of the stale underlying data to + * userspace, but we still need to call the completion function so + * the private resources on the mapping buffer can be released. We + * indicate what the callback should do via the uptodate variable, same + * as for normal BH based IO completions. + */ error = dax_insert_mapping(inode, &bh, vma, vmf); + if (buffer_unwritten(&bh)) { + if (complete_unwritten) + complete_unwritten(&bh, !error); + else + WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); + } out: if (error == -ENOMEM) @@ -435,6 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } goto out; } +EXPORT_SYMBOL(__dax_fault); /** * dax_fault - handle a page fault on a DAX file @@ -446,7 +477,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) + get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -455,7 +486,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = do_dax_fault(vma, vmf, get_block); + result = __dax_fault(vma, vmf, get_block, complete_unwritten); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); @@ -464,6 +495,23 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, EXPORT_SYMBOL_GPL(dax_fault); /** + * dax_pfn_mkwrite - handle first write to DAX page + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * + */ +int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct super_block *sb = file_inode(vma->vm_file)->i_sb; + + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + sb_end_pagefault(sb); + return VM_FAULT_NOPAGE; +} +EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); + +/** * dax_zero_page_range - zero a range within a page of a DAX file * @inode: The file being truncated * @from: The file offset that is being truncated to diff --git a/fs/dcache.c b/fs/dcache.c index c71e3732e53b..9b5fe503f6cb 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -269,6 +269,41 @@ static inline int dname_external(const struct dentry *dentry) return dentry->d_name.name != dentry->d_iname; } +/* + * Make sure other CPUs see the inode attached before the type is set. + */ +static inline void __d_set_inode_and_type(struct dentry *dentry, + struct inode *inode, + unsigned type_flags) +{ + unsigned flags; + + dentry->d_inode = inode; + smp_wmb(); + flags = READ_ONCE(dentry->d_flags); + flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + flags |= type_flags; + WRITE_ONCE(dentry->d_flags, flags); +} + +/* + * Ideally, we want to make sure that other CPUs see the flags cleared before + * the inode is detached, but this is really a violation of RCU principles + * since the ordering suggests we should always set inode before flags. + * + * We should instead replace or discard the entire dentry - but that sucks + * performancewise on mass deletion/rename. + */ +static inline void __d_clear_type_and_inode(struct dentry *dentry) +{ + unsigned flags = READ_ONCE(dentry->d_flags); + + flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); + WRITE_ONCE(dentry->d_flags, flags); + smp_wmb(); + dentry->d_inode = NULL; +} + static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); @@ -287,17 +322,17 @@ static void dentry_free(struct dentry *dentry) } /** - * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups + * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups * @dentry: the target dentry * After this call, in-progress rcu-walk path lookup will fail. This * should be called after unhashing, and after changing d_inode (if * the dentry has not already been unhashed). */ -static inline void dentry_rcuwalk_barrier(struct dentry *dentry) +static inline void dentry_rcuwalk_invalidate(struct dentry *dentry) { - assert_spin_locked(&dentry->d_lock); - /* Go through a barrier */ - write_seqcount_barrier(&dentry->d_seq); + lockdep_assert_held(&dentry->d_lock); + /* Go through am invalidation barrier */ + write_seqcount_invalidate(&dentry->d_seq); } /* @@ -311,7 +346,7 @@ static void dentry_iput(struct dentry * dentry) { struct inode *inode = dentry->d_inode; if (inode) { - dentry->d_inode = NULL; + __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); @@ -335,10 +370,9 @@ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; - __d_clear_type(dentry); - dentry->d_inode = NULL; + __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -460,7 +494,7 @@ void __d_drop(struct dentry *dentry) __hlist_bl_del(&dentry->d_hash); dentry->d_hash.pprev = NULL; hlist_bl_unlock(b); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); } } EXPORT_SYMBOL(__d_drop); @@ -608,7 +642,7 @@ static inline bool fast_dput(struct dentry *dentry) /* * If we have a d_op->d_delete() operation, we sould not - * let the dentry count go to zero, so use "put__or_lock". + * let the dentry count go to zero, so use "put_or_lock". */ if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) return lockref_put_or_lock(&dentry->d_lockref); @@ -663,7 +697,7 @@ static inline bool fast_dput(struct dentry *dentry) */ smp_rmb(); d_flags = ACCESS_ONCE(dentry->d_flags); - d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST; + d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; /* Nothing to do? Dropping the reference was all we needed? */ if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) @@ -742,6 +776,9 @@ repeat: if (unlikely(d_unhashed(dentry))) goto kill_it; + if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) + goto kill_it; + if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { if (dentry->d_op->d_delete(dentry)) goto kill_it; @@ -1205,13 +1242,13 @@ ascend: /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; - next = child->d_child.next; - while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)) { + /* go into the first sibling still alive */ + do { + next = child->d_child.next; if (next == &this_parent->d_subdirs) goto ascend; child = list_entry(next, struct dentry, d_child); - next = next->next; - } + } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); rcu_read_unlock(); goto resume; } @@ -1639,7 +1676,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | - DCACHE_OP_DELETE )); + DCACHE_OP_DELETE | + DCACHE_OP_SELECT_INODE)); dentry->d_op = op; if (!op) return; @@ -1655,6 +1693,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; + if (op->d_select_inode) + dentry->d_flags |= DCACHE_OP_SELECT_INODE; } EXPORT_SYMBOL(d_set_d_op); @@ -1715,12 +1755,10 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); spin_lock(&dentry->d_lock); - dentry->d_flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); - dentry->d_flags |= add_flags; if (inode) hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); - dentry->d_inode = inode; - dentry_rcuwalk_barrier(dentry); + __d_set_inode_and_type(dentry, inode, add_flags); + dentry_rcuwalk_invalidate(dentry); spin_unlock(&dentry->d_lock); fsnotify_d_instantiate(dentry, inode); } @@ -1937,8 +1975,7 @@ static struct dentry *__d_obtain_alias(struct inode *inode, int disconnected) add_flags |= DCACHE_DISCONNECTED; spin_lock(&tmp->d_lock); - tmp->d_inode = inode; - tmp->d_flags |= add_flags; + __d_set_inode_and_type(tmp, inode, add_flags); hlist_add_head(&tmp->d_u.d_alias, &inode->i_dentry); hlist_bl_lock(&tmp->d_sb->s_anon); hlist_bl_add_head(&tmp->d_hash, &tmp->d_sb->s_anon); @@ -2690,7 +2727,7 @@ static int __d_unalias(struct inode *inode, struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL, *m2 = NULL; - int ret = -EBUSY; + int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) @@ -2896,17 +2933,6 @@ restart: vfsmnt = &mnt->mnt; continue; } - /* - * Filesystems needing to implement special "root names" - * should do so with ->d_dname() - */ - if (IS_ROOT(dentry) && - (dentry->d_name.len != 1 || - dentry->d_name.name[0] != '/')) { - WARN(1, "Root dentry has weird name <%.*s>\n", - (int) dentry->d_name.len, - dentry->d_name.name); - } if (!error) error = is_mounted(vfsmnt) ? 1 : 2; break; @@ -3416,22 +3442,15 @@ void __init vfs_caches_init_early(void) inode_init_early(); } -void __init vfs_caches_init(unsigned long mempages) +void __init vfs_caches_init(void) { - unsigned long reserve; - - /* Base hash sizes on available memory, with a reserve equal to - 150% of current kernel size */ - - reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1); - mempages -= reserve; - names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); dcache_init(); inode_init(); - files_init(mempages); + files_init(); + files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 517e64938438..284f9aa0028b 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -17,7 +17,6 @@ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/pagemap.h> -#include <linux/namei.h> #include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> @@ -43,17 +42,6 @@ const struct file_operations debugfs_file_operations = { .llseek = noop_llseek, }; -static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, dentry->d_inode->i_private); - return NULL; -} - -const struct inode_operations debugfs_link_operations = { - .readlink = generic_readlink, - .follow_link = debugfs_follow_link, -}; - static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 96400ab42d13..c711be8d6a3c 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -44,11 +44,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb) return inode; } -static inline int debugfs_positive(struct dentry *dentry) -{ - return dentry->d_inode && !d_unhashed(dentry); -} - struct debugfs_mount_opts { kuid_t uid; kgid_t gid; @@ -124,7 +119,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts) static int debugfs_apply_options(struct super_block *sb) { struct debugfs_fs_info *fsi = sb->s_fs_info; - struct inode *inode = sb->s_root->d_inode; + struct inode *inode = d_inode(sb->s_root); struct debugfs_mount_opts *opts = &fsi->mount_opts; inode->i_mode &= ~S_IALLUGO; @@ -174,7 +169,7 @@ static void debugfs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (S_ISLNK(inode->i_mode)) - kfree(inode->i_private); + kfree(inode->i_link); } static const struct super_operations debugfs_super_operations = { @@ -188,7 +183,7 @@ static struct vfsmount *debugfs_automount(struct path *path) { struct vfsmount *(*f)(void *); f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata; - return f(path->dentry->d_inode->i_private); + return f(d_inode(path->dentry)->i_private); } static const struct dentry_operations debugfs_dops = { @@ -254,6 +249,9 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) pr_debug("debugfs: creating file '%s'\n",name); + if (IS_ERR(parent)) + return parent; + error = simple_pin_fs(&debug_fs_type, &debugfs_mount, &debugfs_mount_count); if (error) @@ -267,20 +265,20 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); dentry = lookup_one_len(name, parent, strlen(name)); - if (!IS_ERR(dentry) && dentry->d_inode) { + if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { dput(dentry); dentry = ERR_PTR(-EEXIST); } if (IS_ERR(dentry)) - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); return dentry; } static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return NULL; @@ -288,7 +286,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); return dentry; } @@ -341,7 +339,7 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode, inode->i_fop = fops ? fops : &debugfs_file_operations; inode->i_private = data; d_instantiate(dentry, inode); - fsnotify_create(dentry->d_parent->d_inode, dentry); + fsnotify_create(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_file); @@ -381,7 +379,7 @@ struct dentry *debugfs_create_file_size(const char *name, umode_t mode, struct dentry *de = debugfs_create_file(name, mode, parent, data, fops); if (de) - de->d_inode->i_size = file_size; + d_inode(de)->i_size = file_size; return de; } EXPORT_SYMBOL_GPL(debugfs_create_file_size); @@ -423,8 +421,8 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + inc_nlink(d_inode(dentry->d_parent)); + fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_dir); @@ -508,8 +506,8 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; - inode->i_op = &debugfs_link_operations; - inode->i_private = link; + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); } @@ -519,12 +517,12 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; - if (debugfs_positive(dentry)) { + if (simple_positive(dentry)) { dget(dentry); - if (S_ISDIR(dentry->d_inode->i_mode)) - ret = simple_rmdir(parent->d_inode, dentry); + if (d_is_dir(dentry)) + ret = simple_rmdir(d_inode(parent), dentry); else - simple_unlink(parent->d_inode, dentry); + simple_unlink(d_inode(parent), dentry); if (!ret) d_delete(dentry); dput(dentry); @@ -554,12 +552,12 @@ void debugfs_remove(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || !parent->d_inode) + if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); ret = __debugfs_remove(dentry, parent); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -585,12 +583,12 @@ void debugfs_remove_recursive(struct dentry *dentry) return; parent = dentry->d_parent; - if (!parent || !parent->d_inode) + if (!parent || d_really_is_negative(parent)) return; parent = dentry; down: - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -599,13 +597,13 @@ void debugfs_remove_recursive(struct dentry *dentry) */ spin_lock(&parent->d_lock); list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!debugfs_positive(child)) + if (!simple_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); parent = child; goto down; } @@ -620,16 +618,16 @@ void debugfs_remove_recursive(struct dentry *dentry) * from d_subdirs. When releasing the parent->d_lock we can * no longer trust that the next pointer is valid. * Restart the loop. We'll skip this one with the - * debugfs_positive() check. + * simple_positive() check. */ goto loop; } spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); child = parent; parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); + mutex_lock(&d_inode(parent)->i_mutex); if (child != dentry) /* go up */ @@ -637,7 +635,7 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); - mutex_unlock(&parent->d_inode->i_mutex); + mutex_unlock(&d_inode(parent)->i_mutex); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); @@ -669,27 +667,27 @@ struct dentry *debugfs_rename(struct dentry *old_dir, struct dentry *old_dentry, trap = lock_rename(new_dir, old_dir); /* Source or destination directories don't exist? */ - if (!old_dir->d_inode || !new_dir->d_inode) + if (d_really_is_negative(old_dir) || d_really_is_negative(new_dir)) goto exit; /* Source does not exist, cyclic rename, or mountpoint? */ - if (!old_dentry->d_inode || old_dentry == trap || + if (d_really_is_negative(old_dentry) || old_dentry == trap || d_mountpoint(old_dentry)) goto exit; dentry = lookup_one_len(new_name, new_dir, strlen(new_name)); /* Lookup failed, cyclic rename or target exists? */ - if (IS_ERR(dentry) || dentry == trap || dentry->d_inode) + if (IS_ERR(dentry) || dentry == trap || d_really_is_positive(dentry)) goto exit; old_name = fsnotify_oldname_init(old_dentry->d_name.name); - error = simple_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, + error = simple_rename(d_inode(old_dir), old_dentry, d_inode(new_dir), dentry); if (error) { fsnotify_oldname_free(old_name); goto exit; } d_move(old_dentry, dentry); - fsnotify_move(old_dir->d_inode, new_dir->d_inode, old_name, + fsnotify_move(d_inode(old_dir), d_inode(new_dir), old_name, d_is_dir(old_dentry), NULL, old_dentry); fsnotify_oldname_free(old_name); @@ -713,20 +711,17 @@ bool debugfs_initialized(void) } EXPORT_SYMBOL_GPL(debugfs_initialized); - -static struct kobject *debug_kobj; - static int __init debugfs_init(void) { int retval; - debug_kobj = kobject_create_and_add("debug", kernel_kobj); - if (!debug_kobj) - return -EINVAL; + retval = sysfs_create_mount_point(kernel_kobj, "debug"); + if (retval) + return retval; retval = register_filesystem(&debug_fs_type); if (retval) - kobject_put(debug_kobj); + sysfs_remove_mount_point(kernel_kobj, "debug"); else debugfs_registered = true; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index cfe8466f7fef..c35ffdc12bba 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) return inode->i_sb; #endif + if (!devpts_mnt) + return NULL; return devpts_mnt->mnt_sb; } @@ -253,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb) if (!uid_valid(root_uid) || !gid_valid(root_gid)) return -EINVAL; - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); /* If we have already created ptmx node, return */ if (fsi->ptmx_dentry) { @@ -290,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb) fsi->ptmx_dentry = dentry; rc = 0; out: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return rc; } @@ -298,7 +300,7 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) { struct inode *inode; if (fsi->ptmx_dentry) { - inode = fsi->ptmx_dentry->d_inode; + inode = d_inode(fsi->ptmx_dentry); inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; } } @@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct inode *ptmx_inode) { struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_fs_info *fsi; int index; int ida_ret; + if (!sb) + return -ENODEV; + + fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct inode *inode; - struct dentry *root = sb->s_root; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = &fsi->mount_opts; + struct dentry *root; + struct pts_fs_info *fsi; + struct pts_mount_opts *opts; char s[12]; + if (!sb) + return ERR_PTR(-ENODEV); + + root = sb->s_root; + fsi = DEVPTS_SB(sb); + opts = &fsi->mount_opts; + inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -602,18 +615,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, sprintf(s, "%d", index); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = d_alloc_name(root, s); if (dentry) { d_add(dentry, inode); - fsnotify_create(root->d_inode, dentry); + fsnotify_create(d_inode(root), dentry); } else { iput(inode); inode = ERR_PTR(-ENOMEM); } - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return inode; } @@ -658,7 +671,7 @@ void devpts_pty_kill(struct inode *inode) BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = d_find_alias(inode); @@ -667,7 +680,7 @@ void devpts_pty_kill(struct inode *inode) dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); } static int __init init_devpts_fs(void) @@ -676,12 +689,16 @@ static int __init init_devpts_fs(void) struct ctl_table_header *table; if (!err) { + struct vfsmount *mnt; + table = register_sysctl_table(pty_root_table); - devpts_mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(devpts_mnt)) { - err = PTR_ERR(devpts_mnt); + mnt = kern_mount(&devpts_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); unregister_filesystem(&devpts_fs_type); unregister_sysctl_table(table); + } else { + devpts_mnt = mnt; } } return err; diff --git a/fs/direct-io.c b/fs/direct-io.c index e181b6b2e297..745d2342651a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,7 +37,6 @@ #include <linux/uio.h> #include <linux/atomic.h> #include <linux/prefetch.h> -#include <linux/aio.h> /* * How many user pages to map in one call to get_user_pages(). This determines @@ -254,7 +253,9 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, if (dio->end_io && dio->result) dio->end_io(dio->iocb, offset, transferred, dio->private); - inode_dio_done(dio->inode); + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) + inode_dio_end(dio->inode); + if (is_async) { if (dio->rw & WRITE) { int err; @@ -265,7 +266,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, ret = err; } - aio_complete(dio->iocb, ret, 0); + dio->iocb->ki_complete(dio->iocb, ret, 0); } kmem_cache_free(dio_cache, dio); @@ -1056,7 +1057,7 @@ static inline int drop_refcount(struct dio *dio) * operation. AIO can if it was a broken operation described above or * in fact if all the bios race to complete before we get here. In * that case dio_complete() translates the EIOCBQUEUED into the proper - * return code that the caller will hand to aio_complete(). + * return code that the caller will hand to ->complete(). * * This is managed by the bio_lock instead of being an atomic_t so that * completion paths can drop their ref and use the remaining count to @@ -1094,10 +1095,10 @@ static inline int drop_refcount(struct dio *dio) * for the whole file. */ static inline ssize_t -do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, struct iov_iter *iter, loff_t offset, - get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, int flags) +do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, struct iov_iter *iter, + loff_t offset, get_block_t get_block, dio_iodone_t end_io, + dio_submit_t submit_io, int flags) { unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits); unsigned blkbits = i_blkbits; @@ -1111,9 +1112,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, struct blk_plug plug; unsigned long align = offset | iov_iter_alignment(iter); - if (rw & WRITE) - rw = WRITE_ODIRECT; - /* * Avoid references to bdev if not absolutely needed to give * the early prefetch in the caller enough time. @@ -1128,7 +1126,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, } /* watch out for a 0 len io from a tricksy fs */ - if (rw == READ && !iov_iter_count(iter)) + if (iov_iter_rw(iter) == READ && !iov_iter_count(iter)) return 0; dio = kmem_cache_alloc(dio_cache, GFP_KERNEL); @@ -1144,7 +1142,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, dio->flags = flags; if (dio->flags & DIO_LOCKING) { - if (rw == READ) { + if (iov_iter_rw(iter) == READ) { struct address_space *mapping = iocb->ki_filp->f_mapping; @@ -1170,19 +1168,19 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, if (is_sync_kiocb(iocb)) dio->is_async = false; else if (!(dio->flags & DIO_ASYNC_EXTEND) && - (rw & WRITE) && end > i_size_read(inode)) + iov_iter_rw(iter) == WRITE && end > i_size_read(inode)) dio->is_async = false; else dio->is_async = true; dio->inode = inode; - dio->rw = rw; + dio->rw = iov_iter_rw(iter) == WRITE ? WRITE_ODIRECT : READ; /* * For AIO O_(D)SYNC writes we need to defer completions to a workqueue * so that we can call ->fsync. */ - if (dio->is_async && (rw & WRITE) && + if (dio->is_async && iov_iter_rw(iter) == WRITE && ((iocb->ki_filp->f_flags & O_DSYNC) || IS_SYNC(iocb->ki_filp->f_mapping->host))) { retval = dio_set_defer_completion(dio); @@ -1199,7 +1197,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, /* * Will be decremented at I/O completion time. */ - atomic_inc(&inode->i_dio_count); + if (!(dio->flags & DIO_SKIP_DIO_COUNT)) + inode_dio_begin(inode); retval = 0; sdio.blkbits = blkbits; @@ -1275,7 +1274,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, * we can let i_mutex go now that its achieved its purpose * of protecting us from looking up uninitialized blocks. */ - if (rw == READ && (dio->flags & DIO_LOCKING)) + if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) mutex_unlock(&dio->inode->i_mutex); /* @@ -1287,7 +1286,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, */ BUG_ON(retval == -EIOCBQUEUED); if (dio->is_async && retval == 0 && dio->result && - (rw == READ || dio->result == count)) + (iov_iter_rw(iter) == READ || dio->result == count)) retval = -EIOCBQUEUED; else dio_await_completion(dio); @@ -1301,11 +1300,11 @@ out: return retval; } -ssize_t -__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, - struct block_device *bdev, struct iov_iter *iter, loff_t offset, - get_block_t get_block, dio_iodone_t end_io, - dio_submit_t submit_io, int flags) +ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, + struct block_device *bdev, struct iov_iter *iter, + loff_t offset, get_block_t get_block, + dio_iodone_t end_io, dio_submit_t submit_io, + int flags) { /* * The block device state is needed in the end to finally @@ -1319,8 +1318,8 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, prefetch(bdev->bd_queue); prefetch((char *)bdev->bd_queue + SMP_CACHE_BYTES); - return do_blockdev_direct_IO(rw, iocb, inode, bdev, iter, offset, - get_block, end_io, submit_io, flags); + return do_blockdev_direct_IO(iocb, inode, bdev, iter, offset, get_block, + end_io, submit_io, flags); } EXPORT_SYMBOL(__blockdev_direct_IO); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d08e079ea5d3..754fd6c0b747 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -921,8 +921,8 @@ static int tcp_accept_from_sock(struct connection *con) mutex_unlock(&connections_lock); memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &newsock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &newsock); if (result < 0) return -ENOMEM; @@ -1173,8 +1173,8 @@ static void tcp_connect_to_sock(struct connection *con) goto out; /* Create a socket to communicate with */ - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (result < 0) goto out_err; @@ -1258,8 +1258,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con, addr_len = sizeof(struct sockaddr_in6); /* Create a socket to communicate with */ - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (result < 0) { log_print("Can't create listening comms socket"); goto create_out; @@ -1365,8 +1365,8 @@ static int sctp_listen_for_all(void) log_print("Using SCTP for communications"); - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET, - IPPROTO_SCTP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_SEQPACKET, IPPROTO_SCTP, &sock); if (result < 0) { log_print("Can't create comms socket, check SCTP is loaded"); goto out; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 719e1ce1c609..97315f2f6816 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1326,7 +1326,7 @@ static int ecryptfs_read_headers_virt(char *page_virt, if (rc) goto out; if (!(crypt_stat->flags & ECRYPTFS_I_SIZE_INITIALIZED)) - ecryptfs_i_size_init(page_virt, ecryptfs_dentry->d_inode); + ecryptfs_i_size_init(page_virt, d_inode(ecryptfs_dentry)); offset += MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; rc = ecryptfs_process_flags(crypt_stat, (page_virt + offset), &bytes_read); @@ -1425,7 +1425,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) { int rc; char *page_virt; - struct inode *ecryptfs_inode = ecryptfs_dentry->d_inode; + struct inode *ecryptfs_inode = d_inode(ecryptfs_dentry); struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 4000f6b3a750..8db0b464483f 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -54,11 +54,11 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; rc = lower_dentry->d_op->d_revalidate(lower_dentry, flags); - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { struct inode *lower_inode = - ecryptfs_inode_to_lower(dentry->d_inode); + ecryptfs_inode_to_lower(d_inode(dentry)); - fsstack_copy_attr_all(dentry->d_inode, lower_inode); + fsstack_copy_attr_all(d_inode(dentry), lower_inode); } return rc; } diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index fd39bad6f1bd..feef8a9c4de7 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -31,7 +31,6 @@ #include <linux/security.h> #include <linux/compat.h> #include <linux/fs_stack.h> -#include <linux/aio.h> #include "ecryptfs_kernel.h" /** @@ -52,12 +51,6 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, struct file *file = iocb->ki_filp; rc = generic_file_read_iter(iocb, to); - /* - * Even though this is a async interface, we need to wait - * for IO to finish to update atime - */ - if (-EIOCBQUEUED == rc) - rc = wait_on_sync_kiocb(iocb); if (rc >= 0) { path = ecryptfs_dentry_to_lower_path(file->f_path.dentry); touch_atime(path); @@ -137,7 +130,7 @@ struct kmem_cache *ecryptfs_file_info_cache; static int read_or_initialize_metadata(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_crypt_stat *crypt_stat; int rc; @@ -332,7 +325,6 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return rc; switch (cmd) { - case FITRIM: case FS_IOC32_GETFLAGS: case FS_IOC32_SETFLAGS: case FS_IOC32_GETVERSION: @@ -365,9 +357,7 @@ const struct file_operations ecryptfs_dir_fops = { const struct file_operations ecryptfs_main_fops = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = ecryptfs_read_update_atime, - .write = new_sync_write, .write_iter = generic_file_write_iter, .iterate = ecryptfs_readdir, .unlocked_ioctl = ecryptfs_unlocked_ioctl, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index b08b5187f662..3c4db1172d22 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry) struct dentry *dir; dir = dget_parent(dentry); - mutex_lock_nested(&(dir->d_inode->i_mutex), I_MUTEX_PARENT); + mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT); return dir; } static void unlock_dir(struct dentry *dir) { - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); dput(dir); } @@ -131,7 +131,7 @@ struct inode *ecryptfs_get_inode(struct inode *lower_inode, static int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, struct super_block *sb) { - struct inode *inode = ecryptfs_get_inode(lower_dentry->d_inode, sb); + struct inode *inode = ecryptfs_get_inode(d_inode(lower_dentry), sb); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -170,7 +170,6 @@ out_unlock: * @directory_inode: inode of the new file's dentry's parent in ecryptfs * @ecryptfs_dentry: New file's dentry in ecryptfs * @mode: The mode of the new file - * @nd: nameidata of ecryptfs' parent's dentry & vfsmount * * Creates the underlying file and the eCryptfs inode which will link to * it. It will also update the eCryptfs directory inode to mimic the @@ -189,21 +188,21 @@ ecryptfs_do_create(struct inode *directory_inode, lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true); + rc = vfs_create(d_inode(lower_dir_dentry), lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); inode = ERR_PTR(rc); goto out_lock; } - inode = __ecryptfs_get_inode(lower_dentry->d_inode, + inode = __ecryptfs_get_inode(d_inode(lower_dentry), directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(lower_dir_dentry->d_inode, lower_dentry, NULL); + vfs_unlink(d_inode(lower_dir_dentry), lower_dentry, NULL); goto out_lock; } - fsstack_copy_attr_times(directory_inode, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(directory_inode, d_inode(lower_dir_dentry)); out_lock: unlock_dir(lower_dir_dentry); return inode; @@ -332,7 +331,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry, struct inode *dir_inode) { - struct inode *inode, *lower_inode = lower_dentry->d_inode; + struct inode *inode, *lower_inode = d_inode(lower_dentry); struct ecryptfs_dentry_info *dentry_info; struct vfsmount *lower_mnt; int rc = 0; @@ -347,14 +346,14 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, } lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); - fsstack_copy_attr_atime(dir_inode, lower_dentry->d_parent->d_inode); + fsstack_copy_attr_atime(dir_inode, d_inode(lower_dentry->d_parent)); BUG_ON(!d_count(lower_dentry)); ecryptfs_set_dentry_private(dentry, dentry_info); dentry_info->lower_path.mnt = lower_mnt; dentry_info->lower_path.dentry = lower_dentry; - if (!lower_dentry->d_inode) { + if (d_really_is_negative(lower_dentry)) { /* We want to add because we couldn't find in lower */ d_add(dentry, NULL); return 0; @@ -384,7 +383,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, * ecryptfs_lookup * @ecryptfs_dir_inode: The eCryptfs directory inode * @ecryptfs_dentry: The eCryptfs dentry that we are looking up - * @ecryptfs_nd: nameidata; may be NULL + * @flags: lookup flags * * Find a file on disk. If the file does not exist, then we'll add it to the * dentry cache and continue on to read it from the disk. @@ -400,11 +399,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, int rc = 0; lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - mutex_lock(&lower_dir_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); - mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -412,7 +411,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, ecryptfs_dentry); goto out; } - if (lower_dentry->d_inode) + if (d_really_is_positive(lower_dentry)) goto interpose; mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; @@ -429,11 +428,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out; } - mutex_lock(&lower_dir_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); lower_dentry = lookup_one_len(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size); - mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -458,24 +457,24 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, u64 file_size_save; int rc; - file_size_save = i_size_read(old_dentry->d_inode); + file_size_save = i_size_read(d_inode(old_dentry)); lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); dget(lower_old_dentry); dget(lower_new_dentry); lower_dir_dentry = lock_parent(lower_new_dentry); - rc = vfs_link(lower_old_dentry, lower_dir_dentry->d_inode, + rc = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry), lower_new_dentry, NULL); - if (rc || !lower_new_dentry->d_inode) + if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - set_nlink(old_dentry->d_inode, - ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink); - i_size_write(new_dentry->d_inode, file_size_save); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + set_nlink(d_inode(old_dentry), + ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink); + i_size_write(d_inode(new_dentry), file_size_save); out_lock: unlock_dir(lower_dir_dentry); dput(lower_new_dentry); @@ -485,7 +484,7 @@ out_lock: static int ecryptfs_unlink(struct inode *dir, struct dentry *dentry) { - return ecryptfs_do_unlink(dir, dentry, dentry->d_inode); + return ecryptfs_do_unlink(dir, dentry, d_inode(dentry)); } static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, @@ -510,20 +509,20 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, strlen(symname)); if (rc) goto out_lock; - rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, + rc = vfs_symlink(d_inode(lower_dir_dentry), lower_dentry, encoded_symname); kfree(encoded_symname); - if (rc || !lower_dentry->d_inode) + if (rc || d_really_is_negative(lower_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); out_lock: unlock_dir(lower_dir_dentry); dput(lower_dentry); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) d_drop(dentry); return rc; } @@ -536,18 +535,18 @@ static int ecryptfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mkdir(lower_dir_dentry->d_inode, lower_dentry, mode); - if (rc || !lower_dentry->d_inode) + rc = vfs_mkdir(d_inode(lower_dir_dentry), lower_dentry, mode); + if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); - set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); out: unlock_dir(lower_dir_dentry); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) d_drop(dentry); return rc; } @@ -562,12 +561,12 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) dget(dentry); lower_dir_dentry = lock_parent(lower_dentry); dget(lower_dentry); - rc = vfs_rmdir(lower_dir_dentry->d_inode, lower_dentry); + rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry); dput(lower_dentry); - if (!rc && dentry->d_inode) - clear_nlink(dentry->d_inode); - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - set_nlink(dir, lower_dir_dentry->d_inode->i_nlink); + if (!rc && d_really_is_positive(dentry)) + clear_nlink(d_inode(dentry)); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); unlock_dir(lower_dir_dentry); if (!rc) d_drop(dentry); @@ -584,17 +583,17 @@ ecryptfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev lower_dentry = ecryptfs_dentry_to_lower(dentry); lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mknod(lower_dir_dentry->d_inode, lower_dentry, mode, dev); - if (rc || !lower_dentry->d_inode) + rc = vfs_mknod(d_inode(lower_dir_dentry), lower_dentry, mode, dev); + if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; - fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); + fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); out: unlock_dir(lower_dir_dentry); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) d_drop(dentry); return rc; } @@ -617,7 +616,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, dget(lower_new_dentry); lower_old_dir_dentry = dget_parent(lower_old_dentry); lower_new_dir_dentry = dget_parent(lower_new_dentry); - target_inode = new_dentry->d_inode; + target_inode = d_inode(new_dentry); trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); /* source should not be ancestor of target */ if (trap == lower_old_dentry) { @@ -629,17 +628,17 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = -ENOTEMPTY; goto out_lock; } - rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, - lower_new_dir_dentry->d_inode, lower_new_dentry, + rc = vfs_rename(d_inode(lower_old_dir_dentry), lower_old_dentry, + d_inode(lower_new_dir_dentry), lower_new_dentry, NULL, 0); if (rc) goto out_lock; if (target_inode) fsstack_copy_attr_all(target_inode, ecryptfs_inode_to_lower(target_inode)); - fsstack_copy_attr_all(new_dir, lower_new_dir_dentry->d_inode); + fsstack_copy_attr_all(new_dir, d_inode(lower_new_dir_dentry)); if (new_dir != old_dir) - fsstack_copy_attr_all(old_dir, lower_old_dir_dentry->d_inode); + fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry)); out_lock: unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry); dput(lower_new_dir_dentry); @@ -662,7 +661,7 @@ static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz) return ERR_PTR(-ENOMEM); old_fs = get_fs(); set_fs(get_ds()); - rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, + rc = d_inode(lower_dentry)->i_op->readlink(lower_dentry, (char __user *)lower_buf, PATH_MAX); set_fs(old_fs); @@ -675,18 +674,16 @@ out: return rc ? ERR_PTR(rc) : buf; } -static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie) { size_t len; char *buf = ecryptfs_readlink_lower(dentry, &len); if (IS_ERR(buf)) - goto out; - fsstack_copy_attr_atime(dentry->d_inode, - ecryptfs_dentry_to_lower(dentry)->d_inode); + return buf; + fsstack_copy_attr_atime(d_inode(dentry), + d_inode(ecryptfs_dentry_to_lower(dentry))); buf[len] = '\0'; -out: - nd_set_link(nd, buf); - return NULL; + return *cookie = buf; } /** @@ -738,7 +735,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, struct iattr *lower_ia) { int rc = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ecryptfs_crypt_stat *crypt_stat; loff_t i_size = i_size_read(inode); loff_t lower_size_before_truncate; @@ -751,7 +748,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia, rc = ecryptfs_get_lower_file(dentry, inode); if (rc) return rc; - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; /* Switch on growing or shrinking file */ if (ia->ia_size > i_size) { char zero[] = { 0x00 }; @@ -858,7 +855,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) struct iattr lower_ia = { .ia_valid = 0 }; int rc; - rc = ecryptfs_inode_newsize_ok(dentry->d_inode, new_length); + rc = ecryptfs_inode_newsize_ok(d_inode(dentry), new_length); if (rc) return rc; @@ -866,9 +863,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) if (!rc && lower_ia.ia_valid & ATTR_SIZE) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - mutex_lock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); } return rc; } @@ -900,10 +897,10 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) struct inode *lower_inode; struct ecryptfs_crypt_stat *crypt_stat; - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; + crypt_stat = &ecryptfs_inode_to_private(d_inode(dentry))->crypt_stat; if (!(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) ecryptfs_init_crypt_stat(crypt_stat); - inode = dentry->d_inode; + inode = d_inode(dentry); lower_inode = ecryptfs_inode_to_lower(inode); lower_dentry = ecryptfs_dentry_to_lower(dentry); mutex_lock(&crypt_stat->cs_mutex); @@ -967,9 +964,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) lower_ia.ia_valid &= ~ATTR_MODE; - mutex_lock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: fsstack_copy_attr_all(inode, lower_inode); return rc; @@ -983,7 +980,7 @@ static int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, mount_crypt_stat = &ecryptfs_superblock_to_private( dentry->d_sb)->mount_crypt_stat; - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { char *target; size_t targetsiz; @@ -1007,9 +1004,9 @@ static int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, rc = vfs_getattr(ecryptfs_dentry_to_lower_path(dentry), &lower_stat); if (!rc) { - fsstack_copy_attr_all(dentry->d_inode, - ecryptfs_inode_to_lower(dentry->d_inode)); - generic_fillattr(dentry->d_inode, stat); + fsstack_copy_attr_all(d_inode(dentry), + ecryptfs_inode_to_lower(d_inode(dentry))); + generic_fillattr(d_inode(dentry), stat); stat->blocks = lower_stat.blocks; } return rc; @@ -1023,14 +1020,14 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, struct dentry *lower_dentry; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->setxattr) { + if (!d_inode(lower_dentry)->i_op->setxattr) { rc = -EOPNOTSUPP; goto out; } rc = vfs_setxattr(lower_dentry, name, value, size, flags); - if (!rc && dentry->d_inode) - fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); + if (!rc && d_really_is_positive(dentry)) + fsstack_copy_attr_all(d_inode(dentry), d_inode(lower_dentry)); out: return rc; } @@ -1041,14 +1038,14 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, { int rc = 0; - if (!lower_dentry->d_inode->i_op->getxattr) { + if (!d_inode(lower_dentry)->i_op->getxattr) { rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value, + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value, size); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: return rc; } @@ -1068,13 +1065,13 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) struct dentry *lower_dentry; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->listxattr) { + if (!d_inode(lower_dentry)->i_op->listxattr) { rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->listxattr(lower_dentry, list, size); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: return rc; } @@ -1085,13 +1082,13 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name) struct dentry *lower_dentry; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->removexattr) { + if (!d_inode(lower_dentry)->i_op->removexattr) { rc = -EOPNOTSUPP; goto out; } - mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = lower_dentry->d_inode->i_op->removexattr(lower_dentry, name); - mutex_unlock(&lower_dentry->d_inode->i_mutex); + mutex_lock(&d_inode(lower_dentry)->i_mutex); + rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name); + mutex_unlock(&d_inode(lower_dentry)->i_mutex); out: return rc; } diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index f1ea610362c6..866bb18efefe 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -144,7 +144,7 @@ int ecryptfs_privileged_open(struct file **lower_file, /* Corresponding dput() and mntput() are done when the * lower file is fput() when all eCryptfs files for the inode are * released. */ - flags |= IS_RDONLY(lower_dentry->d_inode) ? O_RDONLY : O_RDWR; + flags |= IS_RDONLY(d_inode(lower_dentry)) ? O_RDONLY : O_RDWR; (*lower_file) = dentry_open(&req.path, flags, cred); if (!IS_ERR(*lower_file)) goto out; diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index c095d3264259..4f4d0474bee9 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -546,11 +546,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } - if (check_ruid && !uid_eq(path.dentry->d_inode->i_uid, current_uid())) { + if (check_ruid && !uid_eq(d_inode(path.dentry)->i_uid, current_uid())) { rc = -EPERM; printk(KERN_ERR "Mount of device (uid: %d) not owned by " "requested user (uid: %d)\n", - i_uid_read(path.dentry->d_inode), + i_uid_read(d_inode(path.dentry)), from_kuid(&init_user_ns, current_uid())); goto out_free; } @@ -584,7 +584,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out_free; } - inode = ecryptfs_get_inode(path.dentry->d_inode, s); + inode = ecryptfs_get_inode(d_inode(path.dentry), s); rc = PTR_ERR(inode); if (IS_ERR(inode)) goto out_free; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 4626976794e7..cf208522998e 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -420,7 +420,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) void *xattr_virt; struct dentry *lower_dentry = ecryptfs_inode_to_private(ecryptfs_inode)->lower_file->f_path.dentry; - struct inode *lower_inode = lower_dentry->d_inode; + struct inode *lower_inode = d_inode(lower_dentry); int rc; if (!lower_inode->i_op->getxattr || !lower_inode->i_op->setxattr) { diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 07ab49745e31..3381b9da9ee6 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -145,12 +145,12 @@ out: static int efivarfs_unlink(struct inode *dir, struct dentry *dentry) { - struct efivar_entry *var = dentry->d_inode->i_private; + struct efivar_entry *var = d_inode(dentry)->i_private; if (efivar_entry_delete(var)) return -EINVAL; - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); dput(dentry); return 0; }; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index ddbce42548c9..86a2121828c3 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -121,7 +121,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, int len, i; int err = -ENOMEM; - entry = kmalloc(sizeof(*entry), GFP_KERNEL); + entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return err; @@ -144,7 +144,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, name[len + EFI_VARIABLE_GUID_LEN+1] = '\0'; - inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0); + inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0); if (!inode) goto fail_name; diff --git a/fs/efs/namei.c b/fs/efs/namei.c index bbee8f063dfa..40ba9cc41bf7 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -111,9 +111,9 @@ struct dentry *efs_get_parent(struct dentry *child) struct dentry *parent = ERR_PTR(-ENOENT); efs_ino_t ino; - ino = efs_find_entry(child->d_inode, "..", 2); + ino = efs_find_entry(d_inode(child), "..", 2); if (ino) - parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino)); + parent = d_obtain_alias(efs_iget(d_inode(child)->i_sb, ino)); return parent; } diff --git a/fs/efs/super.c b/fs/efs/super.c index 7fca462ea4e3..c8411a30f7da 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -67,7 +67,7 @@ static struct kmem_cache * efs_inode_cachep; static struct inode *efs_alloc_inode(struct super_block *sb) { struct efs_inode_info *ei; - ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/exec.c b/fs/exec.c index c7f9b733406d..1977c2a553ac 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -659,6 +659,9 @@ int setup_arg_pages(struct linux_binprm *bprm, if (stack_base > STACK_SIZE_MAX) stack_base = STACK_SIZE_MAX; + /* Add space for stack randomization. */ + stack_base += (STACK_RND_MASK << PAGE_SHIFT); + /* Make sure we didn't let the argument array grow too large. */ if (vma->vm_end - vma->vm_start > stack_base) return -ENOMEM; @@ -926,10 +929,14 @@ static int de_thread(struct task_struct *tsk) if (!thread_group_leader(tsk)) { struct task_struct *leader = tsk->group_leader; - sig->notify_count = -1; /* for exit_notify() */ for (;;) { threadgroup_change_begin(tsk); write_lock_irq(&tasklist_lock); + /* + * Do this under tasklist_lock to ensure that + * exit_notify() can't miss ->group_exit_task + */ + sig->notify_count = -1; if (likely(leader->exit_state)) break; __set_current_state(TASK_KILLABLE); @@ -1078,7 +1085,13 @@ int flush_old_exec(struct linux_binprm * bprm) if (retval) goto out; + /* + * Must be called _before_ exec_mmap() as bprm->mm is + * not visibile until then. This also enables the update + * to be lockless. + */ set_mm_exe_file(bprm->mm, bprm->file); + /* * Release all of the old mmap stuff */ @@ -1265,6 +1278,53 @@ static void check_unsafe_exec(struct linux_binprm *bprm) spin_unlock(&p->fs->lock); } +static void bprm_fill_uid(struct linux_binprm *bprm) +{ + struct inode *inode; + unsigned int mode; + kuid_t uid; + kgid_t gid; + + /* clear any previous set[ug]id data from a previous binary */ + bprm->cred->euid = current_euid(); + bprm->cred->egid = current_egid(); + + if (bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) + return; + + if (task_no_new_privs(current)) + return; + + inode = file_inode(bprm->file); + mode = READ_ONCE(inode->i_mode); + if (!(mode & (S_ISUID|S_ISGID))) + return; + + /* Be careful if suid/sgid is set */ + mutex_lock(&inode->i_mutex); + + /* reload atomically mode/uid/gid now that lock held */ + mode = inode->i_mode; + uid = inode->i_uid; + gid = inode->i_gid; + mutex_unlock(&inode->i_mutex); + + /* We ignore suid/sgid if there are no mappings for them in the ns */ + if (!kuid_has_mapping(bprm->cred->user_ns, uid) || + !kgid_has_mapping(bprm->cred->user_ns, gid)) + return; + + if (mode & S_ISUID) { + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->euid = uid; + } + + if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { + bprm->per_clear |= PER_CLEAR_ON_SETID; + bprm->cred->egid = gid; + } +} + /* * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes @@ -1273,36 +1333,9 @@ static void check_unsafe_exec(struct linux_binprm *bprm) */ int prepare_binprm(struct linux_binprm *bprm) { - struct inode *inode = file_inode(bprm->file); - umode_t mode = inode->i_mode; int retval; - - /* clear any previous set[ug]id data from a previous binary */ - bprm->cred->euid = current_euid(); - bprm->cred->egid = current_egid(); - - if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID) && - !task_no_new_privs(current) && - kuid_has_mapping(bprm->cred->user_ns, inode->i_uid) && - kgid_has_mapping(bprm->cred->user_ns, inode->i_gid)) { - /* Set-uid? */ - if (mode & S_ISUID) { - bprm->per_clear |= PER_CLEAR_ON_SETID; - bprm->cred->euid = inode->i_uid; - } - - /* Set-gid? */ - /* - * If setgid is set but no group execute bit then this - * is a candidate for mandatory locking, not a setgid - * executable. - */ - if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { - bprm->per_clear |= PER_CLEAR_ON_SETID; - bprm->cred->egid = inode->i_gid; - } - } + bprm_fill_uid(bprm); /* fill in binprm security blob */ retval = security_bprm_set_creds(bprm); diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index b47c7b8dc275..a364fd0965ec 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -16,5 +16,5 @@ libore-y := ore.o ore_raid.o obj-$(CONFIG_ORE) += libore.o -exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o +exofs-y := inode.o file.o namei.o dir.o super.o sys.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index d7defd557601..e5bb2abf77f9 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -44,12 +44,6 @@ static inline void exofs_put_page(struct page *page) page_cache_release(page); } -/* Accesses dir's inode->i_size must be called under inode lock */ -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) { loff_t last_byte = inode->i_size; @@ -379,7 +373,7 @@ ino_t exofs_parent_ino(struct dentry *child) struct exofs_dir_entry *de; ino_t ino; - de = exofs_dotdot(child->d_inode, &page); + de = exofs_dotdot(d_inode(child), &page); if (!de) return 0; @@ -429,7 +423,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, int exofs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned chunk_size = exofs_chunk_size(dir); diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index ad9cac670a47..2e86086bc940 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -207,10 +207,6 @@ extern const struct address_space_operations exofs_aops; extern const struct inode_operations exofs_dir_inode_operations; extern const struct inode_operations exofs_special_inode_operations; -/* symlink.c */ -extern const struct inode_operations exofs_symlink_inode_operations; -extern const struct inode_operations exofs_fast_symlink_inode_operations; - /* exofs_init_comps will initialize an ore_components device array * pointing to a single ore_comp struct, and a round-robin view * of the device table. diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 1a376b42d305..906de66e8e7e 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -67,8 +67,6 @@ static int exofs_flush(struct file *file, fl_owner_t id) const struct file_operations exofs_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index a198e94813fe..73c64daa0f55 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -963,8 +963,8 @@ static void exofs_invalidatepage(struct page *page, unsigned int offset, /* TODO: Should be easy enough to do proprly */ -static ssize_t exofs_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t exofs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { return 0; } @@ -1028,7 +1028,7 @@ static int _do_truncate(struct inode *inode, loff_t newsize) */ int exofs_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; /* if we are about to modify an object, and it hasn't been @@ -1222,10 +1222,11 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &exofs_dir_operations; inode->i_mapping->a_ops = &exofs_aops; } else if (S_ISLNK(inode->i_mode)) { - if (exofs_inode_is_fast_symlink(inode)) - inode->i_op = &exofs_fast_symlink_inode_operations; - else { - inode->i_op = &exofs_symlink_inode_operations; + if (exofs_inode_is_fast_symlink(inode)) { + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = (char *)oi->i_data; + } else { + inode->i_op = &page_symlink_inode_operations; inode->i_mapping->a_ops = &exofs_aops; } } else { diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 28907460e8fa..09a6bb1ad63c 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -113,7 +113,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry, oi = exofs_i(inode); if (l > sizeof(oi->i_data)) { /* slow symlink */ - inode->i_op = &exofs_symlink_inode_operations; + inode->i_op = &page_symlink_inode_operations; inode->i_mapping->a_ops = &exofs_aops; memset(oi->i_data, 0, sizeof(oi->i_data)); @@ -122,7 +122,8 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry, goto out_fail; } else { /* fast symlink */ - inode->i_op = &exofs_fast_symlink_inode_operations; + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = (char *)oi->i_data; memcpy(oi->i_data, symname, l); inode->i_size = l-1; } @@ -141,7 +142,7 @@ out_fail: static int exofs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = CURRENT_TIME; inode_inc_link_count(inode); @@ -191,7 +192,7 @@ out_dir: static int exofs_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct exofs_dir_entry *de; struct page *page; int err = -ENOENT; @@ -213,7 +214,7 @@ out: static int exofs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = -ENOTEMPTY; if (exofs_empty_dir(inode)) { @@ -230,8 +231,8 @@ static int exofs_rmdir(struct inode *dir, struct dentry *dentry) static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct exofs_dir_entry *dir_de = NULL; struct page *old_page; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index fcc2e565f540..b795c567b5e1 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -958,7 +958,7 @@ static struct dentry *exofs_get_parent(struct dentry *child) if (!ino) return ERR_PTR(-ESTALE); - return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(exofs_iget(d_inode(child)->i_sb, ino)); } static struct inode *exofs_nfs_get_inode(struct super_block *sb, diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c deleted file mode 100644 index 832e2624b80b..000000000000 --- a/fs/exofs/symlink.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/namei.h> - -#include "exofs.h" - -static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct exofs_i_info *oi = exofs_i(dentry->d_inode); - - nd_set_link(nd, (char *)oi->i_data); - return NULL; -} - -const struct inode_operations exofs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, -}; - -const struct inode_operations exofs_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = exofs_follow_link, -}; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 6e1d4ab09d72..0c6638b40f21 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -70,11 +70,6 @@ static inline void ext2_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. @@ -486,7 +481,7 @@ void ext2_set_link(struct inode *dir, struct ext2_dir_entry_2 *de, */ int ext2_add_link (struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned chunk_size = ext2_chunk_size(dir); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 678f9ab08c48..8d15febd0aa3 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -793,7 +793,6 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync); extern const struct inode_operations ext2_file_inode_operations; extern const struct file_operations ext2_file_operations; -extern const struct file_operations ext2_dax_file_operations; /* inode.c */ extern const struct address_space_operations ext2_aops; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index e31701713516..3b57c9f83c9b 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -28,17 +28,18 @@ #ifdef CONFIG_FS_DAX static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext2_get_block); + return dax_fault(vma, vmf, ext2_get_block, NULL); } static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext2_get_block); + return dax_mkwrite(vma, vmf, ext2_get_block, NULL); } static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, .page_mkwrite = ext2_dax_mkwrite, + .pfn_mkwrite = dax_pfn_mkwrite, }; static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma) @@ -92,8 +93,6 @@ int ext2_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ const struct file_operations ext2_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .unlocked_ioctl = ext2_ioctl, @@ -108,24 +107,6 @@ const struct file_operations ext2_file_operations = { .splice_write = iter_file_splice_write, }; -#ifdef CONFIG_FS_DAX -const struct file_operations ext2_dax_file_operations = { - .llseek = generic_file_llseek, - .read = new_sync_read, - .write = new_sync_write, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .unlocked_ioctl = ext2_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext2_compat_ioctl, -#endif - .mmap = ext2_file_mmap, - .open = dquot_file_open, - .release = ext2_release_file, - .fsync = ext2_fsync, -}; -#endif - const struct inode_operations ext2_file_inode_operations = { #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 6c14bb8322fa..5c04a0ddea80 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -278,7 +278,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) avefreeb = free_blocks / ngroups; ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - if ((parent == sb->s_root->d_inode) || + if ((parent == d_inode(sb->s_root)) || (EXT2_I(parent)->i_flags & EXT2_TOPDIR_FL)) { struct ext2_group_desc *best_desc = NULL; int best_ndir = inodes_per_group; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6434bc000125..5c09776d347f 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,7 +31,7 @@ #include <linux/mpage.h> #include <linux/fiemap.h> #include <linux/namei.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "ext2.h" #include "acl.h" #include "xattr.h" @@ -851,8 +851,7 @@ static sector_t ext2_bmap(struct address_space *mapping, sector_t block) } static ssize_t -ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -861,12 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, ssize_t ret; if (IS_DAX(inode)) - ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block, - NULL, DIO_LOCKING); + ret = dax_do_io(iocb, inode, iter, offset, ext2_get_block, NULL, + DIO_LOCKING); else - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, + ret = blockdev_direct_IO(iocb, inode, iter, offset, ext2_get_block); - if (ret < 0 && (rw & WRITE)) + if (ret < 0 && iov_iter_rw(iter) == WRITE) ext2_write_failed(mapping, offset + count); return ret; } @@ -1388,10 +1387,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_dax_file_operations; - } else if (test_opt(inode->i_sb, NOBH)) { + if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; } else { @@ -1407,6 +1403,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) inode->i_mapping->a_ops = &ext2_aops; } else if (S_ISLNK(inode->i_mode)) { if (ext2_inode_is_fast_symlink(inode)) { + inode->i_link = (char *)ei->i_data; inode->i_op = &ext2_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); @@ -1548,7 +1545,7 @@ int ext2_write_inode(struct inode *inode, struct writeback_control *wbc) int ext2_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, iattr); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 148f6e3789ea..13ec54a99c96 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -79,10 +79,10 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, uns struct dentry *ext2_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = ext2_inode_by_name(child->d_inode, &dotdot); + unsigned long ino = ext2_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(ext2_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext2_iget(d_inode(child)->i_sb, ino)); } /* @@ -104,10 +104,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_dax_file_operations; - } else if (test_opt(inode->i_sb, NOBH)) { + if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; } else { @@ -125,10 +122,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) return PTR_ERR(inode); inode->i_op = &ext2_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) { - inode->i_mapping->a_ops = &ext2_aops; - inode->i_fop = &ext2_dax_file_operations; - } else if (test_opt(inode->i_sb, NOBH)) { + if (test_opt(inode->i_sb, NOBH)) { inode->i_mapping->a_ops = &ext2_nobh_aops; inode->i_fop = &ext2_file_operations; } else { @@ -195,7 +189,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, } else { /* fast symlink */ inode->i_op = &ext2_fast_symlink_inode_operations; - memcpy((char*)(EXT2_I(inode)->i_data),symname,l); + inode->i_link = (char*)EXT2_I(inode)->i_data; + memcpy(inode->i_link, symname, l); inode->i_size = l-1; } mark_inode_dirty(inode); @@ -214,7 +209,7 @@ out_fail: static int ext2_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int err; dquot_initialize(dir); @@ -281,7 +276,7 @@ out_dir: static int ext2_unlink(struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct ext2_dir_entry_2 * de; struct page * page; int err = -ENOENT; @@ -305,7 +300,7 @@ out: static int ext2_rmdir (struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); int err = -ENOTEMPTY; if (ext2_empty_dir(inode)) { @@ -322,8 +317,8 @@ static int ext2_rmdir (struct inode * dir, struct dentry *dentry) static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, struct inode * new_dir, struct dentry * new_dentry ) { - struct inode * old_inode = old_dentry->d_inode; - struct inode * new_inode = new_dentry->d_inode; + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct ext2_dir_entry_2 * dir_de = NULL; struct page * old_page; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d0e746e96511..900e19cf9ef6 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -882,6 +882,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_iflags |= SB_I_CGROUPWB; if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index 565cf817bbf1..ae17179f3810 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -19,14 +19,6 @@ #include "ext2.h" #include "xattr.h" -#include <linux/namei.h> - -static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ext2_inode_info *ei = EXT2_I(dentry->d_inode); - nd_set_link(nd, (char *)ei->i_data); - return NULL; -} const struct inode_operations ext2_symlink_inode_operations = { .readlink = generic_readlink, @@ -43,7 +35,7 @@ const struct inode_operations ext2_symlink_inode_operations = { const struct inode_operations ext2_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext2_follow_link, + .follow_link = simple_follow_link, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index 91426141c33a..0b6bfd3a398b 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -243,7 +243,7 @@ cleanup: static int ext2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; struct ext2_xattr_entry *entry; char *end; @@ -319,7 +319,7 @@ cleanup: /* * Inode operation listxattr() * - * dentry->d_inode->i_mutex: don't care + * d_inode(dentry)->i_mutex: don't care */ ssize_t ext2_listxattr(struct dentry *dentry, char *buffer, size_t size) diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index c0ebc4db8849..702fc6840246 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -28,7 +28,7 @@ ext2_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -38,7 +38,7 @@ ext2_xattr_security_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_SECURITY, name, + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 7e192574c001..42b6e9874bcc 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -32,7 +32,7 @@ ext2_xattr_trusted_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -42,7 +42,7 @@ ext2_xattr_trusted_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_TRUSTED, name, + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index f470e44c4b8d..ecdc4605192c 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -36,7 +36,7 @@ ext2_xattr_user_get(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_get(dentry->d_inode, EXT2_XATTR_INDEX_USER, + return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER, name, buffer, size); } @@ -49,7 +49,7 @@ ext2_xattr_user_set(struct dentry *dentry, const char *name, if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext2_xattr_set(dentry->d_inode, EXT2_XATTR_INDEX_USER, + return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/ext3/file.c b/fs/ext3/file.c index a062fa1e1b11..3b8f650de22c 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -50,8 +50,6 @@ static int ext3_release_file (struct inode * inode, struct file * filp) const struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .unlocked_ioctl = ext3_ioctl, diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index a1b810230cc5..3ad242e5840e 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -210,7 +210,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent) avefreeb = freeb / ngroups; ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); - if ((parent == sb->s_root->d_inode) || + if ((parent == d_inode(sb->s_root)) || (EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) { int best_ndir = inodes_per_group; int best_group = -1; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2c6ccc49ba27..6c7e5468a2f8 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -27,7 +27,7 @@ #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/namei.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "ext3.h" #include "xattr.h" #include "acl.h" @@ -1820,8 +1820,8 @@ static int ext3_releasepage(struct page *page, gfp_t wait) * crashes then stale disk data _may_ be exposed inside the file. But current * VFS code falls back into buffered path in that case so we are safe. */ -static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t ext3_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -1832,9 +1832,9 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_iter_count(iter); int retries = 0; - trace_ext3_direct_IO_enter(inode, offset, count, rw); + trace_ext3_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (rw == WRITE) { + if (iov_iter_rw(iter) == WRITE) { loff_t final_size = offset + count; if (final_size > inode->i_size) { @@ -1856,12 +1856,12 @@ static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb, } retry: - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext3_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, offset, ext3_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ - if (unlikely((rw & WRITE) && ret < 0)) { + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = offset + count; @@ -1908,7 +1908,7 @@ retry: ret = err; } out: - trace_ext3_direct_IO_exit(inode, offset, count, rw, ret); + trace_ext3_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); return ret; } @@ -2999,6 +2999,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext3_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); + inode->i_link = (char *)ei->i_data; } else { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); @@ -3240,7 +3241,7 @@ int ext3_write_inode(struct inode *inode, struct writeback_control *wbc) */ int ext3_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error, rc = 0; const unsigned int ia_valid = attr->ia_valid; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index f197736dccfa..c9e767cd4b67 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1049,19 +1049,19 @@ struct dentry *ext3_get_parent(struct dentry *child) struct ext3_dir_entry_2 * de; struct buffer_head *bh; - bh = ext3_find_entry(child->d_inode, &dotdot, &de); + bh = ext3_find_entry(d_inode(child), &dotdot, &de); if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); brelse(bh); - if (!ext3_valid_inum(child->d_inode->i_sb, ino)) { - ext3_error(child->d_inode->i_sb, "ext3_get_parent", + if (!ext3_valid_inum(d_inode(child)->i_sb, ino)) { + ext3_error(d_inode(child)->i_sb, "ext3_get_parent", "bad inode number: %lu", ino); return ERR_PTR(-EIO); } - return d_obtain_alias(ext3_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext3_iget(d_inode(child)->i_sb, ino)); } #define S_SHIFT 12 @@ -1243,7 +1243,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext3_dir_entry_2 *de, struct buffer_head * bh) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned long offset = 0; @@ -1330,7 +1330,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct buffer_head *bh2; @@ -1435,7 +1435,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, static int ext3_add_entry (handle_t *handle, struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct buffer_head * bh; struct ext3_dir_entry_2 *de; struct super_block * sb; @@ -1489,7 +1489,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, struct dx_entry *entries, *at; struct dx_hash_info hinfo; struct buffer_head * bh; - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct super_block * sb = dir->i_sb; struct ext3_dir_entry_2 *de; int err; @@ -2111,7 +2111,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) @@ -2125,7 +2125,7 @@ static int ext3_rmdir (struct inode * dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) @@ -2173,7 +2173,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); handle = ext3_journal_start(dir, EXT3_DELETE_TRANS_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) @@ -2187,7 +2187,7 @@ static int ext3_unlink(struct inode * dir, struct dentry *dentry) if (!bh) goto end_unlink; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) @@ -2308,7 +2308,8 @@ retry: } } else { inode->i_op = &ext3_fast_symlink_inode_operations; - memcpy((char*)&EXT3_I(inode)->i_data,symname,l); + inode->i_link = (char*)&EXT3_I(inode)->i_data; + memcpy(inode->i_link, symname, l); inode->i_size = l-1; } EXT3_I(inode)->i_disksize = inode->i_size; @@ -2328,7 +2329,7 @@ static int ext3_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { handle_t *handle; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int err, retries = 0; if (inode->i_nlink >= EXT3_LINK_MAX) @@ -2391,8 +2392,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, /* Initialize quotas before so that eventual writes go * in separate transaction */ - if (new_dentry->d_inode) - dquot_initialize(new_dentry->d_inode); + if (d_really_is_positive(new_dentry)) + dquot_initialize(d_inode(new_dentry)); handle = ext3_journal_start(old_dir, 2 * EXT3_DATA_TRANS_BLOCKS(old_dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 2); @@ -2409,12 +2410,12 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, * and merrily kill the link to whatever was created under the * same name. Goodbye sticky bit ;-< */ - old_inode = old_dentry->d_inode; + old_inode = d_inode(old_dentry); retval = -ENOENT; if (!old_bh || le32_to_cpu(old_de->inode) != old_inode->i_ino) goto end_rename; - new_inode = new_dentry->d_inode; + new_inode = d_inode(new_dentry); new_bh = ext3_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh) { if (!new_inode) { diff --git a/fs/ext3/super.c b/fs/ext3/super.c index d4dbf3c259b3..5ed0044fbb37 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -789,7 +789,7 @@ static const struct quotactl_ops ext3_qctl_operations = { .quota_on = ext3_quota_on, .quota_off = dquot_quota_off, .quota_sync = dquot_quota_sync, - .get_info = dquot_get_dqinfo, + .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk @@ -1170,7 +1170,7 @@ static int parse_options (char *options, struct super_block *sb, return 0; } - journal_inode = path.dentry->d_inode; + journal_inode = d_inode(path.dentry); if (!S_ISBLK(journal_inode->i_mode)) { ext3_msg(sb, KERN_ERR, "error: journal path %s " "is not a block device", journal_path); @@ -1908,7 +1908,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_mount_state = le16_to_cpu(es->s_state); sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb)); - for (i=0; i < 4; i++) + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; i = le32_to_cpu(es->s_flags); @@ -2947,7 +2947,7 @@ static int ext3_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext3_journal_start(sb->s_root->d_inode, 2); + handle = ext3_journal_start(d_inode(sb->s_root), 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -2994,7 +2994,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, * When we journal data on quota file, we have to flush journal to see * all updates to the file when we bypass pagecache... */ - if (ext3_should_journal_data(path->dentry->d_inode)) { + if (ext3_should_journal_data(d_inode(path->dentry))) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c index 6b01c3eab1f3..c08c59094ae6 100644 --- a/fs/ext3/symlink.c +++ b/fs/ext3/symlink.c @@ -17,17 +17,9 @@ * ext3 symlink handling code */ -#include <linux/namei.h> #include "ext3.h" #include "xattr.h" -static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ext3_inode_info *ei = EXT3_I(dentry->d_inode); - nd_set_link(nd, (char*)ei->i_data); - return NULL; -} - const struct inode_operations ext3_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, @@ -43,7 +35,7 @@ const struct inode_operations ext3_symlink_inode_operations = { const struct inode_operations ext3_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext3_follow_link, + .follow_link = simple_follow_link, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index c6874be6d58b..7cf36501ccf4 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -137,7 +137,7 @@ ext3_xattr_handler(int name_index) /* * Inode operation listxattr() * - * dentry->d_inode->i_mutex: don't care + * d_inode(dentry)->i_mutex: don't care */ ssize_t ext3_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -355,7 +355,7 @@ ext3_xattr_list_entries(struct dentry *dentry, struct ext3_xattr_entry *entry, static int ext3_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; @@ -391,7 +391,7 @@ cleanup: static int ext3_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ext3_xattr_ibody_header *header; struct ext3_inode *raw_inode; struct ext3_iloc iloc; @@ -432,7 +432,7 @@ ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int i_error, b_error; - down_read(&EXT3_I(dentry->d_inode)->xattr_sem); + down_read(&EXT3_I(d_inode(dentry))->xattr_sem); i_error = ext3_xattr_ibody_list(dentry, buffer, buffer_size); if (i_error < 0) { b_error = 0; @@ -445,7 +445,7 @@ ext3_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) if (b_error < 0) i_error = 0; } - up_read(&EXT3_I(dentry->d_inode)->xattr_sem); + up_read(&EXT3_I(d_inode(dentry))->xattr_sem); return i_error + b_error; } @@ -546,8 +546,7 @@ ext3_xattr_set_entry(struct ext3_xattr_info *i, struct ext3_xattr_search *s) free += EXT3_XATTR_LEN(name_len); } if (i->value) { - if (free < EXT3_XATTR_SIZE(i->value_len) || - free < EXT3_XATTR_LEN(name_len) + + if (free < EXT3_XATTR_LEN(name_len) + EXT3_XATTR_SIZE(i->value_len)) return -ENOSPC; } diff --git a/fs/ext3/xattr_security.c b/fs/ext3/xattr_security.c index 722c2bf9645d..c9506d5e3b13 100644 --- a/fs/ext3/xattr_security.c +++ b/fs/ext3/xattr_security.c @@ -29,7 +29,7 @@ ext3_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -39,7 +39,7 @@ ext3_xattr_security_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_SECURITY, + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext3/xattr_trusted.c b/fs/ext3/xattr_trusted.c index d75727cc67fa..206cc66dc285 100644 --- a/fs/ext3/xattr_trusted.c +++ b/fs/ext3/xattr_trusted.c @@ -32,7 +32,7 @@ ext3_xattr_trusted_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -42,7 +42,7 @@ ext3_xattr_trusted_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_TRUSTED, name, + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext3/xattr_user.c b/fs/ext3/xattr_user.c index 5612af3567e0..021508ad1616 100644 --- a/fs/ext3/xattr_user.c +++ b/fs/ext3/xattr_user.c @@ -34,7 +34,7 @@ ext3_xattr_user_get(struct dentry *dentry, const char *name, void *buffer, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_get(dentry->d_inode, EXT3_XATTR_INDEX_USER, + return ext3_xattr_get(d_inode(dentry), EXT3_XATTR_INDEX_USER, name, buffer, size); } @@ -46,7 +46,7 @@ ext3_xattr_user_set(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext3_xattr_set(dentry->d_inode, EXT3_XATTR_INDEX_USER, + return ext3_xattr_set(d_inode(dentry), EXT3_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index efea5d5c44ce..bf8bc8aba471 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -64,6 +64,29 @@ config EXT4_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. +config EXT4_ENCRYPTION + tristate "Ext4 Encryption" + depends on EXT4_FS + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_ECB + select CRYPTO_XTS + select CRYPTO_CTS + select CRYPTO_CTR + select CRYPTO_SHA256 + select KEYS + select ENCRYPTED_KEYS + help + Enable encryption of ext4 files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. + +config EXT4_FS_ENCRYPTION + bool + default y + depends on EXT4_ENCRYPTION + config EXT4_DEBUG bool "EXT4 debugging support" depends on EXT4_FS diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 0310fec2ee3d..75285ea9aa05 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -8,7 +8,9 @@ ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ - xattr_trusted.o inline.o + xattr_trusted.o inline.o readpage.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o +ext4-$(CONFIG_EXT4_FS_ENCRYPTION) += crypto_policy.o crypto.o \ + crypto_key.o crypto_fname.o diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index d40c8dbbb0d6..69b1e73026a5 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -4,11 +4,6 @@ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ -#include <linux/init.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/capability.h> -#include <linux/fs.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 83a6f497c4e0..cd6ea29be645 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -14,7 +14,6 @@ #include <linux/time.h> #include <linux/capability.h> #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" @@ -370,7 +369,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (buffer_verified(bh)) + if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return; ext4_lock_group(sb, block_group); @@ -447,7 +446,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); if (err) ext4_error(sb, "Checksum bad for grp %u", block_group); - return bh; + goto verify; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { @@ -641,8 +640,6 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, * fail EDQUOT for metdata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index b610779a958c..4a606afb171f 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -8,7 +8,6 @@ */ #include <linux/buffer_head.h> -#include <linux/jbd2.h> #include "ext4.h" unsigned int ext4_count_free(char *bitmap, unsigned int numchars) diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 41eb9dcfac7e..3522340c7a99 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -16,7 +16,6 @@ #include <linux/swap.h> #include <linux/pagemap.h> #include <linux/blkdev.h> -#include <linux/mutex.h> #include <linux/slab.h> #include "ext4.h" diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c new file mode 100644 index 000000000000..45731558138c --- /dev/null +++ b/fs/ext4/crypto.c @@ -0,0 +1,475 @@ +/* + * linux/fs/ext4/crypto.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption functions for ext4 + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <keys/user-type.h> +#include <keys/encrypted-type.h> +#include <linux/crypto.h> +#include <linux/ecryptfs.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/key.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <linux/spinlock_types.h> + +#include "ext4_extents.h" +#include "xattr.h" + +/* Encryption added and removed here! (L: */ + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *ext4_bounce_page_pool; + +static LIST_HEAD(ext4_free_crypto_ctxs); +static DEFINE_SPINLOCK(ext4_crypto_ctx_lock); + +static struct kmem_cache *ext4_crypto_ctx_cachep; +struct kmem_cache *ext4_crypt_info_cachep; + +/** + * ext4_release_crypto_ctx() - Releases an encryption context + * @ctx: The encryption context to release. + * + * If the encryption context was allocated from the pre-allocated pool, returns + * it to that pool. Else, frees it. + * + * If there's a bounce page in the context, this frees that. + */ +void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) +{ + unsigned long flags; + + if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) + mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); + ctx->w.bounce_page = NULL; + ctx->w.control_page = NULL; + if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { + kmem_cache_free(ext4_crypto_ctx_cachep, ctx); + } else { + spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); + list_add(&ctx->free_list, &ext4_free_crypto_ctxs); + spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); + } +} + +/** + * ext4_get_crypto_ctx() - Gets an encryption context + * @inode: The inode for which we are doing the crypto + * + * Allocates and initializes an encryption context. + * + * Return: An allocated and initialized encryption context on success; error + * value or NULL otherwise. + */ +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) +{ + struct ext4_crypto_ctx *ctx = NULL; + int res = 0; + unsigned long flags; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + + if (ci == NULL) + return ERR_PTR(-ENOKEY); + + /* + * We first try getting the ctx from a free list because in + * the common case the ctx will have an allocated and + * initialized crypto tfm, so it's probably a worthwhile + * optimization. For the bounce page, we first try getting it + * from the kernel allocator because that's just about as fast + * as getting it from a list and because a cache of free pages + * should generally be a "last resort" option for a filesystem + * to be able to do its job. + */ + spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); + ctx = list_first_entry_or_null(&ext4_free_crypto_ctxs, + struct ext4_crypto_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); + if (!ctx) { + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) { + res = -ENOMEM; + goto out; + } + ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->flags &= ~EXT4_WRITE_PATH_FL; + +out: + if (res) { + if (!IS_ERR_OR_NULL(ctx)) + ext4_release_crypto_ctx(ctx); + ctx = ERR_PTR(res); + } + return ctx; +} + +struct workqueue_struct *ext4_read_workqueue; +static DEFINE_MUTEX(crypto_init); + +/** + * ext4_exit_crypto() - Shutdown the ext4 encryption system + */ +void ext4_exit_crypto(void) +{ + struct ext4_crypto_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) + kmem_cache_free(ext4_crypto_ctx_cachep, pos); + INIT_LIST_HEAD(&ext4_free_crypto_ctxs); + if (ext4_bounce_page_pool) + mempool_destroy(ext4_bounce_page_pool); + ext4_bounce_page_pool = NULL; + if (ext4_read_workqueue) + destroy_workqueue(ext4_read_workqueue); + ext4_read_workqueue = NULL; + if (ext4_crypto_ctx_cachep) + kmem_cache_destroy(ext4_crypto_ctx_cachep); + ext4_crypto_ctx_cachep = NULL; + if (ext4_crypt_info_cachep) + kmem_cache_destroy(ext4_crypt_info_cachep); + ext4_crypt_info_cachep = NULL; +} + +/** + * ext4_init_crypto() - Set up for ext4 encryption. + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int ext4_init_crypto(void) +{ + int i, res = -ENOMEM; + + mutex_lock(&crypto_init); + if (ext4_read_workqueue) + goto already_initialized; + ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0); + if (!ext4_read_workqueue) + goto fail; + + ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx, + SLAB_RECLAIM_ACCOUNT); + if (!ext4_crypto_ctx_cachep) + goto fail; + + ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info, + SLAB_RECLAIM_ACCOUNT); + if (!ext4_crypt_info_cachep) + goto fail; + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct ext4_crypto_ctx *ctx; + + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) { + res = -ENOMEM; + goto fail; + } + list_add(&ctx->free_list, &ext4_free_crypto_ctxs); + } + + ext4_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!ext4_bounce_page_pool) { + res = -ENOMEM; + goto fail; + } +already_initialized: + mutex_unlock(&crypto_init); + return 0; +fail: + ext4_exit_crypto(); + mutex_unlock(&crypto_init); + return res; +} + +void ext4_restore_control_page(struct page *data_page) +{ + struct ext4_crypto_ctx *ctx = + (struct ext4_crypto_ctx *)page_private(data_page); + + set_page_private(data_page, (unsigned long)NULL); + ClearPagePrivate(data_page); + unlock_page(data_page); + ext4_release_crypto_ctx(ctx); +} + +/** + * ext4_crypt_complete() - The completion callback for page encryption + * @req: The asynchronous encryption request context + * @res: The result of the encryption operation + */ +static void ext4_crypt_complete(struct crypto_async_request *req, int res) +{ + struct ext4_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +typedef enum { + EXT4_DECRYPT = 0, + EXT4_ENCRYPT, +} ext4_direction_t; + +static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, + struct inode *inode, + ext4_direction_t rw, + pgoff_t index, + struct page *src_page, + struct page *dest_page) + +{ + u8 xts_tweak[EXT4_XTS_TWEAK_SIZE]; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist dst, src; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; + int res = 0; + + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", + __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_crypt_complete, &ecr); + + BUILD_BUG_ON(EXT4_XTS_TWEAK_SIZE < sizeof(index)); + memcpy(xts_tweak, &index, sizeof(index)); + memset(&xts_tweak[sizeof(index)], 0, + EXT4_XTS_TWEAK_SIZE - sizeof(index)); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); + ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + xts_tweak); + if (rw == EXT4_DECRYPT) + res = crypto_ablkcipher_decrypt(req); + else + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + ablkcipher_request_free(req); + if (res) { + printk_ratelimited( + KERN_ERR + "%s: crypto_ablkcipher_encrypt() returned %d\n", + __func__, res); + return res; + } + return 0; +} + +static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) +{ + ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags |= EXT4_WRITE_PATH_FL; + return ctx->w.bounce_page; +} + +/** + * ext4_encrypt() - Encrypts a page + * @inode: The inode for which the encryption should take place + * @plaintext_page: The page to encrypt. Must be locked. + * + * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx + * encryption context. + * + * Called on the page write path. The caller must call + * ext4_restore_control_page() on the returned ciphertext page to + * release the bounce buffer and the encryption context. + * + * Return: An allocated page with the encrypted content on success. Else, an + * error value or NULL. + */ +struct page *ext4_encrypt(struct inode *inode, + struct page *plaintext_page) +{ + struct ext4_crypto_ctx *ctx; + struct page *ciphertext_page = NULL; + int err; + + BUG_ON(!PageLocked(plaintext_page)); + + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + return (struct page *) ctx; + + /* The encryption operation will require a bounce page. */ + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) + goto errout; + ctx->w.control_page = plaintext_page; + err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, + plaintext_page, ciphertext_page); + if (err) { + ciphertext_page = ERR_PTR(err); + errout: + ext4_release_crypto_ctx(ctx); + return ciphertext_page; + } + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)ctx); + lock_page(ciphertext_page); + return ciphertext_page; +} + +/** + * ext4_decrypt() - Decrypts a page in-place + * @ctx: The encryption context. + * @page: The page to decrypt. Must be locked. + * + * Decrypts page in-place using the ctx encryption context. + * + * Called from the read completion callback. + * + * Return: Zero on success, non-zero otherwise. + */ +int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page) +{ + BUG_ON(!PageLocked(page)); + + return ext4_page_crypto(ctx, page->mapping->host, + EXT4_DECRYPT, page->index, page, page); +} + +/* + * Convenience function which takes care of allocating and + * deallocating the encryption context + */ +int ext4_decrypt_one(struct inode *inode, struct page *page) +{ + int ret; + + struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); + + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + ret = ext4_decrypt(ctx, page); + ext4_release_crypto_ctx(ctx); + return ret; +} + +int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + struct ext4_crypto_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + ext4_lblk_t lblk = ex->ee_block; + ext4_fsblk_t pblk = ext4_ext_pblock(ex); + unsigned int len = ext4_ext_get_actual_len(ex); + int err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_CACHE_SIZE); + + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page); + if (err) + goto errout; + + bio = bio_alloc(GFP_KERNEL, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = pblk; + err = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (err) { + bio_put(bio); + goto errout; + } + err = submit_bio_wait(WRITE, bio); + bio_put(bio); + if (err) + goto errout; + } + err = 0; +errout: + ext4_release_crypto_ctx(ctx); + return err; +} + +bool ext4_valid_contents_enc_mode(uint32_t mode) +{ + return (mode == EXT4_ENCRYPTION_MODE_AES_256_XTS); +} + +/** + * ext4_validate_encryption_key_size() - Validate the encryption key size + * @mode: The key mode. + * @size: The key size to validate. + * + * Return: The validated key size for @mode. Zero if invalid. + */ +uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) +{ + if (size == ext4_encryption_key_size(mode)) + return size; + return 0; +} diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c new file mode 100644 index 000000000000..7dc4eb55913c --- /dev/null +++ b/fs/ext4/crypto_fname.c @@ -0,0 +1,469 @@ +/* + * linux/fs/ext4/crypto_fname.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains functions for filename crypto management in ext4 + * + * Written by Uday Savagaonkar, 2014. + * + * This has not yet undergone a rigorous security audit. + * + */ + +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <keys/encrypted-type.h> +#include <keys/user-type.h> +#include <linux/crypto.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/key.h> +#include <linux/key.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <linux/spinlock_types.h> + +#include "ext4.h" +#include "ext4_crypto.h" +#include "xattr.h" + +/** + * ext4_dir_crypt_complete() - + */ +static void ext4_dir_crypt_complete(struct crypto_async_request *req, int res) +{ + struct ext4_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +bool ext4_valid_filenames_enc_mode(uint32_t mode) +{ + return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); +} + +static unsigned max_name_len(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : + EXT4_NAME_LEN; +} + +/** + * ext4_fname_encrypt() - + * + * This function encrypts the input filename, and returns the length of the + * ciphertext. Errors are returned as negative numbers. We trust the caller to + * allocate sufficient memory to oname string. + */ +static int ext4_fname_encrypt(struct inode *inode, + const struct qstr *iname, + struct ext4_str *oname) +{ + u32 ciphertext_len; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; + int res = 0; + char iv[EXT4_CRYPTO_BLOCK_SIZE]; + struct scatterlist src_sg, dst_sg; + int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); + char *workbuf, buf[32], *alloc_buf = NULL; + unsigned lim = max_name_len(inode); + + if (iname->len <= 0 || iname->len > lim) + return -EIO; + + ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? + EXT4_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (ciphertext_len > lim) + ? lim : ciphertext_len; + + if (ciphertext_len <= sizeof(buf)) { + workbuf = buf; + } else { + alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); + if (!alloc_buf) + return -ENOMEM; + workbuf = alloc_buf; + } + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited( + KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + kfree(alloc_buf); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_dir_crypt_complete, &ecr); + + /* Copy the input */ + memcpy(workbuf, iname->name, iname->len); + if (iname->len < ciphertext_len) + memset(workbuf + iname->len, 0, ciphertext_len - iname->len); + + /* Initialize IV */ + memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); + + /* Create encryption request */ + sg_init_one(&src_sg, workbuf, ciphertext_len); + sg_init_one(&dst_sg, oname->name, ciphertext_len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + kfree(alloc_buf); + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited( + KERN_ERR "%s: Error (error code %d)\n", __func__, res); + } + oname->len = ciphertext_len; + return res; +} + +/* + * ext4_fname_decrypt() + * This function decrypts the input filename, and returns + * the length of the plaintext. + * Errors are returned as negative numbers. + * We trust the caller to allocate sufficient memory to oname string. + */ +static int ext4_fname_decrypt(struct inode *inode, + const struct ext4_str *iname, + struct ext4_str *oname) +{ + struct ext4_str tmp_in[2], tmp_out[1]; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; + int res = 0; + char iv[EXT4_CRYPTO_BLOCK_SIZE]; + unsigned lim = max_name_len(inode); + + if (iname->len <= 0 || iname->len > lim) + return -EIO; + + tmp_in[0].name = iname->name; + tmp_in[0].len = iname->len; + tmp_out[0].name = oname->name; + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited( + KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + ext4_dir_crypt_complete, &ecr); + + /* Initialize IV */ + memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); + + /* Create encryption request */ + sg_init_one(&src_sg, iname->name, iname->len); + sg_init_one(&dst_sg, oname->name, oname->len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_ablkcipher_decrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited( + KERN_ERR "%s: Error in ext4_fname_encrypt (error code %d)\n", + __func__, res); + return res; + } + + oname->len = strnlen(oname->name, iname->len); + return oname->len; +} + +static const char *lookup_table = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +/** + * ext4_fname_encode_digest() - + * + * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. + * The encoded string is roughly 4/3 times the size of the input string. + */ +static int digest_encode(const char *src, int len, char *dst) +{ + int i = 0, bits = 0, ac = 0; + char *cp = dst; + + while (i < len) { + ac += (((unsigned char) src[i]) << bits); + bits += 8; + do { + *cp++ = lookup_table[ac & 0x3f]; + ac >>= 6; + bits -= 6; + } while (bits >= 6); + i++; + } + if (bits) + *cp++ = lookup_table[ac & 0x3f]; + return cp - dst; +} + +static int digest_decode(const char *src, int len, char *dst) +{ + int i = 0, bits = 0, ac = 0; + const char *p; + char *cp = dst; + + while (i < len) { + p = strchr(lookup_table, src[i]); + if (p == NULL || src[i] == 0) + return -2; + ac += (p - lookup_table) << bits; + bits += 6; + if (bits >= 8) { + *cp++ = ac & 0xff; + ac >>= 8; + bits -= 8; + } + i++; + } + if (ac) + return -1; + return cp - dst; +} + +/** + * ext4_fname_crypto_round_up() - + * + * Return: The next multiple of block size + */ +u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) +{ + return ((size+blksize-1)/blksize)*blksize; +} + +unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen) +{ + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + int padding = 32; + + if (ci) + padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); + if (ilen < EXT4_CRYPTO_BLOCK_SIZE) + ilen = EXT4_CRYPTO_BLOCK_SIZE; + return ext4_fname_crypto_round_up(ilen, padding); +} + +/* + * ext4_fname_crypto_alloc_buffer() - + * + * Allocates an output buffer that is sufficient for the crypto operation + * specified by the context and the direction. + */ +int ext4_fname_crypto_alloc_buffer(struct inode *inode, + u32 ilen, struct ext4_str *crypto_str) +{ + unsigned int olen = ext4_fname_encrypted_size(inode, ilen); + + crypto_str->len = olen; + if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) + olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; + /* Allocated buffer can hold one more character to null-terminate the + * string */ + crypto_str->name = kmalloc(olen+1, GFP_NOFS); + if (!(crypto_str->name)) + return -ENOMEM; + return 0; +} + +/** + * ext4_fname_crypto_free_buffer() - + * + * Frees the buffer allocated for crypto operation. + */ +void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) +{ + if (!crypto_str) + return; + kfree(crypto_str->name); + crypto_str->name = NULL; +} + +/** + * ext4_fname_disk_to_usr() - converts a filename from disk space to user space + */ +int _ext4_fname_disk_to_usr(struct inode *inode, + struct dx_hash_info *hinfo, + const struct ext4_str *iname, + struct ext4_str *oname) +{ + char buf[24]; + int ret; + + if (iname->len < 3) { + /*Check for . and .. */ + if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') { + oname->name[0] = '.'; + oname->name[iname->len-1] = '.'; + oname->len = iname->len; + return oname->len; + } + } + if (EXT4_I(inode)->i_crypt_info) + return ext4_fname_decrypt(inode, iname, oname); + + if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { + ret = digest_encode(iname->name, iname->len, oname->name); + oname->len = ret; + return ret; + } + if (hinfo) { + memcpy(buf, &hinfo->hash, 4); + memcpy(buf+4, &hinfo->minor_hash, 4); + } else + memset(buf, 0, 8); + memcpy(buf + 8, iname->name + iname->len - 16, 16); + oname->name[0] = '_'; + ret = digest_encode(buf, 24, oname->name+1); + oname->len = ret + 1; + return ret + 1; +} + +int ext4_fname_disk_to_usr(struct inode *inode, + struct dx_hash_info *hinfo, + const struct ext4_dir_entry_2 *de, + struct ext4_str *oname) +{ + struct ext4_str iname = {.name = (unsigned char *) de->name, + .len = de->name_len }; + + return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname); +} + + +/** + * ext4_fname_usr_to_disk() - converts a filename from user space to disk space + */ +int ext4_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct ext4_str *oname) +{ + int res; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + + if (iname->len < 3) { + /*Check for . and .. */ + if (iname->name[0] == '.' && + iname->name[iname->len-1] == '.') { + oname->name[0] = '.'; + oname->name[iname->len-1] = '.'; + oname->len = iname->len; + return oname->len; + } + } + if (ci) { + res = ext4_fname_encrypt(inode, iname, oname); + return res; + } + /* Without a proper key, a user is not allowed to modify the filenames + * in a directory. Consequently, a user space name cannot be mapped to + * a disk-space name */ + return -EACCES; +} + +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + struct ext4_crypt_info *ci; + int ret = 0, bigname = 0; + + memset(fname, 0, sizeof(struct ext4_filename)); + fname->usr_fname = iname; + + if (!ext4_encrypted_inode(dir) || + ((iname->name[0] == '.') && + ((iname->len == 1) || + ((iname->name[1] == '.') && (iname->len == 2))))) { + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + return 0; + } + ret = ext4_get_encryption_info(dir); + if (ret) + return ret; + ci = EXT4_I(dir)->i_crypt_info; + if (ci) { + ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, + &fname->crypto_buf); + if (ret < 0) + return ret; + ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf); + if (ret < 0) + goto errout; + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + return 0; + } + if (!lookup) + return -EACCES; + + /* We don't have the key and we are doing a lookup; decode the + * user-supplied name + */ + if (iname->name[0] == '_') + bigname = 1; + if ((bigname && (iname->len != 33)) || + (!bigname && (iname->len > 43))) + return -ENOENT; + + fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + if (fname->crypto_buf.name == NULL) + return -ENOMEM; + ret = digest_decode(iname->name + bigname, iname->len - bigname, + fname->crypto_buf.name); + if (ret < 0) { + ret = -ENOENT; + goto errout; + } + fname->crypto_buf.len = ret; + if (bigname) { + memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4); + memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4); + } else { + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + } + return 0; +errout: + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; + return ret; +} + +void ext4_fname_free_filename(struct ext4_filename *fname) +{ + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; +} diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c new file mode 100644 index 000000000000..442d24e8efc0 --- /dev/null +++ b/fs/ext4/crypto_key.c @@ -0,0 +1,260 @@ +/* + * linux/fs/ext4/crypto_key.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions for ext4 + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#include <keys/encrypted-type.h> +#include <keys/user-type.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <uapi/linux/keyctl.h> + +#include "ext4.h" +#include "xattr.h" + +static void derive_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct ext4_completion_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->res = rc; + complete(&ecr->completion); +} + +/** + * ext4_derive_key_aes() - Derive a key using AES-128-ECB + * @deriving_key: Encryption key used for derivatio. + * @source_key: Source key to which to apply derivation. + * @derived_key: Derived key. + * + * Return: Zero on success; non-zero otherwise. + */ +static int ext4_derive_key_aes(char deriving_key[EXT4_AES_128_ECB_KEY_SIZE], + char source_key[EXT4_AES_256_XTS_KEY_SIZE], + char derived_key[EXT4_AES_256_XTS_KEY_SIZE]) +{ + int res = 0; + struct ablkcipher_request *req = NULL; + DECLARE_EXT4_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, + 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + derive_crypt_complete, &ecr); + res = crypto_ablkcipher_setkey(tfm, deriving_key, + EXT4_AES_128_ECB_KEY_SIZE); + if (res < 0) + goto out; + sg_init_one(&src_sg, source_key, EXT4_AES_256_XTS_KEY_SIZE); + sg_init_one(&dst_sg, derived_key, EXT4_AES_256_XTS_KEY_SIZE); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, + EXT4_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + +out: + if (req) + ablkcipher_request_free(req); + if (tfm) + crypto_free_ablkcipher(tfm); + return res; +} + +void ext4_free_crypt_info(struct ext4_crypt_info *ci) +{ + if (!ci) + return; + + if (ci->ci_keyring_key) + key_put(ci->ci_keyring_key); + crypto_free_ablkcipher(ci->ci_ctfm); + kmem_cache_free(ext4_crypt_info_cachep, ci); +} + +void ext4_free_encryption_info(struct inode *inode, + struct ext4_crypt_info *ci) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_crypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(ei->i_crypt_info); + if (ci == NULL) + return; + prev = cmpxchg(&ei->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + ext4_free_crypt_info(ci); +} + +int _ext4_get_encryption_info(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_crypt_info *crypt_info; + char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + + (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; + struct key *keyring_key = NULL; + struct ext4_encryption_key *master_key; + struct ext4_encryption_context ctx; + struct user_key_payload *ukp; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + struct crypto_ablkcipher *ctfm; + const char *cipher_str; + char raw_key[EXT4_MAX_KEY_SIZE]; + char mode; + int res; + + if (!ext4_read_workqueue) { + res = ext4_init_crypto(); + if (res) + return res; + } + +retry: + crypt_info = ACCESS_ONCE(ei->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) + return 0; + ext4_free_encryption_info(inode, crypt_info); + goto retry; + } + + res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + if (res < 0) { + if (!DUMMY_ENCRYPTION_ENABLED(sbi)) + return res; + ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + } else if (res != sizeof(ctx)) + return -EINVAL; + res = 0; + + crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); + if (S_ISREG(inode->i_mode)) + mode = crypt_info->ci_data_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + mode = crypt_info->ci_filename_mode; + else + BUG(); + switch (mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case EXT4_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "ext4: unsupported key mode %d (ino %u)\n", + mode, (unsigned) inode->i_ino); + res = -ENOKEY; + goto out; + } + if (DUMMY_ENCRYPTION_ENABLED(sbi)) { + memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + goto got_key; + } + memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, + EXT4_KEY_DESC_PREFIX_SIZE); + sprintf(full_key_descriptor + EXT4_KEY_DESC_PREFIX_SIZE, + "%*phN", EXT4_KEY_DESCRIPTOR_SIZE, + ctx.master_key_descriptor); + full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + + (2 * EXT4_KEY_DESCRIPTOR_SIZE)] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + if (IS_ERR(keyring_key)) { + res = PTR_ERR(keyring_key); + keyring_key = NULL; + goto out; + } + crypt_info->ci_keyring_key = keyring_key; + BUG_ON(keyring_key->type != &key_type_logon); + ukp = ((struct user_key_payload *)keyring_key->payload.data); + if (ukp->datalen != sizeof(struct ext4_encryption_key)) { + res = -EINVAL; + goto out; + } + master_key = (struct ext4_encryption_key *)ukp->data; + BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != + EXT4_KEY_DERIVATION_NONCE_SIZE); + BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); + res = ext4_derive_key_aes(ctx.nonce, master_key->raw, + raw_key); +got_key: + ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_ablkcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_ablkcipher_setkey(ctfm, raw_key, + ext4_encryption_key_size(mode)); + if (res) + goto out; + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) { + ext4_free_crypt_info(crypt_info); + goto retry; + } + return 0; + +out: + if (res == -ENOKEY) + res = 0; + ext4_free_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); + return res; +} + +int ext4_has_encryption_key(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + return (ei->i_crypt_info != NULL); +} diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c new file mode 100644 index 000000000000..02c4e5df7afb --- /dev/null +++ b/fs/ext4/crypto_policy.c @@ -0,0 +1,215 @@ +/* + * linux/fs/ext4/crypto_policy.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption policy functions for ext4 + * + * Written by Michael Halcrow, 2015. + */ + +#include <linux/random.h> +#include <linux/string.h> +#include <linux/types.h> + +#include "ext4.h" +#include "xattr.h" + +static int ext4_inode_has_encryption_context(struct inode *inode) +{ + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0); + return (res > 0); +} + +/* + * check whether the policy is consistent with the encryption context + * for the inode + */ +static int ext4_is_encryption_context_consistent_with_policy( + struct inode *inode, const struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx)); + if (res != sizeof(ctx)) + return 0; + return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == + policy->flags) && + (ctx.contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx.filenames_encryption_mode == + policy->filenames_encryption_mode)); +} + +static int ext4_create_encryption_context_from_policy( + struct inode *inode, const struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + int res = 0; + + res = ext4_convert_inline_data(inode); + if (res) + return res; + + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE); + if (!ext4_valid_contents_enc_mode(policy->contents_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid contents encryption mode %d\n", __func__, + policy->contents_encryption_mode); + return -EINVAL; + } + if (!ext4_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid filenames encryption mode %d\n", __func__, + policy->filenames_encryption_mode); + return -EINVAL; + } + if (policy->flags & ~EXT4_POLICY_FLAGS_VALID) + return -EINVAL; + ctx.contents_encryption_mode = policy->contents_encryption_mode; + ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; + BUILD_BUG_ON(sizeof(ctx.nonce) != EXT4_KEY_DERIVATION_NONCE_SIZE); + get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + + res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), 0); + if (!res) + ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); + return res; +} + +int ext4_process_policy(const struct ext4_encryption_policy *policy, + struct inode *inode) +{ + if (policy->version != 0) + return -EINVAL; + + if (!ext4_inode_has_encryption_context(inode)) { + if (!S_ISDIR(inode->i_mode)) + return -EINVAL; + if (!ext4_empty_dir(inode)) + return -ENOTEMPTY; + return ext4_create_encryption_context_from_policy(inode, + policy); + } + + if (ext4_is_encryption_context_consistent_with_policy(inode, policy)) + return 0; + + printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", + __func__); + return -EINVAL; +} + +int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) +{ + struct ext4_encryption_context ctx; + + int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return -ENOENT; + if (ctx.format != EXT4_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + policy->version = 0; + policy->contents_encryption_mode = ctx.contents_encryption_mode; + policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; + memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + EXT4_KEY_DESCRIPTOR_SIZE); + return 0; +} + +int ext4_is_child_context_consistent_with_parent(struct inode *parent, + struct inode *child) +{ + struct ext4_crypt_info *parent_ci, *child_ci; + int res; + + if ((parent == NULL) || (child == NULL)) { + pr_err("parent %p child %p\n", parent, child); + BUG_ON(1); + } + /* no restrictions if the parent directory is not encrypted */ + if (!ext4_encrypted_inode(parent)) + return 1; + /* if the child directory is not encrypted, this is always a problem */ + if (!ext4_encrypted_inode(child)) + return 0; + res = ext4_get_encryption_info(parent); + if (res) + return 0; + res = ext4_get_encryption_info(child); + if (res) + return 0; + parent_ci = EXT4_I(parent)->i_crypt_info; + child_ci = EXT4_I(child)->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) + return 0; + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, + EXT4_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); +} + +/** + * ext4_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * + * Return: Zero on success, non-zero otherwise + */ +int ext4_inherit_context(struct inode *parent, struct inode *child) +{ + struct ext4_encryption_context ctx; + struct ext4_crypt_info *ci; + int res; + + res = ext4_get_encryption_info(parent); + if (res < 0) + return res; + ci = EXT4_I(parent)->i_crypt_info; + if (ci == NULL) + return -ENOKEY; + + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { + ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, + EXT4_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + EXT4_KEY_DESCRIPTOR_SIZE); + } + get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); + res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), 0); + if (!res) { + ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA); + res = ext4_get_encryption_info(child); + } + return res; +} diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index c24143ea9c08..f9e14911918c 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -22,10 +22,8 @@ */ #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/buffer_head.h> #include <linux/slab.h> -#include <linux/rbtree.h> #include "ext4.h" #include "xattr.h" @@ -110,7 +108,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int err; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; + struct buffer_head *bh = NULL; int dir_has_error = 0; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); @@ -127,17 +127,23 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) if (ext4_has_inline_data(inode)) { int has_inline_data = 1; - int ret = ext4_read_inline_dir(file, ctx, + err = ext4_read_inline_dir(file, ctx, &has_inline_data); if (has_inline_data) - return ret; + return err; + } + + if (ext4_encrypted_inode(inode)) { + err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN, + &fname_crypto_str); + if (err < 0) + return err; } offset = ctx->pos & (sb->s_blocksize - 1); while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; - struct buffer_head *bh = NULL; map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; @@ -180,6 +186,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) (unsigned long long)ctx->pos); ctx->pos += sb->s_blocksize - offset; brelse(bh); + bh = NULL; continue; } set_buffer_verified(bh); @@ -226,25 +233,45 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - if (!dir_emit(ctx, de->name, - de->name_len, - le32_to_cpu(de->inode), - get_dtype(sb, de->file_type))) { - brelse(bh); - return 0; + if (!ext4_encrypted_inode(inode)) { + if (!dir_emit(ctx, de->name, + de->name_len, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; + } else { + int save_len = fname_crypto_str.len; + + /* Directory is encrypted */ + err = ext4_fname_disk_to_usr(inode, + NULL, de, &fname_crypto_str); + fname_crypto_str.len = save_len; + if (err < 0) + goto errout; + if (!dir_emit(ctx, + fname_crypto_str.name, err, + le32_to_cpu(de->inode), + get_dtype(sb, de->file_type))) + goto done; } } ctx->pos += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); } - offset = 0; + if ((ctx->pos < inode->i_size) && !dir_relax(inode)) + goto done; brelse(bh); - if (ctx->pos < inode->i_size) { - if (!dir_relax(inode)) - return 0; - } + bh = NULL; + offset = 0; } - return 0; +done: + err = 0; +errout: +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ext4_fname_crypto_free_buffer(&fname_crypto_str); +#endif + brelse(bh); + return err; } static inline int is_32bit_api(void) @@ -384,10 +411,15 @@ void ext4_htree_free_dir_info(struct dir_private_info *p) /* * Given a directory entry, enter it into the fname rb tree. + * + * When filename encryption is enabled, the dirent will hold the + * encrypted filename, while the htree will hold decrypted filename. + * The decrypted filename is passed in via ent_name. parameter. */ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, - struct ext4_dir_entry_2 *dirent) + struct ext4_dir_entry_2 *dirent, + struct ext4_str *ent_name) { struct rb_node **p, *parent = NULL; struct fname *fname, *new_fn; @@ -398,17 +430,17 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, p = &info->root.rb_node; /* Create and allocate the fname structure */ - len = sizeof(struct fname) + dirent->name_len + 1; + len = sizeof(struct fname) + ent_name->len + 1; new_fn = kzalloc(len, GFP_KERNEL); if (!new_fn) return -ENOMEM; new_fn->hash = hash; new_fn->minor_hash = minor_hash; new_fn->inode = le32_to_cpu(dirent->inode); - new_fn->name_len = dirent->name_len; + new_fn->name_len = ent_name->len; new_fn->file_type = dirent->file_type; - memcpy(new_fn->name, dirent->name, dirent->name_len); - new_fn->name[dirent->name_len] = 0; + memcpy(new_fn->name, ent_name->name, ent_name->len); + new_fn->name[ent_name->len] = 0; while (*p) { parent = *p; @@ -561,6 +593,13 @@ finished: return 0; } +static int ext4_dir_open(struct inode * inode, struct file * filp) +{ + if (ext4_encrypted_inode(inode)) + return ext4_get_encryption_info(inode) ? -EACCES : 0; + return 0; +} + static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) @@ -603,5 +642,6 @@ const struct file_operations ext4_dir_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, + .open = ext4_dir_open, .release = ext4_release_dir, }; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f63c3d5805c4..f5e9f04220c1 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -69,15 +69,6 @@ #define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -#define EXT4_ERROR_INODE(inode, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) - -#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) - -#define EXT4_ERROR_FILE(file, block, fmt, a...) \ - ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) - /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -90,6 +81,11 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + /* * Flags used in mballoc's allocation_context flags field. * @@ -422,7 +418,7 @@ enum { EXT4_INODE_DIRTY = 8, EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ - EXT4_INODE_ENCRYPT = 11, /* Compression error */ + EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ /* End compression flags --- maybe not all used */ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ EXT4_INODE_IMAGIC = 13, /* AFS directory */ @@ -582,6 +578,15 @@ enum { #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 +/* Encryption algorithms */ +#define EXT4_ENCRYPTION_MODE_INVALID 0 +#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 +#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 +#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 +#define EXT4_ENCRYPTION_MODE_AES_256_CTS 4 + +#include "ext4_crypto.h" + /* * ioctl commands */ @@ -603,6 +608,9 @@ enum { #define EXT4_IOC_RESIZE_FS _IOW('f', 16, __u64) #define EXT4_IOC_SWAP_BOOT _IO('f', 17) #define EXT4_IOC_PRECACHE_EXTENTS _IO('f', 18) +#define EXT4_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct ext4_encryption_policy) +#define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -939,6 +947,11 @@ struct ext4_inode_info { /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Encryption params */ + struct ext4_crypt_info *i_crypt_info; +#endif }; /* @@ -1049,12 +1062,6 @@ extern void ext4_set_bits(void *bm, int cur, int len); /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 -/* Encryption algorithms */ -#define EXT4_ENCRYPTION_MODE_INVALID 0 -#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 -#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 -#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 - /* * Structure of the super block */ @@ -1142,7 +1149,8 @@ struct ext4_super_block { __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_checksum_type; /* metadata checksum algorithm used */ - __le16 s_reserved_pad; + __u8 s_encryption_level; /* versioning level for encryption */ + __u8 s_reserved_pad; /* Padding to next 32bits */ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ __le32 s_snapshot_inum; /* Inode number of active snapshot */ __le32 s_snapshot_id; /* sequential ID of active snapshot */ @@ -1169,7 +1177,9 @@ struct ext4_super_block { __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ - __le32 s_reserved[105]; /* Padding to the end of the block */ + __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ + __le32 s_lpf_ino; /* Location of the lost+found inode */ + __le32 s_reserved[100]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1180,8 +1190,16 @@ struct ext4_super_block { /* * run-time mount flags */ -#define EXT4_MF_MNTDIR_SAMPLED 0x0001 -#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_MNTDIR_SAMPLED 0x0001 +#define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +#define EXT4_MF_TEST_DUMMY_ENCRYPTION 0x0004 + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +#define DUMMY_ENCRYPTION_ENABLED(sbi) (unlikely((sbi)->s_mount_flags & \ + EXT4_MF_TEST_DUMMY_ENCRYPTION)) +#else +#define DUMMY_ENCRYPTION_ENABLED(sbi) (0) +#endif /* Number of quota types we support */ #define EXT4_MAXQUOTAS 2 @@ -1466,6 +1484,18 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_SB(sb) (sb) #endif +/* + * Returns true if the inode is inode is encrypted + */ +static inline int ext4_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT); +#else + return 0; +#endif +} + #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* @@ -1575,8 +1605,9 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ - EXT4_FEATURE_INCOMPAT_MMP | \ - EXT4_FEATURE_INCOMPAT_INLINE_DATA) + EXT4_FEATURE_INCOMPAT_MMP | \ + EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ @@ -1796,6 +1827,17 @@ struct dx_hash_info */ #define HASH_NB_ALWAYS 1 +struct ext4_filename { + const struct qstr *usr_fname; + struct ext4_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_str crypto_buf; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) /* * Describe an inode's exact location on disk and in memory @@ -2001,6 +2043,130 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); +/* crypto_policy.c */ +int ext4_is_child_context_consistent_with_parent(struct inode *parent, + struct inode *child); +int ext4_inherit_context(struct inode *parent, struct inode *child); +void ext4_to_hex(char *dst, char *src, size_t src_size); +int ext4_process_policy(const struct ext4_encryption_policy *policy, + struct inode *inode); +int ext4_get_policy(struct inode *inode, + struct ext4_encryption_policy *policy); + +/* crypto.c */ +extern struct kmem_cache *ext4_crypt_info_cachep; +bool ext4_valid_contents_enc_mode(uint32_t mode); +uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); +extern struct workqueue_struct *ext4_read_workqueue; +struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode); +void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx); +void ext4_restore_control_page(struct page *data_page); +struct page *ext4_encrypt(struct inode *inode, + struct page *plaintext_page); +int ext4_decrypt(struct ext4_crypto_ctx *ctx, struct page *page); +int ext4_decrypt_one(struct inode *inode, struct page *page); +int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +int ext4_init_crypto(void); +void ext4_exit_crypto(void); +static inline int ext4_sb_has_crypto(struct super_block *sb) +{ + return EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); +} +#else +static inline int ext4_init_crypto(void) { return 0; } +static inline void ext4_exit_crypto(void) { } +static inline int ext4_sb_has_crypto(struct super_block *sb) +{ + return 0; +} +#endif + +/* crypto_fname.c */ +bool ext4_valid_filenames_enc_mode(uint32_t mode); +u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); +unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen); +int ext4_fname_crypto_alloc_buffer(struct inode *inode, + u32 ilen, struct ext4_str *crypto_str); +int _ext4_fname_disk_to_usr(struct inode *inode, + struct dx_hash_info *hinfo, + const struct ext4_str *iname, + struct ext4_str *oname); +int ext4_fname_disk_to_usr(struct inode *inode, + struct dx_hash_info *hinfo, + const struct ext4_dir_entry_2 *de, + struct ext4_str *oname); +int ext4_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct ext4_str *oname); +#ifdef CONFIG_EXT4_FS_ENCRYPTION +void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname); +void ext4_fname_free_filename(struct ext4_filename *fname); +#else +static inline +int ext4_setup_fname_crypto(struct inode *inode) +{ + return 0; +} +static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + return 0; +} +static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } +#endif + + +/* crypto_key.c */ +void ext4_free_crypt_info(struct ext4_crypt_info *ci); +void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); +int _ext4_get_encryption_info(struct inode *inode); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +int ext4_has_encryption_key(struct inode *inode); + +static inline int ext4_get_encryption_info(struct inode *inode) +{ + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return _ext4_get_encryption_info(inode); + return 0; +} + +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +{ + return EXT4_I(inode)->i_crypt_info; +} + +#else +static inline int ext4_has_encryption_key(struct inode *inode) +{ + return 0; +} +static inline int ext4_get_encryption_info(struct inode *inode) +{ + return 0; +} +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +{ + return NULL; +} +#endif + + /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, @@ -2011,18 +2177,20 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, - __u32 minor_hash, - struct ext4_dir_entry_2 *dirent); + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent, + struct ext4_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, - const char *name, int namelen, + struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); -void ext4_insert_dentry(struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - const char *name, int namelen); +int ext4_insert_dentry(struct inode *dir, + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, @@ -2099,6 +2267,7 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); /* inode.c */ +int ext4_inode_is_fast_symlink(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, @@ -2152,8 +2321,8 @@ extern void ext4_da_update_reserve_space(struct inode *inode, /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset); +extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); @@ -2175,13 +2344,14 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); -extern int search_dir(struct buffer_head *bh, - char *search_buf, - int buf_size, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, @@ -2189,6 +2359,7 @@ extern int ext4_generic_delete_entry(handle_t *handle, void *entry_buf, int buf_size, int csum_size); +extern int ext4_empty_dir(struct inode *inode); /* resize.c */ extern int ext4_group_add(struct super_block *sb, @@ -2225,6 +2396,9 @@ void __ext4_abort(struct super_block *, const char *, unsigned int, extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); extern __printf(3, 4) void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, @@ -2235,6 +2409,15 @@ void __ext4_grp_locked_error(const char *, unsigned int, unsigned long, ext4_fsblk_t, const char *, ...); +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ @@ -2247,6 +2430,8 @@ void __ext4_grp_locked_error(const char *, unsigned int, __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_msg(sb, level, fmt, ...) \ __ext4_msg(sb, level, fmt, ##__VA_ARGS__) #define dump_mmp_msg(sb, mmp, msg) \ @@ -2282,6 +2467,11 @@ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning(sb, "", 0, " "); \ } while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) #define ext4_msg(sb, level, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ @@ -2593,7 +2783,6 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; -extern const struct file_operations ext4_dax_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ @@ -2626,7 +2815,9 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); -extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, @@ -2640,6 +2831,7 @@ extern int htree_inlinedir_to_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); @@ -2699,8 +2891,13 @@ static inline void ext4_set_de_type(struct super_block *sb, de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } +/* readpages.c */ +extern int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages); /* symlink.c */ +extern const struct inode_operations ext4_encrypted_symlink_inode_operations; extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; @@ -2743,7 +2940,6 @@ extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_metadata_amount(struct inode *inode, ext4_lblk_t lblocks); -extern int ext4_extent_tree_init(handle_t *, struct inode *); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); @@ -2767,6 +2963,7 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h new file mode 100644 index 000000000000..ac7d4e813796 --- /dev/null +++ b/fs/ext4/ext4_crypto.h @@ -0,0 +1,159 @@ +/* + * linux/fs/ext4/ext4_crypto.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption header content for ext4 + * + * Written by Michael Halcrow, 2015. + */ + +#ifndef _EXT4_CRYPTO_H +#define _EXT4_CRYPTO_H + +#include <linux/fs.h> + +#define EXT4_KEY_DESCRIPTOR_SIZE 8 + +/* Policy provided via an ioctl on the topmost directory */ +struct ext4_encryption_policy { + char version; + char contents_encryption_mode; + char filenames_encryption_mode; + char flags; + char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; +} __attribute__((__packed__)); + +#define EXT4_ENCRYPTION_CONTEXT_FORMAT_V1 1 +#define EXT4_KEY_DERIVATION_NONCE_SIZE 16 + +#define EXT4_POLICY_FLAGS_PAD_4 0x00 +#define EXT4_POLICY_FLAGS_PAD_8 0x01 +#define EXT4_POLICY_FLAGS_PAD_16 0x02 +#define EXT4_POLICY_FLAGS_PAD_32 0x03 +#define EXT4_POLICY_FLAGS_PAD_MASK 0x03 +#define EXT4_POLICY_FLAGS_VALID 0x03 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Reserved + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct ext4_encryption_context { + char format; + char contents_encryption_mode; + char filenames_encryption_mode; + char flags; + char master_key_descriptor[EXT4_KEY_DESCRIPTOR_SIZE]; + char nonce[EXT4_KEY_DERIVATION_NONCE_SIZE]; +} __attribute__((__packed__)); + +/* Encryption parameters */ +#define EXT4_XTS_TWEAK_SIZE 16 +#define EXT4_AES_128_ECB_KEY_SIZE 16 +#define EXT4_AES_256_GCM_KEY_SIZE 32 +#define EXT4_AES_256_CBC_KEY_SIZE 32 +#define EXT4_AES_256_CTS_KEY_SIZE 32 +#define EXT4_AES_256_XTS_KEY_SIZE 64 +#define EXT4_MAX_KEY_SIZE 64 + +#define EXT4_KEY_DESC_PREFIX "ext4:" +#define EXT4_KEY_DESC_PREFIX_SIZE 5 + +/* This is passed in from userspace into the kernel keyring */ +struct ext4_encryption_key { + __u32 mode; + char raw[EXT4_MAX_KEY_SIZE]; + __u32 size; +} __attribute__((__packed__)); + +struct ext4_crypt_info { + char ci_data_mode; + char ci_filename_mode; + char ci_flags; + struct crypto_ablkcipher *ci_ctfm; + struct key *ci_keyring_key; + char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; +}; + +#define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define EXT4_WRITE_PATH_FL 0x00000002 + +struct ext4_crypto_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + char flags; /* Flags */ + char mode; /* Encryption mode for tfm */ +}; + +struct ext4_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_EXT4_COMPLETION_RESULT(ecr) \ + struct ext4_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + +static inline int ext4_encryption_key_size(int mode) +{ + switch (mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + return EXT4_AES_256_XTS_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_GCM: + return EXT4_AES_256_GCM_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_CBC: + return EXT4_AES_256_CBC_KEY_SIZE; + case EXT4_ENCRYPTION_MODE_AES_256_CTS: + return EXT4_AES_256_CTS_KEY_SIZE; + default: + BUG(); + } + return 0; +} + +#define EXT4_FNAME_NUM_SCATTER_ENTRIES 4 +#define EXT4_CRYPTO_BLOCK_SIZE 16 +#define EXT4_FNAME_CRYPTO_DIGEST_SIZE 32 + +struct ext4_str { + unsigned char *name; + u32 len; +}; + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct ext4_encrypted_symlink_data { + __le16 len; + char encrypted_path[1]; +} __attribute__((__packed__)); + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 encrypted_symlink_data_len(u32 l) +{ + if (l < EXT4_CRYPTO_BLOCK_SIZE) + l = EXT4_CRYPTO_BLOCK_SIZE; + return (l + sizeof(struct ext4_encrypted_symlink_data) - 1); +} + +#endif /* _EXT4_CRYPTO_H */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 3445035c7e01..d41843181818 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -87,6 +87,12 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) ext4_put_nojournal(handle); return 0; } + + if (!handle->h_transaction) { + err = jbd2_journal_stop(handle); + return handle->h_err ? handle->h_err : err; + } + sb = handle->h_transaction->t_journal->j_private; err = handle->h_err; rc = jbd2_journal_stop(handle); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bed43081720f..2553aa8b608d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -39,6 +39,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> #include <linux/fiemap.h> +#include <linux/backing-dev.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -377,7 +378,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) ext4_lblk_t lblock = le32_to_cpu(ext->ee_block); ext4_lblk_t last = lblock + len - 1; - if (lblock > last) + if (len == 0 || lblock > last) return 0; return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } @@ -503,7 +504,7 @@ __read_extent_tree_block(const char *function, unsigned int line, struct buffer_head *bh; int err; - bh = sb_getblk(inode->i_sb, pblk); + bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -1088,7 +1089,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, err = -EIO; goto cleanup; } - bh = sb_getblk(inode->i_sb, newblock); + bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) { err = -ENOMEM; goto cleanup; @@ -1282,7 +1283,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, if (newblock == 0) return err; - bh = sb_getblk(inode->i_sb, newblock); + bh = sb_getblk_gfp(inode->i_sb, newblock, __GFP_MOVABLE | GFP_NOFS); if (unlikely(!bh)) return -ENOMEM; lock_buffer(bh); @@ -1717,12 +1718,6 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, { unsigned short ext1_ee_len, ext2_ee_len; - /* - * Make sure that both extents are initialized. We don't merge - * unwritten extents so that we can be sure that end_io code has - * the extent that was written properly split out and conversion to - * initialized is trivial. - */ if (ext4_ext_is_unwritten(ex1) != ext4_ext_is_unwritten(ex2)) return 0; @@ -3128,6 +3123,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); + if (ext4_encrypted_inode(inode)) + return ext4_encrypted_zeroout(inode, ex); + ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); if (ret > 0) ret = 0; @@ -4459,6 +4457,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.flags |= EXT4_MB_HINT_NOPREALLOC; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; @@ -4535,19 +4535,7 @@ got_allocated_blocks: */ reserved_clusters = get_reserved_cluster_alloc(inode, map->m_lblk, allocated); - if (map_from_cluster) { - if (reserved_clusters) { - /* - * We have clusters reserved for this range. - * But since we are not doing actual allocation - * and are simply using blocks from previously - * allocated cluster, we should release the - * reservation and not claim quota. - */ - ext4_da_update_reserve_space(inode, - reserved_clusters, 0); - } - } else { + if (!map_from_cluster) { BUG_ON(allocated_clusters < reserved_clusters); if (reserved_clusters < allocated_clusters) { struct ext4_inode_info *ei = EXT4_I(inode); @@ -4678,6 +4666,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int ret = 0; int ret2 = 0; int retries = 0; + int depth = 0; struct ext4_map_blocks map; unsigned int credits; loff_t epos; @@ -4692,13 +4681,32 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); + /* + * We can only call ext_depth() on extent based inodes + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + depth = ext_depth(inode); + else + depth = -1; retry: while (ret >= 0 && len) { + /* + * Recalculate credits when extent tree depth changes. + */ + if (depth >= 0 && depth != ext_depth(inode)) { + credits = ext4_chunk_trans_blocks(inode, len); + depth = ext_depth(inode); + } + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { @@ -4740,6 +4748,8 @@ retry: goto retry; } + ext4_inode_resume_unlocked_dio(inode); + return ret > 0 ? ret2 : ret; } @@ -4803,12 +4813,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, else max_blocks -= lblk; - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | - EXT4_EX_NOCACHE; - if (mode & FALLOC_FL_KEEP_SIZE) - flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - mutex_lock(&inode->i_mutex); /* @@ -4825,15 +4829,28 @@ static long ext4_zero_range(struct file *file, loff_t offset, ret = inode_newsize_ok(inode, new_size); if (ret) goto out_mutex; - /* - * If we have a partial block after EOF we have to allocate - * the entire block. - */ - if (partial_end) - max_blocks += 1; } + flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; + if (mode & FALLOC_FL_KEEP_SIZE) + flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + + /* Preallocate the range including the unaligned edges */ + if (partial_begin || partial_end) { + ret = ext4_alloc_file_blocks(file, + round_down(offset, 1 << blkbits) >> blkbits, + (round_up((offset + len), 1 << blkbits) - + round_down(offset, 1 << blkbits)) >> blkbits, + new_size, flags, mode); + if (ret) + goto out_mutex; + + } + + /* Zero range excluding the unaligned edges */ if (max_blocks > 0) { + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE); /* Now release the pages and zero block aligned part of pages*/ truncate_pagecache_range(inode, start, end - 1); @@ -4847,19 +4864,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags, mode); if (ret) goto out_dio; - /* - * Remove entire range from the extent status tree. - * - * ext4_es_remove_extent(inode, lblk, max_blocks) is - * NOT sufficient. I'm not sure why this is the case, - * but let's be conservative and remove the extent - * status tree for the entire inode. There should be - * no outstanding delalloc extents thanks to the - * filemap_write_and_wait_range() call above. - */ - ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); - if (ret) - goto out_dio; } if (!partial_begin && !partial_end) goto out_dio; @@ -4922,9 +4926,25 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) ext4_lblk_t lblk; unsigned int blkbits = inode->i_blkbits; + /* + * Encrypted inodes can't handle collapse range or insert + * range since we would need to re-encrypt blocks with a + * different IV or XTS tweak (which are based on the logical + * block number). + * + * XXX It's not clear why zero range isn't working, but we'll + * leave it disabled for encrypted inodes for now. This is a + * bug we should fix.... + */ + if (ext4_encrypted_inode(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | + FALLOC_FL_ZERO_RANGE))) + return -EOPNOTSUPP; + /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -4934,16 +4954,12 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (ret) return ret; - /* - * currently supporting (pre)allocate mode for extent-based - * files _only_ - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - return -EOPNOTSUPP; - if (mode & FALLOC_FL_COLLAPSE_RANGE) return ext4_collapse_range(inode, offset, len); + if (mode & FALLOC_FL_INSERT_RANGE) + return ext4_insert_range(inode, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); @@ -4962,6 +4978,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) mutex_lock(&inode->i_mutex); + /* + * We only support preallocation for extent-based files only + */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > i_size_read(inode)) { new_size = offset + len; @@ -5230,13 +5254,13 @@ ext4_access_path(handle_t *handle, struct inode *inode, /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext - * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift - * from starting block for each extent. + * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells + * if it is right shift or left shift operation. */ static int ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, struct inode *inode, handle_t *handle, - ext4_lblk_t *start) + enum SHIFT_DIRECTION SHIFT) { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; @@ -5258,19 +5282,25 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; - *start = le32_to_cpu(ex_last->ee_block) + - ext4_ext_get_actual_len(ex_last); - while (ex_start <= ex_last) { - le32_add_cpu(&ex_start->ee_block, -shift); - /* Try to merge to the left. */ - if ((ex_start > - EXT_FIRST_EXTENT(path[depth].p_hdr)) && - ext4_ext_try_to_merge_right(inode, - path, ex_start - 1)) + if (SHIFT == SHIFT_LEFT) { + le32_add_cpu(&ex_start->ee_block, + -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) + && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } else { + le32_add_cpu(&ex_last->ee_block, shift); + ext4_ext_try_to_merge_right(inode, path, + ex_last); ex_last--; - else - ex_start++; + } } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -5285,7 +5315,10 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (err) goto out; - le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + if (SHIFT == SHIFT_LEFT) + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + else + le32_add_cpu(&path[depth].p_idx->ei_block, shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; @@ -5303,19 +5336,20 @@ out: /* * ext4_ext_shift_extents: - * All the extents which lies in the range from start to the last allocated - * block for the file are shifted downwards by shift blocks. + * All the extents which lies in the range from @start to the last allocated + * block for the @inode are shifted either towards left or right (depending + * upon @SHIFT) by @shift blocks. * On success, 0 is returned, error otherwise. */ static int ext4_ext_shift_extents(struct inode *inode, handle_t *handle, - ext4_lblk_t start, ext4_lblk_t shift) + ext4_lblk_t start, ext4_lblk_t shift, + enum SHIFT_DIRECTION SHIFT) { struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; - ext4_lblk_t stop_block; - ext4_lblk_t ex_start, ex_end; + ext4_lblk_t stop, *iterator, ex_start, ex_end; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); @@ -5327,58 +5361,84 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) goto out; - stop_block = le32_to_cpu(extent->ee_block) + + stop = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); - /* Nothing to shift, if hole is at the end of file */ - if (start >= stop_block) - goto out; + /* + * In case of left shift, Don't start shifting extents until we make + * sure the hole is big enough to accommodate the shift. + */ + if (SHIFT == SHIFT_LEFT) { + path = ext4_find_extent(inode, start - 1, &path, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } - /* - * Don't start shifting extents until we make sure the hole is big - * enough to accomodate the shift. - */ - path = ext4_find_extent(inode, start - 1, &path, 0); - if (IS_ERR(path)) - return PTR_ERR(path); - depth = path->p_depth; - extent = path[depth].p_ext; - if (extent) { - ex_start = le32_to_cpu(extent->ee_block); - ex_end = le32_to_cpu(extent->ee_block) + - ext4_ext_get_actual_len(extent); - } else { - ex_start = 0; - ex_end = 0; + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) { + ext4_ext_drop_refs(path); + kfree(path); + return -EINVAL; + } } - if ((start == ex_start && shift > ex_start) || - (shift > start - ex_end)) - return -EINVAL; + /* + * In case of left shift, iterator points to start and it is increased + * till we reach stop. In case of right shift, iterator points to stop + * and it is decreased till we reach start. + */ + if (SHIFT == SHIFT_LEFT) + iterator = &start; + else + iterator = &stop; /* Its safe to start updating extents */ - while (start < stop_block) { - path = ext4_find_extent(inode, start, &path, 0); + while (start < stop) { + path = ext4_find_extent(inode, *iterator, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) start); + (unsigned long) *iterator); return -EIO; } - if (start > le32_to_cpu(extent->ee_block)) { + if (SHIFT == SHIFT_LEFT && *iterator > + le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { path[depth].p_ext++; } else { - start = ext4_ext_next_allocated_block(path); + *iterator = ext4_ext_next_allocated_block(path); continue; } } + + if (SHIFT == SHIFT_LEFT) { + extent = EXT_LAST_EXTENT(path[depth].p_hdr); + *iterator = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + extent = EXT_FIRST_EXTENT(path[depth].p_hdr); + *iterator = le32_to_cpu(extent->ee_block) > 0 ? + le32_to_cpu(extent->ee_block) - 1 : 0; + /* Update path extent in case we need to stop */ + while (le32_to_cpu(extent->ee_block) < start) + extent++; + path[depth].p_ext = extent; + } ret = ext4_ext_shift_path_extents(path, shift, inode, - handle, &start); + handle, SHIFT); if (ret) break; } @@ -5402,6 +5462,14 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size, ioffset; int ret; + /* + * We need to test this early because xfstests assumes that a + * collapse range of (0, 1) will return EOPNOTSUPP if the file + * system does not support collapse range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + /* Collapse range works only on fs block size aligned offsets. */ if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || len & (EXT4_CLUSTER_SIZE(sb) - 1)) @@ -5483,7 +5551,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, - punch_stop - punch_start); + punch_stop - punch_start, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; @@ -5508,6 +5576,174 @@ out_mutex: return ret; } +/* + * ext4_insert_range: + * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. + * The data blocks starting from @offset to the EOF are shifted by @len + * towards right to create a hole in the @inode. Inode size is increased + * by len bytes. + * Returns 0 on success, error otherwise. + */ +int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + handle_t *handle; + struct ext4_ext_path *path; + struct ext4_extent *extent; + ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + unsigned int credits, ee_len; + int ret = 0, depth, split_flag = 0; + loff_t ioffset; + + /* + * We need to test this early because xfstests assumes that an + * insert range of (0, 1) will return EOPNOTSUPP if the file + * system does not support insert range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + + /* Insert range works only on fs block size aligned offsets. */ + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + trace_ext4_insert_range(inode, offset, len); + + offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); + len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + /* Check for wrap through zero */ + if (inode->i_size + len > inode->i_sb->s_maxbytes) { + ret = -EFBIG; + goto out_mutex; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + + truncate_pagecache(inode, ioffset); + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } + + /* Expand file to avoid data loss if there is error while shifting */ + inode->i_size += len; + EXT4_I(inode)->i_disksize += len; + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + path = ext4_find_extent(inode, offset_lblk, NULL, 0); + if (IS_ERR(path)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + depth = ext_depth(inode); + extent = path[depth].p_ext; + if (extent) { + ee_start_lblk = le32_to_cpu(extent->ee_block); + ee_len = ext4_ext_get_actual_len(extent); + + /* + * If offset_lblk is not the starting block of extent, split + * the extent @offset_lblk + */ + if ((offset_lblk > ee_start_lblk) && + (offset_lblk < (ee_start_lblk + ee_len))) { + if (ext4_ext_is_unwritten(extent)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; + ret = ext4_split_extent_at(handle, inode, &path, + offset_lblk, split_flag, + EXT4_EX_NOCACHE | + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } + + ext4_ext_drop_refs(path); + kfree(path); + if (ret < 0) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + } + + ret = ext4_es_remove_extent(inode, offset_lblk, + EXT_MAX_BLOCKS - offset_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + /* + * if offset_lblk lies in a hole which is at start of file, use + * ee_start_lblk to shift extents + */ + ret = ext4_ext_shift_extents(inode, handle, + ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, + len_lblk, SHIFT_RIGHT); + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} + /** * ext4_swap_extents - Swap extents between two inodes * @@ -5540,7 +5776,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); BUG_ON(!mutex_is_locked(&inode1->i_mutex)); - BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + BUG_ON(!mutex_is_locked(&inode2->i_mutex)); *erp = ext4_es_remove_extent(inode1, lblk1, count); if (unlikely(*erp)) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e04d45733976..26724aeece73 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -9,12 +9,10 @@ * * Ext4 extents status tree core functions. */ -#include <linux/rbtree.h> #include <linux/list_sort.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include "ext4.h" -#include "extents_status.h" #include <trace/events/ext4.h> @@ -705,6 +703,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, BUG_ON(end < lblk); + if ((status & EXTENT_STATUS_DELAYED) && + (status & EXTENT_STATUS_WRITTEN)) { + ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " + " delayed and written which can potentially " + " cause data loss.\n", lblk, len); + WARN_ON(1); + } + newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 33a09da16c9c..bc313ac5d3fa 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -20,12 +20,11 @@ #include <linux/time.h> #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/mount.h> #include <linux/path.h> -#include <linux/aio.h> #include <linux/quotaops.h> #include <linux/pagevec.h> +#include <linux/uio.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -95,11 +94,9 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct mutex *aio_mutex = NULL; struct blk_plug plug; - int o_direct = io_is_direct(file); + int o_direct = iocb->ki_flags & IOCB_DIRECT; int overwrite = 0; - size_t length = iov_iter_count(from); ssize_t ret; - loff_t pos = iocb->ki_pos; /* * Unaligned direct AIO must be serialized; see comment above @@ -108,16 +105,17 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (o_direct && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && !is_sync_kiocb(iocb) && - (file->f_flags & O_APPEND || - ext4_unaligned_aio(inode, from, pos))) { + (iocb->ki_flags & IOCB_APPEND || + ext4_unaligned_aio(inode, from, iocb->ki_pos))) { aio_mutex = ext4_aio_mutex(inode); mutex_lock(aio_mutex); ext4_unwritten_wait(inode); } mutex_lock(&inode->i_mutex); - if (file->f_flags & O_APPEND) - iocb->ki_pos = pos = i_size_read(inode); + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto out; /* * If we have encountered a bitmap-format file, the size limit @@ -126,22 +124,19 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if ((pos > sbi->s_bitmap_maxbytes) || - (pos == sbi->s_bitmap_maxbytes && length > 0)) { - mutex_unlock(&inode->i_mutex); + if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) { ret = -EFBIG; - goto errout; + goto out; } - - if (pos + length > sbi->s_bitmap_maxbytes) - iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos); + iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos); } iocb->private = &overwrite; if (o_direct) { + size_t length = iov_iter_count(from); + loff_t pos = iocb->ki_pos; blk_start_plug(&plug); - /* check whether we do a DIO overwrite or not */ if (ext4_should_dioread_nolock(inode) && !aio_mutex && !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) { @@ -185,27 +180,45 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (o_direct) blk_finish_plug(&plug); -errout: + if (aio_mutex) + mutex_unlock(aio_mutex); + return ret; + +out: + mutex_unlock(&inode->i_mutex); if (aio_mutex) mutex_unlock(aio_mutex); return ret; } #ifdef CONFIG_FS_DAX +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext4_get_block); + return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten); /* Is this the right get_block? */ } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block); + return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten); } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .page_mkwrite = ext4_dax_mkwrite, + .pfn_mkwrite = dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops @@ -219,6 +232,15 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { + struct inode *inode = file->f_mapping->host; + + if (ext4_encrypted_inode(inode)) { + int err = ext4_get_encryption_info(inode); + if (err) + return 0; + if (ext4_encryption_info(inode) == NULL) + return -ENOKEY; + } file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -236,6 +258,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) struct vfsmount *mnt = filp->f_path.mnt; struct path path; char buf[64], *cp; + int ret; if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) && !(sb->s_flags & MS_RDONLY))) { @@ -269,12 +292,19 @@ static int ext4_file_open(struct inode * inode, struct file * filp) ext4_journal_stop(handle); } } + if (ext4_encrypted_inode(inode)) { + ret = ext4_get_encryption_info(inode); + if (ret) + return -EACCES; + if (ext4_encryption_info(inode) == NULL) + return -ENOKEY; + } /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present */ if (filp->f_mode & FMODE_WRITE) { - int ret = ext4_inode_attach_jinode(inode); + ret = ext4_inode_attach_jinode(inode); if (ret < 0) return ret; } @@ -607,8 +637,6 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence) const struct file_operations ext4_file_operations = { .llseek = ext4_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .write_iter = ext4_file_write_iter, .unlocked_ioctl = ext4_ioctl, @@ -624,26 +652,6 @@ const struct file_operations ext4_file_operations = { .fallocate = ext4_fallocate, }; -#ifdef CONFIG_FS_DAX -const struct file_operations ext4_dax_file_operations = { - .llseek = ext4_llseek, - .read = new_sync_read, - .write = new_sync_write, - .read_iter = generic_file_read_iter, - .write_iter = ext4_file_write_iter, - .unlocked_ioctl = ext4_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = ext4_compat_ioctl, -#endif - .mmap = ext4_file_mmap, - .open = ext4_file_open, - .release = ext4_release_file, - .fsync = ext4_sync_file, - /* Splice not yet supported with DAX */ - .fallocate = ext4_fallocate, -}; -#endif - const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index a8bc47f75fa0..8850254136ae 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -26,7 +26,6 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/writeback.h> -#include <linux/jbd2.h> #include <linux/blkdev.h> #include "ext4.h" @@ -56,7 +55,7 @@ static int ext4_sync_parent(struct inode *inode) dentry = d_find_any_alias(inode); if (!dentry) break; - next = igrab(dentry->d_parent->d_inode); + next = igrab(d_inode(dentry->d_parent)); dput(dentry); if (!next) break; diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 3d586f02883e..e026aa941fd5 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -10,7 +10,6 @@ */ #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/cryptohash.h> #include "ext4.h" diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index ac644c31ca67..173c1ae21395 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -14,7 +14,6 @@ #include <linux/time.h> #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> @@ -444,7 +443,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); if (S_ISDIR(mode) && - ((parent == sb->s_root->d_inode) || + ((parent == d_inode(sb->s_root)) || (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) { int best_ndir = inodes_per_group; int ret = -1; @@ -727,11 +726,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp; + int encrypt = 0; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); + if ((ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + err = ext4_get_encryption_info(dir); + if (err) + return ERR_PTR(err); + if (ext4_encryption_info(dir) == NULL) + return ERR_PTR(-EPERM); + if (!handle) + nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); + encrypt = 1; + } + sb = dir->i_sb; ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); @@ -1029,11 +1042,9 @@ got: ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; - ei->i_inline_off = 0; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - ret = inode; err = dquot_alloc_inode(inode); if (err) @@ -1060,6 +1071,12 @@ got: ei->i_datasync_tid = handle->h_transaction->t_tid; } + if (encrypt) { + err = ext4_inherit_context(dir, inode); + if (err) + goto fail_free_drop; + } + err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 45fe924f82bc..4f6ac499f09e 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,9 +20,9 @@ * (sct@redhat.com), 1993, 1998 */ -#include <linux/aio.h> #include "ext4_jbd2.h" #include "truncate.h" +#include <linux/uio.h> #include <trace/events/ext4.h> @@ -565,7 +565,7 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, EXT4_FEATURE_RO_COMPAT_BIGALLOC)) { EXT4_ERROR_INODE(inode, "Can't allocate blocks for " "non-extent mapped inodes with bigalloc"); - return -ENOSPC; + return -EUCLEAN; } /* Set up for the direct block allocation */ @@ -576,6 +576,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; ar.goal = ext4_find_goal(inode, map->m_lblk, partial); @@ -642,8 +644,8 @@ out: * crashes then stale disk data _may_ be exposed inside the file. But current * VFS code falls back into buffered path in that case so we are safe. */ -ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -654,7 +656,7 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_iter_count(iter); int retries = 0; - if (rw == WRITE) { + if (iov_iter_rw(iter) == WRITE) { loff_t final_size = offset + count; if (final_size > inode->i_size) { @@ -676,37 +678,38 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, } retry: - if (rw == READ && ext4_should_dioread_nolock(inode)) { + if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) { /* * Nolock dioread optimization may be dynamically disabled * via ext4_inode_block_unlocked_dio(). Check inode's state * while holding extra i_dio_count ref. */ - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); smp_mb(); if (unlikely(ext4_test_inode_state(inode, EXT4_STATE_DIOREAD_LOCK))) { - inode_dio_done(inode); + inode_dio_end(inode); goto locked; } if (IS_DAX(inode)) - ret = dax_do_io(rw, iocb, inode, iter, offset, + ret = dax_do_io(iocb, inode, iter, offset, ext4_get_block, NULL, 0); else - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, offset, - ext4_get_block, NULL, NULL, 0); - inode_dio_done(inode); + ret = __blockdev_direct_IO(iocb, inode, + inode->i_sb->s_bdev, iter, + offset, ext4_get_block, NULL, + NULL, 0); + inode_dio_end(inode); } else { locked: if (IS_DAX(inode)) - ret = dax_do_io(rw, iocb, inode, iter, offset, + ret = dax_do_io(iocb, inode, iter, offset, ext4_get_block, NULL, DIO_LOCKING); else - ret = blockdev_direct_IO(rw, iocb, inode, iter, - offset, ext4_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, offset, + ext4_get_block); - if (unlikely((rw & WRITE) && ret < 0)) { + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = offset + count; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 4b143febf21f..cd944a7a99cd 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -11,11 +11,13 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ + +#include <linux/fiemap.h> + #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "truncate.h" -#include <linux/fiemap.h> #define EXT4_XATTR_SYSTEM_DATA "data" #define EXT4_MIN_INLINE_DATA_SIZE ((sizeof(__le32) * EXT4_N_BLOCKS)) @@ -972,7 +974,7 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, offset = 0; while ((void *)de < dlimit) { de_len = ext4_rec_len_from_disk(de->rec_len, inline_size); - trace_printk("de: off %u rlen %u name %*.s nlen %u ino %u\n", + trace_printk("de: off %u rlen %u name %.*s nlen %u ino %u\n", offset, de_len, de->name_len, de->name, de->name_len, le32_to_cpu(de->inode)); if (ext4_check_dir_entry(dir, NULL, de, bh, @@ -993,20 +995,18 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, * and -EEXIST if directory entry already exists. */ static int ext4_add_dirent_to_inline(handle_t *handle, + struct ext4_filename *fname, struct dentry *dentry, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { - struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + struct inode *dir = d_inode(dentry->d_parent); int err; struct ext4_dir_entry_2 *de; - err = ext4_find_dest_de(dir, inode, iloc->bh, - inline_start, inline_size, - name, namelen, &de); + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, + inline_size, fname, &de); if (err) return err; @@ -1014,7 +1014,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(inode, de, inline_size, name, namelen); + ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); @@ -1245,13 +1245,13 @@ out: * If succeeds, return 0. If not, extended the inline dir and copied data to * the new created block. */ -int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) +int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode) { int ret, inline_size; void *inline_start; struct ext4_iloc iloc; - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); ret = ext4_get_inode_loc(dir, &iloc); if (ret) @@ -1265,7 +1265,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; @@ -1286,8 +1286,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, - inline_start, inline_size); + ret = ext4_add_dirent_to_inline(handle, fname, dentry, + inode, &iloc, inline_start, + inline_size); if (ret != -ENOSPC) goto out; @@ -1327,6 +1328,7 @@ int htree_inlinedir_to_tree(struct file *dir_file, struct ext4_iloc iloc; void *dir_buf = NULL; struct ext4_dir_entry_2 fake; + struct ext4_str tmp_str; ret = ext4_get_inode_loc(inode, &iloc); if (ret) @@ -1398,8 +1400,10 @@ int htree_inlinedir_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - err = ext4_htree_store_dirent(dir_file, - hinfo->hash, hinfo->minor_hash, de); + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, hinfo->hash, + hinfo->minor_hash, de, &tmp_str); if (err) { count = err; goto out; @@ -1605,6 +1609,7 @@ out: } struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) @@ -1626,8 +1631,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); + ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + dir, fname, d_name, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) @@ -1639,8 +1644,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); + ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + dir, fname, d_name, 0, res_dir); if (ret == 1) goto out_find; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5cb9a212b86f..cecf9aa10811 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -20,7 +20,6 @@ #include <linux/fs.h> #include <linux/time.h> -#include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> @@ -36,8 +35,6 @@ #include <linux/kernel.h> #include <linux/printk.h> #include <linux/slab.h> -#include <linux/ratelimit.h> -#include <linux/aio.h> #include <linux/bitops.h> #include "ext4_jbd2.h" @@ -141,7 +138,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, /* * Test whether an inode is a fast symlink. */ -static int ext4_inode_is_fast_symlink(struct inode *inode) +int ext4_inode_is_fast_symlink(struct inode *inode) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; @@ -534,6 +531,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && ext4_find_delalloc_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; @@ -638,6 +636,7 @@ found: status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && + !(status & EXTENT_STATUS_WRITTEN) && ext4_find_delalloc_range(inode, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; @@ -657,18 +656,6 @@ has_zeroout: return retval; } -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ - struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; - int err; - if (!uptodate) - return; - WARN_ON(!buffer_unwritten(bh)); - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} - /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -706,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + if (IS_DAX(inode) && buffer_unwritten(bh)) { + /* + * dgc: I suspect unwritten conversion on ext4+DAX is + * fundamentally broken here when there are concurrent + * read/write in progress on this inode. + */ + WARN_ON_ONCE(io_end); bh->b_assoc_map = inode->i_mapping; bh->b_private = (void *)(unsigned long)iblock; - bh->b_end_io = ext4_end_io_unwritten; } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); @@ -732,18 +724,18 @@ int ext4_get_block(struct inode *inode, sector_t iblock, * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create) + ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; + int create = map_flags & EXT4_GET_BLOCKS_CREATE; int err; J_ASSERT(handle != NULL || create == 0); map.m_lblk = block; map.m_len = 1; - err = ext4_map_blocks(handle, inode, &map, - create ? EXT4_GET_BLOCKS_CREATE : 0); + err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; @@ -789,11 +781,11 @@ errout: } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create) + ext4_lblk_t block, int map_flags) { struct buffer_head *bh; - bh = ext4_getblk(handle, inode, block, create); + bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || buffer_uptodate(bh)) @@ -888,6 +880,95 @@ int do_journal_get_write_access(handle_t *handle, static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION +static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, + get_block_t *get_block) +{ + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + struct inode *inode = page->mapping->host; + unsigned block_start, block_end; + sector_t block; + int err = 0; + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned bbits; + struct buffer_head *bh, *head, *wait[2], **wait_bh = wait; + bool decrypt = false; + + BUG_ON(!PageLocked(page)); + BUG_ON(from > PAGE_CACHE_SIZE); + BUG_ON(to > PAGE_CACHE_SIZE); + BUG_ON(from > to); + + if (!page_has_buffers(page)) + create_empty_buffers(page, blocksize, 0); + head = page_buffers(page); + bbits = ilog2(blocksize); + block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits); + + for (bh = head, block_start = 0; bh != head || !block_start; + block++, block_start = block_end, bh = bh->b_this_page) { + block_end = block_start + blocksize; + if (block_end <= from || block_start >= to) { + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + } + continue; + } + if (buffer_new(bh)) + clear_buffer_new(bh); + if (!buffer_mapped(bh)) { + WARN_ON(bh->b_size != blocksize); + err = get_block(inode, block, bh, 1); + if (err) + break; + if (buffer_new(bh)) { + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + if (PageUptodate(page)) { + clear_buffer_new(bh); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + continue; + } + if (block_end > to || block_start < from) + zero_user_segments(page, to, block_end, + block_start, from); + continue; + } + } + if (PageUptodate(page)) { + if (!buffer_uptodate(bh)) + set_buffer_uptodate(bh); + continue; + } + if (!buffer_uptodate(bh) && !buffer_delay(bh) && + !buffer_unwritten(bh) && + (block_start < from || block_end > to)) { + ll_rw_block(READ, 1, &bh); + *wait_bh++ = bh; + decrypt = ext4_encrypted_inode(inode) && + S_ISREG(inode->i_mode); + } + } + /* + * If we issued read requests, let them complete. + */ + while (wait_bh > wait) { + wait_on_buffer(*--wait_bh); + if (!buffer_uptodate(*wait_bh)) + err = -EIO; + } + if (unlikely(err)) + page_zero_new_buffers(page, from, to); + else if (decrypt) + err = ext4_decrypt_one(inode, page); + return err; +} +#endif + static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -950,11 +1031,19 @@ retry_journal: /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_should_dioread_nolock(inode)) + ret = ext4_block_write_begin(page, pos, len, + ext4_get_block_write); + else + ret = ext4_block_write_begin(page, pos, len, + ext4_get_block); +#else if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(page, pos, len, ext4_get_block_write); else ret = __block_write_begin(page, pos, len, ext4_get_block); - +#endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, page_buffers(page), from, to, NULL, @@ -1165,13 +1254,12 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a single cluster located at lblock + * Reserve space for a single cluster */ -static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +static int ext4_da_reserve_space(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - unsigned int md_needed; int ret; /* @@ -1183,25 +1271,14 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) if (ret) return ret; - /* - * recalculate the amount of metadata blocks to reserve - * in order to allocate nrblocks - * worse case is one extent per block - */ spin_lock(&ei->i_block_reservation_lock); - /* - * ext4_calc_metadata_amount() has side effects, which we have - * to be prepared undo if we fail to claim space. - */ - md_needed = 0; - trace_ext4_da_reserve_space(inode, 0); - if (ext4_claim_free_clusters(sbi, 1, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } ei->i_reserved_data_blocks++; + trace_ext4_da_reserve_space(inode); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1246,7 +1323,7 @@ static void ext4_da_page_release_reservation(struct page *page, unsigned int offset, unsigned int length) { - int to_release = 0; + int to_release = 0, contiguous_blks = 0; struct buffer_head *head, *bh; unsigned int curr_off = 0; struct inode *inode = page->mapping->host; @@ -1267,14 +1344,23 @@ static void ext4_da_page_release_reservation(struct page *page, if ((offset <= curr_off) && (buffer_delay(bh))) { to_release++; + contiguous_blks++; clear_buffer_delay(bh); + } else if (contiguous_blks) { + lblk = page->index << + (PAGE_CACHE_SHIFT - inode->i_blkbits); + lblk += (curr_off >> inode->i_blkbits) - + contiguous_blks; + ext4_es_remove_extent(inode, lblk, contiguous_blks); + contiguous_blks = 0; } curr_off = next_off; } while ((bh = bh->b_this_page) != head); - if (to_release) { + if (contiguous_blks) { lblk = page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits); - ext4_es_remove_extent(inode, lblk, to_release); + lblk += (curr_off >> inode->i_blkbits) - contiguous_blks; + ext4_es_remove_extent(inode, lblk, contiguous_blks); } /* If we have released all the blocks belonging to a cluster, then we @@ -1470,9 +1556,9 @@ add_delayed: * then we don't need to reserve it again. However we still need * to reserve metadata for every block we're going to write. */ - if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 || + if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || !ext4_find_delalloc_cluster(inode, map->m_lblk)) { - ret = ext4_da_reserve_space(inode, iblock); + ret = ext4_da_reserve_space(inode); if (ret) { /* not enough space to reserve */ retval = ret; @@ -1605,19 +1691,32 @@ static int __ext4_journalled_writepage(struct page *page, ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); } - /* As soon as we unlock the page, it can go away, but we have - * references to buffers so we are safe */ + /* + * We need to release the page lock before we start the + * journal, so grab a reference so the page won't disappear + * out from under us. + */ + get_page(page); unlock_page(page); handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, ext4_writepage_trans_blocks(inode)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out; + put_page(page); + goto out_no_pagelock; } - BUG_ON(!ext4_handle_valid(handle)); + lock_page(page); + put_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + ext4_journal_stop(handle); + ret = 0; + goto out; + } + if (inline_data) { BUFFER_TRACE(inode_bh, "get write access"); ret = ext4_journal_get_write_access(handle, inode_bh); @@ -1643,6 +1742,8 @@ static int __ext4_journalled_writepage(struct page *page, NULL, bput_one); ext4_set_inode_state(inode, EXT4_STATE_JDATA); out: + unlock_page(page); +out_no_pagelock: brelse(inode_bh); return ret; } @@ -2576,7 +2677,12 @@ retry_journal: /* In case writeback began while the page was unlocked */ wait_for_stable_page(page); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ret = ext4_block_write_begin(page, pos, len, + ext4_da_get_block_prep); +#else ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep); +#endif if (ret < 0) { unlock_page(page); ext4_journal_stop(handle); @@ -2822,7 +2928,7 @@ static int ext4_readpage(struct file *file, struct page *page) ret = ext4_readpage_inline(inode, page); if (ret == -EAGAIN) - return mpage_readpage(page, ext4_get_block); + return ext4_mpage_readpages(page->mapping, NULL, page, 1); return ret; } @@ -2837,7 +2943,7 @@ ext4_readpages(struct file *file, struct address_space *mapping, if (ext4_has_inline_data(inode)) return 0; - return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); + return ext4_mpage_readpages(mapping, pages, NULL, nr_pages); } static void ext4_invalidatepage(struct page *page, unsigned int offset, @@ -2953,8 +3059,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * if the machine crashes during the write. * */ -static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -2967,8 +3073,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, ext4_io_end_t *io_end = NULL; /* Use the old path for reads and writes beyond i_size. */ - if (rw != WRITE || final_size > inode->i_size) - return ext4_ind_direct_IO(rw, iocb, iter, offset); + if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) + return ext4_ind_direct_IO(iocb, iter, offset); BUG_ON(iocb->private == NULL); @@ -2977,8 +3083,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, * conversion. This also disallows race between truncate() and * overwrite DIO as i_dio_count needs to be incremented under i_mutex. */ - if (rw == WRITE) - atomic_inc(&inode->i_dio_count); + if (iov_iter_rw(iter) == WRITE) + inode_dio_begin(inode); /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); @@ -3034,11 +3140,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); +#endif if (IS_DAX(inode)) - ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func, + ret = dax_do_io(iocb, inode, iter, offset, get_block_func, ext4_end_io_dio, dio_flags); else - ret = __blockdev_direct_IO(rw, iocb, inode, + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, offset, get_block_func, ext4_end_io_dio, NULL, dio_flags); @@ -3079,8 +3188,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, } retake_lock: - if (rw == WRITE) - inode_dio_done(inode); + if (iov_iter_rw(iter) == WRITE) + inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) { up_read(&EXT4_I(inode)->i_data_sem); @@ -3090,14 +3199,19 @@ retake_lock: return ret; } -static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return 0; +#endif + /* * If we are doing data journalling we don't support O_DIRECT */ @@ -3108,12 +3222,12 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb, if (ext4_has_inline_data(inode)) return 0; - trace_ext4_direct_IO_enter(inode, offset, count, rw); + trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(rw, iocb, iter, offset); + ret = ext4_ext_direct_IO(iocb, iter, offset); else - ret = ext4_ind_direct_IO(rw, iocb, iter, offset); - trace_ext4_direct_IO_exit(inode, offset, count, rw, ret); + ret = ext4_ind_direct_IO(iocb, iter, offset); + trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); return ret; } @@ -3262,6 +3376,13 @@ static int __ext4_block_zero_page_range(handle_t *handle, /* Uhhuh. Read error. Complain and punt. */ if (!buffer_uptodate(bh)) goto unlock; + if (S_ISREG(inode->i_mode) && + ext4_encrypted_inode(inode)) { + /* We expect the key to be set. */ + BUG_ON(!ext4_has_encryption_key(inode)); + BUG_ON(blocksize != PAGE_CACHE_SIZE); + WARN_ON_ONCE(ext4_decrypt_one(inode, page)); + } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); @@ -4091,16 +4212,17 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) - inode->i_fop = &ext4_dax_file_operations; - else - inode->i_fop = &ext4_file_operations; + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (ext4_inode_is_fast_symlink(inode)) { + if (ext4_encrypted_inode(inode)) { + inode->i_op = &ext4_encrypted_symlink_inode_operations; + ext4_set_aops(inode); + } else if (ext4_inode_is_fast_symlink(inode)) { + inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); @@ -4231,7 +4353,12 @@ static void ext4_update_other_inodes_time(struct super_block *sb, int inode_size = EXT4_INODE_SIZE(sb); oi.orig_ino = orig_ino; - ino = orig_ino & ~(inodes_per_block - 1); + /* + * Calculate the first inode in the inode table block. Inode + * numbers are one-based. That is, the first inode in a block + * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). + */ + ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; @@ -4525,7 +4652,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) */ int ext4_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; @@ -4564,8 +4691,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_journal_stop(handle); } - if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; + loff_t oldsize = inode->i_size; + int shrink = (attr->ia_size <= inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4573,24 +4702,26 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } + if (!S_ISREG(inode->i_mode)) + return -EINVAL; if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) inode_inc_iversion(inode); - if (S_ISREG(inode->i_mode) && + if (ext4_should_order_data(inode) && (attr->ia_size < inode->i_size)) { - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, + error = ext4_begin_ordered_truncate(inode, attr->ia_size); - if (error) - goto err_out; - } + if (error) + goto err_out; + } + if (attr->ia_size != inode->i_size) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } - if (ext4_handle_valid(handle)) { + if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } @@ -4609,15 +4740,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) { - ext4_orphan_del(NULL, inode); + if (orphan) + ext4_orphan_del(NULL, inode); goto err_out; } - } else { - loff_t oldsize = inode->i_size; - - i_size_write(inode, attr->ia_size); - pagecache_isize_extended(inode, oldsize, inode->i_size); } + if (!shrink) + pagecache_isize_extended(inode, oldsize, inode->i_size); /* * Blocks are going to be removed from the inode. Wait @@ -4637,13 +4766,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); + if (shrink) + ext4_truncate(inode); } - /* - * We want to call ext4_truncate() even if attr->ia_size == - * inode->i_size for cases like truncation of fallocated space - */ - if (attr->ia_valid & ATTR_SIZE) - ext4_truncate(inode); if (!rc) { setattr_copy(inode, attr); @@ -4673,7 +4798,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct inode *inode; unsigned long long delalloc_blocks; - inode = dentry->d_inode; + inode = d_inode(dentry); generic_fillattr(inode, stat); /* diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f58a0d106726..1346cfa355d0 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -8,12 +8,12 @@ */ #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/capability.h> #include <linux/time.h> #include <linux/compat.h> #include <linux/mount.h> #include <linux/file.h> +#include <linux/random.h> #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -31,14 +31,11 @@ static void memswap(void *a, void *b, size_t len) { unsigned char *ap, *bp; - unsigned char tmp; ap = (unsigned char *)a; bp = (unsigned char *)b; while (len-- > 0) { - tmp = *ap; - *ap = *bp; - *bp = tmp; + swap(*ap, *bp); ap++; bp++; } @@ -196,6 +193,16 @@ journal_err_out: return err; } +static int uuid_is_zero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return 0; + return 1; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -615,7 +622,78 @@ resizefs_out: } case EXT4_IOC_PRECACHE_EXTENTS: return ext4_ext_precache(inode); + case EXT4_IOC_SET_ENCRYPTION_POLICY: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_encryption_policy policy; + int err = 0; + + if (copy_from_user(&policy, + (struct ext4_encryption_policy __user *)arg, + sizeof(policy))) { + err = -EFAULT; + goto encryption_policy_out; + } + err = ext4_process_policy(&policy, inode); +encryption_policy_out: + return err; +#else + return -EOPNOTSUPP; +#endif + } + case EXT4_IOC_GET_ENCRYPTION_PWSALT: { + int err, err2; + struct ext4_sb_info *sbi = EXT4_SB(sb); + handle_t *handle; + + if (!ext4_sb_has_crypto(sb)) + return -EOPNOTSUPP; + if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) { + err = mnt_want_write_file(filp); + if (err) + return err; + handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto pwsalt_err_exit; + } + err = ext4_journal_get_write_access(handle, sbi->s_sbh); + if (err) + goto pwsalt_err_journal; + generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); + err = ext4_handle_dirty_metadata(handle, NULL, + sbi->s_sbh); + pwsalt_err_journal: + err2 = ext4_journal_stop(handle); + if (err2 && !err) + err = err2; + pwsalt_err_exit: + mnt_drop_write_file(filp); + if (err) + return err; + } + if (copy_to_user((void __user *) arg, + sbi->s_es->s_encrypt_pw_salt, 16)) + return -EFAULT; + return 0; + } + case EXT4_IOC_GET_ENCRYPTION_POLICY: { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_encryption_policy policy; + int err = 0; + + if (!ext4_encrypted_inode(inode)) + return -ENOENT; + err = ext4_get_policy(inode, &policy); + if (err) + return err; + if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) + return -EFAULT; + return 0; +#else + return -EOPNOTSUPP; +#endif + } default: return -ENOTTY; } @@ -677,9 +755,11 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } case EXT4_IOC_MOVE_EXT: - case FITRIM: case EXT4_IOC_RESIZE_FS: case EXT4_IOC_PRECACHE_EXTENTS: + case EXT4_IOC_SET_ENCRYPTION_POLICY: + case EXT4_IOC_GET_ENCRYPTION_PWSALT: + case EXT4_IOC_GET_ENCRYPTION_POLICY: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8d1e60214ef0..34b610ea5030 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -26,6 +26,7 @@ #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/backing-dev.h> #include <trace/events/ext4.h> #ifdef CONFIG_EXT4_DEBUG @@ -882,10 +883,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { - if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) err = -EIO; - goto out; - } } first_block = page->index * blocks_per_page; @@ -898,6 +897,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* skip initialized uptodate buddy */ continue; + if (!buffer_verified(bh[group - first_group])) + /* Skip faulty bitmaps */ + continue; + err = 0; + /* * data carry information regarding this * particular group in the format specified @@ -2008,7 +2012,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } -/* This is now called BEFORE we load the buddy bitmap. */ +/* + * This is now called BEFORE we load the buddy bitmap. + * Returns either 1 or 0 indicating that the group is either suitable + * for the allocation or not. In addition it can also return negative + * error code when something goes wrong. + */ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { @@ -2031,7 +2040,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group); if (ret) - return 0; + return ret; } fragments = grp->bb_fragments; @@ -2078,7 +2087,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t ngroups, group, i; int cr; - int err = 0; + int err = 0, first_err = 0; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; @@ -2145,6 +2154,7 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { + int ret = 0; cond_resched(); /* * Artificially restricted ngroups for non-extent @@ -2154,8 +2164,12 @@ repeat: group = 0; /* This now checks without needing the buddy page */ - if (!ext4_mb_good_group(ac, group, cr)) + ret = ext4_mb_good_group(ac, group, cr); + if (ret <= 0) { + if (!first_err) + first_err = ret; continue; + } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) @@ -2167,9 +2181,12 @@ repeat: * We need to check again after locking the * block group */ - if (!ext4_mb_good_group(ac, group, cr)) { + ret = ext4_mb_good_group(ac, group, cr); + if (ret <= 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); + if (!first_err) + first_err = ret; continue; } @@ -2216,6 +2233,8 @@ repeat: } } out: + if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) + err = first_err; return err; } @@ -2257,12 +2276,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) group--; if (group == 0) - seq_printf(seq, "#%-5s: %-5s %-5s %-5s " - "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " - "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", - "group", "free", "frags", "first", - "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", - "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + seq_puts(seq, "#group: free frags first [" + " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]"); i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); @@ -4800,18 +4816,12 @@ do_more: /* * blocks being freed are metadata. these blocks shouldn't * be used until this transaction is committed + * + * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed + * to fail. */ - retry: - new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); - if (!new_entry) { - /* - * We use a retry loop because - * ext4_free_blocks() is not allowed to fail. - */ - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry; - } + new_entry = kmem_cache_alloc(ext4_free_data_cachep, + GFP_NOFS|__GFP_NOFAIL); new_entry->efd_start_cluster = bit; new_entry->efd_group = block_group; new_entry->efd_count = count_clusters; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 3cb267aee802..6163ad21cb0e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode) EXT4_INODES_PER_GROUP(inode->i_sb)) + 1; owner[0] = i_uid_read(inode); owner[1] = i_gid_read(inode); - tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, + tmp_inode = ext4_new_inode(handle, d_inode(inode->i_sb->s_root), S_IFREG, NULL, goal, owner); if (IS_ERR(tmp_inode)) { retval = PTR_ERR(tmp_inode); @@ -620,6 +620,7 @@ int ext4_ind_migrate(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_extent *ex; unsigned int i, len; + ext4_lblk_t start, end; ext4_fsblk_t blk; handle_t *handle; int ret; @@ -633,6 +634,14 @@ int ext4_ind_migrate(struct inode *inode) EXT4_FEATURE_RO_COMPAT_BIGALLOC)) return -EOPNOTSUPP; + /* + * In order to get correct extent info, force all delayed allocation + * blocks to be allocated, otherwise delayed allocation blocks may not + * be reflected and bypass the checks on extent header. + */ + if (test_opt(inode->i_sb, DELALLOC)) + ext4_alloc_da_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -650,11 +659,13 @@ int ext4_ind_migrate(struct inode *inode) goto errout; } if (eh->eh_entries == 0) - blk = len = 0; + blk = len = start = end = 0; else { len = le16_to_cpu(ex->ee_len); blk = ext4_ext_pblock(ex); - if (len > EXT4_NDIR_BLOCKS) { + start = le32_to_cpu(ex->ee_block); + end = start + len - 1; + if (end >= EXT4_NDIR_BLOCKS) { ret = -EOPNOTSUPP; goto errout; } @@ -662,7 +673,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); memset(ei->i_data, 0, sizeof(ei->i_data)); - for (i=0; i < len; i++) + for (i = start; i <= end; i++) ei->i_data[i] = cpu_to_le32(blk++); ext4_mark_inode_dirty(handle, inode); errout: diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 370420bfae8d..fb6f11709ae6 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -166,12 +166,9 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, */ wait_on_page_writeback(page[0]); wait_on_page_writeback(page[1]); - if (inode1 > inode2) { - struct page *tmp; - tmp = page[0]; - page[0] = page[1]; - page[1] = tmp; - } + if (inode1 > inode2) + swap(page[0], page[1]); + return 0; } @@ -574,12 +571,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - /* TODO: This is non obvious task to swap blocks for inodes with full - jornaling enabled */ + + /* TODO: it's not obvious how to swap blocks for inodes with full + journaling enabled */ if (ext4_should_journal_data(orig_inode) || ext4_should_journal_data(donor_inode)) { - return -EINVAL; + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported with data journaling"); + return -EOPNOTSUPP; } + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 28fe71a2904c..011dcfb5cce3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -26,7 +26,6 @@ #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/jbd2.h> #include <linux/time.h> #include <linux/fcntl.h> #include <linux/stat.h> @@ -62,7 +61,7 @@ static struct buffer_head *ext4_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext4_bread(handle, inode, *block, 1); + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return bh; inode->i_size += inode->i_sb->s_blocksize; @@ -85,12 +84,13 @@ typedef enum { } dirblock_type_t; #define ext4_read_dirblock(inode, block, type) \ - __ext4_read_dirblock((inode), (block), (type), __LINE__) + __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__) static struct buffer_head *__ext4_read_dirblock(struct inode *inode, - ext4_lblk_t block, - dirblock_type_t type, - unsigned int line) + ext4_lblk_t block, + dirblock_type_t type, + const char *func, + unsigned int line) { struct buffer_head *bh; struct ext4_dir_entry *dirent; @@ -98,15 +98,17 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { - __ext4_warning(inode->i_sb, __func__, line, - "error %ld reading directory block " - "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino, - (unsigned long) block); + __ext4_warning(inode->i_sb, func, line, + "inode #%lu: lblock %lu: comm %s: " + "error %ld reading directory block", + inode->i_ino, (unsigned long)block, + current->comm, PTR_ERR(bh)); return bh; } if (!bh) { - ext4_error_inode(inode, __func__, line, block, "Directory hole found"); + ext4_error_inode(inode, func, line, block, + "Directory hole found"); return ERR_PTR(-EIO); } dirent = (struct ext4_dir_entry *) bh->b_data; @@ -120,7 +122,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, is_dx_block = 1; } if (!is_dx_block && type == INDEX) { - ext4_error_inode(inode, __func__, line, block, + ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); return ERR_PTR(-EIO); } @@ -137,8 +139,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (ext4_dx_csum_verify(inode, dirent)) set_buffer_verified(bh); else { - ext4_error_inode(inode, __func__, line, block, - "Directory index failed checksum"); + ext4_error_inode(inode, func, line, block, + "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EIO); } @@ -147,8 +149,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (ext4_dirent_csum_verify(inode, dirent)) set_buffer_verified(bh); else { - ext4_error_inode(inode, __func__, line, block, - "Directory block failed checksum"); + ext4_error_inode(inode, func, line, block, + "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EIO); } @@ -249,13 +251,14 @@ static void dx_set_count(struct dx_entry *entries, unsigned value); static void dx_set_limit(struct dx_entry *entries, unsigned value); static unsigned dx_root_limit(struct inode *dir, unsigned infosize); static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(const struct qstr *d_name, +static struct dx_frame *dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame); static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, + unsigned blocksize, struct dx_hash_info *hinfo, + struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, struct dx_map_entry *offsets, int count, unsigned blocksize); @@ -267,10 +270,10 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frames, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, - const struct qstr *d_name, + struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode); /* checksumming functions */ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, @@ -327,10 +330,14 @@ static __le32 ext4_dirent_csum(struct inode *inode, return cpu_to_le32(csum); } -static void warn_no_space_for_csum(struct inode *inode) +#define warn_no_space_for_csum(inode) \ + __warn_no_space_for_csum((inode), __func__, __LINE__) + +static void __warn_no_space_for_csum(struct inode *inode, const char *func, + unsigned int line) { - ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " - "checksum. Please run e2fsck -D.", inode->i_ino); + __ext4_warning_inode(inode, func, line, + "No space for directory leaf checksum. Please run e2fsck -D."); } int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) @@ -586,8 +593,10 @@ struct stats unsigned bcount; }; -static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_entry_2 *de, - int size, int show_names) +static struct stats dx_show_leaf(struct inode *dir, + struct dx_hash_info *hinfo, + struct ext4_dir_entry_2 *de, + int size, int show_names) { unsigned names = 0, space = 0; char *base = (char *) de; @@ -600,12 +609,69 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent { if (show_names) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + int len; + char *name; + struct ext4_str fname_crypto_str + = {.name = NULL, .len = 0}; + int res = 0; + + name = de->name; + len = de->name_len; + if (ext4_encrypted_inode(inode)) + res = ext4_get_encryption_info(dir); + if (res) { + printk(KERN_WARNING "Error setting up" + " fname crypto: %d\n", res); + } + if (ctx == NULL) { + /* Directory is not encrypted */ + ext4fs_dirhash(de->name, + de->name_len, &h); + printk("%*.s:(U)%x.%u ", len, + name, h.hash, + (unsigned) ((char *) de + - base)); + } else { + /* Directory is encrypted */ + res = ext4_fname_crypto_alloc_buffer( + ctx, de->name_len, + &fname_crypto_str); + if (res < 0) { + printk(KERN_WARNING "Error " + "allocating crypto " + "buffer--skipping " + "crypto\n"); + ctx = NULL; + } + res = ext4_fname_disk_to_usr(ctx, NULL, de, + &fname_crypto_str); + if (res < 0) { + printk(KERN_WARNING "Error " + "converting filename " + "from disk to usr" + "\n"); + name = "??"; + len = 2; + } else { + name = fname_crypto_str.name; + len = fname_crypto_str.len; + } + ext4fs_dirhash(de->name, de->name_len, + &h); + printk("%*.s:(E)%x.%u ", len, name, + h.hash, (unsigned) ((char *) de + - base)); + ext4_fname_crypto_free_buffer( + &fname_crypto_str); + } +#else int len = de->name_len; char *name = de->name; - while (len--) printk("%c", *name++); ext4fs_dirhash(de->name, de->name_len, &h); - printk(":%x.%u ", h.hash, + printk("%*.s:%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); +#endif } space += EXT4_DIR_REC_LEN(de->name_len); names++; @@ -623,7 +689,6 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, unsigned count = dx_get_count(entries), names = 0, space = 0, i; unsigned bcount = 0; struct buffer_head *bh; - int err; printk("%i indexed blocks...\n", count); for (i = 0; i < count; i++, entries++) { @@ -637,7 +702,8 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): - dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); + dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) + bh->b_data, blocksize, 0); names += stats.names; space += stats.space; bcount += stats.bcount; @@ -661,7 +727,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, * back to userspace. */ static struct dx_frame * -dx_probe(const struct qstr *d_name, struct inode *dir, +dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect; @@ -679,36 +745,41 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { - ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", - root->info.hash_version); + ext4_warning_inode(dir, "Unrecognised inode hash code %u", + root->info.hash_version); goto fail; } + if (fname) + hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; - if (d_name) - ext4fs_dirhash(d_name->name, d_name->len, hinfo); + if (fname && fname_name(fname)) + ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo); hash = hinfo->hash; if (root->info.unused_flags & 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", - root->info.unused_flags); + ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", + root->info.unused_flags); goto fail; } - if ((indirect = root->info.indirect_levels) > 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", - root->info.indirect_levels); + indirect = root->info.indirect_levels; + if (indirect > 1) { + ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", + root->info.indirect_levels); goto fail; } - entries = (struct dx_entry *) (((char *)&root->info) + - root->info.info_length); + entries = (struct dx_entry *)(((char *)&root->info) + + root->info.info_length); if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { - ext4_warning(dir->i_sb, "dx entry: limit != root limit"); + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), + dx_root_limit(dir, root->info.info_length)); goto fail; } @@ -716,15 +787,16 @@ dx_probe(const struct qstr *d_name, struct inode *dir, while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { - ext4_warning(dir->i_sb, - "dx entry: no count or count > limit"); + ext4_warning_inode(dir, + "dx entry: count %u beyond limit %u", + count, dx_get_limit(entries)); goto fail; } p = entries + 1; q = entries + count - 1; while (p <= q) { - m = p + (q - p)/2; + m = p + (q - p) / 2; dxtrace(printk(".")); if (dx_get_hash(m) > hash) q = m - 1; @@ -748,7 +820,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir, } at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + dxtrace(printk(" %x->%u\n", at == entries ? 0 : dx_get_hash(at), + dx_get_block(at))); frame->entries = entries; frame->at = at; if (!indirect--) @@ -762,9 +835,10 @@ dx_probe(const struct qstr *d_name, struct inode *dir, } entries = ((struct dx_node *) frame->bh->b_data)->entries; - if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, - "dx entry: limit != node limit"); + if (dx_get_limit(entries) != dx_node_limit(dir)) { + ext4_warning_inode(dir, + "dx entry: limit %u != node limit %u", + dx_get_limit(entries), dx_node_limit(dir)); goto fail; } } @@ -773,19 +847,19 @@ fail: brelse(frame->bh); frame--; } + if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) - ext4_warning(dir->i_sb, - "Corrupt dir inode %lu, running e2fsck is " - "recommended.", dir->i_ino); + ext4_warning_inode(dir, + "Corrupt directory, running e2fsck is recommended"); return ret_err; } -static void dx_release (struct dx_frame *frames) +static void dx_release(struct dx_frame *frames) { if (frames[0].bh == NULL) return; - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) brelse(frames[1].bh); brelse(frames[0].bh); } @@ -878,6 +952,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; + struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -889,6 +964,22 @@ static int htree_dirblock_to_tree(struct file *dir_file, top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + /* Check if the directory is encrypted */ + if (ext4_encrypted_inode(dir)) { + err = ext4_get_encryption_info(dir); + if (err < 0) { + brelse(bh); + return err; + } + err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN, + &fname_crypto_str); + if (err < 0) { + brelse(bh); + return err; + } + } +#endif for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, bh->b_size, @@ -904,14 +995,38 @@ static int htree_dirblock_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - if ((err = ext4_htree_store_dirent(dir_file, - hinfo->hash, hinfo->minor_hash, de)) != 0) { - brelse(bh); - return err; + if (!ext4_encrypted_inode(dir)) { + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &tmp_str); + } else { + int save_len = fname_crypto_str.len; + + /* Directory is encrypted */ + err = ext4_fname_disk_to_usr(dir, hinfo, de, + &fname_crypto_str); + if (err < 0) { + count = err; + goto errout; + } + err = ext4_htree_store_dirent(dir_file, + hinfo->hash, hinfo->minor_hash, de, + &fname_crypto_str); + fname_crypto_str.len = save_len; + } + if (err != 0) { + count = err; + goto errout; } count++; } +errout: brelse(bh); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ext4_fname_crypto_free_buffer(&fname_crypto_str); +#endif return count; } @@ -935,6 +1050,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, int count = 0; int ret, err; __u32 hashval; + struct ext4_str tmp_str; dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", start_hash, start_minor_hash)); @@ -970,14 +1086,22 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; - if ((err = ext4_htree_store_dirent(dir_file, 0, 0, de)) != 0) + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, 0, 0, + de, &tmp_str); + if (err != 0) goto errout; count++; } if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) { de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data; de = ext4_next_entry(de, dir->i_sb->s_blocksize); - if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0) + tmp_str.name = de->name; + tmp_str.len = de->name_len; + err = ext4_htree_store_dirent(dir_file, 2, 0, + de, &tmp_str); + if (err != 0) goto errout; count++; } @@ -1019,12 +1143,13 @@ errout: static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { - return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, - d_name, offset, res_dir); + return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + fname, d_name, offset, res_dir); } /* @@ -1035,8 +1160,8 @@ static inline int search_dirblock(struct buffer_head *bh, * Create map of hash values, offsets, and sizes, stored at end of block. * Returns number of entries mapped. */ -static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, - struct dx_hash_info *hinfo, +static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, + unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry *map_tail) { int count = 0; @@ -1106,57 +1231,87 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) * `len <= EXT4_NAME_LEN' is guaranteed by caller. * `de != NULL' is guaranteed by caller. */ -static inline int ext4_match (int len, const char * const name, - struct ext4_dir_entry_2 * de) +static inline int ext4_match(struct ext4_filename *fname, + struct ext4_dir_entry_2 *de) { - if (len != de->name_len) - return 0; + const void *name = fname_name(fname); + u32 len = fname_len(fname); + if (!de->inode) return 0; - return !memcmp(name, de->name, len); + +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (unlikely(!name)) { + if (fname->usr_fname->name[0] == '_') { + int ret; + if (de->name_len < 16) + return 0; + ret = memcmp(de->name + de->name_len - 16, + fname->crypto_buf.name + 8, 16); + return (ret == 0) ? 1 : 0; + } + name = fname->crypto_buf.name; + len = fname->crypto_buf.len; + } +#endif + if (de->name_len != len) + return 0; + return (memcmp(de->name, name, len) == 0) ? 1 : 0; } /* * Returns 0 if not found, -1 on failure, and 1 on success */ -int search_dir(struct buffer_head *bh, - char *search_buf, - int buf_size, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir) +int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + struct inode *dir, struct ext4_filename *fname, + const struct qstr *d_name, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - const char *name = d_name->name; - int namelen = d_name->len; + int res; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ + if ((char *) de + de->name_len <= dlimit) { + res = ext4_match(fname, de); + if (res < 0) { + res = -1; + goto return_result; + } + if (res > 0) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, + bh->b_data, + bh->b_size, offset)) { + res = -1; + goto return_result; + } + *res_dir = de; + res = 1; + goto return_result; + } - if ((char *) de + namelen <= dlimit && - ext4_match (namelen, name, de)) { - /* found a match - just to be sure, do a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, - bh->b_size, offset)) - return -1; - *res_dir = de; - return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); - if (de_len <= 0) - return -1; + if (de_len <= 0) { + res = -1; + goto return_result; + } offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } - return 0; + + res = 0; +return_result: + return res; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, @@ -1202,7 +1357,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, namelen; + int i, namelen, retval; + struct ext4_filename fname; *res_dir = NULL; sb = dir->i_sb; @@ -1210,14 +1366,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (namelen > EXT4_NAME_LEN) return NULL; + retval = ext4_fname_setup_filename(dir, d_name, 1, &fname); + if (retval) + return ERR_PTR(retval); + if (ext4_has_inline_data(dir)) { int has_inline_data = 1; - ret = ext4_find_inline_entry(dir, d_name, res_dir, + ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir, &has_inline_data); if (has_inline_data) { if (inlined) *inlined = 1; - return ret; + goto cleanup_and_exit; } } @@ -1232,14 +1392,14 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto restart; } if (is_dx(dir)) { - bh = ext4_dx_find_entry(dir, d_name, res_dir); + ret = ext4_dx_find_entry(dir, &fname, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR) - return bh; + if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + goto cleanup_and_exit; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); } @@ -1270,8 +1430,10 @@ restart: num++; bh = ext4_getblk(NULL, dir, b++, 0); if (unlikely(IS_ERR(bh))) { - if (ra_max == 0) - return bh; + if (ra_max == 0) { + ret = bh; + goto cleanup_and_exit; + } break; } bh_use[ra_max] = bh; @@ -1301,7 +1463,7 @@ restart: goto next; } set_buffer_verified(bh); - i = search_dirblock(bh, dir, d_name, + i = search_dirblock(bh, dir, &fname, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; @@ -1332,20 +1494,25 @@ cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); + ext4_fname_free_filename(&fname); return ret; } -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir) +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; - struct dx_hash_info hinfo; struct dx_frame frames[2], *frame; + const struct qstr *d_name = fname->usr_fname; struct buffer_head *bh; ext4_lblk_t block; int retval; - frame = dx_probe(d_name, dir, &hinfo, frames); +#ifdef CONFIG_EXT4_FS_ENCRYPTION + *res_dir = NULL; +#endif + frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return (struct buffer_head *) frame; do { @@ -1354,7 +1521,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q if (IS_ERR(bh)) goto errout; - retval = search_dirblock(bh, dir, d_name, + retval = search_dirblock(bh, dir, fname, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) @@ -1366,12 +1533,12 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q } /* Check to see if we should continue to search */ - retval = ext4_htree_next_block(dir, hinfo.hash, frame, + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { - ext4_warning(sb, - "error %d reading index page in directory #%lu", - retval, dir->i_ino); + ext4_warning_inode(dir, + "error %d reading directory index block", + retval); bh = ERR_PTR(retval); goto errout; } @@ -1417,6 +1584,18 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi ino); return ERR_PTR(-EIO); } + if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && + (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) && + !ext4_is_child_context_consistent_with_parent(dir, + inode)) { + iput(inode); + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu\n", + (unsigned long) dir->i_ino, + (unsigned long) inode->i_ino); + return ERR_PTR(-EPERM); + } } return d_splice_alias(inode, dentry); } @@ -1429,7 +1608,7 @@ struct dentry *ext4_get_parent(struct dentry *child) struct ext4_dir_entry_2 * de; struct buffer_head *bh; - bh = ext4_find_entry(child->d_inode, &dotdot, &de, NULL); + bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL); if (IS_ERR(bh)) return (struct dentry *) bh; if (!bh) @@ -1437,13 +1616,13 @@ struct dentry *ext4_get_parent(struct dentry *child) ino = le32_to_cpu(de->inode); brelse(bh); - if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { - EXT4_ERROR_INODE(child->d_inode, + if (!ext4_valid_inum(d_inode(child)->i_sb, ino)) { + EXT4_ERROR_INODE(d_inode(child), "bad parent inode number: %u", ino); return ERR_PTR(-EIO); } - return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext4_iget_normal(d_inode(child)->i_sb, ino)); } /* @@ -1541,7 +1720,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); - count = dx_make_map((struct ext4_dir_entry_2 *) data1, + count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1, blocksize, hinfo, map); map -= count; dx_sort_map(map, count); @@ -1564,7 +1743,8 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, hash2, split, count-split)); /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize); + de2 = dx_move_dirents(data1, data2, map + split, count - split, + blocksize); de = dx_pack_dirents(data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, @@ -1580,8 +1760,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, initialize_dirent_tail(t, blocksize); } - dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); - dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data1, + blocksize, 1)); + dxtrace(dx_show_leaf(dir, hinfo, (struct ext4_dir_entry_2 *) data2, + blocksize, 1)); /* Which block gets the new entry? */ if (hinfo->hash >= hash2) { @@ -1610,23 +1792,32 @@ journal_error: int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, - const char *name, int namelen, + struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; - unsigned short reclen = EXT4_DIR_REC_LEN(namelen); + unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); int nlen, rlen; unsigned int offset = 0; char *top; + int res; de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, - buf, buf_size, offset)) - return -EIO; - if (ext4_match(namelen, name, de)) - return -EEXIST; + buf, buf_size, offset)) { + res = -EIO; + goto return_result; + } + /* Provide crypto context and crypto buffer to ext4 match */ + res = ext4_match(fname, de); + if (res < 0) + goto return_result; + if (res > 0) { + res = -EEXIST; + goto return_result; + } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) @@ -1634,17 +1825,22 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } - if ((char *) de > top) - return -ENOSPC; - *dest_de = de; - return 0; + if ((char *) de > top) + res = -ENOSPC; + else { + *dest_de = de; + res = 0; + } +return_result: + return res; } -void ext4_insert_dentry(struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - const char *name, int namelen) +int ext4_insert_dentry(struct inode *dir, + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname) { int nlen, rlen; @@ -1653,7 +1849,7 @@ void ext4_insert_dentry(struct inode *inode, rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if (de->inode) { struct ext4_dir_entry_2 *de1 = - (struct ext4_dir_entry_2 *)((char *)de + nlen); + (struct ext4_dir_entry_2 *)((char *)de + nlen); de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, buf_size); de->rec_len = ext4_rec_len_to_disk(nlen, buf_size); de = de1; @@ -1661,9 +1857,11 @@ void ext4_insert_dentry(struct inode *inode, de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); - de->name_len = namelen; - memcpy(de->name, name, namelen); + de->name_len = fname_len(fname); + memcpy(de->name, fname_name(fname), fname_len(fname)); + return 0; } + /* * Add a new entry into a directory (leaf) block. If de is non-NULL, * it points to a directory entry which is guaranteed to be large @@ -1672,13 +1870,11 @@ void ext4_insert_dentry(struct inode *inode, * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ -static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, +static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { - struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err; @@ -1687,9 +1883,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { - err = ext4_find_dest_de(dir, inode, - bh, bh->b_data, blocksize - csum_size, - name, namelen, &de); + err = ext4_find_dest_de(dir, inode, bh, bh->b_data, + blocksize - csum_size, fname, &de); if (err) return err; } @@ -1700,8 +1895,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, return err; } - /* By now the buffer is marked for journaling */ - ext4_insert_dentry(inode, de, blocksize, name, namelen); + /* By now the buffer is marked for journaling. Due to crypto operations, + * the following function call may fail */ + err = ext4_insert_dentry(dir, inode, de, blocksize, fname); + if (err < 0) + return err; /* * XXX shouldn't update any times until successful @@ -1729,12 +1927,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. */ -static int make_indexed_dir(handle_t *handle, struct dentry *dentry, +static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { - struct inode *dir = dentry->d_parent->d_inode; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -1745,10 +1942,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, unsigned len; int retval; unsigned blocksize; - struct dx_hash_info hinfo; ext4_lblk_t block; struct fake_dirent *fde; - int csum_size = 0; + int csum_size = 0; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); @@ -1811,11 +2007,12 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ - hinfo.hash_version = root->info.hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; - ext4fs_dirhash(name, namelen, &hinfo); + fname->hinfo.hash_version = root->info.hash_version; + if (fname->hinfo.hash_version <= DX_HASH_TEA) + fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo); + memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; @@ -1830,14 +2027,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &hinfo); + de = do_split(handle,dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } dx_release(frames); - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); brelse(bh); return retval; out_frames: @@ -1864,11 +2061,12 @@ out_frames: static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; - struct buffer_head *bh; + struct inode *dir = d_inode(dentry->d_parent); + struct buffer_head *bh = NULL; struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; struct super_block *sb; + struct ext4_filename fname; int retval; int dx_fallback=0; unsigned blocksize; @@ -1883,20 +2081,25 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!dentry->d_name.len) return -EINVAL; + retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); + if (retval) + return retval; + if (ext4_has_inline_data(dir)) { - retval = ext4_try_add_inline_entry(handle, dentry, inode); + retval = ext4_try_add_inline_entry(handle, &fname, + dentry, inode); if (retval < 0) - return retval; + goto out; if (retval == 1) { retval = 0; - return retval; + goto out; } } if (is_dx(dir)) { - retval = ext4_dx_add_entry(handle, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) - return retval; + goto out; ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; ext4_mark_inode_dirty(handle, dir); @@ -1904,23 +2107,31 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { bh = ext4_read_dirblock(dir, block, DIRENT); - if (IS_ERR(bh)) - return PTR_ERR(bh); - - retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); - if (retval != -ENOSPC) { - brelse(bh); - return retval; + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; } + retval = add_dirent_to_buf(handle, &fname, dir, inode, + NULL, bh); + if (retval != -ENOSPC) + goto out; if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) - return make_indexed_dir(handle, dentry, inode, bh); + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { + retval = make_indexed_dir(handle, &fname, dentry, + inode, bh); + bh = NULL; /* make_indexed_dir releases bh */ + goto out; + } brelse(bh); } bh = ext4_append(handle, dir, &block); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; + } de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); @@ -1930,7 +2141,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, initialize_dirent_tail(t, blocksize); } - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); +out: + ext4_fname_free_filename(&fname); brelse(bh); if (retval == 0) ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); @@ -1940,19 +2153,18 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, /* * Returns 0 for success, or a negative error value */ -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode) { struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; - struct dx_hash_info hinfo; struct buffer_head *bh; - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames); + frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); entries = frame->entries; @@ -1969,7 +2181,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh); if (err != -ENOSPC) goto cleanup; @@ -1987,7 +2199,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { - ext4_warning(sb, "Directory index full!"); + ext4_warning_inode(dir, "Directory index full!"); err = -ENOSPC; goto cleanup; } @@ -2065,12 +2277,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; } } - de = do_split(handle, dir, &bh, frame, &hinfo); + de = do_split(handle, dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { err = PTR_ERR(de); goto cleanup; } - err = add_dirent_to_buf(handle, dentry, inode, de, bh); + err = add_dirent_to_buf(handle, fname, dir, inode, de, bh); goto cleanup; journal_error: @@ -2235,10 +2447,7 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) - inode->i_fop = &ext4_dax_file_operations; - else - inode->i_fop = &ext4_file_operations; + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) @@ -2302,10 +2511,7 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - if (test_opt(inode->i_sb, DAX)) - inode->i_fop = &ext4_dax_file_operations; - else - inode->i_fop = &ext4_file_operations; + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); @@ -2456,7 +2662,7 @@ out_stop: /* * routine to check that the specified directory is empty (for rmdir) */ -static int empty_dir(struct inode *inode) +int ext4_empty_dir(struct inode *inode) { unsigned int offset; struct buffer_head *bh; @@ -2484,12 +2690,9 @@ static int empty_dir(struct inode *inode) de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || - !le32_to_cpu(de1->inode) || - strcmp(".", de->name) || - strcmp("..", de1->name)) { - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no `.' or `..'", - inode->i_ino); + le32_to_cpu(de1->inode) == 0 || + strcmp(".", de->name) || strcmp("..", de1->name)) { + ext4_warning_inode(inode, "directory missing '.' and/or '..'"); brelse(bh); return 1; } @@ -2708,7 +2911,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go in * separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); @@ -2717,14 +2920,14 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (!bh) goto end_rmdir; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) goto end_rmdir; retval = -ENOTEMPTY; - if (!empty_dir(inode)) + if (!ext4_empty_dir(inode)) goto end_rmdir; handle = ext4_journal_start(dir, EXT4_HT_DIR, @@ -2742,8 +2945,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) - ext4_warning(inode->i_sb, - "empty directory has too many links (%d)", + ext4_warning_inode(inode, + "empty directory '%.*s' has too many links (%u)", + dentry->d_name.len, dentry->d_name.name, inode->i_nlink); inode->i_version++; clear_nlink(inode); @@ -2777,7 +2981,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) /* Initialize quotas before so that eventual writes go * in separate transaction */ dquot_initialize(dir); - dquot_initialize(dentry->d_inode); + dquot_initialize(d_inode(dentry)); retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de, NULL); @@ -2786,7 +2990,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (!bh) goto end_unlink; - inode = dentry->d_inode; + inode = d_inode(dentry); retval = -EIO; if (le32_to_cpu(de->inode) != inode->i_ino) @@ -2803,10 +3007,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - if (!inode->i_nlink) { - ext4_warning(inode->i_sb, - "Deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); + if (inode->i_nlink == 0) { + ext4_warning_inode(inode, "Deleting file '%.*s' with no links", + dentry->d_name.len, dentry->d_name.name); set_nlink(inode, 1); } retval = ext4_delete_entry(handle, dir, de, bh); @@ -2834,16 +3037,38 @@ static int ext4_symlink(struct inode *dir, { handle_t *handle; struct inode *inode; - int l, err, retries = 0; + int err, len = strlen(symname); int credits; + bool encryption_required; + struct ext4_str disk_link; + struct ext4_encrypted_symlink_data *sd = NULL; - l = strlen(symname)+1; - if (l > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + disk_link.len = len + 1; + disk_link.name = (char *) symname; + + encryption_required = (ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); + if (encryption_required) { + err = ext4_get_encryption_info(dir); + if (err) + return err; + if (ext4_encryption_info(dir) == NULL) + return -EPERM; + disk_link.len = (ext4_fname_encrypted_size(dir, len) + + sizeof(struct ext4_encrypted_symlink_data)); + sd = kzalloc(disk_link.len, GFP_KERNEL); + if (!sd) + return -ENOMEM; + } + + if (disk_link.len > dir->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto err_free_sd; + } dquot_initialize(dir); - if (l > EXT4_N_BLOCKS * 4) { + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { /* * For non-fast symlinks, we just allocate inode and put it on * orphan list in the first transaction => we need bitmap, @@ -2862,17 +3087,37 @@ static int ext4_symlink(struct inode *dir, credits = EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3; } -retry: + inode = ext4_new_inode_start_handle(dir, S_IFLNK|S_IRWXUGO, &dentry->d_name, 0, NULL, EXT4_HT_DIR, credits); handle = ext4_journal_current_handle(); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_stop; + if (IS_ERR(inode)) { + if (handle) + ext4_journal_stop(handle); + err = PTR_ERR(inode); + goto err_free_sd; + } + + if (encryption_required) { + struct qstr istr; + struct ext4_str ostr; + + istr.name = (const unsigned char *) symname; + istr.len = len; + ostr.name = sd->encrypted_path; + ostr.len = disk_link.len; + err = ext4_fname_usr_to_disk(inode, &istr, &ostr); + if (err < 0) + goto err_drop_inode; + sd->len = cpu_to_le16(ostr.len); + disk_link.name = (char *) sd; + inode->i_op = &ext4_encrypted_symlink_inode_operations; + } - if (l > EXT4_N_BLOCKS * 4) { - inode->i_op = &ext4_symlink_inode_operations; + if ((disk_link.len > EXT4_N_BLOCKS * 4)) { + if (!encryption_required) + inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); /* * We cannot call page_symlink() with transaction started @@ -2887,9 +3132,10 @@ retry: drop_nlink(inode); err = ext4_orphan_add(handle, inode); ext4_journal_stop(handle); + handle = NULL; if (err) goto err_drop_inode; - err = __page_symlink(inode, symname, l, 1); + err = __page_symlink(inode, disk_link.name, disk_link.len, 1); if (err) goto err_drop_inode; /* @@ -2901,36 +3147,41 @@ retry: EXT4_INDEX_EXTRA_TRANS_BLOCKS + 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); + handle = NULL; goto err_drop_inode; } set_nlink(inode, 1); err = ext4_orphan_del(handle, inode); - if (err) { - ext4_journal_stop(handle); - clear_nlink(inode); + if (err) goto err_drop_inode; - } } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); - inode->i_op = &ext4_fast_symlink_inode_operations; - memcpy((char *)&EXT4_I(inode)->i_data, symname, l); - inode->i_size = l-1; + if (!encryption_required) { + inode->i_op = &ext4_fast_symlink_inode_operations; + inode->i_link = (char *)&EXT4_I(inode)->i_data; + } + memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, + disk_link.len); + inode->i_size = disk_link.len - 1; } EXT4_I(inode)->i_disksize = inode->i_size; err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) ext4_handle_sync(handle); -out_stop: if (handle) ext4_journal_stop(handle); - if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) - goto retry; + kfree(sd); return err; err_drop_inode: + if (handle) + ext4_journal_stop(handle); + clear_nlink(inode); unlock_new_inode(inode); iput(inode); +err_free_sd: + kfree(sd); return err; } @@ -2938,12 +3189,14 @@ static int ext4_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { handle_t *handle; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int err, retries = 0; if (inode->i_nlink >= EXT4_LINK_MAX) return -EMLINK; - + if (ext4_encrypted_inode(dir) && + !ext4_is_child_context_consistent_with_parent(dir, inode)) + return -EPERM; dquot_initialize(dir); retry: @@ -3144,9 +3397,9 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, } if (retval) { - ext4_warning(ent->dir->i_sb, - "Deleting old file (%lu), %d, error=%d", - ent->dir->i_ino, ent->dir->i_nlink, retval); + ext4_warning_inode(ent->dir, + "Deleting old file: nlink %d, error=%d", + ent->dir->i_nlink, retval); } } @@ -3210,12 +3463,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, - .inode = old_dentry->d_inode, + .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, - .inode = new_dentry->d_inode, + .inode = d_inode(new_dentry), }; int force_reread; int retval; @@ -3244,6 +3497,14 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino) goto end_rename; + if ((old.dir != new.dir) && + ext4_encrypted_inode(new.dir) && + !ext4_is_child_context_consistent_with_parent(new.dir, + old.inode)) { + retval = -EPERM; + goto end_rename; + } + new.bh = ext4_find_entry(new.dir, &new.dentry->d_name, &new.de, &new.inlined); if (IS_ERR(new.bh)) { @@ -3264,12 +3525,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); if (!(flags & RENAME_WHITEOUT)) { handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rename; + } } else { whiteout = ext4_whiteout_for_rename(&old, credits, &handle); - if (IS_ERR(whiteout)) - return PTR_ERR(whiteout); + if (IS_ERR(whiteout)) { + retval = PTR_ERR(whiteout); + whiteout = NULL; + goto end_rename; + } } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) @@ -3278,7 +3545,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (S_ISDIR(old.inode->i_mode)) { if (new.inode) { retval = -ENOTEMPTY; - if (!empty_dir(new.inode)) + if (!ext4_empty_dir(new.inode)) goto end_rename; } else { retval = -EMLINK; @@ -3352,8 +3619,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, ext4_dec_count(handle, old.dir); if (new.inode) { - /* checked empty_dir above, can't have another parent, - * ext4_dec_count() won't work for many-linked dirs */ + /* checked ext4_empty_dir above, can't have another + * parent, ext4_dec_count() won't work for many-linked + * dirs */ clear_nlink(new.inode); } else { ext4_inc_count(handle, new.dir); @@ -3391,16 +3659,25 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct ext4_renament old = { .dir = old_dir, .dentry = old_dentry, - .inode = old_dentry->d_inode, + .inode = d_inode(old_dentry), }; struct ext4_renament new = { .dir = new_dir, .dentry = new_dentry, - .inode = new_dentry->d_inode, + .inode = d_inode(new_dentry), }; u8 new_file_type; int retval; + if ((ext4_encrypted_inode(old_dir) || + ext4_encrypted_inode(new_dir)) && + (old_dir != new_dir) && + (!ext4_is_child_context_consistent_with_parent(new_dir, + old.inode) || + !ext4_is_child_context_consistent_with_parent(old_dir, + new.inode))) + return -EPERM; + dquot_initialize(old.dir); dquot_initialize(new.dir); @@ -3433,8 +3710,11 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, handle = ext4_journal_start(old.dir, EXT4_HT_DIR, (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + 2 * EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + retval = PTR_ERR(handle); + handle = NULL; + goto end_rename; + } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index b24a2541a9ba..5602450f03f6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -8,7 +8,6 @@ #include <linux/fs.h> #include <linux/time.h> -#include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/quotaops.h> @@ -18,14 +17,12 @@ #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> -#include <linux/aio.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/mm.h> -#include <linux/ratelimit.h> #include "ext4_jbd2.h" #include "xattr.h" @@ -69,6 +66,10 @@ static void ext4_finish_bio(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct page *data_page = NULL; + struct ext4_crypto_ctx *ctx = NULL; +#endif struct buffer_head *bh, *head; unsigned bio_start = bvec->bv_offset; unsigned bio_end = bio_start + bvec->bv_len; @@ -78,6 +79,15 @@ static void ext4_finish_bio(struct bio *bio) if (!page) continue; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (!page->mapping) { + /* The bounce data pages are unmapped. */ + data_page = page; + ctx = (struct ext4_crypto_ctx *)page_private(data_page); + page = ctx->w.control_page; + } +#endif + if (error) { SetPageError(page); set_bit(AS_EIO, &page->mapping->flags); @@ -102,8 +112,13 @@ static void ext4_finish_bio(struct bio *bio) } while ((bh = bh->b_this_page) != head); bit_spin_unlock(BH_Uptodate_Lock, &head->b_state); local_irq_restore(flags); - if (!under_io) + if (!under_io) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (ctx) + ext4_restore_control_page(data_page); +#endif end_page_writeback(page); + } } } @@ -344,7 +359,6 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { bio_get(io->io_bio); submit_bio(io->io_op, io->io_bio); - BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); bio_put(io->io_bio); } io->io_bio = NULL; @@ -378,6 +392,7 @@ static int io_submit_init_bio(struct ext4_io_submit *io, static int io_submit_add_bh(struct ext4_io_submit *io, struct inode *inode, + struct page *page, struct buffer_head *bh) { int ret; @@ -391,7 +406,7 @@ submit_and_retry: if (ret) return ret; } - ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); + ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh)); if (ret != bh->b_size) goto submit_and_retry; io->io_next_block++; @@ -404,6 +419,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, struct writeback_control *wbc, bool keep_towrite) { + struct page *data_page = NULL; struct inode *inode = page->mapping->host; unsigned block_start, blocksize; struct buffer_head *bh, *head; @@ -463,19 +479,29 @@ int ext4_bio_write_page(struct ext4_io_submit *io, set_buffer_async_write(bh); } while ((bh = bh->b_this_page) != head); - /* Now submit buffers to write */ bh = head = page_buffers(page); + + if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + data_page = ext4_encrypt(inode, page); + if (IS_ERR(data_page)) { + ret = PTR_ERR(data_page); + data_page = NULL; + goto out; + } + } + + /* Now submit buffers to write */ do { if (!buffer_async_write(bh)) continue; - ret = io_submit_add_bh(io, inode, bh); + ret = io_submit_add_bh(io, inode, + data_page ? data_page : page, bh); if (ret) { /* * We only get here on ENOMEM. Not much else * we can do but mark the page as dirty, and * better luck next time. */ - redirty_page_for_writepage(wbc, page); break; } nr_submitted++; @@ -484,6 +510,11 @@ int ext4_bio_write_page(struct ext4_io_submit *io, /* Error stopped previous loop? Clean up buffers... */ if (ret) { + out: + if (data_page) + ext4_restore_control_page(data_page); + printk_ratelimited(KERN_ERR "%s: ret = %d\n", __func__, ret); + redirty_page_for_writepage(wbc, page); do { clear_buffer_async_write(bh); bh = bh->b_this_page; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c new file mode 100644 index 000000000000..ec3ef93a52db --- /dev/null +++ b/fs/ext4/readpage.c @@ -0,0 +1,328 @@ +/* + * linux/fs/ext4/readpage.c + * + * Copyright (C) 2002, Linus Torvalds. + * Copyright (C) 2015, Google, Inc. + * + * This was originally taken from fs/mpage.c + * + * The intent is the ext4_mpage_readpages() function here is intended + * to replace mpage_readpages() in the general case, not just for + * encrypted files. It has some limitations (see below), where it + * will fall back to read_block_full_page(), but these limitations + * should only be hit when page_size != block_size. + * + * This will allow us to attach a callback function to support ext4 + * encryption. + * + * If anything unusual happens, such as: + * + * - encountering a page which has buffers + * - encountering a page which has a non-hole after a hole + * - encountering a page with non-contiguous blocks + * + * then this code just gives up and calls the buffer_head-based read function. + * It does handle a page which has holes at the end - that is a common case: + * the end-of-file on blocksize < PAGE_CACHE_SIZE setups. + * + */ + +#include <linux/kernel.h> +#include <linux/export.h> +#include <linux/mm.h> +#include <linux/kdev_t.h> +#include <linux/gfp.h> +#include <linux/bio.h> +#include <linux/fs.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/highmem.h> +#include <linux/prefetch.h> +#include <linux/mpage.h> +#include <linux/writeback.h> +#include <linux/backing-dev.h> +#include <linux/pagevec.h> +#include <linux/cleancache.h> + +#include "ext4.h" + +/* + * Call ext4_decrypt on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_crypto_ctx *ctx = + container_of(work, struct ext4_crypto_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + int ret = ext4_decrypt(ctx, page); + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else + SetPageUptodate(page); + unlock_page(page); + } + ext4_release_crypto_ctx(ctx); + bio_put(bio); +#else + BUG(); +#endif +} + +static inline bool ext4_bio_encrypted(struct bio *bio) +{ +#ifdef CONFIG_EXT4_FS_ENCRYPTION + return unlikely(bio->bi_private != NULL); +#else + return false; +#endif +} + +/* + * I/O completion handler for multipage BIOs. + * + * The mpage code never puts partial pages into a BIO (except for end-of-file). + * If a page does not map to a contiguous run of blocks then it simply falls + * back to block_read_full_page(). + * + * Why is this? If a page's completion depends on a number of different BIOs + * which can complete in any order (or at the same time) then determining the + * status of that page is hard. See end_buffer_async_read() for the details. + * There is no point in duplicating all that complexity. + */ +static void mpage_end_io(struct bio *bio, int err) +{ + struct bio_vec *bv; + int i; + + if (ext4_bio_encrypted(bio)) { + struct ext4_crypto_ctx *ctx = bio->bi_private; + + if (err) { + ext4_release_crypto_ctx(ctx); + } else { + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(ext4_read_workqueue, &ctx->r.work); + return; + } + } + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } + + bio_put(bio); +} + +int ext4_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages) +{ + struct bio *bio = NULL; + unsigned page_idx; + sector_t last_block_in_bio = 0; + + struct inode *inode = mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits; + const unsigned blocksize = 1 << blkbits; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t blocks[MAX_BUF_PER_PAGE]; + unsigned page_block; + struct block_device *bdev = inode->i_sb->s_bdev; + int length; + unsigned relative_block = 0; + struct ext4_map_blocks map; + + map.m_pblk = 0; + map.m_lblk = 0; + map.m_len = 0; + map.m_flags = 0; + + for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { + int fully_mapped = 1; + unsigned first_hole = blocks_per_page; + + prefetchw(&page->flags); + if (pages) { + page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) + goto next_page; + } + + if (page_has_buffers(page)) + goto confused; + + block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits); + last_block = block_in_file + nr_pages * blocks_per_page; + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; + if (last_block > last_block_in_file) + last_block = last_block_in_file; + page_block = 0; + + /* + * Map blocks using the previous result first. + */ + if ((map.m_flags & EXT4_MAP_MAPPED) && + block_in_file > map.m_lblk && + block_in_file < (map.m_lblk + map.m_len)) { + unsigned map_offset = block_in_file - map.m_lblk; + unsigned last = map.m_len - map_offset; + + for (relative_block = 0; ; relative_block++) { + if (relative_block == last) { + /* needed? */ + map.m_flags &= ~EXT4_MAP_MAPPED; + break; + } + if (page_block == blocks_per_page) + break; + blocks[page_block] = map.m_pblk + map_offset + + relative_block; + page_block++; + block_in_file++; + } + } + + /* + * Then do more ext4_map_blocks() calls until we are + * done with this page. + */ + while (page_block < blocks_per_page) { + if (block_in_file < last_block) { + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; + + if (ext4_map_blocks(NULL, inode, &map, 0) < 0) { + set_error_page: + SetPageError(page); + zero_user_segment(page, 0, + PAGE_CACHE_SIZE); + unlock_page(page); + goto next_page; + } + } + if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { + fully_mapped = 0; + if (first_hole == blocks_per_page) + first_hole = page_block; + page_block++; + block_in_file++; + continue; + } + if (first_hole != blocks_per_page) + goto confused; /* hole -> non-hole */ + + /* Contiguous blocks? */ + if (page_block && blocks[page_block-1] != map.m_pblk-1) + goto confused; + for (relative_block = 0; ; relative_block++) { + if (relative_block == map.m_len) { + /* needed? */ + map.m_flags &= ~EXT4_MAP_MAPPED; + break; + } else if (page_block == blocks_per_page) + break; + blocks[page_block] = map.m_pblk+relative_block; + page_block++; + block_in_file++; + } + } + if (first_hole != blocks_per_page) { + zero_user_segment(page, first_hole << blkbits, + PAGE_CACHE_SIZE); + if (first_hole == 0) { + SetPageUptodate(page); + unlock_page(page); + goto next_page; + } + } else if (fully_mapped) { + SetPageMappedToDisk(page); + } + if (fully_mapped && blocks_per_page == 1 && + !PageUptodate(page) && cleancache_get_page(page) == 0) { + SetPageUptodate(page); + goto confused; + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (last_block_in_bio != blocks[0] - 1)) { + submit_and_realloc: + submit_bio(READ, bio); + bio = NULL; + } + if (bio == NULL) { + struct ext4_crypto_ctx *ctx = NULL; + + if (ext4_encrypted_inode(inode) && + S_ISREG(inode->i_mode)) { + ctx = ext4_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + goto set_error_page; + } + bio = bio_alloc(GFP_KERNEL, + min_t(int, nr_pages, bio_get_nr_vecs(bdev))); + if (!bio) { + if (ctx) + ext4_release_crypto_ctx(ctx); + goto set_error_page; + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9); + bio->bi_end_io = mpage_end_io; + bio->bi_private = ctx; + } + + length = first_hole << blkbits; + if (bio_add_page(bio, page, length, 0) < length) + goto submit_and_realloc; + + if (((map.m_flags & EXT4_MAP_BOUNDARY) && + (relative_block == map.m_len)) || + (first_hole != blocks_per_page)) { + submit_bio(READ, bio); + bio = NULL; + } else + last_block_in_bio = blocks[blocks_per_page - 1]; + goto next_page; + confused: + if (bio) { + submit_bio(READ, bio); + bio = NULL; + } + if (!PageUptodate(page)) + block_read_full_page(page, ext4_get_block); + else + unlock_page(page); + next_page: + if (pages) + page_cache_release(page); + } + BUG_ON(pages && !list_empty(pages)); + if (bio) + submit_bio(READ, bio); + return 0; +} diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 8a8ec6293b19..cf0c472047e3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1432,12 +1432,15 @@ static int ext4_flex_group_add(struct super_block *sb, goto exit; /* * We will always be modifying at least the superblock and GDT - * block. If we are adding a group past the last current GDT block, + * blocks. If we are adding a group past the last current GDT block, * we will also modify the inode and the dindirect block. If we * are adding a group with superblock/GDT backups we will also * modify each of the reserved GDT dindirect blocks. */ - credit = flex_gd->count * 4 + reserved_gdb; + credit = 3; /* sb, resize inode, resize inode dindirect */ + /* GDT blocks */ + credit += 1 + DIV_ROUND_UP(flex_gd->count, EXT4_DESC_PER_BLOCK(sb)); + credit += reserved_gdb; /* Reserved GDT dindirect blocks */ handle = ext4_journal_start_sb(sb, EXT4_HT_RESIZE, credit); if (IS_ERR(handle)) { err = PTR_ERR(handle); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e061e66c8280..58987b5c514b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -21,10 +21,10 @@ #include <linux/fs.h> #include <linux/time.h> #include <linux/vmalloc.h> -#include <linux/jbd2.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> @@ -295,6 +295,8 @@ static void __save_error_info(struct super_block *sb, const char *func, struct ext4_super_block *es = EXT4_SB(sb)->s_es; EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + if (bdev_read_only(sb->s_bdev)) + return; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); es->s_last_error_time = cpu_to_le32(get_seconds()); strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); @@ -323,22 +325,6 @@ static void save_error_info(struct super_block *sb, const char *func, ext4_commit_super(sb, 1); } -/* - * The del_gendisk() function uninitializes the disk-specific data - * structures, including the bdi structure, without telling anyone - * else. Once this happens, any attempt to call mark_buffer_dirty() - * (for example, by ext4_commit_super), will cause a kernel OOPS. - * This is a kludge to prevent these oops until we can put in a proper - * hook in del_gendisk() to inform the VFS and file system layers. - */ -static int block_device_ejected(struct super_block *sb) -{ - struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = inode_to_bdi(bd_inode); - - return bdi->dev == NULL; -} - static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; @@ -466,7 +452,7 @@ void __ext4_error_file(struct file *file, const char *function, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { - path = d_path(&(file->f_path), pathname, sizeof(pathname)); + path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); @@ -606,14 +592,17 @@ void __ext4_msg(struct super_block *sb, va_end(args); } +#define ext4_warning_ratelimit(sb) \ + ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \ + "EXT4-fs warning") + void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; - if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), - "EXT4-fs warning")) + if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); @@ -624,6 +613,24 @@ void __ext4_warning(struct super_block *sb, const char *function, va_end(args); } +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (!ext4_warning_ratelimit(inode->i_sb)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, + function, line, inode->i_ino, current->comm, &vaf); + va_end(args); +} + void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, @@ -822,6 +829,7 @@ static void ext4_put_super(struct super_block *sb) dump_orphan_list(sb, sbi); J_ASSERT(list_empty(&sbi->s_orphan)); + sync_blockdev(sb->s_bdev); invalidate_bdev(sb->s_bdev); if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) { /* @@ -893,7 +901,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) atomic_set(&ei->i_ioend_count, 0); atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); - +#ifdef CONFIG_EXT4_FS_ENCRYPTION + ei->i_crypt_info = NULL; +#endif return &ei->vfs_inode; } @@ -970,6 +980,10 @@ void ext4_clear_inode(struct inode *inode) jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (EXT4_I(inode)->i_crypt_info) + ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info); +#endif } static struct inode *ext4_nfs_get_inode(struct super_block *sb, @@ -1076,7 +1090,7 @@ static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, - .get_info = dquot_get_dqinfo, + .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk @@ -1120,7 +1134,7 @@ enum { Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev, Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, + Opt_data_err_abort, Opt_data_err_ignore, Opt_test_dummy_encryption, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, @@ -1211,6 +1225,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, + {Opt_test_dummy_encryption, "test_dummy_encryption"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1412,6 +1427,7 @@ static const struct mount_opts { {Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT}, {Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT}, {Opt_max_dir_size_kb, 0, MOPT_GTE0}, + {Opt_test_dummy_encryption, 0, MOPT_GTE0}, {Opt_err, 0, 0} }; @@ -1568,7 +1584,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } - journal_inode = path.dentry->d_inode; + journal_inode = d_inode(path.dentry); if (!S_ISBLK(journal_inode->i_mode)) { ext4_msg(sb, KERN_ERR, "error: journal path %s " "is not a block device", journal_path); @@ -1588,6 +1604,15 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); + } else if (token == Opt_test_dummy_encryption) { +#ifdef CONFIG_EXT4_FS_ENCRYPTION + sbi->s_mount_flags |= EXT4_MF_TEST_DUMMY_ENCRYPTION; + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mode enabled"); +#else + ext4_msg(sb, KERN_WARNING, + "Test dummy encryption mount option ignored"); +#endif } else if (m->flags & MOPT_DATAJ) { if (is_remount) { if (!sbi->s_journal) @@ -2685,11 +2710,13 @@ static struct attribute *ext4_attrs[] = { EXT4_INFO_ATTR(lazy_itable_init); EXT4_INFO_ATTR(batched_discard); EXT4_INFO_ATTR(meta_bg_resize); +EXT4_INFO_ATTR(encryption); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), ATTR_LIST(batched_discard), ATTR_LIST(meta_bg_resize), + ATTR_LIST(encryption), NULL, }; @@ -3419,7 +3446,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; - char *cp; const char *descr; int ret = -ENOMEM; int blocksize, clustersize; @@ -3450,8 +3476,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) part_stat_read(sb->s_bdev->bd_part, sectors[1]); /* Cleanup superblock name */ - for (cp = sb->s_id; (cp = strchr(cp, '/'));) - *cp = '!'; + strreplace(sb->s_id, '/', '!'); /* -EINVAL is default */ ret = -EINVAL; @@ -3692,6 +3717,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT) && + es->s_encryption_level) { + ext4_msg(sb, KERN_ERR, "Unsupported encryption level %d", + es->s_encryption_level); + goto failed_mount; + } + if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { @@ -4054,6 +4086,21 @@ no_journal: } } + if ((DUMMY_ENCRYPTION_ENABLED(sbi) || + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) && + (blocksize != PAGE_CACHE_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Unsupported blocksize for fs encryption"); + goto failed_mount_wq; + } + + if (DUMMY_ENCRYPTION_ENABLED(sbi) && + !(sb->s_flags & MS_RDONLY) && + !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) { + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); + ext4_commit_super(sb, 1); + } + /* * Get the # of file system overhead blocks from the * superblock if present. @@ -4570,7 +4617,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh || block_device_ejected(sb)) + if (!sbh) return error; if (buffer_write_io_error(sbh)) { /* @@ -4923,6 +4970,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } + if (*flags & MS_LAZYTIME) + sb->s_flags |= MS_LAZYTIME; + if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; @@ -5199,7 +5249,7 @@ static int ext4_write_info(struct super_block *sb, int type) handle_t *handle; /* Data block + inode block */ - handle = ext4_journal_start(sb->s_root->d_inode, EXT4_HT_QUOTA, 2); + handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ret = dquot_commit_info(sb, type); @@ -5247,7 +5297,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * all updates to the file when we bypass pagecache... */ if (EXT4_SB(sb)->s_journal && - ext4_should_journal_data(path->dentry->d_inode)) { + ext4_should_journal_data(d_inode(path->dentry))) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... @@ -5390,6 +5440,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err, offset = off & (sb->s_blocksize - 1); + int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -5410,7 +5461,12 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - bh = ext4_bread(handle, inode, blk, 1); + do { + bh = ext4_bread(handle, inode, blk, + EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) && + ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) @@ -5627,6 +5683,7 @@ out7: static void __exit ext4_exit_fs(void) { + ext4_exit_crypto(); ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index ff3711932018..c677f2c1044b 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -18,18 +18,89 @@ */ #include <linux/fs.h> -#include <linux/jbd2.h> #include <linux/namei.h> #include "ext4.h" #include "xattr.h" -static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) +#ifdef CONFIG_EXT4_FS_ENCRYPTION +static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie) { - struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); - nd_set_link(nd, (char *) ei->i_data); - return NULL; + struct page *cpage = NULL; + char *caddr, *paddr = NULL; + struct ext4_str cstr, pstr; + struct inode *inode = d_inode(dentry); + struct ext4_encrypted_symlink_data *sd; + loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); + int res; + u32 plen, max_size = inode->i_sb->s_blocksize; + + res = ext4_get_encryption_info(inode); + if (res) + return ERR_PTR(res); + + if (ext4_inode_is_fast_symlink(inode)) { + caddr = (char *) EXT4_I(inode)->i_data; + max_size = sizeof(EXT4_I(inode)->i_data); + } else { + cpage = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(cpage)) + return ERR_CAST(cpage); + caddr = kmap(cpage); + caddr[size] = 0; + } + + /* Symlink is encrypted */ + sd = (struct ext4_encrypted_symlink_data *)caddr; + cstr.name = sd->encrypted_path; + cstr.len = le32_to_cpu(sd->len); + if ((cstr.len + + sizeof(struct ext4_encrypted_symlink_data) - 1) > + max_size) { + /* Symlink data on the disk is corrupted */ + res = -EIO; + goto errout; + } + plen = (cstr.len < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) ? + EXT4_FNAME_CRYPTO_DIGEST_SIZE*2 : cstr.len; + paddr = kmalloc(plen + 1, GFP_NOFS); + if (!paddr) { + res = -ENOMEM; + goto errout; + } + pstr.name = paddr; + pstr.len = plen; + res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr); + if (res < 0) + goto errout; + /* Null-terminate the name */ + if (res <= plen) + paddr[res] = '\0'; + if (cpage) { + kunmap(cpage); + page_cache_release(cpage); + } + return *cookie = paddr; +errout: + if (cpage) { + kunmap(cpage); + page_cache_release(cpage); + } + kfree(paddr); + return ERR_PTR(res); } +const struct inode_operations ext4_encrypted_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext4_encrypted_follow_link, + .put_link = kfree_put_link, + .setattr = ext4_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +}; +#endif + const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, @@ -43,7 +114,7 @@ const struct inode_operations ext4_symlink_inode_operations = { const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext4_follow_link, + .follow_link = simple_follow_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1e09fc77395c..16e28c08d1e8 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -55,7 +55,6 @@ #include <linux/slab.h> #include <linux/mbcache.h> #include <linux/quotaops.h> -#include <linux/rwsem.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" @@ -179,7 +178,7 @@ ext4_xattr_handler(int name_index) /* * Inode operation listxattr() * - * dentry->d_inode->i_mutex: don't care + * d_inode(dentry)->i_mutex: don't care */ ssize_t ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) @@ -424,7 +423,7 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, static int ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct buffer_head *bh = NULL; int error; struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode); @@ -461,7 +460,7 @@ cleanup: static int ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; @@ -502,7 +501,7 @@ ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; - down_read(&EXT4_I(dentry->d_inode)->xattr_sem); + down_read(&EXT4_I(d_inode(dentry))->xattr_sem); ret = ret2 = ext4_xattr_ibody_list(dentry, buffer, buffer_size); if (ret < 0) goto errout; @@ -515,7 +514,7 @@ ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) goto errout; ret += ret2; errout: - up_read(&EXT4_I(dentry->d_inode)->xattr_sem); + up_read(&EXT4_I(d_inode(dentry))->xattr_sem); return ret; } @@ -639,8 +638,7 @@ ext4_xattr_set_entry(struct ext4_xattr_info *i, struct ext4_xattr_search *s) free += EXT4_XATTR_LEN(name_len); } if (i->value) { - if (free < EXT4_XATTR_SIZE(i->value_len) || - free < EXT4_XATTR_LEN(name_len) + + if (free < EXT4_XATTR_LEN(name_len) + EXT4_XATTR_SIZE(i->value_len)) return -ENOSPC; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 29bedf5589f6..ddc0957760ba 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -23,6 +23,7 @@ #define EXT4_XATTR_INDEX_SECURITY 6 #define EXT4_XATTR_INDEX_SYSTEM 7 #define EXT4_XATTR_INDEX_RICHACL 8 +#define EXT4_XATTR_INDEX_ENCRYPTION 9 struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ @@ -98,6 +99,8 @@ extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; +#define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" + extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index d2a200624af5..95d90e0560f0 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -33,7 +33,7 @@ ext4_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -43,7 +43,7 @@ ext4_xattr_security_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_SECURITY, + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, name, value, size, flags); } diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 95f1f4ab59a4..891ee2ddfbd6 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -36,7 +36,7 @@ ext4_xattr_trusted_get(struct dentry *dentry, const char *name, void *buffer, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -46,7 +46,7 @@ ext4_xattr_trusted_set(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_TRUSTED, + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 0edb7611ffbe..6ed932b3c043 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -37,7 +37,7 @@ ext4_xattr_user_get(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_get(dentry->d_inode, EXT4_XATTR_INDEX_USER, + return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER, name, buffer, size); } @@ -49,7 +49,7 @@ ext4_xattr_user_set(struct dentry *dentry, const char *name, return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; - return ext4_xattr_set(dentry->d_inode, EXT4_XATTR_INDEX_USER, + return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 94e2d2ffabe1..c629762005bc 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -1,5 +1,5 @@ config F2FS_FS - tristate "F2FS filesystem support (EXPERIMENTAL)" + tristate "F2FS filesystem support" depends on BLOCK help F2FS is based on Log-structured File System (LFS), which supports @@ -72,6 +72,25 @@ config F2FS_CHECK_FS If you want to improve the performance, say N. +config F2FS_FS_ENCRYPTION + bool "F2FS Encryption" + depends on F2FS_FS + depends on F2FS_FS_XATTR + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_ECB + select CRYPTO_XTS + select CRYPTO_CTS + select CRYPTO_CTR + select CRYPTO_SHA256 + select KEYS + select ENCRYPTED_KEYS + help + Enable encryption of f2fs files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. + config F2FS_IO_TRACE bool "F2FS IO tracer" depends on F2FS_FS diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index d92397731db8..396be1a39e55 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -6,3 +6,5 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o +f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \ + crypto_key.o crypto_fname.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 742202779bd5..c8f25f7241f0 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -334,50 +334,48 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, struct page *dpage) { struct posix_acl *p; + struct posix_acl *clone; int ret; + *acl = NULL; + *default_acl = NULL; + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) - goto no_acl; + return 0; p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage); - if (IS_ERR(p)) { - if (p == ERR_PTR(-EOPNOTSUPP)) - goto apply_umask; - return PTR_ERR(p); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; } + if (IS_ERR(p)) + return PTR_ERR(p); - if (!p) - goto apply_umask; - - *acl = f2fs_acl_clone(p, GFP_NOFS); - if (!*acl) - return -ENOMEM; + clone = f2fs_acl_clone(p, GFP_NOFS); + if (!clone) + goto no_mem; - ret = f2fs_acl_create_masq(*acl, mode); - if (ret < 0) { - posix_acl_release(*acl); - return -ENOMEM; - } + ret = f2fs_acl_create_masq(clone, mode); + if (ret < 0) + goto no_mem_clone; - if (ret == 0) { - posix_acl_release(*acl); - *acl = NULL; - } + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; - if (!S_ISDIR(*mode)) { + if (!S_ISDIR(*mode)) posix_acl_release(p); - *default_acl = NULL; - } else { + else *default_acl = p; - } - return 0; -apply_umask: - *mode &= ~current_umask(); -no_acl: - *default_acl = NULL; - *acl = NULL; return 0; + +no_mem_clone: + posix_acl_release(clone); +no_mem: + posix_acl_release(p); + return -ENOMEM; } int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 7f794b72b3b7..b70bbe1a6a8c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -52,9 +52,11 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page; struct f2fs_io_info fio = { + .sbi = sbi, .type = META, .rw = READ_SYNC | REQ_META | REQ_PRIO, .blk_addr = index, + .encrypted_page = NULL, }; repeat: page = grab_cache_page(mapping, index); @@ -65,7 +67,9 @@ repeat: if (PageUptodate(page)) goto out; - if (f2fs_submit_page_bio(sbi, page, &fio)) + fio.page = page; + + if (f2fs_submit_page_bio(&fio)) goto repeat; lock_page(page); @@ -77,8 +81,7 @@ out: return page; } -static inline bool is_valid_blkaddr(struct f2fs_sb_info *sbi, - block_t blkaddr, int type) +bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { case META_NAT: @@ -118,8 +121,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type struct page *page; block_t blkno = start; struct f2fs_io_info fio = { + .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO + .rw = READ_SYNC | REQ_META | REQ_PRIO, + .encrypted_page = NULL, }; for (; nrpages-- > 0; blkno++) { @@ -161,7 +166,8 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type continue; } - f2fs_submit_page_mbio(sbi, page, &fio); + fio.page = page; + f2fs_submit_page_mbio(&fio); f2fs_put_page(page, 0); } out: @@ -276,7 +282,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (f2fs_write_meta_page(page, &wbc)) { + if (mapping->a_ops->writepage(page, &wbc)) { unlock_page(page); break; } @@ -464,20 +470,19 @@ static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) void recover_orphan_inodes(struct f2fs_sb_info *sbi) { - block_t start_blk, orphan_blkaddr, i, j; + block_t start_blk, orphan_blocks, i, j; if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) return; set_sbi_flag(sbi, SBI_POR_DOING); - start_blk = __start_cp_addr(sbi) + 1 + - le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); - orphan_blkaddr = __start_sum_addr(sbi) - 1; + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); + orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); - ra_meta_pages(sbi, start_blk, orphan_blkaddr, META_CP); + ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP); - for (i = 0; i < orphan_blkaddr; i++) { + for (i = 0; i < orphan_blocks; i++) { struct page *page = get_meta_page(sbi, start_blk + i); struct f2fs_orphan_block *orphan_blk; @@ -511,7 +516,12 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) grab_meta_page(sbi, start_blk + index); index = 1; - spin_lock(&im->ino_lock); + + /* + * we don't need to do spin_lock(&im->ino_lock) here, since all the + * orphan inode operations are covered under f2fs_lock_op(). + * And, spin_lock should be avoided due to page operations below. + */ head = &im->ino_list; /* loop for each orphan inode entry and write them in Jornal block */ @@ -551,8 +561,6 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) set_page_dirty(page); f2fs_put_page(page, 1); } - - spin_unlock(&im->ino_lock); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -615,7 +623,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) unsigned long blk_size = sbi->blocksize; unsigned long long cp1_version = 0, cp2_version = 0; unsigned long long cp_start_blk_no; - unsigned int cp_blks = 1 + le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); + unsigned int cp_blks = 1 + __cp_payload(sbi); block_t cp_blk_no; int i; @@ -796,6 +804,7 @@ retry: * wribacking dentry pages in the freeing inode. */ f2fs_submit_merged_bio(sbi, DATA, WRITE); + cond_resched(); } goto retry; } @@ -879,12 +888,10 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; nid_t last_nid = nm_i->next_scan_nid; block_t start_blk; - struct page *cp_page; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; - void *kaddr; int i; - int cp_payload_blks = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); + int cp_payload_blks = __cp_payload(sbi); /* * This avoids to conduct wrong roll-forward operations and uses @@ -979,19 +986,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_addr(sbi); /* write out checkpoint buffer at block 0 */ - cp_page = grab_meta_page(sbi, start_blk++); - kaddr = page_address(cp_page); - memcpy(kaddr, ckpt, F2FS_BLKSIZE); - set_page_dirty(cp_page); - f2fs_put_page(cp_page, 1); - - for (i = 1; i < 1 + cp_payload_blks; i++) { - cp_page = grab_meta_page(sbi, start_blk++); - kaddr = page_address(cp_page); - memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE); - set_page_dirty(cp_page); - f2fs_put_page(cp_page, 1); - } + update_meta_page(sbi, ckpt, start_blk++); + + for (i = 1; i < 1 + cp_payload_blks; i++) + update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, + start_blk++); if (orphan_num) { write_orphan_inodes(sbi, start_blk); @@ -1006,11 +1005,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } /* writeout checkpoint block */ - cp_page = grab_meta_page(sbi, start_blk); - kaddr = page_address(cp_page); - memcpy(kaddr, ckpt, F2FS_BLKSIZE); - set_page_dirty(cp_page); - f2fs_put_page(cp_page, 1); + update_meta_page(sbi, ckpt, start_blk); /* wait for previous submitted node/meta pages writeback */ wait_on_all_pages_writeback(sbi); @@ -1036,7 +1031,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return; - clear_prefree_segments(sbi); + clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); } @@ -1048,17 +1043,19 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); - mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT) + (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || + (cpc->reason == CP_DISCARD && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) goto out; if (f2fs_readonly(sbi->sb)) goto out; + + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); + if (block_operations(sbi)) goto out; @@ -1085,6 +1082,10 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); + + if (cpc->reason == CP_RECOVERY) + f2fs_msg(sbi->sb, KERN_NOTICE, + "checkpoint: version = %llx", ckpt_ver); out: mutex_unlock(&sbi->cp_mutex); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); @@ -1103,14 +1104,9 @@ void init_ino_entry_info(struct f2fs_sb_info *sbi) im->ino_num = 0; } - /* - * considering 512 blocks in a segment 8 blocks are needed for cp - * and log segment summaries. Remaining blocks are used to keep - * orphan entries with the limitation one reserved segment - * for cp pack we can have max 1020*504 orphan entries - */ sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - - NR_CURSEG_TYPE) * F2FS_ORPHANS_PER_BLOCK; + NR_CURSEG_TYPE - __cp_payload(sbi)) * + F2FS_ORPHANS_PER_BLOCK; } int __init create_checkpoint_caches(void) diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c new file mode 100644 index 000000000000..4a62ef14e932 --- /dev/null +++ b/fs/f2fs/crypto.c @@ -0,0 +1,491 @@ +/* + * linux/fs/f2fs/crypto.c + * + * Copied from linux/fs/ext4/crypto.c + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * This contains encryption functions for f2fs + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Remove ext4_encrypted_zeroout(), + * add f2fs_restore_and_release_control_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <keys/user-type.h> +#include <keys/encrypted-type.h> +#include <linux/crypto.h> +#include <linux/ecryptfs.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/key.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <linux/spinlock_types.h> +#include <linux/f2fs_fs.h> +#include <linux/ratelimit.h> +#include <linux/bio.h> + +#include "f2fs.h" +#include "xattr.h" + +/* Encryption added and removed here! (L: */ + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *f2fs_bounce_page_pool; + +static LIST_HEAD(f2fs_free_crypto_ctxs); +static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock); + +static struct workqueue_struct *f2fs_read_workqueue; +static DEFINE_MUTEX(crypto_init); + +static struct kmem_cache *f2fs_crypto_ctx_cachep; +struct kmem_cache *f2fs_crypt_info_cachep; + +/** + * f2fs_release_crypto_ctx() - Releases an encryption context + * @ctx: The encryption context to release. + * + * If the encryption context was allocated from the pre-allocated pool, returns + * it to that pool. Else, frees it. + * + * If there's a bounce page in the context, this frees that. + */ +void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx) +{ + unsigned long flags; + + if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) { + mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool); + ctx->w.bounce_page = NULL; + } + ctx->w.control_page = NULL; + if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { + kmem_cache_free(f2fs_crypto_ctx_cachep, ctx); + } else { + spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); + list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); + spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); + } +} + +/** + * f2fs_get_crypto_ctx() - Gets an encryption context + * @inode: The inode for which we are doing the crypto + * + * Allocates and initializes an encryption context. + * + * Return: An allocated and initialized encryption context on success; error + * value or NULL otherwise. + */ +struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode) +{ + struct f2fs_crypto_ctx *ctx = NULL; + unsigned long flags; + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + + if (ci == NULL) + return ERR_PTR(-ENOKEY); + + /* + * We first try getting the ctx from a free list because in + * the common case the ctx will have an allocated and + * initialized crypto tfm, so it's probably a worthwhile + * optimization. For the bounce page, we first try getting it + * from the kernel allocator because that's just about as fast + * as getting it from a list and because a cache of free pages + * should generally be a "last resort" option for a filesystem + * to be able to do its job. + */ + spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); + ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs, + struct f2fs_crypto_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); + if (!ctx) { + ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) + return ERR_PTR(-ENOMEM); + ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->flags &= ~F2FS_WRITE_PATH_FL; + return ctx; +} + +/* + * Call f2fs_decrypt on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct f2fs_crypto_ctx *ctx = + container_of(work, struct f2fs_crypto_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = f2fs_decrypt(ctx, page); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else + SetPageUptodate(page); + unlock_page(page); + } + f2fs_release_crypto_ctx(ctx); + bio_put(bio); +} + +void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(f2fs_read_workqueue, &ctx->r.work); +} + +static void f2fs_crypto_destroy(void) +{ + struct f2fs_crypto_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list) + kmem_cache_free(f2fs_crypto_ctx_cachep, pos); + INIT_LIST_HEAD(&f2fs_free_crypto_ctxs); + if (f2fs_bounce_page_pool) + mempool_destroy(f2fs_bounce_page_pool); + f2fs_bounce_page_pool = NULL; +} + +/** + * f2fs_crypto_initialize() - Set up for f2fs encryption. + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int f2fs_crypto_initialize(void) +{ + int i, res = -ENOMEM; + + if (f2fs_bounce_page_pool) + return 0; + + mutex_lock(&crypto_init); + if (f2fs_bounce_page_pool) + goto already_initialized; + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct f2fs_crypto_ctx *ctx; + + ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL); + if (!ctx) + goto fail; + list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); + } + + /* must be allocated at the last step to avoid race condition above */ + f2fs_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!f2fs_bounce_page_pool) + goto fail; + +already_initialized: + mutex_unlock(&crypto_init); + return 0; +fail: + f2fs_crypto_destroy(); + mutex_unlock(&crypto_init); + return res; +} + +/** + * f2fs_exit_crypto() - Shutdown the f2fs encryption system + */ +void f2fs_exit_crypto(void) +{ + f2fs_crypto_destroy(); + + if (f2fs_read_workqueue) + destroy_workqueue(f2fs_read_workqueue); + if (f2fs_crypto_ctx_cachep) + kmem_cache_destroy(f2fs_crypto_ctx_cachep); + if (f2fs_crypt_info_cachep) + kmem_cache_destroy(f2fs_crypt_info_cachep); +} + +int __init f2fs_init_crypto(void) +{ + int res = -ENOMEM; + + f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0); + if (!f2fs_read_workqueue) + goto fail; + + f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx, + SLAB_RECLAIM_ACCOUNT); + if (!f2fs_crypto_ctx_cachep) + goto fail; + + f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info, + SLAB_RECLAIM_ACCOUNT); + if (!f2fs_crypt_info_cachep) + goto fail; + + return 0; +fail: + f2fs_exit_crypto(); + return res; +} + +void f2fs_restore_and_release_control_page(struct page **page) +{ + struct f2fs_crypto_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + f2fs_restore_control_page(bounce_page); +} + +void f2fs_restore_control_page(struct page *data_page) +{ + struct f2fs_crypto_ctx *ctx = + (struct f2fs_crypto_ctx *)page_private(data_page); + + set_page_private(data_page, (unsigned long)NULL); + ClearPagePrivate(data_page); + unlock_page(data_page); + f2fs_release_crypto_ctx(ctx); +} + +/** + * f2fs_crypt_complete() - The completion callback for page encryption + * @req: The asynchronous encryption request context + * @res: The result of the encryption operation + */ +static void f2fs_crypt_complete(struct crypto_async_request *req, int res) +{ + struct f2fs_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +typedef enum { + F2FS_DECRYPT = 0, + F2FS_ENCRYPT, +} f2fs_direction_t; + +static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx, + struct inode *inode, + f2fs_direction_t rw, + pgoff_t index, + struct page *src_page, + struct page *dest_page) +{ + u8 xts_tweak[F2FS_XTS_TWEAK_SIZE]; + struct ablkcipher_request *req = NULL; + DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct scatterlist dst, src; + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; + int res = 0; + + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", + __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + f2fs_crypt_complete, &ecr); + + BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index)); + memcpy(xts_tweak, &index, sizeof(index)); + memset(&xts_tweak[sizeof(index)], 0, + F2FS_XTS_TWEAK_SIZE - sizeof(index)); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); + ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + xts_tweak); + if (rw == F2FS_DECRYPT) + res = crypto_ablkcipher_decrypt(req); + else + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + ablkcipher_request_free(req); + if (res) { + printk_ratelimited(KERN_ERR + "%s: crypto_ablkcipher_encrypt() returned %d\n", + __func__, res); + return res; + } + return 0; +} + +static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx) +{ + ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags |= F2FS_WRITE_PATH_FL; + return ctx->w.bounce_page; +} + +/** + * f2fs_encrypt() - Encrypts a page + * @inode: The inode for which the encryption should take place + * @plaintext_page: The page to encrypt. Must be locked. + * + * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx + * encryption context. + * + * Called on the page write path. The caller must call + * f2fs_restore_control_page() on the returned ciphertext page to + * release the bounce buffer and the encryption context. + * + * Return: An allocated page with the encrypted content on success. Else, an + * error value or NULL. + */ +struct page *f2fs_encrypt(struct inode *inode, + struct page *plaintext_page) +{ + struct f2fs_crypto_ctx *ctx; + struct page *ciphertext_page = NULL; + int err; + + BUG_ON(!PageLocked(plaintext_page)); + + ctx = f2fs_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + return (struct page *)ctx; + + /* The encryption operation will require a bounce page. */ + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) + goto err_out; + + ctx->w.control_page = plaintext_page; + err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index, + plaintext_page, ciphertext_page); + if (err) { + ciphertext_page = ERR_PTR(err); + goto err_out; + } + + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)ctx); + lock_page(ciphertext_page); + return ciphertext_page; + +err_out: + f2fs_release_crypto_ctx(ctx); + return ciphertext_page; +} + +/** + * f2fs_decrypt() - Decrypts a page in-place + * @ctx: The encryption context. + * @page: The page to decrypt. Must be locked. + * + * Decrypts page in-place using the ctx encryption context. + * + * Called from the read completion callback. + * + * Return: Zero on success, non-zero otherwise. + */ +int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page) +{ + BUG_ON(!PageLocked(page)); + + return f2fs_page_crypto(ctx, page->mapping->host, + F2FS_DECRYPT, page->index, page, page); +} + +/* + * Convenience function which takes care of allocating and + * deallocating the encryption context + */ +int f2fs_decrypt_one(struct inode *inode, struct page *page) +{ + struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode); + int ret; + + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + ret = f2fs_decrypt(ctx, page); + f2fs_release_crypto_ctx(ctx); + return ret; +} + +bool f2fs_valid_contents_enc_mode(uint32_t mode) +{ + return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS); +} + +/** + * f2fs_validate_encryption_key_size() - Validate the encryption key size + * @mode: The key mode. + * @size: The key size to validate. + * + * Return: The validated key size for @mode. Zero if invalid. + */ +uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size) +{ + if (size == f2fs_encryption_key_size(mode)) + return size; + return 0; +} diff --git a/fs/f2fs/crypto_fname.c b/fs/f2fs/crypto_fname.c new file mode 100644 index 000000000000..ab377d496a39 --- /dev/null +++ b/fs/f2fs/crypto_fname.c @@ -0,0 +1,440 @@ +/* + * linux/fs/f2fs/crypto_fname.c + * + * Copied from linux/fs/ext4/crypto.c + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * This contains functions for filename crypto management in f2fs + * + * Written by Uday Savagaonkar, 2014. + * + * Adjust f2fs dentry structure + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + */ +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <keys/encrypted-type.h> +#include <keys/user-type.h> +#include <linux/crypto.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/key.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <linux/spinlock_types.h> +#include <linux/f2fs_fs.h> +#include <linux/ratelimit.h> + +#include "f2fs.h" +#include "f2fs_crypto.h" +#include "xattr.h" + +/** + * f2fs_dir_crypt_complete() - + */ +static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) +{ + struct f2fs_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +bool f2fs_valid_filenames_enc_mode(uint32_t mode) +{ + return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS); +} + +static unsigned max_name_len(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : + F2FS_NAME_LEN; +} + +/** + * f2fs_fname_encrypt() - + * + * This function encrypts the input filename, and returns the length of the + * ciphertext. Errors are returned as negative numbers. We trust the caller to + * allocate sufficient memory to oname string. + */ +static int f2fs_fname_encrypt(struct inode *inode, + const struct qstr *iname, struct f2fs_str *oname) +{ + u32 ciphertext_len; + struct ablkcipher_request *req = NULL; + DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; + int res = 0; + char iv[F2FS_CRYPTO_BLOCK_SIZE]; + struct scatterlist src_sg, dst_sg; + int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); + char *workbuf, buf[32], *alloc_buf = NULL; + unsigned lim = max_name_len(inode); + + if (iname->len <= 0 || iname->len > lim) + return -EIO; + + ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ? + F2FS_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; + + if (ciphertext_len <= sizeof(buf)) { + workbuf = buf; + } else { + alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); + if (!alloc_buf) + return -ENOMEM; + workbuf = alloc_buf; + } + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", __func__); + kfree(alloc_buf); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + f2fs_dir_crypt_complete, &ecr); + + /* Copy the input */ + memcpy(workbuf, iname->name, iname->len); + if (iname->len < ciphertext_len) + memset(workbuf + iname->len, 0, ciphertext_len - iname->len); + + /* Initialize IV */ + memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + + /* Create encryption request */ + sg_init_one(&src_sg, workbuf, ciphertext_len); + sg_init_one(&dst_sg, oname->name, ciphertext_len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + kfree(alloc_buf); + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited(KERN_ERR + "%s: Error (error code %d)\n", __func__, res); + } + oname->len = ciphertext_len; + return res; +} + +/* + * f2fs_fname_decrypt() + * This function decrypts the input filename, and returns + * the length of the plaintext. + * Errors are returned as negative numbers. + * We trust the caller to allocate sufficient memory to oname string. + */ +static int f2fs_fname_decrypt(struct inode *inode, + const struct f2fs_str *iname, struct f2fs_str *oname) +{ + struct ablkcipher_request *req = NULL; + DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; + int res = 0; + char iv[F2FS_CRYPTO_BLOCK_SIZE]; + unsigned lim = max_name_len(inode); + + if (iname->len <= 0 || iname->len > lim) + return -EIO; + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + f2fs_dir_crypt_complete, &ecr); + + /* Initialize IV */ + memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + + /* Create decryption request */ + sg_init_one(&src_sg, iname->name, iname->len); + sg_init_one(&dst_sg, oname->name, oname->len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_ablkcipher_decrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited(KERN_ERR + "%s: Error in f2fs_fname_decrypt (error code %d)\n", + __func__, res); + return res; + } + + oname->len = strnlen(oname->name, iname->len); + return oname->len; +} + +static const char *lookup_table = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +/** + * f2fs_fname_encode_digest() - + * + * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. + * The encoded string is roughly 4/3 times the size of the input string. + */ +static int digest_encode(const char *src, int len, char *dst) +{ + int i = 0, bits = 0, ac = 0; + char *cp = dst; + + while (i < len) { + ac += (((unsigned char) src[i]) << bits); + bits += 8; + do { + *cp++ = lookup_table[ac & 0x3f]; + ac >>= 6; + bits -= 6; + } while (bits >= 6); + i++; + } + if (bits) + *cp++ = lookup_table[ac & 0x3f]; + return cp - dst; +} + +static int digest_decode(const char *src, int len, char *dst) +{ + int i = 0, bits = 0, ac = 0; + const char *p; + char *cp = dst; + + while (i < len) { + p = strchr(lookup_table, src[i]); + if (p == NULL || src[i] == 0) + return -2; + ac += (p - lookup_table) << bits; + bits += 6; + if (bits >= 8) { + *cp++ = ac & 0xff; + ac >>= 8; + bits -= 8; + } + i++; + } + if (ac) + return -1; + return cp - dst; +} + +/** + * f2fs_fname_crypto_round_up() - + * + * Return: The next multiple of block size + */ +u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize) +{ + return ((size + blksize - 1) / blksize) * blksize; +} + +/** + * f2fs_fname_crypto_alloc_obuff() - + * + * Allocates an output buffer that is sufficient for the crypto operation + * specified by the context and the direction. + */ +int f2fs_fname_crypto_alloc_buffer(struct inode *inode, + u32 ilen, struct f2fs_str *crypto_str) +{ + unsigned int olen; + int padding = 16; + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + + if (ci) + padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); + if (padding < F2FS_CRYPTO_BLOCK_SIZE) + padding = F2FS_CRYPTO_BLOCK_SIZE; + olen = f2fs_fname_crypto_round_up(ilen, padding); + crypto_str->len = olen; + if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2) + olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + /* Allocated buffer can hold one more character to null-terminate the + * string */ + crypto_str->name = kmalloc(olen + 1, GFP_NOFS); + if (!(crypto_str->name)) + return -ENOMEM; + return 0; +} + +/** + * f2fs_fname_crypto_free_buffer() - + * + * Frees the buffer allocated for crypto operation. + */ +void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str) +{ + if (!crypto_str) + return; + kfree(crypto_str->name); + crypto_str->name = NULL; +} + +/** + * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space + */ +int f2fs_fname_disk_to_usr(struct inode *inode, + f2fs_hash_t *hash, + const struct f2fs_str *iname, + struct f2fs_str *oname) +{ + const struct qstr qname = FSTR_TO_QSTR(iname); + char buf[24]; + int ret; + + if (is_dot_dotdot(&qname)) { + oname->name[0] = '.'; + oname->name[iname->len - 1] = '.'; + oname->len = iname->len; + return oname->len; + } + + if (F2FS_I(inode)->i_crypt_info) + return f2fs_fname_decrypt(inode, iname, oname); + + if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) { + ret = digest_encode(iname->name, iname->len, oname->name); + oname->len = ret; + return ret; + } + if (hash) { + memcpy(buf, hash, 4); + memset(buf + 4, 0, 4); + } else + memset(buf, 0, 8); + memcpy(buf + 8, iname->name + iname->len - 16, 16); + oname->name[0] = '_'; + ret = digest_encode(buf, 24, oname->name + 1); + oname->len = ret + 1; + return ret + 1; +} + +/** + * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space + */ +int f2fs_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct f2fs_str *oname) +{ + int res; + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + + if (is_dot_dotdot(iname)) { + oname->name[0] = '.'; + oname->name[iname->len - 1] = '.'; + oname->len = iname->len; + return oname->len; + } + + if (ci) { + res = f2fs_fname_encrypt(inode, iname, oname); + return res; + } + /* Without a proper key, a user is not allowed to modify the filenames + * in a directory. Consequently, a user space name cannot be mapped to + * a disk-space name */ + return -EACCES; +} + +int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct f2fs_filename *fname) +{ + struct f2fs_crypt_info *ci; + int ret = 0, bigname = 0; + + memset(fname, 0, sizeof(struct f2fs_filename)); + fname->usr_fname = iname; + + if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) { + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; + } + ret = f2fs_get_encryption_info(dir); + if (ret) + return ret; + ci = F2FS_I(dir)->i_crypt_info; + if (ci) { + ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len, + &fname->crypto_buf); + if (ret < 0) + return ret; + ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf); + if (ret < 0) + goto errout; + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + return 0; + } + if (!lookup) + return -EACCES; + + /* We don't have the key and we are doing a lookup; decode the + * user-supplied name + */ + if (iname->name[0] == '_') + bigname = 1; + if ((bigname && (iname->len != 33)) || + (!bigname && (iname->len > 43))) + return -ENOENT; + + fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + if (fname->crypto_buf.name == NULL) + return -ENOMEM; + ret = digest_decode(iname->name + bigname, iname->len - bigname, + fname->crypto_buf.name); + if (ret < 0) { + ret = -ENOENT; + goto errout; + } + fname->crypto_buf.len = ret; + if (bigname) { + memcpy(&fname->hash, fname->crypto_buf.name, 4); + } else { + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + } + return 0; +errout: + f2fs_fname_crypto_free_buffer(&fname->crypto_buf); + return ret; +} + +void f2fs_fname_free_filename(struct f2fs_filename *fname) +{ + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; +} diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c new file mode 100644 index 000000000000..95b8f936f00b --- /dev/null +++ b/fs/f2fs/crypto_key.c @@ -0,0 +1,255 @@ +/* + * linux/fs/f2fs/crypto_key.c + * + * Copied from linux/fs/f2fs/crypto_key.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions for f2fs + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ +#include <keys/encrypted-type.h> +#include <keys/user-type.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <uapi/linux/keyctl.h> +#include <crypto/hash.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "xattr.h" + +static void derive_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct f2fs_completion_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->res = rc; + complete(&ecr->completion); +} + +/** + * f2fs_derive_key_aes() - Derive a key using AES-128-ECB + * @deriving_key: Encryption key used for derivatio. + * @source_key: Source key to which to apply derivation. + * @derived_key: Derived key. + * + * Return: Zero on success; non-zero otherwise. + */ +static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE], + char source_key[F2FS_AES_256_XTS_KEY_SIZE], + char derived_key[F2FS_AES_256_XTS_KEY_SIZE]) +{ + int res = 0; + struct ablkcipher_request *req = NULL; + DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, + 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + derive_crypt_complete, &ecr); + res = crypto_ablkcipher_setkey(tfm, deriving_key, + F2FS_AES_128_ECB_KEY_SIZE); + if (res < 0) + goto out; + + sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE); + sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, + F2FS_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } +out: + if (req) + ablkcipher_request_free(req); + if (tfm) + crypto_free_ablkcipher(tfm); + return res; +} + +static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) +{ + if (!ci) + return; + + if (ci->ci_keyring_key) + key_put(ci->ci_keyring_key); + crypto_free_ablkcipher(ci->ci_ctfm); + kmem_cache_free(f2fs_crypt_info_cachep, ci); +} + +void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_crypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(fi->i_crypt_info); + if (ci == NULL) + return; + prev = cmpxchg(&fi->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + f2fs_free_crypt_info(ci); +} + +int _f2fs_get_encryption_info(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_crypt_info *crypt_info; + char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + + (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1]; + struct key *keyring_key = NULL; + struct f2fs_encryption_key *master_key; + struct f2fs_encryption_context ctx; + struct user_key_payload *ukp; + struct crypto_ablkcipher *ctfm; + const char *cipher_str; + char raw_key[F2FS_MAX_KEY_SIZE]; + char mode; + int res; + + res = f2fs_crypto_initialize(); + if (res) + return res; +retry: + crypt_info = ACCESS_ONCE(fi->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) + return 0; + f2fs_free_encryption_info(inode, crypt_info); + goto retry; + } + + res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx), NULL); + if (res < 0) + return res; + else if (res != sizeof(ctx)) + return -EINVAL; + res = 0; + + crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); + if (S_ISREG(inode->i_mode)) + mode = crypt_info->ci_data_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + mode = crypt_info->ci_filename_mode; + else + BUG(); + + switch (mode) { + case F2FS_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case F2FS_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "f2fs: unsupported key mode %d (ino %u)\n", + mode, (unsigned) inode->i_ino); + res = -ENOKEY; + goto out; + } + + memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX, + F2FS_KEY_DESC_PREFIX_SIZE); + sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE, + "%*phN", F2FS_KEY_DESCRIPTOR_SIZE, + ctx.master_key_descriptor); + full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + + (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + if (IS_ERR(keyring_key)) { + res = PTR_ERR(keyring_key); + keyring_key = NULL; + goto out; + } + crypt_info->ci_keyring_key = keyring_key; + BUG_ON(keyring_key->type != &key_type_logon); + ukp = ((struct user_key_payload *)keyring_key->payload.data); + if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { + res = -EINVAL; + goto out; + } + master_key = (struct f2fs_encryption_key *)ukp->data; + BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE != + F2FS_KEY_DERIVATION_NONCE_SIZE); + BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE); + res = f2fs_derive_key_aes(ctx.nonce, master_key->raw, + raw_key); + if (res) + goto out; + + ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_ablkcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_ablkcipher_setkey(ctfm, raw_key, + f2fs_encryption_key_size(mode)); + if (res) + goto out; + + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) { + f2fs_free_crypt_info(crypt_info); + goto retry; + } + return 0; + +out: + if (res == -ENOKEY && !S_ISREG(inode->i_mode)) + res = 0; + + f2fs_free_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); + return res; +} + +int f2fs_has_encryption_key(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + return (fi->i_crypt_info != NULL); +} diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c new file mode 100644 index 000000000000..d4a96af513c2 --- /dev/null +++ b/fs/f2fs/crypto_policy.c @@ -0,0 +1,209 @@ +/* + * copied from linux/fs/ext4/crypto_policy.c + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility. + * + * This contains encryption policy functions for f2fs with some modifications + * to support f2fs-specific xattr APIs. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ +#include <linux/random.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "xattr.h" + +static int f2fs_inode_has_encryption_context(struct inode *inode) +{ + int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL); + return (res > 0); +} + +/* + * check whether the policy is consistent with the encryption context + * for the inode + */ +static int f2fs_is_encryption_context_consistent_with_policy( + struct inode *inode, const struct f2fs_encryption_policy *policy) +{ + struct f2fs_encryption_context ctx; + int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), NULL); + + if (res != sizeof(ctx)) + return 0; + + return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, + F2FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == policy->flags) && + (ctx.contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx.filenames_encryption_mode == + policy->filenames_encryption_mode)); +} + +static int f2fs_create_encryption_context_from_policy( + struct inode *inode, const struct f2fs_encryption_policy *policy) +{ + struct f2fs_encryption_context ctx; + + ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; + memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, + F2FS_KEY_DESCRIPTOR_SIZE); + + if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid contents encryption mode %d\n", __func__, + policy->contents_encryption_mode); + return -EINVAL; + } + + if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid filenames encryption mode %d\n", __func__, + policy->filenames_encryption_mode); + return -EINVAL; + } + + if (policy->flags & ~F2FS_POLICY_FLAGS_VALID) + return -EINVAL; + + ctx.contents_encryption_mode = policy->contents_encryption_mode; + ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; + BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE); + get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); + + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), NULL, XATTR_CREATE); +} + +int f2fs_process_policy(const struct f2fs_encryption_policy *policy, + struct inode *inode) +{ + if (policy->version != 0) + return -EINVAL; + + if (!S_ISDIR(inode->i_mode)) + return -EINVAL; + + if (!f2fs_inode_has_encryption_context(inode)) { + if (!f2fs_empty_dir(inode)) + return -ENOTEMPTY; + return f2fs_create_encryption_context_from_policy(inode, + policy); + } + + if (f2fs_is_encryption_context_consistent_with_policy(inode, policy)) + return 0; + + printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", + __func__); + return -EINVAL; +} + +int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy) +{ + struct f2fs_encryption_context ctx; + int res; + + if (!f2fs_encrypted_inode(inode)) + return -ENODATA; + + res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx), NULL); + if (res != sizeof(ctx)) + return -ENODATA; + if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + policy->version = 0; + policy->contents_encryption_mode = ctx.contents_encryption_mode; + policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; + memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + F2FS_KEY_DESCRIPTOR_SIZE); + return 0; +} + +int f2fs_is_child_context_consistent_with_parent(struct inode *parent, + struct inode *child) +{ + struct f2fs_crypt_info *parent_ci, *child_ci; + int res; + + if ((parent == NULL) || (child == NULL)) { + pr_err("parent %p child %p\n", parent, child); + BUG_ON(1); + } + + /* no restrictions if the parent directory is not encrypted */ + if (!f2fs_encrypted_inode(parent)) + return 1; + /* if the child directory is not encrypted, this is always a problem */ + if (!f2fs_encrypted_inode(child)) + return 0; + res = f2fs_get_encryption_info(parent); + if (res) + return 0; + res = f2fs_get_encryption_info(child); + if (res) + return 0; + parent_ci = F2FS_I(parent)->i_crypt_info; + child_ci = F2FS_I(child)->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) + return 0; + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, + F2FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); +} + +/** + * f2fs_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * + * Return: Zero on success, non-zero otherwise + */ +int f2fs_inherit_context(struct inode *parent, struct inode *child, + struct page *ipage) +{ + struct f2fs_encryption_context ctx; + struct f2fs_crypt_info *ci; + int res; + + res = f2fs_get_encryption_info(parent); + if (res < 0) + return res; + + ci = F2FS_I(parent)->i_crypt_info; + BUG_ON(ci == NULL); + + ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; + + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + F2FS_KEY_DESCRIPTOR_SIZE); + + get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); + return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), ipage, XATTR_CREATE); +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 985ed023a750..f71e19a9dd3c 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,12 +12,13 @@ #include <linux/f2fs_fs.h> #include <linux/buffer_head.h> #include <linux/mpage.h> -#include <linux/aio.h> #include <linux/writeback.h> #include <linux/backing-dev.h> #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/prefetch.h> +#include <linux/uio.h> +#include <linux/cleancache.h> #include "f2fs.h" #include "node.h" @@ -25,11 +26,23 @@ #include "trace.h" #include <trace/events/f2fs.h> +static struct kmem_cache *extent_tree_slab; +static struct kmem_cache *extent_node_slab; + static void f2fs_read_end_io(struct bio *bio, int err) { struct bio_vec *bvec; int i; + if (f2fs_bio_encrypted(bio)) { + if (err) { + f2fs_release_crypto_ctx(bio->bi_private); + } else { + f2fs_end_io_crypto_work(bio->bi_private, bio); + return; + } + } + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; @@ -53,6 +66,8 @@ static void f2fs_write_end_io(struct bio *bio, int err) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + f2fs_restore_and_release_control_page(&page); + if (unlikely(err)) { set_page_dirty(page); set_bit(AS_EIO, &page->mapping->flags); @@ -83,7 +98,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio->bi_bdev = sbi->sb->s_bdev; bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; - bio->bi_private = sbi; + bio->bi_private = is_read ? NULL : sbi; return bio; } @@ -130,16 +145,16 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, * Fill the locked page with data located in the block address. * Return unlocked page. */ -int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, - struct f2fs_io_info *fio) +int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; + struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(page, fio, 0); + f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw)); if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { bio_put(bio); @@ -151,12 +166,13 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, return 0; } -void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, - struct f2fs_io_info *fio) +void f2fs_submit_page_mbio(struct f2fs_io_info *fio) { + struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; bool is_read = is_read_io(fio->rw); + struct page *bio_page; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -178,17 +194,19 @@ alloc_new: io->fio = *fio; } - if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) < + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { __submit_merged_bio(io); goto alloc_new; } io->last_block_in_bio = fio->blk_addr; - f2fs_trace_ios(page, fio, 0); + f2fs_trace_ios(fio, 0); up_write(&io->io_rwsem); - trace_f2fs_submit_page_mbio(page, fio); + trace_f2fs_submit_page_mbio(fio->page, fio); } /* @@ -197,7 +215,7 @@ alloc_new: * ->node_page * update block addresses in the node page */ -static void __set_data_blkaddr(struct dnode_of_data *dn) +void set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn; __le32 *addr_array; @@ -226,7 +244,7 @@ int reserve_new_block(struct dnode_of_data *dn) trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); dn->data_blkaddr = NEW_ADDR; - __set_data_blkaddr(dn); + set_data_blkaddr(dn); mark_inode_dirty(dn->inode); sync_inode_page(dn); return 0; @@ -248,73 +266,49 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -static int check_extent_cache(struct inode *inode, pgoff_t pgofs, - struct buffer_head *bh_result) +static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) { struct f2fs_inode_info *fi = F2FS_I(inode); pgoff_t start_fofs, end_fofs; block_t start_blkaddr; - if (is_inode_flag_set(fi, FI_NO_EXTENT)) - return 0; - - read_lock(&fi->ext.ext_lock); + read_lock(&fi->ext_lock); if (fi->ext.len == 0) { - read_unlock(&fi->ext.ext_lock); - return 0; + read_unlock(&fi->ext_lock); + return false; } stat_inc_total_hit(inode->i_sb); start_fofs = fi->ext.fofs; end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk_addr; + start_blkaddr = fi->ext.blk; if (pgofs >= start_fofs && pgofs <= end_fofs) { - unsigned int blkbits = inode->i_sb->s_blocksize_bits; - size_t count; - - set_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, - start_blkaddr + pgofs - start_fofs); - count = end_fofs - pgofs + 1; - if (count < (UINT_MAX >> blkbits)) - bh_result->b_size = (count << blkbits); - else - bh_result->b_size = UINT_MAX; - + *ei = fi->ext; stat_inc_read_hit(inode->i_sb); - read_unlock(&fi->ext.ext_lock); - return 1; + read_unlock(&fi->ext_lock); + return true; } - read_unlock(&fi->ext.ext_lock); - return 0; + read_unlock(&fi->ext_lock); + return false; } -void update_extent_cache(struct dnode_of_data *dn) +static bool update_extent_info(struct inode *inode, pgoff_t fofs, + block_t blkaddr) { - struct f2fs_inode_info *fi = F2FS_I(dn->inode); - pgoff_t fofs, start_fofs, end_fofs; + struct f2fs_inode_info *fi = F2FS_I(inode); + pgoff_t start_fofs, end_fofs; block_t start_blkaddr, end_blkaddr; int need_update = true; - f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); - - /* Update the page address in the parent node */ - __set_data_blkaddr(dn); - - if (is_inode_flag_set(fi, FI_NO_EXTENT)) - return; - - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + - dn->ofs_in_node; - - write_lock(&fi->ext.ext_lock); + write_lock(&fi->ext_lock); start_fofs = fi->ext.fofs; end_fofs = fi->ext.fofs + fi->ext.len - 1; - start_blkaddr = fi->ext.blk_addr; - end_blkaddr = fi->ext.blk_addr + fi->ext.len - 1; + start_blkaddr = fi->ext.blk; + end_blkaddr = fi->ext.blk + fi->ext.len - 1; /* Drop and initialize the matched extent */ if (fi->ext.len == 1 && fofs == start_fofs) @@ -322,24 +316,24 @@ void update_extent_cache(struct dnode_of_data *dn) /* Initial extent */ if (fi->ext.len == 0) { - if (dn->data_blkaddr != NULL_ADDR) { + if (blkaddr != NULL_ADDR) { fi->ext.fofs = fofs; - fi->ext.blk_addr = dn->data_blkaddr; + fi->ext.blk = blkaddr; fi->ext.len = 1; } goto end_update; } /* Front merge */ - if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) { + if (fofs == start_fofs - 1 && blkaddr == start_blkaddr - 1) { fi->ext.fofs--; - fi->ext.blk_addr--; + fi->ext.blk--; fi->ext.len++; goto end_update; } /* Back merge */ - if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) { + if (fofs == end_fofs + 1 && blkaddr == end_blkaddr + 1) { fi->ext.len++; goto end_update; } @@ -351,8 +345,7 @@ void update_extent_cache(struct dnode_of_data *dn) fi->ext.len = fofs - start_fofs; } else { fi->ext.fofs = fofs + 1; - fi->ext.blk_addr = start_blkaddr + - fofs - start_fofs + 1; + fi->ext.blk = start_blkaddr + fofs - start_fofs + 1; fi->ext.len -= fofs - start_fofs + 1; } } else { @@ -366,85 +359,580 @@ void update_extent_cache(struct dnode_of_data *dn) need_update = true; } end_update: - write_unlock(&fi->ext.ext_lock); - if (need_update) - sync_inode_page(dn); - return; + write_unlock(&fi->ext_lock); + return need_update; } -struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) +static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct rb_node *parent, struct rb_node **p) { - struct address_space *mapping = inode->i_mapping; - struct dnode_of_data dn; - struct page *page; - int err; - struct f2fs_io_info fio = { - .type = DATA, - .rw = sync ? READ_SYNC : READA, - }; + struct extent_node *en; - page = find_get_page(mapping, index); - if (page && PageUptodate(page)) - return page; - f2fs_put_page(page, 0); + en = kmem_cache_alloc(extent_node_slab, GFP_ATOMIC); + if (!en) + return NULL; - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) - return ERR_PTR(err); - f2fs_put_dnode(&dn); + en->ei = *ei; + INIT_LIST_HEAD(&en->list); - if (dn.data_blkaddr == NULL_ADDR) - return ERR_PTR(-ENOENT); + rb_link_node(&en->rb_node, parent, p); + rb_insert_color(&en->rb_node, &et->root); + et->count++; + atomic_inc(&sbi->total_ext_node); + return en; +} - /* By fallocate(), there is no cached page, but with NEW_ADDR */ - if (unlikely(dn.data_blkaddr == NEW_ADDR)) - return ERR_PTR(-EINVAL); +static void __detach_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + rb_erase(&en->rb_node, &et->root); + et->count--; + atomic_dec(&sbi->total_ext_node); - page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); + if (et->cached_en == en) + et->cached_en = NULL; +} - if (PageUptodate(page)) { - unlock_page(page); - return page; +static struct extent_tree *__find_extent_tree(struct f2fs_sb_info *sbi, + nid_t ino) +{ + struct extent_tree *et; + + down_read(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, ino); + if (!et) { + up_read(&sbi->extent_tree_lock); + return NULL; } + atomic_inc(&et->refcount); + up_read(&sbi->extent_tree_lock); - fio.blk_addr = dn.data_blkaddr; - err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); - if (err) - return ERR_PTR(err); + return et; +} - if (sync) { - wait_on_page_locked(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 0); - return ERR_PTR(-EIO); +static struct extent_tree *__grab_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + nid_t ino = inode->i_ino; + + down_write(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, ino); + if (!et) { + et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); + f2fs_radix_tree_insert(&sbi->extent_tree_root, ino, et); + memset(et, 0, sizeof(struct extent_tree)); + et->ino = ino; + et->root = RB_ROOT; + et->cached_en = NULL; + rwlock_init(&et->lock); + atomic_set(&et->refcount, 0); + et->count = 0; + sbi->total_ext_tree++; + } + atomic_inc(&et->refcount); + up_write(&sbi->extent_tree_lock); + + return et; +} + +static struct extent_node *__lookup_extent_tree(struct extent_tree *et, + unsigned int fofs) +{ + struct rb_node *node = et->root.rb_node; + struct extent_node *en; + + if (et->cached_en) { + struct extent_info *cei = &et->cached_en->ei; + + if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) + return et->cached_en; + } + + while (node) { + en = rb_entry(node, struct extent_node, rb_node); + + if (fofs < en->ei.fofs) { + node = node->rb_left; + } else if (fofs >= en->ei.fofs + en->ei.len) { + node = node->rb_right; + } else { + et->cached_en = en; + return en; } } - return page; + return NULL; } -/* - * If it tries to access a hole, return an error. - * Because, the callers, functions in dir.c and GC, should be able to know - * whether this page exists or not. - */ -struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +static struct extent_node *__try_back_merge(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_node *prev; + struct rb_node *node; + + node = rb_prev(&en->rb_node); + if (!node) + return NULL; + + prev = rb_entry(node, struct extent_node, rb_node); + if (__is_back_mergeable(&en->ei, &prev->ei)) { + en->ei.fofs = prev->ei.fofs; + en->ei.blk = prev->ei.blk; + en->ei.len += prev->ei.len; + __detach_extent_node(sbi, et, prev); + return prev; + } + return NULL; +} + +static struct extent_node *__try_front_merge(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + struct extent_node *next; + struct rb_node *node; + + node = rb_next(&en->rb_node); + if (!node) + return NULL; + + next = rb_entry(node, struct extent_node, rb_node); + if (__is_front_mergeable(&en->ei, &next->ei)) { + en->ei.len += next->ei.len; + __detach_extent_node(sbi, et, next); + return next; + } + return NULL; +} + +static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_info *ei, + struct extent_node **den) +{ + struct rb_node **p = &et->root.rb_node; + struct rb_node *parent = NULL; + struct extent_node *en; + + while (*p) { + parent = *p; + en = rb_entry(parent, struct extent_node, rb_node); + + if (ei->fofs < en->ei.fofs) { + if (__is_front_mergeable(ei, &en->ei)) { + f2fs_bug_on(sbi, !den); + en->ei.fofs = ei->fofs; + en->ei.blk = ei->blk; + en->ei.len += ei->len; + *den = __try_back_merge(sbi, et, en); + return en; + } + p = &(*p)->rb_left; + } else if (ei->fofs >= en->ei.fofs + en->ei.len) { + if (__is_back_mergeable(ei, &en->ei)) { + f2fs_bug_on(sbi, !den); + en->ei.len += ei->len; + *den = __try_front_merge(sbi, et, en); + return en; + } + p = &(*p)->rb_right; + } else { + f2fs_bug_on(sbi, 1); + } + } + + return __attach_extent_node(sbi, et, ei, parent, p); +} + +static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, + struct extent_tree *et, bool free_all) +{ + struct rb_node *node, *next; + struct extent_node *en; + unsigned int count = et->count; + + node = rb_first(&et->root); + while (node) { + next = rb_next(node); + en = rb_entry(node, struct extent_node, rb_node); + + if (free_all) { + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + } + + if (free_all || list_empty(&en->list)) { + __detach_extent_node(sbi, et, en); + kmem_cache_free(extent_node_slab, en); + } + node = next; + } + + return count - et->count; +} + +static void f2fs_init_extent_tree(struct inode *inode, + struct f2fs_extent *i_ext) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en; + struct extent_info ei; + + if (le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) + return; + + et = __grab_extent_tree(inode); + + write_lock(&et->lock); + if (et->count) + goto out; + + set_extent_info(&ei, le32_to_cpu(i_ext->fofs), + le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + + en = __insert_extent_tree(sbi, et, &ei, NULL); + if (en) { + et->cached_en = en; + + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + } +out: + write_unlock(&et->lock); + atomic_dec(&et->refcount); +} + +static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en; + + trace_f2fs_lookup_extent_tree_start(inode, pgofs); + + et = __find_extent_tree(sbi, inode->i_ino); + if (!et) + return false; + + read_lock(&et->lock); + en = __lookup_extent_tree(et, pgofs); + if (en) { + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) + list_move_tail(&en->list, &sbi->extent_list); + spin_unlock(&sbi->extent_lock); + stat_inc_read_hit(sbi->sb); + } + stat_inc_total_hit(sbi->sb); + read_unlock(&et->lock); + + trace_f2fs_lookup_extent_tree_end(inode, pgofs, en); + + atomic_dec(&et->refcount); + return en ? true : false; +} + +static void f2fs_update_extent_tree(struct inode *inode, pgoff_t fofs, + block_t blkaddr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + struct extent_node *en = NULL, *en1 = NULL, *en2 = NULL, *en3 = NULL; + struct extent_node *den = NULL; + struct extent_info ei, dei; + unsigned int endofs; + + trace_f2fs_update_extent_tree(inode, fofs, blkaddr); + + et = __grab_extent_tree(inode); + + write_lock(&et->lock); + + /* 1. lookup and remove existing extent info in cache */ + en = __lookup_extent_tree(et, fofs); + if (!en) + goto update_extent; + + dei = en->ei; + __detach_extent_node(sbi, et, en); + + /* 2. if extent can be split more, split and insert the left part */ + if (dei.len > 1) { + /* insert left part of split extent into cache */ + if (fofs - dei.fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, dei.fofs, dei.blk, + fofs - dei.fofs); + en1 = __insert_extent_tree(sbi, et, &ei, NULL); + } + + /* insert right part of split extent into cache */ + endofs = dei.fofs + dei.len - 1; + if (endofs - fofs >= F2FS_MIN_EXTENT_LEN) { + set_extent_info(&ei, fofs + 1, + fofs - dei.fofs + dei.blk, endofs - fofs); + en2 = __insert_extent_tree(sbi, et, &ei, NULL); + } + } + +update_extent: + /* 3. update extent in extent cache */ + if (blkaddr) { + set_extent_info(&ei, fofs, blkaddr, 1); + en3 = __insert_extent_tree(sbi, et, &ei, &den); + } + + /* 4. update in global extent list */ + spin_lock(&sbi->extent_lock); + if (en && !list_empty(&en->list)) + list_del(&en->list); + /* + * en1 and en2 split from en, they will become more and more smaller + * fragments after splitting several times. So if the length is smaller + * than F2FS_MIN_EXTENT_LEN, we will not add them into extent tree. + */ + if (en1) + list_add_tail(&en1->list, &sbi->extent_list); + if (en2) + list_add_tail(&en2->list, &sbi->extent_list); + if (en3) { + if (list_empty(&en3->list)) + list_add_tail(&en3->list, &sbi->extent_list); + else + list_move_tail(&en3->list, &sbi->extent_list); + } + if (den && !list_empty(&den->list)) + list_del(&den->list); + spin_unlock(&sbi->extent_lock); + + /* 5. release extent node */ + if (en) + kmem_cache_free(extent_node_slab, en); + if (den) + kmem_cache_free(extent_node_slab, den); + + write_unlock(&et->lock); + atomic_dec(&et->refcount); +} + +void f2fs_preserve_extent_tree(struct inode *inode) +{ + struct extent_tree *et; + struct extent_info *ext = &F2FS_I(inode)->ext; + bool sync = false; + + if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) + return; + + et = __find_extent_tree(F2FS_I_SB(inode), inode->i_ino); + if (!et) { + if (ext->len) { + ext->len = 0; + update_inode_page(inode); + } + return; + } + + read_lock(&et->lock); + if (et->count) { + struct extent_node *en; + + if (et->cached_en) { + en = et->cached_en; + } else { + struct rb_node *node = rb_first(&et->root); + + if (!node) + node = rb_last(&et->root); + en = rb_entry(node, struct extent_node, rb_node); + } + + if (__is_extent_same(ext, &en->ei)) + goto out; + + *ext = en->ei; + sync = true; + } else if (ext->len) { + ext->len = 0; + sync = true; + } +out: + read_unlock(&et->lock); + atomic_dec(&et->refcount); + + if (sync) + update_inode_page(inode); +} + +void f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) +{ + struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; + struct extent_node *en, *tmp; + unsigned long ino = F2FS_ROOT_INO(sbi); + struct radix_tree_iter iter; + void **slot; + unsigned int found; + unsigned int node_cnt = 0, tree_cnt = 0; + + if (!test_opt(sbi, EXTENT_CACHE)) + return; + + if (available_free_memory(sbi, EXTENT_CACHE)) + return; + + spin_lock(&sbi->extent_lock); + list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { + if (!nr_shrink--) + break; + list_del_init(&en->list); + } + spin_unlock(&sbi->extent_lock); + + down_read(&sbi->extent_tree_lock); + while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, + (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { + unsigned i; + + ino = treevec[found - 1]->ino + 1; + for (i = 0; i < found; i++) { + struct extent_tree *et = treevec[i]; + + atomic_inc(&et->refcount); + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, false); + write_unlock(&et->lock); + atomic_dec(&et->refcount); + } + } + up_read(&sbi->extent_tree_lock); + + down_write(&sbi->extent_tree_lock); + radix_tree_for_each_slot(slot, &sbi->extent_tree_root, &iter, + F2FS_ROOT_INO(sbi)) { + struct extent_tree *et = (struct extent_tree *)*slot; + + if (!atomic_read(&et->refcount) && !et->count) { + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + tree_cnt++; + } + } + up_write(&sbi->extent_tree_lock); + + trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); +} + +void f2fs_destroy_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et; + unsigned int node_cnt = 0; + + if (!test_opt(sbi, EXTENT_CACHE)) + return; + + et = __find_extent_tree(sbi, inode->i_ino); + if (!et) + goto out; + + /* free all extent info belong to this extent tree */ + write_lock(&et->lock); + node_cnt = __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + + atomic_dec(&et->refcount); + + /* try to find and delete extent tree entry in radix tree */ + down_write(&sbi->extent_tree_lock); + et = radix_tree_lookup(&sbi->extent_tree_root, inode->i_ino); + if (!et) { + up_write(&sbi->extent_tree_lock); + goto out; + } + f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); + kmem_cache_free(extent_tree_slab, et); + sbi->total_ext_tree--; + up_write(&sbi->extent_tree_lock); +out: + trace_f2fs_destroy_extent_tree(inode, node_cnt); + return; +} + +void f2fs_init_extent_cache(struct inode *inode, struct f2fs_extent *i_ext) +{ + if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) + f2fs_init_extent_tree(inode, i_ext); + + write_lock(&F2FS_I(inode)->ext_lock); + get_extent_info(&F2FS_I(inode)->ext, *i_ext); + write_unlock(&F2FS_I(inode)->ext_lock); +} + +static bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei) +{ + if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + return false; + + if (test_opt(F2FS_I_SB(inode), EXTENT_CACHE)) + return f2fs_lookup_extent_tree(inode, pgofs, ei); + + return lookup_extent_info(inode, pgofs, ei); +} + +void f2fs_update_extent_cache(struct dnode_of_data *dn) +{ + struct f2fs_inode_info *fi = F2FS_I(dn->inode); + pgoff_t fofs; + + f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + + if (is_inode_flag_set(fi, FI_NO_EXTENT)) + return; + + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; + + if (test_opt(F2FS_I_SB(dn->inode), EXTENT_CACHE)) + return f2fs_update_extent_tree(dn->inode, fofs, + dn->data_blkaddr); + + if (update_extent_info(dn->inode, fofs, dn->data_blkaddr)) + sync_inode_page(dn); +} + +struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; + struct extent_info ei; int err; struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = READ_SYNC, + .rw = rw, + .encrypted_page = NULL, }; -repeat: + + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return read_mapping_page(mapping, index, NULL); + page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + goto got_it; + } + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) { @@ -457,9 +945,11 @@ repeat: f2fs_put_page(page, 1); return ERR_PTR(-ENOENT); } - - if (PageUptodate(page)) +got_it: + if (PageUptodate(page)) { + unlock_page(page); return page; + } /* * A new dentry page is allocated but not able to be written, since its @@ -470,14 +960,58 @@ repeat: if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); + unlock_page(page); return page; } fio.blk_addr = dn.data_blkaddr; - err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); + fio.page = page; + err = f2fs_submit_page_bio(&fio); if (err) return ERR_PTR(err); + return page; +} +struct page *find_data_page(struct inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = find_get_page(mapping, index); + if (page && PageUptodate(page)) + return page; + f2fs_put_page(page, 0); + + page = get_read_data_page(inode, index, READ_SYNC); + if (IS_ERR(page)) + return page; + + if (PageUptodate(page)) + return page; + + wait_on_page_locked(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 0); + return ERR_PTR(-EIO); + } + return page; +} + +/* + * If it tries to access a hole, return an error. + * Because, the callers, functions in dir.c and GC, should be able to know + * whether this page exists or not. + */ +struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; +repeat: + page = get_read_data_page(inode, index, READ_SYNC); + if (IS_ERR(page)) + return page; + + /* wait for read completion */ lock_page(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); @@ -505,46 +1039,37 @@ struct page *get_new_data_page(struct inode *inode, struct page *page; struct dnode_of_data dn; int err; +repeat: + page = grab_cache_page(mapping, index); + if (!page) + return ERR_PTR(-ENOMEM); set_new_dnode(&dn, inode, ipage, NULL, 0); err = f2fs_reserve_block(&dn, index); - if (err) + if (err) { + f2fs_put_page(page, 1); return ERR_PTR(err); -repeat: - page = grab_cache_page(mapping, index); - if (!page) { - err = -ENOMEM; - goto put_err; } + if (!ipage) + f2fs_put_dnode(&dn); if (PageUptodate(page)) - return page; + goto got_it; if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { - struct f2fs_io_info fio = { - .type = DATA, - .rw = READ_SYNC, - .blk_addr = dn.data_blkaddr, - }; - err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); - if (err) - goto put_err; + f2fs_put_page(page, 1); - lock_page(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - err = -EIO; - goto put_err; - } - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); + page = get_read_data_page(inode, index, READ_SYNC); + if (IS_ERR(page)) goto repeat; - } - } + /* wait for read completion */ + lock_page(page); + } +got_it: if (new_i_size && i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); @@ -552,10 +1077,6 @@ repeat: set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); } return page; - -put_err: - f2fs_put_dnode(&dn); - return ERR_PTR(err); } static int __allocate_data_block(struct dnode_of_data *dn) @@ -569,19 +1090,26 @@ static int __allocate_data_block(struct dnode_of_data *dn) if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; + + dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + if (dn->data_blkaddr == NEW_ADDR) + goto alloc; + if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) return -ENOSPC; +alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg); + allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, + &sum, seg); /* direct IO doesn't use extent cache to maximize the performance */ - __set_data_blkaddr(dn); + set_data_blkaddr(dn); /* update i_size */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + @@ -615,7 +1143,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); while (dn.ofs_in_node < end_offset && len) { - if (dn.data_blkaddr == NULL_ADDR) { + block_t blkaddr; + + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { if (__allocate_data_block(&dn)) goto sync_out; allocated = true; @@ -643,29 +1174,37 @@ out: } /* - * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. + * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with + * f2fs_map_blocks structure. * If original data blocks are allocated, then give them to blockdev. * Otherwise, * a. preallocate requested block addresses * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, bool fiemap) +static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, bool fiemap) { - unsigned int blkbits = inode->i_sb->s_blocksize_bits; - unsigned maxblocks = bh_result->b_size >> blkbits; + unsigned int maxblocks = map->m_len; struct dnode_of_data dn; int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; pgoff_t pgofs, end_offset; int err = 0, ofs = 1; + struct extent_info ei; bool allocated = false; - /* Get the page offset from the block offset(iblock) */ - pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); + map->m_len = 0; + map->m_flags = 0; + + /* it only supports block size == page size */ + pgofs = (pgoff_t)map->m_lblk; - if (check_extent_cache(inode, pgofs, bh_result)) + if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + map->m_pblk = ei.blk + pgofs - ei.fofs; + map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); + map->m_flags = F2FS_MAP_MAPPED; goto out; + } if (create) f2fs_lock_op(F2FS_I_SB(inode)); @@ -682,21 +1221,23 @@ static int __get_data_block(struct inode *inode, sector_t iblock, goto put_out; if (dn.data_blkaddr != NULL_ADDR) { - set_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + map->m_flags = F2FS_MAP_MAPPED; + map->m_pblk = dn.data_blkaddr; + if (dn.data_blkaddr == NEW_ADDR) + map->m_flags |= F2FS_MAP_UNWRITTEN; } else if (create) { err = __allocate_data_block(&dn); if (err) goto put_out; allocated = true; - set_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED; + map->m_pblk = dn.data_blkaddr; } else { goto put_out; } end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); - bh_result->b_size = (((size_t)1) << blkbits); + map->m_len = 1; dn.ofs_in_node++; pgofs++; @@ -720,21 +1261,25 @@ get_next: end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); } - if (maxblocks > (bh_result->b_size >> blkbits)) { + if (maxblocks > map->m_len) { block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (blkaddr == NULL_ADDR && create) { err = __allocate_data_block(&dn); if (err) goto sync_out; allocated = true; + map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } /* Give more consecutive addresses for the readahead */ - if (blkaddr == (bh_result->b_blocknr + ofs)) { + if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && + blkaddr == NEW_ADDR)) { ofs++; dn.ofs_in_node++; pgofs++; - bh_result->b_size += (((size_t)1) << blkbits); + map->m_len++; goto get_next; } } @@ -747,10 +1292,28 @@ unlock_out: if (create) f2fs_unlock_op(F2FS_I_SB(inode)); out: - trace_f2fs_get_data_block(inode, iblock, bh_result, err); + trace_f2fs_map_blocks(inode, map, err); return err; } +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create, bool fiemap) +{ + struct f2fs_map_blocks map; + int ret; + + map.m_lblk = iblock; + map.m_len = bh->b_size >> inode->i_blkbits; + + ret = f2fs_map_blocks(inode, &map, create, fiemap); + if (!ret) { + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; + bh->b_size = map.m_len << inode->i_blkbits; + } + return ret; +} + static int get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -763,11 +1326,268 @@ static int get_data_block_fiemap(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, true); } +static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) +{ + return (offset >> inode->i_blkbits); +} + +static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) +{ + return (blk << inode->i_blkbits); +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, - start, len, get_data_block_fiemap); + struct buffer_head map_bh; + sector_t start_blk, last_blk; + loff_t isize = i_size_read(inode); + u64 logical = 0, phys = 0, size = 0; + u32 flags = 0; + bool past_eof = false, whole_file = false; + int ret = 0; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + if (len >= isize) { + whole_file = true; + len = isize; + } + + if (logical_to_blk(inode, len) == 0) + len = blk_to_logical(inode, 1); + + start_blk = logical_to_blk(inode, start); + last_blk = logical_to_blk(inode, start + len - 1); +next: + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len; + + ret = get_data_block_fiemap(inode, start_blk, &map_bh, 0); + if (ret) + goto out; + + /* HOLE */ + if (!buffer_mapped(&map_bh)) { + start_blk++; + + if (!past_eof && blk_to_logical(inode, start_blk) >= isize) + past_eof = 1; + + if (past_eof && size) { + flags |= FIEMAP_EXTENT_LAST; + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } else if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + size = 0; + } + + /* if we have holes up to/past EOF then we're done */ + if (start_blk > last_blk || past_eof || ret) + goto out; + } else { + if (start_blk > last_blk && !whole_file) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + goto out; + } + + /* + * if size != 0 then we know we already have an extent + * to add, so add it. + */ + if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + if (ret) + goto out; + } + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; + + start_blk += logical_to_blk(inode, size); + + /* + * If we are past the EOF, then we need to make sure as + * soon as we find a hole that the last extent we found + * is marked with FIEMAP_EXTENT_LAST + */ + if (!past_eof && logical + size >= isize) + past_eof = true; + } + cond_resched(); + if (fatal_signal_pending(current)) + ret = -EINTR; + else + goto next; +out: + if (ret == 1) + ret = 0; + + mutex_unlock(&inode->i_mutex); + return ret; +} + +/* + * This function was originally taken from fs/mpage.c, and customized for f2fs. + * Major change was from block_size == page_size in f2fs by default. + */ +static int f2fs_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages) +{ + struct bio *bio = NULL; + unsigned page_idx; + sector_t last_block_in_bio = 0; + struct inode *inode = mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t block_nr; + struct block_device *bdev = inode->i_sb->s_bdev; + struct f2fs_map_blocks map; + + map.m_pblk = 0; + map.m_lblk = 0; + map.m_len = 0; + map.m_flags = 0; + + for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { + + prefetchw(&page->flags); + if (pages) { + page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) + goto next_page; + } + + block_in_file = (sector_t)page->index; + last_block = block_in_file + nr_pages; + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> + blkbits; + if (last_block > last_block_in_file) + last_block = last_block_in_file; + + /* + * Map blocks using the previous result first. + */ + if ((map.m_flags & F2FS_MAP_MAPPED) && + block_in_file > map.m_lblk && + block_in_file < (map.m_lblk + map.m_len)) + goto got_it; + + /* + * Then do more f2fs_map_blocks() calls until we are + * done with this page. + */ + map.m_flags = 0; + + if (block_in_file < last_block) { + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; + + if (f2fs_map_blocks(inode, &map, 0, false)) + goto set_error_page; + } +got_it: + if ((map.m_flags & F2FS_MAP_MAPPED)) { + block_nr = map.m_pblk + block_in_file - map.m_lblk; + SetPageMappedToDisk(page); + + if (!PageUptodate(page) && !cleancache_get_page(page)) { + SetPageUptodate(page); + goto confused; + } + } else { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + unlock_page(page); + goto next_page; + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (last_block_in_bio != block_nr - 1)) { +submit_and_realloc: + submit_bio(READ, bio); + bio = NULL; + } + if (bio == NULL) { + struct f2fs_crypto_ctx *ctx = NULL; + + if (f2fs_encrypted_inode(inode) && + S_ISREG(inode->i_mode)) { + struct page *cpage; + + ctx = f2fs_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + goto set_error_page; + + /* wait the page to be moved by cleaning */ + cpage = find_lock_page( + META_MAPPING(F2FS_I_SB(inode)), + block_nr); + if (cpage) { + f2fs_wait_on_page_writeback(cpage, + DATA); + f2fs_put_page(cpage, 1); + } + } + + bio = bio_alloc(GFP_KERNEL, + min_t(int, nr_pages, bio_get_nr_vecs(bdev))); + if (!bio) { + if (ctx) + f2fs_release_crypto_ctx(ctx); + goto set_error_page; + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + } + + if (bio_add_page(bio, page, blocksize, 0) < blocksize) + goto submit_and_realloc; + + last_block_in_bio = block_nr; + goto next_page; +set_error_page: + SetPageError(page); + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + goto next_page; +confused: + if (bio) { + submit_bio(READ, bio); + bio = NULL; + } + unlock_page(page); +next_page: + if (pages) + page_cache_release(page); + } + BUG_ON(pages && !list_empty(pages)); + if (bio) + submit_bio(READ, bio); + return 0; } static int f2fs_read_data_page(struct file *file, struct page *page) @@ -781,8 +1601,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = mpage_readpage(page, get_data_block); - + ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1); return ret; } @@ -796,11 +1615,12 @@ static int f2fs_read_data_pages(struct file *file, if (f2fs_has_inline_data(inode)) return 0; - return mpage_readpages(mapping, pages, nr_pages, get_data_block); + return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } -int do_write_data_page(struct page *page, struct f2fs_io_info *fio) +int do_write_data_page(struct f2fs_io_info *fio) { + struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; int err = 0; @@ -813,8 +1633,18 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) fio->blk_addr = dn.data_blkaddr; /* This page is already truncated */ - if (fio->blk_addr == NULL_ADDR) + if (fio->blk_addr == NULL_ADDR) { + ClearPageUptodate(page); goto out_writepage; + } + + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + fio->encrypted_page = f2fs_encrypt(inode, fio->page); + if (IS_ERR(fio->encrypted_page)) { + err = PTR_ERR(fio->encrypted_page); + goto out_writepage; + } + } set_page_writeback(page); @@ -825,12 +1655,17 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) if (unlikely(fio->blk_addr != NEW_ADDR && !is_cold_data(page) && need_inplace_update(inode))) { - rewrite_data_page(page, fio); + rewrite_data_page(fio); set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + trace_f2fs_do_write_data_page(page, IPU); } else { - write_data_page(page, &dn, fio); - update_extent_cache(&dn); + write_data_page(&dn, fio); + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + trace_f2fs_do_write_data_page(page, OPU); set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); } out_writepage: f2fs_put_dnode(&dn); @@ -849,8 +1684,11 @@ static int f2fs_write_data_page(struct page *page, bool need_balance_fs = false; int err = 0; struct f2fs_io_info fio = { + .sbi = sbi, .type = DATA, .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .page = page, + .encrypted_page = NULL, }; trace_f2fs_writepage(page, DATA); @@ -880,7 +1718,7 @@ write: if (S_ISDIR(inode->i_mode)) { if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - err = do_write_data_page(page, &fio); + err = do_write_data_page(&fio); goto done; } @@ -900,7 +1738,7 @@ write: if (f2fs_has_inline_data(inode)) err = f2fs_write_inline_data(inode, page); if (err == -EAGAIN) - err = do_write_data_page(page, &fio); + err = do_write_data_page(&fio); f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) @@ -909,6 +1747,8 @@ done: clear_cold_data(page); out: inode_dec_dirty_pages(inode); + if (err) + ClearPageUptodate(page); unlock_page(page); if (need_balance_fs) f2fs_balance_fs(sbi); @@ -950,6 +1790,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + diff = nr_pages_to_write(sbi, DATA, wbc); if (!S_ISDIR(inode->i_mode)) { @@ -1063,11 +1907,14 @@ put_next: zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { struct f2fs_io_info fio = { + .sbi = sbi, .type = DATA, .rw = READ_SYNC, .blk_addr = dn.data_blkaddr, + .page = page, + .encrypted_page = NULL, }; - err = f2fs_submit_page_bio(sbi, page, &fio); + err = f2fs_submit_page_bio(&fio); if (err) goto fail; @@ -1081,6 +1928,15 @@ put_next: f2fs_put_page(page, 1); goto repeat; } + + /* avoid symlink page */ + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + err = f2fs_decrypt_one(inode, page); + if (err) { + f2fs_put_page(page, 1); + goto fail; + } + } } out: SetPageUptodate(page); @@ -1118,12 +1974,12 @@ static int f2fs_write_end(struct file *file, return copied; } -static int check_direct_IO(struct inode *inode, int rw, - struct iov_iter *iter, loff_t offset) +static int check_direct_IO(struct inode *inode, struct iov_iter *iter, + loff_t offset) { unsigned blocksize_mask = inode->i_sb->s_blocksize - 1; - if (rw == READ) + if (iov_iter_rw(iter) == READ) return 0; if (offset & blocksize_mask) @@ -1135,8 +1991,8 @@ static int check_direct_IO(struct inode *inode, int rw, return 0; } -static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -1151,19 +2007,22 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, return err; } - if (check_direct_IO(inode, rw, iter, offset)) + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return 0; + + if (check_direct_IO(inode, iter, offset)) return 0; - trace_f2fs_direct_IO_enter(inode, offset, count, rw); + trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (rw & WRITE) + if (iov_iter_rw(iter) == WRITE) __allocate_data_blocks(inode, offset, count); - err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); - if (err < 0 && (rw & WRITE)) + err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block); + if (err < 0 && iov_iter_rw(iter) == WRITE) f2fs_write_failed(mapping, offset + count); - trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); + trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); return err; } @@ -1213,8 +2072,6 @@ static int f2fs_set_data_page_dirty(struct page *page) return 1; } - mark_inode_dirty(inode); - if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); update_dirty_page(inode, page); @@ -1236,6 +2093,37 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, get_data_block); } +void init_extent_cache_info(struct f2fs_sb_info *sbi) +{ + INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); + init_rwsem(&sbi->extent_tree_lock); + INIT_LIST_HEAD(&sbi->extent_list); + spin_lock_init(&sbi->extent_lock); + sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_node, 0); +} + +int __init create_extent_cache(void) +{ + extent_tree_slab = f2fs_kmem_cache_create("f2fs_extent_tree", + sizeof(struct extent_tree)); + if (!extent_tree_slab) + return -ENOMEM; + extent_node_slab = f2fs_kmem_cache_create("f2fs_extent_node", + sizeof(struct extent_node)); + if (!extent_node_slab) { + kmem_cache_destroy(extent_tree_slab); + return -ENOMEM; + } + return 0; +} + +void destroy_extent_cache(void) +{ + kmem_cache_destroy(extent_node_slab); + kmem_cache_destroy(extent_tree_slab); +} + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index e671373cc8ab..75176e0dd6c8 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -35,6 +35,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) /* validation check of the segment numbers */ si->hit_ext = sbi->read_hit_ext; si->total_ext = sbi->total_hit_ext; + si->ext_tree = sbi->total_ext_tree; + si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); si->ndirty_dirs = sbi->n_dirty_dirs; @@ -92,7 +94,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) static void update_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; + unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; + unsigned long long bimodal, dist; unsigned int segno, vblocks; int ndirty = 0; @@ -110,10 +113,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi) ndirty++; } } - dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; - si->bimodal = bimodal / dist; + dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); + si->bimodal = div_u64(bimodal, dist); if (si->dirty_count) - si->avg_vblocks = total_vblocks / ndirty; + si->avg_vblocks = div_u64(total_vblocks, ndirty); else si->avg_vblocks = 0; } @@ -141,7 +144,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct sit_info); si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); - si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); @@ -185,6 +188,9 @@ get_cache: si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); for (i = 0; i <= UPDATE_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); + si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_node) * + sizeof(struct extent_node); si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; @@ -260,13 +266,20 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "CP calls: %d\n", si->cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); - seq_printf(s, " - data segments : %d\n", si->data_segs); - seq_printf(s, " - node segments : %d\n", si->node_segs); - seq_printf(s, "Try to move %d blocks\n", si->tot_blks); - seq_printf(s, " - data blocks : %d\n", si->data_blks); - seq_printf(s, " - node blocks : %d\n", si->node_blks); + seq_printf(s, " - data segments : %d (%d)\n", + si->data_segs, si->bg_data_segs); + seq_printf(s, " - node segments : %d (%d)\n", + si->node_segs, si->bg_node_segs); + seq_printf(s, "Try to move %d blocks (BG: %d)\n", si->tot_blks, + si->bg_data_blks + si->bg_node_blks); + seq_printf(s, " - data blocks : %d (%d)\n", si->data_blks, + si->bg_data_blks); + seq_printf(s, " - node blocks : %d (%d)\n", si->node_blks, + si->bg_node_blks); seq_printf(s, "\nExtent Hit Ratio: %d / %d\n", si->hit_ext, si->total_ext); + seq_printf(s, "\nExtent Tree Count: %d\n", si->ext_tree); + seq_printf(s, "\nExtent Node Count: %d\n", si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - inmem: %4d, wb: %4d\n", si->inmem_pages, si->wb_pages); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index b74097a7f6d9..a34ebd8312ab 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -59,9 +59,8 @@ static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = F2FS_FT_SYMLINK, }; -void set_de_type(struct f2fs_dir_entry *de, struct inode *inode) +void set_de_type(struct f2fs_dir_entry *de, umode_t mode) { - umode_t mode = inode->i_mode; de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } @@ -77,20 +76,10 @@ static unsigned long dir_block_index(unsigned int level, return bidx; } -static bool early_match_name(size_t namelen, f2fs_hash_t namehash, - struct f2fs_dir_entry *de) -{ - if (le16_to_cpu(de->name_len) != namelen) - return false; - - if (de->hash_code != namehash) - return false; - - return true; -} - static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - struct qstr *name, int *max_slots, + struct f2fs_filename *fname, + f2fs_hash_t namehash, + int *max_slots, struct page **res_page) { struct f2fs_dentry_block *dentry_blk; @@ -99,9 +88,8 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(&d, (void *)dentry_blk, 1); - de = find_target_dentry(name, max_slots, &d); - + make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; else @@ -115,34 +103,43 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, return de; } -struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots, - struct f2fs_dentry_ptr *d) +struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d) { struct f2fs_dir_entry *de; unsigned long bit_pos = 0; - f2fs_hash_t namehash = f2fs_dentry_hash(name); int max_len = 0; + struct f2fs_str de_name = FSTR_INIT(NULL, 0); + struct f2fs_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; while (bit_pos < d->max) { if (!test_bit_le(bit_pos, d->bitmap)) { - if (bit_pos == 0) - max_len = 1; - else if (!test_bit_le(bit_pos - 1, d->bitmap)) - max_len++; bit_pos++; + max_len++; continue; } + de = &d->dentry[bit_pos]; - if (early_match_name(name->len, namehash, de) && - !memcmp(d->filename[bit_pos], name->name, name->len)) + + /* encrypted case */ + de_name.name = d->filename[bit_pos]; + de_name.len = le16_to_cpu(de->name_len); + + /* show encrypted name */ + if (fname->hash) { + if (de->hash_code == fname->hash) + goto found; + } else if (de_name.len == name->len && + de->hash_code == namehash && + !memcmp(de_name.name, name->name, name->len)) goto found; - if (max_slots && *max_slots >= 0 && max_len > *max_slots) { + if (max_slots && max_len > *max_slots) *max_slots = max_len; - max_len = 0; - } + max_len = 0; /* remain bug on condition */ if (unlikely(!de->name_len)) @@ -159,16 +156,21 @@ found: } static struct f2fs_dir_entry *find_in_level(struct inode *dir, - unsigned int level, struct qstr *name, - f2fs_hash_t namehash, struct page **res_page) + unsigned int level, + struct f2fs_filename *fname, + struct page **res_page) { - int s = GET_DENTRY_SLOTS(name->len); + struct qstr name = FSTR_TO_QSTR(&fname->disk_name); + int s = GET_DENTRY_SLOTS(name.len); unsigned int nbucket, nblock; unsigned int bidx, end_block; struct page *dentry_page; struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; + f2fs_hash_t namehash; + + namehash = f2fs_dentry_hash(&name); f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); @@ -181,13 +183,14 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, for (; bidx < end_block; bidx++) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = find_data_page(dir, bidx, true); + dentry_page = find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { room = true; continue; } - de = find_in_block(dentry_page, name, &max_slots, res_page); + de = find_in_block(dentry_page, fname, namehash, &max_slots, + res_page); if (de) break; @@ -215,30 +218,34 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, { unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; - f2fs_hash_t name_hash; unsigned int max_depth; unsigned int level; + struct f2fs_filename fname; + int err; - if (f2fs_has_inline_dentry(dir)) - return find_in_inline_dir(dir, child, res_page); + *res_page = NULL; - if (npages == 0) + err = f2fs_fname_setup_filename(dir, child, 1, &fname); + if (err) return NULL; - *res_page = NULL; + if (f2fs_has_inline_dentry(dir)) { + de = find_in_inline_dir(dir, &fname, res_page); + goto out; + } + + if (npages == 0) + goto out; - name_hash = f2fs_dentry_hash(child); max_depth = F2FS_I(dir)->i_current_depth; for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, child, name_hash, res_page); + de = find_in_level(dir, level, &fname, res_page); if (de) break; } - if (!de && F2FS_I(dir)->chash != name_hash) { - F2FS_I(dir)->chash = name_hash; - F2FS_I(dir)->clevel = level - 1; - } +out: + f2fs_fname_free_filename(&fname); return de; } @@ -285,7 +292,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, lock_page(page); f2fs_wait_on_page_writeback(page, type); de->ino = cpu_to_le32(inode->i_ino); - set_de_type(de, inode); + set_de_type(de, inode->i_mode); f2fs_dentry_kunmap(dir, page); set_page_dirty(page); dir->i_mtime = dir->i_ctime = CURRENT_TIME; @@ -307,10 +314,14 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, const struct qstr *name) +int update_dent_inode(struct inode *inode, struct inode *to, + const struct qstr *name) { struct page *page; + if (file_enc_name(to)) + return 0; + page = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -331,14 +342,14 @@ void do_make_empty_dir(struct inode *inode, struct inode *parent, de->hash_code = 0; de->ino = cpu_to_le32(inode->i_ino); memcpy(d->filename[0], ".", 1); - set_de_type(de, inode); + set_de_type(de, inode->i_mode); de = &d->dentry[1]; de->hash_code = 0; de->name_len = cpu_to_le16(2); de->ino = cpu_to_le32(parent->i_ino); memcpy(d->filename[1], "..", 2); - set_de_type(de, inode); + set_de_type(de, parent->i_mode); test_and_set_bit_le(0, (void *)d->bitmap); test_and_set_bit_le(1, (void *)d->bitmap); @@ -360,7 +371,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(&d, (void *)dentry_blk, 1); + make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -394,6 +405,12 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, err = f2fs_init_security(inode, dir, name, page); if (err) goto put_error; + + if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { + err = f2fs_inherit_context(dir, inode, page); + if (err) + goto put_error; + } } else { page = get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) @@ -435,7 +452,7 @@ error: void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { if (S_ISDIR(inode->i_mode)) { inc_nlink(dir); set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); @@ -450,7 +467,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); } - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) + if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) clear_inode_flag(F2FS_I(inode), FI_INC_LINK); } @@ -474,38 +491,64 @@ next: goto next; } +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos) +{ + struct f2fs_dir_entry *de; + int slots = GET_DENTRY_SLOTS(name->len); + int i; + + de = &d->dentry[bit_pos]; + de->hash_code = name_hash; + de->name_len = cpu_to_le16(name->len); + memcpy(d->filename[bit_pos], name->name, name->len); + de->ino = cpu_to_le32(ino); + set_de_type(de, mode); + for (i = 0; i < slots; i++) + test_and_set_bit_le(bit_pos + i, (void *)d->bitmap); +} + /* * Caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). */ int __f2fs_add_link(struct inode *dir, const struct qstr *name, - struct inode *inode) + struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; unsigned int level; unsigned int current_depth; unsigned long bidx, block; f2fs_hash_t dentry_hash; - struct f2fs_dir_entry *de; unsigned int nbucket, nblock; - size_t namelen = name->len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; - int slots = GET_DENTRY_SLOTS(namelen); - struct page *page; - int err = 0; - int i; + struct f2fs_dentry_ptr d; + struct page *page = NULL; + struct f2fs_filename fname; + struct qstr new_name; + int slots, err; + + err = f2fs_fname_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + new_name.name = fname_name(&fname); + new_name.len = fname_len(&fname); if (f2fs_has_inline_dentry(dir)) { - err = f2fs_add_inline_entry(dir, name, inode); + err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); if (!err || err != -EAGAIN) - return err; + goto out; else err = 0; } - dentry_hash = f2fs_dentry_hash(name); level = 0; + slots = GET_DENTRY_SLOTS(new_name.len); + dentry_hash = f2fs_dentry_hash(&new_name); + current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { level = F2FS_I(dir)->clevel; @@ -513,8 +556,10 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } start: - if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) - return -ENOSPC; + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) { + err = -ENOSPC; + goto out; + } /* Increase the depth, if required */ if (level == current_depth) @@ -528,8 +573,10 @@ start: for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + goto out; + } dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, @@ -547,30 +594,33 @@ start: add_dentry: f2fs_wait_on_page_writeback(dentry_page, DATA); - down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, NULL); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto fail; + if (inode) { + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, &new_name, NULL); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); } - de = &dentry_blk->dentry[bit_pos]; - de->hash_code = dentry_hash; - de->name_len = cpu_to_le16(namelen); - memcpy(dentry_blk->filename[bit_pos], name->name, name->len); - de->ino = cpu_to_le32(inode->i_ino); - set_de_type(de, inode); - for (i = 0; i < slots; i++) - test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + + make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos); + set_page_dirty(dentry_page); - /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); - f2fs_put_page(page, 1); + if (inode) { + /* we don't need to mark_inode_dirty now */ + F2FS_I(inode)->i_pino = dir->i_ino; + update_inode(inode, page); + f2fs_put_page(page, 1); + } update_parent_metadata(dir, inode, current_depth); fail: - up_write(&F2FS_I(inode)->i_sem); + if (inode) + up_write(&F2FS_I(inode)->i_sem); if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { update_inode_page(dir); @@ -578,6 +628,8 @@ fail: } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); +out: + f2fs_fname_free_filename(&fname); return err; } @@ -669,6 +721,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK) { truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); + ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); } @@ -714,11 +767,12 @@ bool f2fs_empty_dir(struct inode *dir) } bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, - unsigned int start_pos) + unsigned int start_pos, struct f2fs_str *fstr) { unsigned char d_type = DT_UNKNOWN; unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; + struct f2fs_str de_name = FSTR_INIT(NULL, 0); bit_pos = ((unsigned long)ctx->pos % d->max); @@ -732,8 +786,24 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, d_type = f2fs_filetype_table[de->file_type]; else d_type = DT_UNKNOWN; - if (!dir_emit(ctx, d->filename[bit_pos], - le16_to_cpu(de->name_len), + + /* encrypted case */ + de_name.name = d->filename[bit_pos]; + de_name.len = le16_to_cpu(de->name_len); + + if (f2fs_encrypted_inode(d->inode)) { + int save_len = fstr->len; + int ret; + + ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, + &de_name, fstr); + de_name = *fstr; + fstr->len = save_len; + if (ret < 0) + return true; + } + + if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) return true; @@ -752,9 +822,24 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct file_ra_state *ra = &file->f_ra; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; + struct f2fs_str fstr = FSTR_INIT(NULL, 0); + int err = 0; - if (f2fs_has_inline_dentry(inode)) - return f2fs_read_inline_dir(file, ctx); + if (f2fs_encrypted_inode(inode)) { + err = f2fs_get_encryption_info(inode); + if (err) + return err; + + err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN, + &fstr); + if (err < 0) + return err; + } + + if (f2fs_has_inline_dentry(inode)) { + err = f2fs_read_inline_dir(file, ctx, &fstr); + goto out; + } /* readahead for multi pages of dir */ if (npages - n > 1 && !ra_has_index(ra, n)) @@ -768,9 +853,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_blk = kmap(dentry_page); - make_dentry_ptr(&d, (void *)dentry_blk, 1); + make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK)) + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) goto stop; ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; @@ -783,8 +868,9 @@ stop: kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } - - return 0; +out: + f2fs_fname_crypto_free_buffer(&fstr); + return err; } const struct file_operations f2fs_dir_operations = { @@ -793,4 +879,7 @@ const struct file_operations f2fs_dir_operations = { .iterate = f2fs_readdir, .fsync = f2fs_sync_file, .unlocked_ioctl = f2fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = f2fs_compat_ioctl, +#endif }; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 7fa3313ab0e2..a8327ed73898 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -50,6 +50,7 @@ #define F2FS_MOUNT_FLUSH_MERGE 0x00000400 #define F2FS_MOUNT_NOBARRIER 0x00000800 #define F2FS_MOUNT_FASTBOOT 0x00001000 +#define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -69,6 +70,15 @@ struct f2fs_mount_info { unsigned int opt; }; +#define F2FS_FEATURE_ENCRYPT 0x0001 + +#define F2FS_HAS_FEATURE(sb, mask) \ + ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) +#define F2FS_SET_FEATURE(sb, mask) \ + F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask) +#define F2FS_CLEAR_FEATURE(sb, mask) \ + F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) + #define CRCPOLY_LE 0xedb88320 static inline __u32 f2fs_crc32(void *buf, size_t len) @@ -102,12 +112,15 @@ enum { CP_UMOUNT, CP_FASTBOOT, CP_SYNC, + CP_RECOVERY, CP_DISCARD, }; #define DEF_BATCHED_TRIM_SECTIONS 32 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) +#define BATCHED_TRIM_BLOCKS(sbi) \ + (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) struct cp_control { int reason; @@ -216,6 +229,22 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +#define F2FS_IOC_SET_ENCRYPTION_POLICY \ + _IOR('f', 19, struct f2fs_encryption_policy) +#define F2FS_IOC_GET_ENCRYPTION_PWSALT \ + _IOW('f', 20, __u8[16]) +#define F2FS_IOC_GET_ENCRYPTION_POLICY \ + _IOW('f', 21, struct f2fs_encryption_policy) + +/* + * should be same as XFS_IOC_GOINGDOWN. + * Flags for going down operation used by FS_IOC_GOINGDOWN + */ +#define F2FS_IOC_SHUTDOWN _IOR('X', 125, __u32) /* Shutdown */ +#define F2FS_GOING_DOWN_FULLSYNC 0x0 /* going down with full sync */ +#define F2FS_GOING_DOWN_METASYNC 0x1 /* going down with metadata */ +#define F2FS_GOING_DOWN_NOSYNC 0x2 /* going down */ + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -228,16 +257,38 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, * For INODE and NODE manager */ /* for directory operations */ +struct f2fs_str { + unsigned char *name; + u32 len; +}; + +struct f2fs_filename { + const struct qstr *usr_fname; + struct f2fs_str disk_name; + f2fs_hash_t hash; +#ifdef CONFIG_F2FS_FS_ENCRYPTION + struct f2fs_str crypto_buf; +#endif +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + struct f2fs_dentry_ptr { + struct inode *inode; const void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; }; -static inline void make_dentry_ptr(struct f2fs_dentry_ptr *d, - void *src, int type) +static inline void make_dentry_ptr(struct inode *inode, + struct f2fs_dentry_ptr *d, void *src, int type) { + d->inode = inode; + if (type == 1) { struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; d->max = NR_DENTRY_IN_BLOCK; @@ -273,14 +324,52 @@ enum { #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ +/* vector size for gang look-up from extent cache that consists of radix tree */ +#define EXT_TREE_VEC_SIZE 64 + /* for in-memory extent cache entry */ -#define F2FS_MIN_EXTENT_LEN 16 /* minimum extent length */ +#define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ + +/* number of extent info in extent cache we try to shrink */ +#define EXTENT_CACHE_SHRINK_NUMBER 128 struct extent_info { - rwlock_t ext_lock; /* rwlock for consistency */ - unsigned int fofs; /* start offset in a file */ - u32 blk_addr; /* start block address of the extent */ - unsigned int len; /* length of the extent */ + unsigned int fofs; /* start offset in a file */ + u32 blk; /* start block address of the extent */ + unsigned int len; /* length of the extent */ +}; + +struct extent_node { + struct rb_node rb_node; /* rb node located in rb-tree */ + struct list_head list; /* node in global extent list of sbi */ + struct extent_info ei; /* extent info */ +}; + +struct extent_tree { + nid_t ino; /* inode number */ + struct rb_root root; /* root of extent info rb-tree */ + struct extent_node *cached_en; /* recently accessed extent node */ + rwlock_t lock; /* protect extent info rb-tree */ + atomic_t refcount; /* reference count of rb-tree */ + unsigned int count; /* # of extent node in rb-tree*/ +}; + +/* + * This structure is taken from ext4_map_blocks. + * + * Note that, however, f2fs uses NEW and MAPPED flags for f2fs_map_blocks(). + */ +#define F2FS_MAP_NEW (1 << BH_New) +#define F2FS_MAP_MAPPED (1 << BH_Mapped) +#define F2FS_MAP_UNWRITTEN (1 << BH_Unwritten) +#define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\ + F2FS_MAP_UNWRITTEN) + +struct f2fs_map_blocks { + block_t m_pblk; + block_t m_lblk; + unsigned int m_len; + unsigned int m_flags; }; /* @@ -288,6 +377,29 @@ struct extent_info { */ #define FADVISE_COLD_BIT 0x01 #define FADVISE_LOST_PINO_BIT 0x02 +#define FADVISE_ENCRYPT_BIT 0x04 +#define FADVISE_ENC_NAME_BIT 0x08 + +#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) +#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) +#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) +#define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) +#define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) +#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) +#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) +#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) + +/* Encryption algorithms */ +#define F2FS_ENCRYPTION_MODE_INVALID 0 +#define F2FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define F2FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define F2FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define F2FS_ENCRYPTION_MODE_AES_256_CTS 4 + +#include "f2fs_crypto.h" #define DEF_DIR_LEVEL 0 @@ -309,31 +421,67 @@ struct f2fs_inode_info { nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ struct extent_info ext; /* in-memory extent cache entry */ + rwlock_t ext_lock; /* rwlock for single extent cache */ struct inode_entry *dirty_dir; /* the pointer of dirty dir */ struct radix_tree_root inmem_root; /* radix tree for inmem pages */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + /* Encryption params */ + struct f2fs_crypt_info *i_crypt_info; +#endif }; static inline void get_extent_info(struct extent_info *ext, struct f2fs_extent i_ext) { - write_lock(&ext->ext_lock); ext->fofs = le32_to_cpu(i_ext.fofs); - ext->blk_addr = le32_to_cpu(i_ext.blk_addr); + ext->blk = le32_to_cpu(i_ext.blk); ext->len = le32_to_cpu(i_ext.len); - write_unlock(&ext->ext_lock); } static inline void set_raw_extent(struct extent_info *ext, struct f2fs_extent *i_ext) { - read_lock(&ext->ext_lock); i_ext->fofs = cpu_to_le32(ext->fofs); - i_ext->blk_addr = cpu_to_le32(ext->blk_addr); + i_ext->blk = cpu_to_le32(ext->blk); i_ext->len = cpu_to_le32(ext->len); - read_unlock(&ext->ext_lock); +} + +static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, + u32 blk, unsigned int len) +{ + ei->fofs = fofs; + ei->blk = blk; + ei->len = len; +} + +static inline bool __is_extent_same(struct extent_info *ei1, + struct extent_info *ei2) +{ + return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk && + ei1->len == ei2->len); +} + +static inline bool __is_extent_mergeable(struct extent_info *back, + struct extent_info *front) +{ + return (back->fofs + back->len == front->fofs && + back->blk + back->len == front->blk); +} + +static inline bool __is_back_mergeable(struct extent_info *cur, + struct extent_info *back) +{ + return __is_extent_mergeable(back, cur); +} + +static inline bool __is_front_mergeable(struct extent_info *cur, + struct extent_info *front) +{ + return __is_extent_mergeable(cur, front); } struct f2fs_nm_info { @@ -502,12 +650,19 @@ enum page_type { META, NR_PAGE_TYPE, META_FLUSH, + INMEM, /* the below types are used by tracepoints only. */ + INMEM_DROP, + IPU, + OPU, }; struct f2fs_io_info { + struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ block_t blk_addr; /* block address to be written */ + struct page *page; /* page to be written */ + struct page *encrypted_page; /* encrypted page */ }; #define is_read_io(rw) (((rw) & 1) == READ) @@ -571,6 +726,14 @@ struct f2fs_sb_info { struct list_head dir_inode_list; /* dir inode list */ spinlock_t dir_inode_lock; /* for dir inode list lock */ + /* for extent tree cache */ + struct radix_tree_root extent_tree_root;/* cache extent cache entries */ + struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ + struct list_head extent_list; /* lru list for shrinker */ + spinlock_t extent_lock; /* locking extent lru list */ + int total_ext_tree; /* extent tree count */ + atomic_t total_ext_node; /* extent info count */ + /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ @@ -592,6 +755,7 @@ struct f2fs_sb_info { block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ block_t alloc_valid_block_count; /* # of allocated blocks */ + block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ @@ -920,12 +1084,17 @@ static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) return 0; } +static inline block_t __cp_payload(struct f2fs_sb_info *sbi) +{ + return le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload); +} + static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); int offset; - if (le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload) > 0) { + if (__cp_payload(sbi) > 0) { if (flag == NAT_BITMAP) return &ckpt->sit_nat_version_bitmap; else @@ -1114,6 +1283,24 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr) return mask & *addr; } +static inline void f2fs_set_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + *addr |= mask; +} + +static inline void f2fs_clear_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + *addr &= ~mask; +} + static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) { int mask; @@ -1166,8 +1353,10 @@ enum { FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ FI_VOLATILE_FILE, /* indicate volatile file */ + FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ + FI_INLINE_DOTS, /* indicate inline dot dentries */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -1204,6 +1393,8 @@ static inline void get_inline_info(struct f2fs_inode_info *fi, set_inode_flag(fi, FI_INLINE_DENTRY); if (ri->i_inline & F2FS_DATA_EXIST) set_inode_flag(fi, FI_DATA_EXIST); + if (ri->i_inline & F2FS_INLINE_DOTS) + set_inode_flag(fi, FI_INLINE_DOTS); } static inline void set_raw_inline(struct f2fs_inode_info *fi, @@ -1219,6 +1410,8 @@ static inline void set_raw_inline(struct f2fs_inode_info *fi, ri->i_inline |= F2FS_INLINE_DENTRY; if (is_inode_flag_set(fi, FI_DATA_EXIST)) ri->i_inline |= F2FS_DATA_EXIST; + if (is_inode_flag_set(fi, FI_INLINE_DOTS)) + ri->i_inline |= F2FS_INLINE_DOTS; } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -1264,6 +1457,11 @@ static inline int f2fs_exist_data(struct inode *inode) return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); } +static inline int f2fs_has_inline_dots(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS); +} + static inline bool f2fs_is_atomic_file(struct inode *inode) { return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); @@ -1274,6 +1472,11 @@ static inline bool f2fs_is_volatile_file(struct inode *inode) return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); } +static inline bool f2fs_is_first_block_written(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); +} + static inline bool f2fs_is_drop_cache(struct inode *inode) { return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); @@ -1290,18 +1493,27 @@ static inline int f2fs_has_inline_dentry(struct inode *inode) return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); } -static inline void *inline_dentry_addr(struct page *page) -{ - struct f2fs_inode *ri = F2FS_INODE(page); - return (void *)&(ri->i_addr[1]); -} - static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) { if (!f2fs_has_inline_dentry(dir)) kunmap(page); } +static inline int is_file(struct inode *inode, int type) +{ + return F2FS_I(inode)->i_advise & type; +} + +static inline void set_file(struct inode *inode, int type) +{ + F2FS_I(inode)->i_advise |= type; +} + +static inline void clear_file(struct inode *inode, int type) +{ + F2FS_I(inode)->i_advise &= ~type; +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; @@ -1318,6 +1530,17 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) sbi->sb->s_flags |= MS_RDONLY; } +static inline bool is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -1363,11 +1586,12 @@ struct dentry *f2fs_get_parent(struct dentry *child); * dir.c */ extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; -void set_de_type(struct f2fs_dir_entry *, struct inode *); -struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *, - struct f2fs_dentry_ptr *); +void set_de_type(struct f2fs_dir_entry *, umode_t); + +struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *, + f2fs_hash_t, int *, struct f2fs_dentry_ptr *); bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int); + unsigned int, struct f2fs_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); struct page *init_inode_metadata(struct inode *, struct inode *, @@ -1381,23 +1605,26 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); ino_t f2fs_inode_by_name(struct inode *, struct qstr *); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); -int update_dent_inode(struct inode *, const struct qstr *); -int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *); +int update_dent_inode(struct inode *, struct inode *, const struct qstr *); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, + const struct qstr *, f2fs_hash_t , unsigned int); +int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, + umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); int f2fs_do_tmpfile(struct inode *, struct inode *); -int f2fs_make_empty(struct inode *, struct inode *); bool f2fs_empty_dir(struct inode *); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { - return __f2fs_add_link(dentry->d_parent->d_inode, &dentry->d_name, - inode); + return __f2fs_add_link(d_inode(dentry->d_parent), &dentry->d_name, + inode, inode->i_ino, inode->i_mode); } /* * super.c */ +int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); extern __printf(3, 4) void f2fs_msg(struct super_block *, const char *, const char *, ...); @@ -1414,8 +1641,8 @@ struct dnode_of_data; struct node_info; bool available_free_memory(struct f2fs_sb_info *, int); +int need_dentry_mark(struct f2fs_sb_info *, nid_t); bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool has_fsynced_inode(struct f2fs_sb_info *, nid_t); bool need_inode_block_update(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); @@ -1456,21 +1683,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *); void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void clear_prefree_segments(struct f2fs_sb_info *); +void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); void discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); +void update_meta_page(struct f2fs_sb_info *, void *, block_t); void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(struct f2fs_sb_info *, struct page *, - unsigned int, struct f2fs_io_info *); -void write_data_page(struct page *, struct dnode_of_data *, - struct f2fs_io_info *); -void rewrite_data_page(struct page *, struct f2fs_io_info *); -void recover_data_page(struct f2fs_sb_info *, struct page *, - struct f2fs_summary *, block_t, block_t); +void write_node_page(unsigned int, struct f2fs_io_info *); +void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); +void rewrite_data_page(struct f2fs_io_info *); +void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, + block_t, block_t, unsigned char, bool); void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); void f2fs_wait_on_page_writeback(struct page *, enum page_type); @@ -1489,6 +1715,7 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); @@ -1515,18 +1742,25 @@ void destroy_checkpoint_caches(void); * data.c */ void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); -int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, - struct f2fs_io_info *); -void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, - struct f2fs_io_info *); +int f2fs_submit_page_bio(struct f2fs_io_info *); +void f2fs_submit_page_mbio(struct f2fs_io_info *); +void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -void update_extent_cache(struct dnode_of_data *); -struct page *find_data_page(struct inode *, pgoff_t, bool); +void f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); +void f2fs_destroy_extent_tree(struct inode *); +void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *); +void f2fs_update_extent_cache(struct dnode_of_data *); +void f2fs_preserve_extent_tree(struct inode *); +struct page *get_read_data_page(struct inode *, pgoff_t, int); +struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int do_write_data_page(struct page *, struct f2fs_io_info *); +int do_write_data_page(struct f2fs_io_info *); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); +void init_extent_cache_info(struct f2fs_sb_info *); +int __init create_extent_cache(void); +void destroy_extent_cache(void); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -1554,7 +1788,7 @@ struct f2fs_stat_info { struct f2fs_sb_info *sbi; int all_area_segs, sit_area_segs, nat_area_segs, ssa_area_segs; int main_area_segs, main_area_sections, main_area_zones; - int hit_ext, total_ext; + int hit_ext, total_ext, ext_tree, ext_node; int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; @@ -1566,7 +1800,9 @@ struct f2fs_stat_info { int dirty_count, node_pages, meta_pages; int prefree_count, call_count, cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; + int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; + int bg_data_blks, bg_node_blks; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; @@ -1615,31 +1851,36 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) -#define stat_inc_seg_count(sbi, type) \ +#define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) \ + if (type == SUM_TYPE_DATA) { \ si->data_segs++; \ - else \ + si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ + } else { \ si->node_segs++; \ + si->bg_node_segs += (gc_type == BG_GC) ? 1 : 0; \ + } \ } while (0) #define stat_inc_tot_blk_count(si, blks) \ (si->tot_blks += (blks)) -#define stat_inc_data_blk_count(sbi, blks) \ +#define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ + si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ } while (0) -#define stat_inc_node_blk_count(sbi, blks) \ +#define stat_inc_node_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ + si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ } while (0) int f2fs_build_stats(struct f2fs_sb_info *); @@ -1661,10 +1902,10 @@ void f2fs_destroy_root_stats(void); #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(si, type) +#define stat_inc_seg_count(sbi, type, gc_type) #define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(si, blks) -#define stat_inc_node_blk_count(sbi, blks) +#define stat_inc_data_blk_count(sbi, blks, gc_type) +#define stat_inc_node_blk_count(sbi, blks, gc_type) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } @@ -1680,26 +1921,162 @@ extern const struct address_space_operations f2fs_node_aops; extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; +extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; extern struct kmem_cache *inode_entry_slab; /* * inline.c */ -bool f2fs_may_inline(struct inode *); +bool f2fs_may_inline_data(struct inode *); +bool f2fs_may_inline_dentry(struct inode *); void read_inline_data(struct page *, struct page *); +bool truncate_inline_inode(struct page *, u64); int f2fs_read_inline_data(struct inode *, struct page *); int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); int f2fs_convert_inline_inode(struct inode *); int f2fs_write_inline_data(struct inode *, struct page *); bool recover_inline_data(struct inode *, struct page *); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *, - struct page **); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *, + struct f2fs_filename *, struct page **); struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *); +int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, + nid_t, umode_t); void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); bool f2fs_empty_inline_dir(struct inode *); -int f2fs_read_inline_dir(struct file *, struct dir_context *); +int f2fs_read_inline_dir(struct file *, struct dir_context *, + struct f2fs_str *); + +/* + * crypto support + */ +static inline int f2fs_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + return file_is_encrypt(inode); +#else + return 0; +#endif +} + +static inline void f2fs_set_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + file_set_encrypt(inode); +#endif +} + +static inline bool f2fs_bio_encrypted(struct bio *bio) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + return unlikely(bio->bi_private != NULL); +#else + return false; +#endif +} + +static inline int f2fs_sb_has_crypto(struct super_block *sb) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); +#else + return 0; +#endif +} + +static inline bool f2fs_may_encrypt(struct inode *inode) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + mode_t mode = inode->i_mode; + + return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); +#else + return 0; +#endif +} + +/* crypto_policy.c */ +int f2fs_is_child_context_consistent_with_parent(struct inode *, + struct inode *); +int f2fs_inherit_context(struct inode *, struct inode *, struct page *); +int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *); +int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *); + +/* crypt.c */ +extern struct kmem_cache *f2fs_crypt_info_cachep; +bool f2fs_valid_contents_enc_mode(uint32_t); +uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t); +struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *); +void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *); +struct page *f2fs_encrypt(struct inode *, struct page *); +int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *); +int f2fs_decrypt_one(struct inode *, struct page *); +void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *); + +/* crypto_key.c */ +void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *); +int _f2fs_get_encryption_info(struct inode *inode); + +/* crypto_fname.c */ +bool f2fs_valid_filenames_enc_mode(uint32_t); +u32 f2fs_fname_crypto_round_up(u32, u32); +int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *); +int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *, + const struct f2fs_str *, struct f2fs_str *); +int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *, + struct f2fs_str *); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +void f2fs_restore_and_release_control_page(struct page **); +void f2fs_restore_control_page(struct page *); + +int __init f2fs_init_crypto(void); +int f2fs_crypto_initialize(void); +void f2fs_exit_crypto(void); + +int f2fs_has_encryption_key(struct inode *); + +static inline int f2fs_get_encryption_info(struct inode *inode) +{ + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return _f2fs_get_encryption_info(inode); + return 0; +} + +void f2fs_fname_crypto_free_buffer(struct f2fs_str *); +int f2fs_fname_setup_filename(struct inode *, const struct qstr *, + int lookup, struct f2fs_filename *); +void f2fs_fname_free_filename(struct f2fs_filename *); +#else +static inline void f2fs_restore_and_release_control_page(struct page **p) { } +static inline void f2fs_restore_control_page(struct page *p) { } + +static inline int __init f2fs_init_crypto(void) { return 0; } +static inline void f2fs_exit_crypto(void) { } + +static inline int f2fs_has_encryption_key(struct inode *i) { return 0; } +static inline int f2fs_get_encryption_info(struct inode *i) { return 0; } +static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { } + +static inline int f2fs_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct f2fs_filename *fname) +{ + memset(fname, 0, sizeof(struct f2fs_filename)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { } +#endif #endif diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h new file mode 100644 index 000000000000..c2c1c2b63b25 --- /dev/null +++ b/fs/f2fs/f2fs_crypto.h @@ -0,0 +1,151 @@ +/* + * linux/fs/f2fs/f2fs_crypto.h + * + * Copied from linux/fs/ext4/ext4_crypto.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption header content for f2fs + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ +#ifndef _F2FS_CRYPTO_H +#define _F2FS_CRYPTO_H + +#include <linux/fs.h> + +#define F2FS_KEY_DESCRIPTOR_SIZE 8 + +/* Policy provided via an ioctl on the topmost directory */ +struct f2fs_encryption_policy { + char version; + char contents_encryption_mode; + char filenames_encryption_mode; + char flags; + char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; +} __attribute__((__packed__)); + +#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 +#define F2FS_KEY_DERIVATION_NONCE_SIZE 16 + +#define F2FS_POLICY_FLAGS_PAD_4 0x00 +#define F2FS_POLICY_FLAGS_PAD_8 0x01 +#define F2FS_POLICY_FLAGS_PAD_16 0x02 +#define F2FS_POLICY_FLAGS_PAD_32 0x03 +#define F2FS_POLICY_FLAGS_PAD_MASK 0x03 +#define F2FS_POLICY_FLAGS_VALID 0x03 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct f2fs_encryption_context { + char format; + char contents_encryption_mode; + char filenames_encryption_mode; + char flags; + char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; + char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE]; +} __attribute__((__packed__)); + +/* Encryption parameters */ +#define F2FS_XTS_TWEAK_SIZE 16 +#define F2FS_AES_128_ECB_KEY_SIZE 16 +#define F2FS_AES_256_GCM_KEY_SIZE 32 +#define F2FS_AES_256_CBC_KEY_SIZE 32 +#define F2FS_AES_256_CTS_KEY_SIZE 32 +#define F2FS_AES_256_XTS_KEY_SIZE 64 +#define F2FS_MAX_KEY_SIZE 64 + +#define F2FS_KEY_DESC_PREFIX "f2fs:" +#define F2FS_KEY_DESC_PREFIX_SIZE 5 + +struct f2fs_encryption_key { + __u32 mode; + char raw[F2FS_MAX_KEY_SIZE]; + __u32 size; +} __attribute__((__packed__)); + +struct f2fs_crypt_info { + char ci_data_mode; + char ci_filename_mode; + char ci_flags; + struct crypto_ablkcipher *ci_ctfm; + struct key *ci_keyring_key; + char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE]; +}; + +#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define F2FS_WRITE_PATH_FL 0x00000002 + +struct f2fs_crypto_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + char flags; /* Flags */ +}; + +struct f2fs_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \ + struct f2fs_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + +static inline int f2fs_encryption_key_size(int mode) +{ + switch (mode) { + case F2FS_ENCRYPTION_MODE_AES_256_XTS: + return F2FS_AES_256_XTS_KEY_SIZE; + case F2FS_ENCRYPTION_MODE_AES_256_GCM: + return F2FS_AES_256_GCM_KEY_SIZE; + case F2FS_ENCRYPTION_MODE_AES_256_CBC: + return F2FS_AES_256_CBC_KEY_SIZE; + case F2FS_ENCRYPTION_MODE_AES_256_CTS: + return F2FS_AES_256_CTS_KEY_SIZE; + default: + BUG(); + } + return 0; +} + +#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4 +#define F2FS_CRYPTO_BLOCK_SIZE 16 +#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct f2fs_encrypted_symlink_data { + __le16 len; + char encrypted_path[1]; +} __attribute__((__packed__)); + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 encrypted_symlink_data_len(u32 l) +{ + return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1); +} +#endif /* _F2FS_CRYPTO_H */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 98dac27bc3f7..b0f38c3b37f4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -20,6 +20,7 @@ #include <linux/uaccess.h> #include <linux/mount.h> #include <linux/pagevec.h> +#include <linux/random.h> #include "f2fs.h" #include "node.h" @@ -105,7 +106,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - if (update_dent_inode(inode, &dentry->d_name)) { + if (update_dent_inode(inode, inode, &dentry->d_name)) { dput(dentry); return 0; } @@ -122,6 +123,8 @@ static inline bool need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; + else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino)) + need_cp = true; else if (file_wrong_pino(inode)) need_cp = true; else if (!space_for_roll_forward(sbi)) @@ -241,6 +244,8 @@ go_write: * will be used only for fsynced inodes after checkpoint. */ try_to_fix_pino(inode); + clear_inode_flag(fi, FI_APPEND_WRITE); + clear_inode_flag(fi, FI_UPDATE_WRITE); goto out; } sync_nodes: @@ -269,7 +274,7 @@ flush_out: ret = f2fs_issue_flush(sbi); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); - f2fs_trace_ios(NULL, NULL, 1); + f2fs_trace_ios(NULL, 1); return ret; } @@ -405,6 +410,12 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + if (f2fs_encrypted_inode(inode)) { + int err = f2fs_get_encryption_info(inode); + if (err) + return 0; + } + /* we don't need to use inline_data strictly */ if (f2fs_has_inline_data(inode)) { int err = f2fs_convert_inline_inode(inode); @@ -417,6 +428,18 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static int f2fs_file_open(struct inode *inode, struct file *filp) +{ + int ret = generic_file_open(inode, filp); + + if (!ret && f2fs_encrypted_inode(inode)) { + ret = f2fs_get_encryption_info(inode); + if (ret) + ret = -EACCES; + } + return ret; +} + int truncate_data_blocks_range(struct dnode_of_data *dn, int count) { int nr_free = 0, ofs = dn->ofs_in_node; @@ -433,8 +456,12 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) continue; dn->data_blkaddr = NULL_ADDR; - update_extent_cache(dn); + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); invalidate_blocks(sbi, blkaddr); + if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) + clear_inode_flag(F2FS_I(dn->inode), + FI_FIRST_BLOCK_WRITTEN); nr_free++; } if (nr_free) { @@ -454,27 +481,33 @@ void truncate_data_blocks(struct dnode_of_data *dn) truncate_data_blocks_range(dn, ADDRS_PER_BLOCK); } -static int truncate_partial_data_page(struct inode *inode, u64 from) +static int truncate_partial_data_page(struct inode *inode, u64 from, + bool cache_only) { unsigned offset = from & (PAGE_CACHE_SIZE - 1); + pgoff_t index = from >> PAGE_CACHE_SHIFT; + struct address_space *mapping = inode->i_mapping; struct page *page; - if (!offset) + if (!offset && !cache_only) return 0; - page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, false); - if (IS_ERR(page)) + if (cache_only) { + page = grab_cache_page(mapping, index); + if (page && PageUptodate(page)) + goto truncate_out; + f2fs_put_page(page, 1); return 0; + } - lock_page(page); - if (unlikely(!PageUptodate(page) || - page->mapping != inode->i_mapping)) - goto out; - + page = get_lock_data_page(inode, index); + if (IS_ERR(page)) + return 0; +truncate_out: f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); - set_page_dirty(page); -out: + if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + set_page_dirty(page); f2fs_put_page(page, 1); return 0; } @@ -487,6 +520,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) pgoff_t free_from; int count = 0, err = 0; struct page *ipage; + bool truncate_page = false; trace_f2fs_truncate_blocks_enter(inode, from); @@ -502,7 +536,10 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { + if (truncate_inline_inode(ipage, from)) + set_page_dirty(ipage); f2fs_put_page(ipage, 1); + truncate_page = true; goto out; } @@ -533,7 +570,7 @@ out: /* lastly zero out the first data page */ if (!err) - err = truncate_partial_data_page(inode, from); + err = truncate_partial_data_page(inode, from, truncate_page); trace_f2fs_truncate_blocks_exit(inode, err); return err; @@ -548,7 +585,7 @@ void f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); /* we should check inline_data size */ - if (f2fs_has_inline_data(inode) && !f2fs_may_inline(inode)) { + if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { if (f2fs_convert_inline_inode(inode)) return; } @@ -562,7 +599,7 @@ void f2fs_truncate(struct inode *inode) int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); stat->blocks <<= 3; return 0; @@ -601,7 +638,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct f2fs_inode_info *fi = F2FS_I(inode); int err; @@ -610,16 +647,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) { + if (f2fs_encrypted_inode(inode) && + f2fs_get_encryption_info(inode)) + return -EACCES; + + if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); f2fs_truncate(inode); f2fs_balance_fs(F2FS_I_SB(inode)); } else { /* - * giving a chance to truncate blocks past EOF which - * are fallocated with FALLOC_FL_KEEP_SIZE. + * do not trim all blocks after i_size if target size is + * larger than i_size. */ - f2fs_truncate(inode); + truncate_setsize(inode, attr->ia_size); } } @@ -706,10 +747,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - /* skip punching hole beyond i_size */ - if (offset >= inode->i_size) - return ret; - if (f2fs_has_inline_data(inode)) { ret = f2fs_convert_inline_inode(inode); if (ret) @@ -753,6 +790,320 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } +static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + int ret = 0; + + f2fs_lock_op(sbi); + + for (; end < nrpages; start++, end++) { + block_t new_addr, old_addr; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + new_addr = NULL_ADDR; + } else { + new_addr = dn.data_blkaddr; + truncate_data_blocks_range(&dn, 1); + f2fs_put_dnode(&dn); + } + + if (new_addr == NULL_ADDR) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) + goto out; + else if (ret == -ENOENT) + continue; + + if (dn.data_blkaddr == NULL_ADDR) { + f2fs_put_dnode(&dn); + continue; + } else { + truncate_data_blocks_range(&dn, 1); + } + + f2fs_put_dnode(&dn); + } else { + struct page *ipage; + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + ret = PTR_ERR(ipage); + goto out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + ret = f2fs_reserve_block(&dn, start); + if (ret) + goto out; + + old_addr = dn.data_blkaddr; + if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) { + dn.data_blkaddr = NULL_ADDR; + f2fs_update_extent_cache(&dn); + invalidate_blocks(sbi, old_addr); + + dn.data_blkaddr = new_addr; + set_data_blkaddr(&dn); + } else if (new_addr != NEW_ADDR) { + struct node_info ni; + + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, old_addr, new_addr, + ni.version, true); + } + + f2fs_put_dnode(&dn); + } + } + ret = 0; +out: + f2fs_unlock_op(sbi); + return ret; +} + +static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + pgoff_t pg_start, pg_end; + loff_t new_size; + int ret; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (offset + len >= i_size_read(inode)) + return -EINVAL; + + /* collapse range should be aligned to block size of f2fs. */ + if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + pg_start = offset >> PAGE_CACHE_SHIFT; + pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + if (ret) + return ret; + + truncate_pagecache(inode, offset); + + ret = f2fs_do_collapse(inode, pg_start, pg_end); + if (ret) + return ret; + + new_size = i_size_read(inode) - len; + + ret = truncate_blocks(inode, new_size, true); + if (!ret) + i_size_write(inode, new_size); + + return ret; +} + +static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + pgoff_t index, pg_start, pg_end; + loff_t new_size = i_size_read(inode); + loff_t off_start, off_end; + int ret = 0; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + ret = inode_newsize_ok(inode, (len + offset)); + if (ret) + return ret; + + f2fs_balance_fs(sbi); + + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); + if (ret) + return ret; + + truncate_pagecache_range(inode, offset, offset + len - 1); + + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + + off_start = offset & (PAGE_CACHE_SIZE - 1); + off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + + if (pg_start == pg_end) { + fill_zero(inode, pg_start, off_start, off_end - off_start); + if (offset + len > new_size) + new_size = offset + len; + new_size = max_t(loff_t, new_size, offset + len); + } else { + if (off_start) { + fill_zero(inode, pg_start++, off_start, + PAGE_CACHE_SIZE - off_start); + new_size = max_t(loff_t, new_size, + pg_start << PAGE_CACHE_SHIFT); + } + + for (index = pg_start; index < pg_end; index++) { + struct dnode_of_data dn; + struct page *ipage; + + f2fs_lock_op(sbi); + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + ret = PTR_ERR(ipage); + f2fs_unlock_op(sbi); + goto out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + ret = f2fs_reserve_block(&dn, index); + if (ret) { + f2fs_unlock_op(sbi); + goto out; + } + + if (dn.data_blkaddr != NEW_ADDR) { + invalidate_blocks(sbi, dn.data_blkaddr); + + dn.data_blkaddr = NEW_ADDR; + set_data_blkaddr(&dn); + + dn.data_blkaddr = NULL_ADDR; + f2fs_update_extent_cache(&dn); + } + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + + new_size = max_t(loff_t, new_size, + (index + 1) << PAGE_CACHE_SHIFT); + } + + if (off_end) { + fill_zero(inode, pg_end, 0, off_end); + new_size = max_t(loff_t, new_size, offset + len); + } + } + +out: + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { + i_size_write(inode, new_size); + mark_inode_dirty(inode); + update_inode_page(inode); + } + + return ret; +} + +static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t pg_start, pg_end, delta, nrpages, idx; + loff_t new_size; + int ret; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + new_size = i_size_read(inode) + len; + if (new_size > inode->i_sb->s_maxbytes) + return -EFBIG; + + if (offset >= i_size_read(inode)) + return -EINVAL; + + /* insert range should be aligned to block size of f2fs. */ + if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + f2fs_balance_fs(sbi); + + ret = truncate_blocks(inode, i_size_read(inode), true); + if (ret) + return ret; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + if (ret) + return ret; + + truncate_pagecache(inode, offset); + + pg_start = offset >> PAGE_CACHE_SHIFT; + pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + delta = pg_end - pg_start; + nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + + for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { + struct dnode_of_data dn; + struct page *ipage; + block_t new_addr, old_addr; + + f2fs_lock_op(sbi); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + goto next; + } else if (dn.data_blkaddr == NULL_ADDR) { + f2fs_put_dnode(&dn); + goto next; + } else { + new_addr = dn.data_blkaddr; + truncate_data_blocks_range(&dn, 1); + f2fs_put_dnode(&dn); + } + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + ret = PTR_ERR(ipage); + goto out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + ret = f2fs_reserve_block(&dn, idx + delta); + if (ret) + goto out; + + old_addr = dn.data_blkaddr; + f2fs_bug_on(sbi, old_addr != NEW_ADDR); + + if (new_addr != NEW_ADDR) { + struct node_info ni; + + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, old_addr, new_addr, + ni.version, true); + } + f2fs_put_dnode(&dn); +next: + f2fs_unlock_op(sbi); + } + + i_size_write(inode, new_size); + return 0; +out: + f2fs_unlock_op(sbi); + return ret; +} + static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { @@ -818,23 +1169,40 @@ static long f2fs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - long ret; + long ret = 0; + + if (f2fs_encrypted_inode(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) + return -EOPNOTSUPP; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; mutex_lock(&inode->i_mutex); - if (mode & FALLOC_FL_PUNCH_HOLE) + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (offset >= inode->i_size) + goto out; + ret = punch_hole(inode, offset, len); - else + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + ret = f2fs_collapse_range(inode, offset, len); + } else if (mode & FALLOC_FL_ZERO_RANGE) { + ret = f2fs_zero_range(inode, offset, len, mode); + } else if (mode & FALLOC_FL_INSERT_RANGE) { + ret = f2fs_insert_range(inode, offset, len); + } else { ret = expand_inode_data(inode, offset, len, mode); + } if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } +out: mutex_unlock(&inode->i_mutex); trace_f2fs_fallocate(inode, mode, offset, len, ret); @@ -963,12 +1331,13 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); commit_inmem_pages(inode, false); + } ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); mnt_drop_write_file(filp); - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); return ret; } @@ -997,6 +1366,9 @@ static int f2fs_ioc_release_volatile_write(struct file *filp) if (!f2fs_is_volatile_file(inode)) return 0; + if (!f2fs_is_first_block_written(inode)) + return truncate_partial_data_page(inode, 0, true); + punch_hole(inode, 0, F2FS_BLKSIZE); return 0; } @@ -1016,19 +1388,52 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) f2fs_balance_fs(F2FS_I_SB(inode)); if (f2fs_is_atomic_file(inode)) { - commit_inmem_pages(inode, false); clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + commit_inmem_pages(inode, false); } - if (f2fs_is_volatile_file(inode)) { + if (f2fs_is_volatile_file(inode)) clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - filemap_fdatawrite(inode->i_mapping); - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - } + mnt_drop_write_file(filp); return ret; } +static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct super_block *sb = sbi->sb; + __u32 in; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(in, (__u32 __user *)arg)) + return -EFAULT; + + switch (in) { + case F2FS_GOING_DOWN_FULLSYNC: + sb = freeze_bdev(sb->s_bdev); + if (sb && !IS_ERR(sb)) { + f2fs_stop_checkpoint(sbi); + thaw_bdev(sb->s_bdev, sb); + } + break; + case F2FS_GOING_DOWN_METASYNC: + /* do checkpoint only */ + f2fs_sync_fs(sb, 1); + f2fs_stop_checkpoint(sbi); + break; + case F2FS_GOING_DOWN_NOSYNC: + f2fs_stop_checkpoint(sbi); + break; + default: + return -EINVAL; + } + return 0; +} + static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1059,6 +1464,86 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) return 0; } +static bool uuid_is_nonzero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return true; + return false; +} + +static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + struct f2fs_encryption_policy policy; + struct inode *inode = file_inode(filp); + + if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg, + sizeof(policy))) + return -EFAULT; + + return f2fs_process_policy(&policy, inode); +#else + return -EOPNOTSUPP; +#endif +} + +static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + struct f2fs_encryption_policy policy; + struct inode *inode = file_inode(filp); + int err; + + err = f2fs_get_policy(inode, &policy); + if (err) + return err; + + if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy, + sizeof(policy))) + return -EFAULT; + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err; + + if (!f2fs_sb_has_crypto(inode->i_sb)) + return -EOPNOTSUPP; + + if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) + goto got_it; + + err = mnt_want_write_file(filp); + if (err) + return err; + + /* update superblock with uuid */ + generate_random_uuid(sbi->raw_super->encrypt_pw_salt); + + err = f2fs_commit_super(sbi, false); + + mnt_drop_write_file(filp); + if (err) { + /* undo new data */ + memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + return err; + } +got_it: + if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, + 16)) + return -EFAULT; + return 0; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1078,13 +1563,33 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_release_volatile_write(filp); case F2FS_IOC_ABORT_VOLATILE_WRITE: return f2fs_ioc_abort_volatile_write(filp); + case F2FS_IOC_SHUTDOWN: + return f2fs_ioc_shutdown(filp, arg); case FITRIM: return f2fs_ioc_fitrim(filp, arg); + case F2FS_IOC_SET_ENCRYPTION_POLICY: + return f2fs_ioc_set_encryption_policy(filp, arg); + case F2FS_IOC_GET_ENCRYPTION_POLICY: + return f2fs_ioc_get_encryption_policy(filp, arg); + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + return f2fs_ioc_get_encryption_pwsalt(filp, arg); default: return -ENOTTY; } } +static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (f2fs_encrypted_inode(inode) && + !f2fs_has_encryption_key(inode) && + f2fs_get_encryption_info(inode)) + return -EACCES; + + return generic_file_write_iter(iocb, from); +} + #ifdef CONFIG_COMPAT long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1104,11 +1609,9 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) const struct file_operations f2fs_file_operations = { .llseek = f2fs_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .open = generic_file_open, + .write_iter = f2fs_file_write_iter, + .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, .fsync = f2fs_sync_file, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 76adbc3641f1..22fb5ef37966 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -435,7 +435,7 @@ next_step: set_page_dirty(node_page); } f2fs_put_page(node_page, 1); - stat_inc_node_blk_count(sbi, 1); + stat_inc_node_blk_count(sbi, 1, gc_type); } if (initial) { @@ -518,12 +518,91 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return 1; } -static void move_data_page(struct inode *inode, struct page *page, int gc_type) +static void move_encrypted_block(struct inode *inode, block_t bidx) { struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = WRITE_SYNC, + .rw = READ_SYNC, + .encrypted_page = NULL, }; + struct dnode_of_data dn; + struct f2fs_summary sum; + struct node_info ni; + struct page *page; + int err; + + /* do not read out */ + page = grab_cache_page(inode->i_mapping, bidx); + if (!page) + return; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); + if (err) + goto out; + + if (unlikely(dn.data_blkaddr == NULL_ADDR)) + goto put_out; + + get_node_info(fio.sbi, dn.nid, &ni); + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* read page */ + fio.page = page; + fio.blk_addr = dn.data_blkaddr; + + fio.encrypted_page = grab_cache_page(META_MAPPING(fio.sbi), fio.blk_addr); + if (!fio.encrypted_page) + goto put_out; + + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_page_out; + + /* write page */ + lock_page(fio.encrypted_page); + + if (unlikely(!PageUptodate(fio.encrypted_page))) + goto put_page_out; + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) + goto put_page_out; + + set_page_dirty(fio.encrypted_page); + f2fs_wait_on_page_writeback(fio.encrypted_page, META); + if (clear_page_dirty_for_io(fio.encrypted_page)) + dec_page_count(fio.sbi, F2FS_DIRTY_META); + + set_page_writeback(fio.encrypted_page); + + /* allocate block address */ + f2fs_wait_on_page_writeback(dn.node_page, NODE); + allocate_data_block(fio.sbi, NULL, fio.blk_addr, + &fio.blk_addr, &sum, CURSEG_COLD_DATA); + fio.rw = WRITE_SYNC; + f2fs_submit_page_mbio(&fio); + + dn.data_blkaddr = fio.blk_addr; + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); +put_page_out: + f2fs_put_page(fio.encrypted_page, 1); +put_out: + f2fs_put_dnode(&dn); +out: + f2fs_put_page(page, 1); +} + +static void move_data_page(struct inode *inode, block_t bidx, int gc_type) +{ + struct page *page; + + page = get_lock_data_page(inode, bidx); + if (IS_ERR(page)) + return; if (gc_type == BG_GC) { if (PageWriteback(page)) @@ -531,12 +610,19 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) set_page_dirty(page); set_cold_data(page); } else { + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), + .type = DATA, + .rw = WRITE_SYNC, + .page = page, + .encrypted_page = NULL, + }; + set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA); - if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); set_cold_data(page); - do_write_data_page(page, &fio); + do_write_data_page(&fio); clear_cold_data(page); } out: @@ -599,10 +685,16 @@ next_step: if (IS_ERR(inode) || is_bad_inode(inode)) continue; - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + /* if encrypted inode, let's go phase 3 */ + if (f2fs_encrypted_inode(inode) && + S_ISREG(inode->i_mode)) { + add_gc_inode(gc_list, inode); + continue; + } - data_page = find_data_page(inode, - start_bidx + ofs_in_node, false); + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + data_page = get_read_data_page(inode, + start_bidx + ofs_in_node, READA); if (IS_ERR(data_page)) { iput(inode); continue; @@ -616,13 +708,13 @@ next_step: /* phase 3 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); - data_page = get_lock_data_page(inode, - start_bidx + ofs_in_node); - if (IS_ERR(data_page)) - continue; - move_data_page(inode, data_page, gc_type); - stat_inc_data_blk_count(sbi, 1); + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)) + + ofs_in_node; + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + move_encrypted_block(inode, start_bidx); + else + move_data_page(inode, start_bidx, gc_type); + stat_inc_data_blk_count(sbi, 1, gc_type); } } @@ -670,6 +762,15 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, sum = page_address(sum_page); + /* + * this is to avoid deadlock: + * - lock_page(sum_page) - f2fs_replace_block + * - check_valid_map() - mutex_lock(sentry_lock) + * - mutex_lock(sentry_lock) - change_curseg() + * - lock_page(sum_page) + */ + unlock_page(sum_page); + switch (GET_SUM_TYPE((&sum->footer))) { case SUM_TYPE_NODE: gc_node_segment(sbi, sum->entries, segno, gc_type); @@ -680,10 +781,10 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, } blk_finish_plug(&plug); - stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer))); + stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type); stat_inc_call_count(sbi->stat_info); - f2fs_put_page(sum_page, 1); + f2fs_put_page(sum_page, 0); } int f2fs_gc(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index a844fcfb9a8d..71b7206c431e 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -79,8 +79,7 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; - if ((len <= 2) && (name[0] == '.') && - (name[1] == '.' || name[1] == '\0')) + if (is_dot_dotdot(name_info)) return 0; /* Initialize the default seed for the hash checksum functions */ diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 1484c00133cd..a13ffcc32992 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -13,7 +13,7 @@ #include "f2fs.h" -bool f2fs_may_inline(struct inode *inode) +bool f2fs_may_inline_data(struct inode *inode) { if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) return false; @@ -21,12 +21,26 @@ bool f2fs_may_inline(struct inode *inode) if (f2fs_is_atomic_file(inode)) return false; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; if (i_size_read(inode) > MAX_INLINE_DATA) return false; + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return false; + + return true; +} + +bool f2fs_may_inline_dentry(struct inode *inode) +{ + if (!test_opt(F2FS_I_SB(inode), INLINE_DENTRY)) + return false; + + if (!S_ISDIR(inode->i_mode)) + return false; + return true; } @@ -50,10 +64,19 @@ void read_inline_data(struct page *page, struct page *ipage) SetPageUptodate(page); } -static void truncate_inline_data(struct page *ipage) +bool truncate_inline_inode(struct page *ipage, u64 from) { + void *addr; + + if (from >= MAX_INLINE_DATA) + return false; + + addr = inline_data_addr(ipage); + f2fs_wait_on_page_writeback(ipage, NODE); - memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA); + memset(addr + from, 0, MAX_INLINE_DATA - from); + + return true; } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -86,8 +109,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { void *src_addr, *dst_addr; struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(dn->inode), .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, + .page = page, + .encrypted_page = NULL, }; int dirty, err; @@ -115,14 +141,17 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) kunmap_atomic(dst_addr); SetPageUptodate(page); no_update: + set_page_dirty(page); + /* clear dirty state */ dirty = clear_page_dirty_for_io(page); /* write data page to try to make data consistent */ set_page_writeback(page); fio.blk_addr = dn->data_blkaddr; - write_data_page(page, dn, &fio); - update_extent_cache(dn); + write_data_page(dn, &fio); + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); f2fs_wait_on_page_writeback(page, DATA); if (dirty) inode_dec_dirty_pages(dn->inode); @@ -131,7 +160,7 @@ no_update: set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_data(dn->inode_page); + truncate_inline_inode(dn->inode_page, 0); clear_out: stat_dec_inline_inode(dn->inode); f2fs_clear_inline_inode(dn->inode); @@ -245,7 +274,7 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - truncate_inline_data(ipage); + truncate_inline_inode(ipage, 0); f2fs_clear_inline_inode(inode); update_inode(inode, ipage); f2fs_put_page(ipage, 1); @@ -257,23 +286,26 @@ process_inline: } struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, - struct qstr *name, struct page **res_page) + struct f2fs_filename *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct f2fs_inline_dentry *inline_dentry; + struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return NULL; - inline_dentry = inline_data_addr(ipage); + namehash = f2fs_dentry_hash(&name); - make_dentry_ptr(&d, (void *)inline_dentry, 2); - de = find_target_dentry(name, NULL, &d); + inline_dentry = inline_data_addr(ipage); + make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) *res_page = ipage; @@ -315,7 +347,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, dentry_blk = inline_data_addr(ipage); - make_dentry_ptr(&d, (void *)dentry_blk, 2); + make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -363,7 +395,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_data(ipage); + truncate_inline_inode(ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); @@ -380,21 +412,18 @@ out: } int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, - struct inode *inode) + struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_dir_entry *de; size_t namelen = name->len; struct f2fs_inline_dentry *dentry_blk = NULL; + struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(namelen); - struct page *page; + struct page *page = NULL; int err = 0; - int i; - - name_hash = f2fs_dentry_hash(name); ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) @@ -410,32 +439,34 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, goto out; } - down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, ipage); - if (IS_ERR(page)) { - err = PTR_ERR(page); - goto fail; + if (inode) { + down_write(&F2FS_I(inode)->i_sem); + page = init_inode_metadata(inode, dir, name, ipage); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto fail; + } } f2fs_wait_on_page_writeback(ipage, NODE); - de = &dentry_blk->dentry[bit_pos]; - de->hash_code = name_hash; - de->name_len = cpu_to_le16(namelen); - memcpy(dentry_blk->filename[bit_pos], name->name, name->len); - de->ino = cpu_to_le32(inode->i_ino); - set_de_type(de, inode); - for (i = 0; i < slots; i++) - test_and_set_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + + name_hash = f2fs_dentry_hash(name); + make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); + set_page_dirty(ipage); /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); - f2fs_put_page(page, 1); + if (inode) { + F2FS_I(inode)->i_pino = dir->i_ino; + update_inode(inode, page); + f2fs_put_page(page, 1); + } update_parent_metadata(dir, inode, 0); fail: - up_write(&F2FS_I(inode)->i_sem); + if (inode) + up_write(&F2FS_I(inode)->i_sem); if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { update_inode(dir, ipage); @@ -497,7 +528,8 @@ bool f2fs_empty_inline_dir(struct inode *dir) return true; } -int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx) +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct f2fs_str *fstr) { struct inode *inode = file_inode(file); struct f2fs_inline_dentry *inline_dentry = NULL; @@ -513,9 +545,9 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx) inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(&d, (void *)inline_dentry, 2); + make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); - if (!f2fs_fill_dentries(ctx, &d, 0)) + if (!f2fs_fill_dentries(ctx, &d, 0, fstr)) ctx->pos = NR_INLINE_DENTRY; f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2d002e3738a7..2550868dc651 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -51,6 +51,15 @@ static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) } } +static bool __written_first_block(struct f2fs_inode *ri) +{ + block_t addr = le32_to_cpu(ri->i_addr[0]); + + if (addr != NEW_ADDR && addr != NULL_ADDR) + return true; + return false; +} + static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { @@ -130,7 +139,8 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - get_extent_info(&fi->ext, ri->i_ext); + f2fs_init_extent_cache(inode, &ri->i_ext); + get_inline_info(fi, ri); /* check data exist */ @@ -140,6 +150,9 @@ static int do_read_inode(struct inode *inode) /* get rdev by using inline_info */ __get_inode_rdev(inode, ri); + if (__written_first_block(ri)) + set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + f2fs_put_page(node_page, 1); stat_inc_inline_inode(inode); @@ -185,7 +198,10 @@ make_now: inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &f2fs_symlink_inode_operations; + if (f2fs_encrypted_inode(inode)) + inode->i_op = &f2fs_encrypted_symlink_inode_operations; + else + inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { @@ -220,7 +236,11 @@ void update_inode(struct inode *inode, struct page *node_page) ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); + + read_lock(&F2FS_I(inode)->ext_lock); set_raw_extent(&F2FS_I(inode)->ext, &ri->i_ext); + read_unlock(&F2FS_I(inode)->ext_lock); + set_raw_inline(F2FS_I(inode), ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); @@ -328,6 +348,12 @@ void f2fs_evict_inode(struct inode *inode) no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); + + /* update extent info in inode */ + if (inode->i_nlink) + f2fs_preserve_extent_tree(inode); + f2fs_destroy_extent_tree(inode); + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); @@ -336,6 +362,10 @@ no_delete: if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); out_clear: +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (F2FS_I(inode)->i_crypt_info) + f2fs_free_encryption_info(inode, F2FS_I(inode)->i_crypt_info); +#endif clear_inode(inode); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index e79639a9787a..fdbae21ee8fb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -14,6 +14,7 @@ #include <linux/sched.h> #include <linux/ctype.h> #include <linux/dcache.h> +#include <linux/namei.h> #include "f2fs.h" #include "node.h" @@ -55,11 +56,18 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) goto out; } - if (f2fs_may_inline(inode)) + /* If the directory encrypted, then we should encrypt the inode. */ + if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) + f2fs_set_encrypted_inode(inode); + + if (f2fs_may_inline_data(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - if (test_opt(sbi, INLINE_DENTRY) && S_ISDIR(inode->i_mode)) + if (f2fs_may_inline_dentry(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + trace_f2fs_new_inode(inode, 0); mark_inode_dirty(inode); return inode; @@ -135,7 +143,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, alloc_nid_done(sbi, ino); - stat_inc_inline_inode(inode); d_instantiate(dentry, inode); unlock_new_inode(inode); @@ -150,10 +157,14 @@ out: static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct f2fs_sb_info *sbi = F2FS_I_SB(dir); int err; + if (f2fs_encrypted_inode(dir) && + !f2fs_is_child_context_consistent_with_parent(dir, inode)) + return -EPERM; + f2fs_balance_fs(sbi); inode->i_ctime = CURRENT_TIME; @@ -181,10 +192,48 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = f2fs_inode_by_name(child->d_inode, &dotdot); + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(f2fs_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino)); +} + +static int __recover_dot_dentries(struct inode *dir, nid_t pino) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct qstr dot = QSTR_INIT(".", 1); + struct qstr dotdot = QSTR_INIT("..", 2); + struct f2fs_dir_entry *de; + struct page *page; + int err = 0; + + f2fs_lock_op(sbi); + + de = f2fs_find_entry(dir, &dot, &page); + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + } else { + err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); + if (err) + goto out; + } + + de = f2fs_find_entry(dir, &dotdot, &page); + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + } else { + err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); + } +out: + if (!err) { + clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS); + mark_inode_dirty(dir); + } + + f2fs_unlock_op(sbi); + return err; } static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, @@ -193,28 +242,40 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct f2fs_dir_entry *de; struct page *page; + nid_t ino; + int err = 0; if (dentry->d_name.len > F2FS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (de) { - nid_t ino = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); - f2fs_put_page(page, 0); + if (!de) + return d_splice_alias(inode, dentry); - inode = f2fs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); - } + ino = le32_to_cpu(de->ino); + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + inode = f2fs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + if (f2fs_has_inline_dots(inode)) { + err = __recover_dot_dentries(inode, dir->i_ino); + if (err) + goto err_out; + } return d_splice_alias(inode, dentry); + +err_out: + iget_failed(inode); + return ERR_PTR(err); } static int f2fs_unlink(struct inode *dir, struct dentry *dentry) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct f2fs_dir_entry *de; struct page *page; int err = -ENOENT; @@ -247,21 +308,42 @@ fail: return err; } +static const char *f2fs_follow_link(struct dentry *dentry, void **cookie) +{ + const char *link = page_follow_link_light(dentry, cookie); + if (!IS_ERR(link) && !*link) { + /* this is broken symlink case */ + page_put_link(NULL, *cookie); + link = ERR_PTR(-ENOENT); + } + return link; +} + static int f2fs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; - size_t symlen = strlen(symname) + 1; + size_t len = strlen(symname); + size_t p_len; + char *p_str; + struct f2fs_str disk_link = FSTR_INIT(NULL, 0); + struct f2fs_encrypted_symlink_data *sd = NULL; int err; + if (len > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + f2fs_balance_fs(sbi); inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); - inode->i_op = &f2fs_symlink_inode_operations; + if (f2fs_encrypted_inode(inode)) + inode->i_op = &f2fs_encrypted_symlink_inode_operations; + else + inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; f2fs_lock_op(sbi); @@ -269,15 +351,66 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (err) goto out; f2fs_unlock_op(sbi); - - err = page_symlink(inode, symname, symlen); alloc_nid_done(sbi, inode->i_ino); + if (f2fs_encrypted_inode(dir)) { + struct qstr istr = QSTR_INIT(symname, len); + + err = f2fs_get_encryption_info(inode); + if (err) + goto err_out; + + err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link); + if (err) + goto err_out; + + err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link); + if (err < 0) + goto err_out; + + p_len = encrypted_symlink_data_len(disk_link.len) + 1; + + if (p_len > dir->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto err_out; + } + + sd = kzalloc(p_len, GFP_NOFS); + if (!sd) { + err = -ENOMEM; + goto err_out; + } + memcpy(sd->encrypted_path, disk_link.name, disk_link.len); + sd->len = cpu_to_le16(disk_link.len); + p_str = (char *)sd; + } else { + p_len = len + 1; + p_str = (char *)symname; + } + + err = page_symlink(inode, p_str, p_len); + +err_out: d_instantiate(dentry, inode); unlock_new_inode(inode); + /* + * Let's flush symlink data in order to avoid broken symlink as much as + * possible. Nevertheless, fsyncing is the best way, but there is no + * way to get a file descriptor in order to flush that. + * + * Note that, it needs to do dir->fsync to make this recoverable. + * If the symlink path is stored into inline_data, there is no + * performance regression. + */ + if (!err) + filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); + if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + kfree(sd); + f2fs_fname_crypto_free_buffer(&disk_link); return err; out: handle_failed_inode(inode); @@ -308,7 +441,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; f2fs_unlock_op(sbi); - stat_inc_inline_dir(inode); alloc_nid_done(sbi, inode->i_ino); d_instantiate(dentry, inode); @@ -326,7 +458,7 @@ out_fail: static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (f2fs_empty_dir(inode)) return f2fs_unlink(dir, dentry); return -ENOTEMPTY; @@ -370,19 +502,101 @@ out: return err; } +static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, + umode_t mode, struct inode **whiteout) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err; + + if (!whiteout) + f2fs_balance_fs(sbi); + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (whiteout) { + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + inode->i_op = &f2fs_special_inode_operations; + } else { + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } + + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); + if (err) + goto out; + + err = f2fs_do_tmpfile(inode, dir); + if (err) + goto release_out; + + /* + * add this non-linked tmpfile to orphan list, in this way we could + * remove all unused data of tmpfile after abnormal power-off. + */ + add_orphan_inode(sbi, inode->i_ino); + f2fs_unlock_op(sbi); + + alloc_nid_done(sbi, inode->i_ino); + + if (whiteout) { + inode_dec_link_count(inode); + *whiteout = inode; + } else { + d_tmpfile(dentry, inode); + } + unlock_new_inode(inode); + return 0; + +release_out: + release_orphan_inode(sbi); +out: + handle_failed_inode(inode); + return err; +} + +static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + if (f2fs_encrypted_inode(dir)) { + int err = f2fs_get_encryption_info(dir); + if (err) + return err; + } + + return __f2fs_tmpfile(dir, dentry, mode, NULL); +} + +static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) +{ + return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); +} + static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); + struct inode *whiteout = NULL; struct page *old_dir_page; - struct page *old_page, *new_page; + struct page *old_page, *new_page = NULL; struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; int err = -ENOENT; + if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && + !f2fs_is_child_context_consistent_with_parent(new_dir, + old_inode)) { + err = -EPERM; + goto out; + } + f2fs_balance_fs(sbi); old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); @@ -396,17 +610,23 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } + if (flags & RENAME_WHITEOUT) { + err = f2fs_create_whiteout(old_dir, &whiteout); + if (err) + goto out_dir; + } + if (new_inode) { err = -ENOTEMPTY; if (old_dir_entry && !f2fs_empty_dir(new_inode)) - goto out_dir; + goto out_whiteout; err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_entry) - goto out_dir; + goto out_whiteout; f2fs_lock_op(sbi); @@ -414,7 +634,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - if (update_dent_inode(old_inode, &new_dentry->d_name)) { + if (update_dent_inode(old_inode, new_inode, + &new_dentry->d_name)) { release_orphan_inode(sbi); goto put_out_dir; } @@ -443,7 +664,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(new_dentry, old_inode); if (err) { f2fs_unlock_op(sbi); - goto out_dir; + goto out_whiteout; } if (old_dir_entry) { @@ -454,6 +675,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, down_write(&F2FS_I(old_inode)->i_sem); file_lost_pino(old_inode); + if (new_inode && file_enc_name(new_inode)) + file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = CURRENT_TIME; @@ -461,8 +684,18 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_delete_entry(old_entry, old_page, old_dir, NULL); + if (whiteout) { + whiteout->i_state |= I_LINKABLE; + set_inode_flag(F2FS_I(whiteout), FI_INC_LINK); + err = f2fs_add_link(old_dentry, whiteout); + if (err) + goto put_out_dir; + whiteout->i_state &= ~I_LINKABLE; + iput(whiteout); + } + if (old_dir_entry) { - if (old_dir != new_dir) { + if (old_dir != new_dir && !whiteout) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); update_inode_page(old_inode); @@ -483,8 +716,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, put_out_dir: f2fs_unlock_op(sbi); - f2fs_dentry_kunmap(new_dir, new_page); - f2fs_put_page(new_page, 0); + if (new_page) { + f2fs_dentry_kunmap(new_dir, new_page); + f2fs_put_page(new_page, 0); + } +out_whiteout: + if (whiteout) + iput(whiteout); out_dir: if (old_dir_entry) { f2fs_dentry_kunmap(old_inode, old_dir_page); @@ -501,8 +739,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *old_dir_page, *new_dir_page; struct page *old_page, *new_page; struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; @@ -510,6 +748,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && + (old_dir != new_dir) && + (!f2fs_is_child_context_consistent_with_parent(new_dir, + old_inode) || + !f2fs_is_child_context_consistent_with_parent(old_dir, + new_inode))) + return -EPERM; + f2fs_balance_fs(sbi); old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); @@ -556,13 +802,17 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, &new_dentry->d_name); + err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); if (err) goto out_unlock; + if (file_enc_name(new_inode)) + file_set_enc_name(old_inode); - err = update_dent_inode(new_inode, &old_dentry->d_name); + err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); if (err) goto out_undo; + if (file_enc_name(old_inode)) + file_set_enc_name(new_inode); /* update ".." directory entry info of old dentry */ if (old_dir_entry) @@ -620,8 +870,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_sync_fs(sbi->sb, 1); return 0; out_undo: - /* Still we may fail to recover name info of f2fs_inode here */ - update_dent_inode(old_inode, &old_dentry->d_name); + /* + * Still we may fail to recover name info of f2fs_inode here + * Drop it, once its name is set as encrypted + */ + update_dent_inode(old_inode, old_inode, &old_dentry->d_name); out_unlock: f2fs_unlock_op(sbi); out_new_dir: @@ -648,7 +901,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) { @@ -659,51 +912,85 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, * VFS has already handled the new dentry existence case, * here, we just deal with "RENAME_NOREPLACE" as regular rename. */ - return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry); + return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +#ifdef CONFIG_F2FS_FS_ENCRYPTION +static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie) { - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct inode *inode; - int err; - - inode = f2fs_new_inode(dir, mode); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - inode->i_op = &f2fs_file_inode_operations; - inode->i_fop = &f2fs_file_operations; - inode->i_mapping->a_ops = &f2fs_dblock_aops; - - f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); - if (err) - goto out; - - err = f2fs_do_tmpfile(inode, dir); - if (err) - goto release_out; - - /* - * add this non-linked tmpfile to orphan list, in this way we could - * remove all unused data of tmpfile after abnormal power-off. - */ - add_orphan_inode(sbi, inode->i_ino); - f2fs_unlock_op(sbi); - - alloc_nid_done(sbi, inode->i_ino); - d_tmpfile(dentry, inode); - unlock_new_inode(inode); - return 0; + struct page *cpage = NULL; + char *caddr, *paddr = NULL; + struct f2fs_str cstr; + struct f2fs_str pstr = FSTR_INIT(NULL, 0); + struct inode *inode = d_inode(dentry); + struct f2fs_encrypted_symlink_data *sd; + loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); + u32 max_size = inode->i_sb->s_blocksize; + int res; + + res = f2fs_get_encryption_info(inode); + if (res) + return ERR_PTR(res); + + cpage = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(cpage)) + return ERR_CAST(cpage); + caddr = kmap(cpage); + caddr[size] = 0; + + /* Symlink is encrypted */ + sd = (struct f2fs_encrypted_symlink_data *)caddr; + cstr.name = sd->encrypted_path; + cstr.len = le16_to_cpu(sd->len); + + /* this is broken symlink case */ + if (cstr.name[0] == 0 && cstr.len == 0) { + res = -ENOENT; + goto errout; + } -release_out: - release_orphan_inode(sbi); -out: - handle_failed_inode(inode); - return err; + if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) > + max_size) { + /* Symlink data on the disk is corrupted */ + res = -EIO; + goto errout; + } + res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr); + if (res) + goto errout; + + res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr); + if (res < 0) + goto errout; + + paddr = pstr.name; + + /* Null-terminate the name */ + paddr[res] = '\0'; + + kunmap(cpage); + page_cache_release(cpage); + return *cookie = paddr; +errout: + f2fs_fname_crypto_free_buffer(&pstr); + kunmap(cpage); + page_cache_release(cpage); + return ERR_PTR(res); } +const struct inode_operations f2fs_encrypted_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = f2fs_encrypted_follow_link, + .put_link = kfree_put_link, + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = f2fs_listxattr, + .removexattr = generic_removexattr, +}; +#endif + const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, .lookup = f2fs_lookup, @@ -729,7 +1016,7 @@ const struct inode_operations f2fs_dir_inode_operations = { const struct inode_operations f2fs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, + .follow_link = f2fs_follow_link, .put_link = page_put_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 97bd9d3db882..7dd63b794bfb 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -41,7 +41,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) /* only uses low memory */ avail_ram = val.totalram - val.totalhigh; - /* give 25%, 25%, 50%, 50% memory for each components respectively */ + /* + * give 25%, 25%, 50%, 50%, 50% memory for each components respectively + */ if (type == FREE_NIDS) { mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> PAGE_CACHE_SHIFT; @@ -51,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); @@ -62,8 +64,13 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) mem_size += (sbi->im[i].ino_num * sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); + } else if (type == EXTENT_CACHE) { + mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + atomic_read(&sbi->total_ext_node) * + sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; + res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; } return res; @@ -188,32 +195,35 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } -bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - bool is_cp = true; + bool need = false; down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); - if (e && !get_nat_flag(e, IS_CHECKPOINTED)) - is_cp = false; + if (e) { + if (!get_nat_flag(e, IS_CHECKPOINTED) && + !get_nat_flag(e, HAS_FSYNCED_INODE)) + need = true; + } up_read(&nm_i->nat_tree_lock); - return is_cp; + return need; } -bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino) +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - bool fsynced = false; + bool is_cp = true; down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, ino); - if (e && get_nat_flag(e, HAS_FSYNCED_INODE)) - fsynced = true; + e = __lookup_nat_cache(nm_i, nid); + if (e && !get_nat_flag(e, IS_CHECKPOINTED)) + is_cp = false; up_read(&nm_i->nat_tree_lock); - return fsynced; + return is_cp; } bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) @@ -305,7 +315,8 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, __set_nat_cache_dirty(nm_i, e); /* update fsync_mark if its inode nat entry is still alive */ - e = __lookup_nat_cache(nm_i, ni->ino); + if (ni->nid != ni->ino) + e = __lookup_nat_cache(nm_i, ni->ino); if (e) { if (fsync_done && ni->nid == ni->ino) set_nat_flag(e, HAS_FSYNCED_INODE, true); @@ -494,7 +505,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) /* if inline_data is set, should not report any block indices */ if (f2fs_has_inline_data(dn->inode) && index) { - err = -EINVAL; + err = -ENOENT; f2fs_put_page(npage[0], 1); goto release_out; } @@ -988,13 +999,17 @@ static int read_node_page(struct page *page, int rw) struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; struct f2fs_io_info fio = { + .sbi = sbi, .type = NODE, .rw = rw, + .page = page, + .encrypted_page = NULL, }; get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); f2fs_put_page(page, 1); return -ENOENT; } @@ -1003,7 +1018,7 @@ static int read_node_page(struct page *page, int rw) return LOCKED_PAGE; fio.blk_addr = ni.blk_addr; - return f2fs_submit_page_bio(sbi, page, &fio); + return f2fs_submit_page_bio(&fio); } /* @@ -1196,13 +1211,9 @@ continue_unlock: /* called by fsync() */ if (ino && IS_DNODE(page)) { set_fsync_mark(page, 1); - if (IS_INODE(page)) { - if (!is_checkpointed_node(sbi, ino) && - !has_fsynced_inode(sbi, ino)) - set_dentry_mark(page, 1); - else - set_dentry_mark(page, 0); - } + if (IS_INODE(page)) + set_dentry_mark(page, + need_dentry_mark(sbi, ino)); nwritten++; } else { set_fsync_mark(page, 0); @@ -1285,8 +1296,11 @@ static int f2fs_write_node_page(struct page *page, nid_t nid; struct node_info ni; struct f2fs_io_info fio = { + .sbi = sbi, .type = NODE, .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .page = page, + .encrypted_page = NULL, }; trace_f2fs_writepage(page, NODE); @@ -1306,6 +1320,7 @@ static int f2fs_write_node_page(struct page *page, /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_NODES); unlock_page(page); return 0; @@ -1320,7 +1335,7 @@ static int f2fs_write_node_page(struct page *page, set_page_writeback(page); fio.blk_addr = ni.blk_addr; - write_node_page(sbi, page, nid, &fio); + write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); @@ -1821,6 +1836,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; + struct f2fs_nm_info *nm_i = NM_I(sbi); /* * there are two steps to flush nat entries: @@ -1874,7 +1890,9 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, set->entry_cnt); + down_write(&nm_i->nat_tree_lock); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + up_write(&nm_i->nat_tree_lock); kmem_cache_free(nat_entry_set_slab, set); } @@ -1902,6 +1920,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); + down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; @@ -1910,6 +1929,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) __adjust_nat_entry_set(setvec[idx], &sets, MAX_NAT_JENTRIES(sum)); } + up_write(&nm_i->nat_tree_lock); /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index f405bbf2435a..7427e956ad81 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -120,6 +120,7 @@ enum mem_type { NAT_ENTRIES, /* indicates the cached nat entry */ DIRTY_DENTS, /* indicates dirty dentry pages */ INO_ENTRIES, /* indicates inode entries */ + EXTENT_CACHE, /* indicates extent cache */ BASE_CHECK, /* check kernel status */ }; @@ -342,28 +343,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ -static inline int is_file(struct inode *inode, int type) -{ - return F2FS_I(inode)->i_advise & type; -} - -static inline void set_file(struct inode *inode, int type) -{ - F2FS_I(inode)->i_advise |= type; -} - -static inline void clear_file(struct inode *inode, int type) -{ - F2FS_I(inode)->i_advise &= ~type; -} - -#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) -#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) -#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) -#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) -#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) -#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) - static inline int is_cold_data(struct page *page) { return PageChecked(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 41afb9534bbd..24a8c1d4f45f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -83,6 +83,11 @@ static int recover_dentry(struct inode *inode, struct page *ipage) goto out; } + if (file_enc_name(inode)) { + iput(dir); + return 0; + } + name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; @@ -93,10 +98,9 @@ static int recover_dentry(struct inode *inode, struct page *ipage) } retry: de = f2fs_find_entry(dir, &name, &page); - if (de && inode->i_ino == le32_to_cpu(de->ino)) { - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + if (de && inode->i_ino == le32_to_cpu(de->ino)) goto out_unmap_put; - } + if (de) { einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { @@ -115,7 +119,7 @@ retry: iput(einode); goto retry; } - err = __f2fs_add_link(dir, &name, inode); + err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); if (err) goto out_err; @@ -144,6 +148,7 @@ out: static void recover_inode(struct inode *inode, struct page *page) { struct f2fs_inode *raw = F2FS_INODE(page); + char *name; inode->i_mode = le16_to_cpu(raw->i_mode); i_size_write(inode, le64_to_cpu(raw->i_size)); @@ -154,8 +159,13 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + if (file_enc_name(inode)) + name = "<encrypted>"; + else + name = F2FS_INODE(page)->i_name; + f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", - ino_of_node(page), F2FS_INODE(page)->i_name); + ino_of_node(page), name); } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) @@ -175,7 +185,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) while (1) { struct fsync_inode_entry *entry; - if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) + if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; page = get_meta_page(sbi, blkaddr); @@ -187,11 +197,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (IS_INODE(page) && is_dent_dnode(page)) - set_inode_flag(F2FS_I(entry->inode), - FI_INC_LINK); - } else { + if (!entry) { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) @@ -212,8 +218,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); kmem_cache_free(fsync_entry_slab, entry); - if (err == -ENOENT) + if (err == -ENOENT) { + err = 0; goto next; + } break; } list_add_tail(&entry->list, head); @@ -256,6 +264,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, struct f2fs_summary_block *sum_node; struct f2fs_summary sum; struct page *sum_page, *node_page; + struct dnode_of_data tdn = *dn; nid_t ino, nid; struct inode *inode; unsigned int offset; @@ -283,17 +292,15 @@ got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); if (dn->inode->i_ino == nid) { - struct dnode_of_data tdn = *dn; tdn.nid = nid; + if (!dn->inode_page_locked) + lock_page(dn->inode_page); tdn.node_page = dn->inode_page; tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); - truncate_data_blocks_range(&tdn, 1); - return 0; + goto truncate_out; } else if (dn->nid == nid) { - struct dnode_of_data tdn = *dn; tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node); - truncate_data_blocks_range(&tdn, 1); - return 0; + goto truncate_out; } /* Get the node page */ @@ -317,18 +324,33 @@ got_it: bidx = start_bidx_of_node(offset, F2FS_I(inode)) + le16_to_cpu(sum.ofs_in_node); - if (ino != dn->inode->i_ino) { - truncate_hole(inode, bidx, bidx + 1); + /* + * if inode page is locked, unlock temporarily, but its reference + * count keeps alive. + */ + if (ino == dn->inode->i_ino && dn->inode_page_locked) + unlock_page(dn->inode_page); + + set_new_dnode(&tdn, inode, NULL, NULL, 0); + if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + goto out; + + if (tdn.data_blkaddr == blkaddr) + truncate_data_blocks_range(&tdn, 1); + + f2fs_put_dnode(&tdn); +out: + if (ino != dn->inode->i_ino) iput(inode); - } else { - struct dnode_of_data tdn; - set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0); - if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) - return 0; - if (tdn.data_blkaddr != NULL_ADDR) - truncate_data_blocks_range(&tdn, 1); - f2fs_put_page(tdn.node_page, 1); - } + else if (dn->inode_page_locked) + lock_page(dn->inode_page); + return 0; + +truncate_out: + if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + truncate_data_blocks_range(&tdn, 1); + if (dn->inode->i_ino == nid && !dn->inode_page_locked) + unlock_page(dn->inode_page); return 0; } @@ -338,7 +360,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int start, end; struct dnode_of_data dn; - struct f2fs_summary sum; struct node_info ni; int err = 0, recovered = 0; @@ -384,7 +405,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, src = datablock_addr(dn.node_page, dn.ofs_in_node); dest = datablock_addr(page, dn.ofs_in_node); - if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR) { + if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR && + is_valid_blkaddr(sbi, dest, META_POR)) { + if (src == NULL_ADDR) { err = reserve_new_block(&dn); /* We should not get -ENOSPC */ @@ -396,19 +419,14 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (err) goto err; - set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); - /* write dummy data page */ - recover_data_page(sbi, NULL, &sum, src, dest); - dn.data_blkaddr = dest; - update_extent_cache(&dn); + f2fs_replace_block(sbi, &dn, src, dest, + ni.version, false); recovered++; } dn.ofs_in_node++; } - /* write node page in place */ - set_summary(&sum, dn.nid, 0, 0); if (IS_INODE(dn.node_page)) sync_inode_page(&dn); @@ -442,7 +460,7 @@ static int recover_data(struct f2fs_sb_info *sbi, while (1) { struct fsync_inode_entry *entry; - if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) + if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) break; ra_meta_pages_cond(sbi, blkaddr); @@ -552,7 +570,7 @@ out: mutex_unlock(&sbi->cp_mutex); } else if (need_writecp) { struct cp_control cpc = { - .reason = CP_SYNC, + .reason = CP_RECOVERY, }; mutex_unlock(&sbi->cp_mutex); write_checkpoint(sbi, &cpc); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index daee4ab913da..61b97f9cb9f6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -75,6 +75,14 @@ static inline unsigned long __reverse_ffs(unsigned long word) static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + while (!f2fs_test_bit(offset, (unsigned char *)addr)) + offset++; + + if (offset > size) + offset = size; + + return offset; +#if 0 const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; @@ -121,11 +129,20 @@ found_first: return result + size; /* Nope. */ found_middle: return result + __reverse_ffs(tmp); +#endif } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + while (f2fs_test_bit(offset, (unsigned char *)addr)) + offset++; + + if (offset > size) + offset = size; + + return offset; +#if 0 const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; @@ -173,6 +190,7 @@ found_first: return result + size; /* Nope. */ found_middle: return result + __reverse_ffz(tmp); +#endif } void register_inmem_page(struct inode *inode, struct page *page) @@ -205,6 +223,8 @@ retry: list_add_tail(&new->list, &fi->inmem_pages); inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); mutex_unlock(&fi->inmem_lock); + + trace_f2fs_register_inmem_page(page, INMEM); } void commit_inmem_pages(struct inode *inode, bool abort) @@ -214,8 +234,10 @@ void commit_inmem_pages(struct inode *inode, bool abort) struct inmem_pages *cur, *tmp; bool submit_bio = false; struct f2fs_io_info fio = { + .sbi = sbi, .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, + .encrypted_page = NULL, }; /* @@ -235,14 +257,18 @@ void commit_inmem_pages(struct inode *inode, bool abort) if (!abort) { lock_page(cur->page); if (cur->page->mapping == inode->i_mapping) { + set_page_dirty(cur->page); f2fs_wait_on_page_writeback(cur->page, DATA); if (clear_page_dirty_for_io(cur->page)) inode_dec_dirty_pages(inode); - do_write_data_page(cur->page, &fio); + trace_f2fs_commit_inmem_page(cur->page, INMEM); + fio.page = cur->page; + do_write_data_page(&fio); submit_bio = true; } f2fs_put_page(cur->page, 1); } else { + trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); put_page(cur->page); } radix_tree_delete(&fi->inmem_root, cur->page->index); @@ -277,6 +303,9 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) { + /* try to shrink extent cache when there is no enough memory */ + f2fs_shrink_extent_tree(sbi, EXTENT_CACHE_SHRINK_NUMBER); + /* check the # of cached NAT entries and prefree segments */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) || excess_prefree_segs(sbi) || @@ -459,22 +488,43 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, { sector_t start = SECTOR_FROM_BLOCK(blkstart); sector_t len = SECTOR_FROM_BLOCK(blklen); + struct seg_entry *se; + unsigned int offset; + block_t i; + + for (i = blkstart; i < blkstart + blklen; i++) { + se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); + offset = GET_BLKOFF_FROM_SEG0(sbi, i); + + if (!f2fs_test_and_set_bit(offset, se->discard_map)) + sbi->discard_blks--; + } trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) { - if (f2fs_issue_discard(sbi, blkaddr, 1)) { - struct page *page = grab_meta_page(sbi, blkaddr); - /* zero-filled page */ - set_page_dirty(page); - f2fs_put_page(page, 1); + int err = -ENOTSUPP; + + if (test_opt(sbi, DISCARD)) { + struct seg_entry *se = get_seg_entry(sbi, + GET_SEGNO(sbi, blkaddr)); + unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (f2fs_test_bit(offset, se->discard_map)) + return; + + err = f2fs_issue_discard(sbi, blkaddr, 1); } + + if (err) + update_meta_page(sbi, NULL, blkaddr); } static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, unsigned int start, unsigned int end) + struct cp_control *cpc, struct seg_entry *se, + unsigned int start, unsigned int end) { struct list_head *head = &SM_I(sbi)->discard_list; struct discard_entry *new, *last; @@ -495,7 +545,6 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, list_add_tail(&new->list, head); done: SM_I(sbi)->nr_discards += end - start; - cpc->trimmed += end - start; } static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) @@ -505,41 +554,24 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; bool force = (cpc->reason == CP_DISCARD); int i; - if (!force && (!test_opt(sbi, DISCARD) || - SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)) + if (se->valid_blocks == max_blocks) return; - if (force && !se->valid_blocks) { - struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - /* - * if this segment is registered in the prefree list, then - * we should skip adding a discard candidate, and let the - * checkpoint do that later. - */ - mutex_lock(&dirty_i->seglist_lock); - if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) { - mutex_unlock(&dirty_i->seglist_lock); - cpc->trimmed += sbi->blocks_per_seg; + if (!force) { + if (!test_opt(sbi, DISCARD) || !se->valid_blocks || + SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) return; - } - mutex_unlock(&dirty_i->seglist_lock); - - __add_discard_entry(sbi, cpc, 0, sbi->blocks_per_seg); - return; } - /* zero block will be discarded through the prefree list */ - if (!se->valid_blocks || se->valid_blocks == max_blocks) - return; - /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ for (i = 0; i < entries; i++) - dmap[i] = force ? ~ckpt_map[i] : + dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { @@ -548,11 +580,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); - - if (end - start < cpc->trim_minlen) - continue; - - __add_discard_entry(sbi, cpc, start, end); + __add_discard_entry(sbi, cpc, se, start, end); } } @@ -582,7 +610,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) mutex_unlock(&dirty_i->seglist_lock); } -void clear_prefree_segments(struct f2fs_sb_info *sbi) +void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct list_head *head = &(SM_I(sbi)->discard_list); struct discard_entry *entry, *this; @@ -615,7 +643,11 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { + if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen) + goto skip; f2fs_issue_discard(sbi, entry->blkaddr, entry->len); + cpc->trimmed += entry->len; +skip: list_del(&entry->list); SM_I(sbi)->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); @@ -666,9 +698,13 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (del > 0) { if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); + if (!f2fs_test_and_set_bit(offset, se->discard_map)) + sbi->discard_blks--; } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); + if (f2fs_test_and_clear_bit(offset, se->discard_map)) + sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks += del; @@ -762,16 +798,25 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); } -static void write_sum_page(struct f2fs_sb_info *sbi, - struct f2fs_summary_block *sum_blk, block_t blk_addr) +void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct page *page = grab_meta_page(sbi, blk_addr); - void *kaddr = page_address(page); - memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE); + void *dst = page_address(page); + + if (src) + memcpy(dst, src, PAGE_CACHE_SIZE); + else + memset(dst, 0, PAGE_CACHE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } +static void write_sum_page(struct f2fs_sb_info *sbi, + struct f2fs_summary_block *sum_blk, block_t blk_addr) +{ + update_meta_page(sbi, (void *)sum_blk, blk_addr); +} + static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1053,8 +1098,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) unsigned int start_segno, end_segno; struct cp_control cpc; - if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) || - range->len < sbi->blocksize) + if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; cpc.trimmed = 0; @@ -1066,12 +1110,19 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); cpc.reason = CP_DISCARD; - cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen); + cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); /* do checkpoint to issue discard commands safely */ for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { cpc.trim_start = start_segno; - cpc.trim_end = min_t(unsigned int, rounddown(start_segno + + + if (sbi->discard_blks == 0) + break; + else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi)) + cpc.trim_end = end_segno; + else + cpc.trim_end = min_t(unsigned int, + rounddown(start_segno + BATCHED_TRIM_SEGMENTS(sbi), sbi->segs_per_sec) - 1, end_segno); @@ -1164,6 +1215,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); + mutex_lock(&sit_i->sentry_lock); /* direct_io'ed data is aligned to the segment for better performance */ if (direct_io && curseg->next_blkoff) @@ -1178,7 +1230,6 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, */ __add_sum_entry(sbi, type, sum); - mutex_lock(&sit_i->sentry_lock); __refresh_next_blkoff(sbi, curseg); stat_inc_block_count(sbi, curseg); @@ -1199,84 +1250,95 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); } -static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, - struct f2fs_summary *sum, - struct f2fs_io_info *fio) +static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(page, fio->type); + int type = __get_segment_type(fio->page, fio->type); - allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type); + allocate_data_block(fio->sbi, fio->page, fio->blk_addr, + &fio->blk_addr, sum, type); /* writeout dirty page into bdev */ - f2fs_submit_page_mbio(sbi, page, fio); + f2fs_submit_page_mbio(fio); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_io_info fio = { + .sbi = sbi, .type = META, .rw = WRITE_SYNC | REQ_META | REQ_PRIO, .blk_addr = page->index, + .page = page, + .encrypted_page = NULL, }; set_page_writeback(page); - f2fs_submit_page_mbio(sbi, page, &fio); + f2fs_submit_page_mbio(&fio); } -void write_node_page(struct f2fs_sb_info *sbi, struct page *page, - unsigned int nid, struct f2fs_io_info *fio) +void write_node_page(unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; + set_summary(&sum, nid, 0, 0); - do_write_page(sbi, page, &sum, fio); + do_write_page(&sum, fio); } -void write_data_page(struct page *page, struct dnode_of_data *dn, - struct f2fs_io_info *fio) +void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) { - struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; struct node_info ni; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - do_write_page(sbi, page, &sum, fio); + do_write_page(&sum, fio); dn->data_blkaddr = fio->blk_addr; } -void rewrite_data_page(struct page *page, struct f2fs_io_info *fio) +void rewrite_data_page(struct f2fs_io_info *fio) { - stat_inc_inplace_blocks(F2FS_P_SB(page)); - f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio); + stat_inc_inplace_blocks(fio->sbi); + f2fs_submit_page_mbio(fio); } -void recover_data_page(struct f2fs_sb_info *sbi, - struct page *page, struct f2fs_summary *sum, - block_t old_blkaddr, block_t new_blkaddr) +static void __f2fs_replace_block(struct f2fs_sb_info *sbi, + struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int segno, old_cursegno; struct seg_entry *se; int type; + unsigned short old_blkoff; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); type = se->type; - if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { - if (old_blkaddr == NULL_ADDR) - type = CURSEG_COLD_DATA; - else + if (!recover_curseg) { + /* for recovery flow */ + if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { + if (old_blkaddr == NULL_ADDR) + type = CURSEG_COLD_DATA; + else + type = CURSEG_WARM_DATA; + } + } else { + if (!IS_CURSEG(sbi, segno)) type = CURSEG_WARM_DATA; } + curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); old_cursegno = curseg->segno; + old_blkoff = curseg->next_blkoff; /* change the current segment */ if (segno != curseg->segno) { @@ -1290,30 +1352,67 @@ void recover_data_page(struct f2fs_sb_info *sbi, refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); locate_dirty_segment(sbi, old_cursegno); + if (recover_curseg) { + if (old_cursegno != curseg->segno) { + curseg->next_segno = old_cursegno; + change_curseg(sbi, type, true); + } + curseg->next_blkoff = old_blkoff; + } + mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); } +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg) +{ + struct f2fs_summary sum; + + set_summary(&sum, dn->nid, dn->ofs_in_node, version); + + __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg); + + dn->data_blkaddr = new_addr; + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); +} + static inline bool is_merged_page(struct f2fs_sb_info *sbi, struct page *page, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = &sbi->write_io[btype]; struct bio_vec *bvec; + struct page *target; int i; down_read(&io->io_rwsem); - if (!io->bio) - goto out; + if (!io->bio) { + up_read(&io->io_rwsem); + return false; + } bio_for_each_segment_all(bvec, io->bio, i) { - if (page == bvec->bv_page) { + + if (bvec->bv_page->mapping) { + target = bvec->bv_page; + } else { + struct f2fs_crypto_ctx *ctx; + + /* encrypted page */ + ctx = (struct f2fs_crypto_ctx *)page_private( + bvec->bv_page); + target = ctx->w.control_page; + } + + if (page == target) { up_read(&io->io_rwsem); return true; } } -out: up_read(&io->io_rwsem); return false; } @@ -1730,6 +1829,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); + if (!sit_i->dirty_sentries) + goto out; + /* * add and account sit entries of dirty bitmap in sit entry * set temporarily @@ -1744,9 +1846,6 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) remove_sits_in_journal(sbi); - if (!sit_i->dirty_sentries) - goto out; - /* * there are two steps to flush sit entries: * #1, flush sit entries to journal in current cold data summary block. @@ -1850,8 +1949,11 @@ static int build_sit_info(struct f2fs_sb_info *sbi) = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - if (!sit_i->sentries[start].cur_valid_map - || !sit_i->sentries[start].ckpt_valid_map) + sit_i->sentries[start].discard_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map || + !sit_i->sentries[start].ckpt_valid_map || + !sit_i->sentries[start].discard_map) return -ENOMEM; } @@ -1989,6 +2091,11 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) got_it: check_block_count(sbi, start, &sit); seg_info_from_raw_sit(se, &sit); + + /* build discard map only one time */ + memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks; + if (sbi->segs_per_sec > 1) { struct sec_entry *e = get_sec_entry(sbi, start); e->valid_blocks += se->valid_blocks; @@ -2238,6 +2345,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) for (start = 0; start < MAIN_SEGS(sbi); start++) { kfree(sit_i->sentries[start].cur_valid_map); kfree(sit_i->sentries[start].ckpt_valid_map); + kfree(sit_i->sentries[start].discard_map); } } kfree(sit_i->tmp_map); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 7fd35111cf62..79e7b879a753 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -9,6 +9,7 @@ * published by the Free Software Foundation. */ #include <linux/blkdev.h> +#include <linux/backing-dev.h> /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) @@ -163,6 +164,7 @@ struct seg_entry { */ unsigned short ckpt_valid_blocks; unsigned char *ckpt_valid_map; + unsigned char *discard_map; unsigned char type; /* segment type like CURSEG_XXX_TYPE */ unsigned long long mtime; /* modification time of the segment */ }; @@ -336,7 +338,8 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) clear_bit(segno, free_i->free_segmap); free_i->free_segments++; - next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno); + next = find_next_bit(free_i->free_segmap, + start_segno + sbi->segs_per_sec, start_segno); if (next >= start_segno + sbi->segs_per_sec) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; @@ -712,7 +715,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; if (type == DATA) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f2fe666a6ea9..a06b0b46fe69 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -57,6 +57,8 @@ enum { Opt_flush_merge, Opt_nobarrier, Opt_fastboot, + Opt_extent_cache, + Opt_noinline_data, Opt_err, }; @@ -78,6 +80,8 @@ static match_table_t f2fs_tokens = { {Opt_flush_merge, "flush_merge"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, + {Opt_extent_cache, "extent_cache"}, + {Opt_noinline_data, "noinline_data"}, {Opt_err, NULL}, }; @@ -254,6 +258,7 @@ static void init_once(void *foo) static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct request_queue *q; substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; @@ -298,7 +303,14 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; break; case Opt_discard: - set_opt(sbi, DISCARD); + q = bdev_get_queue(sb->s_bdev); + if (blk_queue_discard(q)) { + set_opt(sbi, DISCARD); + } else { + f2fs_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } break; case Opt_noheap: set_opt(sbi, NOHEAP); @@ -367,6 +379,12 @@ static int parse_options(struct super_block *sb, char *options) case Opt_fastboot: set_opt(sbi, FASTBOOT); break; + case Opt_extent_cache: + set_opt(sbi, EXTENT_CACHE); + break; + case Opt_noinline_data: + clear_opt(sbi, INLINE_DATA); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -392,7 +410,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; - rwlock_init(&fi->ext.ext_lock); + rwlock_init(&fi->ext_lock); init_rwsem(&fi->i_sem); INIT_RADIX_TREE(&fi->inmem_root, GFP_NOFS); INIT_LIST_HEAD(&fi->inmem_pages); @@ -406,6 +424,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; +#ifdef CONFIG_F2FS_FS_ENCRYPTION + fi->i_crypt_info = NULL; +#endif return &fi->vfs_inode; } @@ -418,8 +439,31 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if (!inode_unhashed(inode) && inode->i_state & I_SYNC) + if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { + if (!inode->i_nlink && !is_bad_inode(inode)) { + spin_unlock(&inode->i_lock); + + /* some remained atomic pages should discarded */ + if (f2fs_is_atomic_file(inode)) + commit_inmem_pages(inode, true); + + sb_start_intwrite(inode->i_sb); + i_size_write(inode, 0); + + if (F2FS_HAS_BLOCKS(inode)) + f2fs_truncate(inode); + + sb_end_intwrite(inode->i_sb); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (F2FS_I(inode)->i_crypt_info) + f2fs_free_encryption_info(inode, + F2FS_I(inode)->i_crypt_info); +#endif + spin_lock(&inode->i_lock); + } return 0; + } return generic_drop_inode(inode); } @@ -510,7 +554,7 @@ int f2fs_sync_fs(struct super_block *sb, int sync) } else { f2fs_balance_fs(sbi); } - f2fs_trace_ios(NULL, NULL, 1); + f2fs_trace_ios(NULL, 1); return 0; } @@ -591,6 +635,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",disable_ext_identify"); if (test_opt(sbi, INLINE_DATA)) seq_puts(seq, ",inline_data"); + else + seq_puts(seq, ",noinline_data"); if (test_opt(sbi, INLINE_DENTRY)) seq_puts(seq, ",inline_dentry"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) @@ -599,6 +645,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",nobarrier"); if (test_opt(sbi, FASTBOOT)) seq_puts(seq, ",fastboot"); + if (test_opt(sbi, EXTENT_CACHE)) + seq_puts(seq, ",extent_cache"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -644,6 +692,22 @@ static const struct file_operations f2fs_seq_segment_info_fops = { .release = single_release, }; +static void default_options(struct f2fs_sb_info *sbi) +{ + /* init some FS parameters */ + sbi->active_logs = NR_CURSEG_TYPE; + + set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_DATA); + +#ifdef CONFIG_F2FS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif +} + static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -662,7 +726,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) active_logs = sbi->active_logs; sbi->mount_opt.opt = 0; - sbi->active_logs = NR_CURSEG_TYPE; + default_options(sbi); /* parse mount options */ err = parse_options(sb, data); @@ -915,29 +979,36 @@ static void init_sb_info(struct f2fs_sb_info *sbi) */ static int read_raw_super_block(struct super_block *sb, struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf) + struct buffer_head **raw_super_buf, + int *recovery) { int block = 0; + struct buffer_head *buffer; + struct f2fs_super_block *super; + int err = 0; retry: - *raw_super_buf = sb_bread(sb, block); - if (!*raw_super_buf) { + buffer = sb_bread(sb, block); + if (!buffer) { + *recovery = 1; f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", block + 1); if (block == 0) { block++; goto retry; } else { - return -EIO; + err = -EIO; + goto out; } } - *raw_super = (struct f2fs_super_block *) - ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET); + super = (struct f2fs_super_block *) + ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET); /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, *raw_super)) { - brelse(*raw_super_buf); + if (sanity_check_raw_super(sb, super)) { + brelse(buffer); + *recovery = 1; f2fs_msg(sb, KERN_ERR, "Can't find valid F2FS filesystem in %dth superblock", block + 1); @@ -945,25 +1016,76 @@ retry: block++; goto retry; } else { - return -EINVAL; + err = -EINVAL; + goto out; } } + if (!*raw_super) { + *raw_super_buf = buffer; + *raw_super = super; + } else { + /* already have a valid superblock */ + brelse(buffer); + } + + /* check the validity of the second superblock */ + if (block == 0) { + block++; + goto retry; + } + +out: + /* No valid superblock */ + if (!*raw_super) + return err; + return 0; } +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) +{ + struct buffer_head *sbh = sbi->raw_super_buf; + sector_t block = sbh->b_blocknr; + int err; + + /* write back-up superblock first */ + sbh->b_blocknr = block ? 0 : 1; + mark_buffer_dirty(sbh); + err = sync_dirty_buffer(sbh); + + sbh->b_blocknr = block; + + /* if we are in recovery path, skip writing valid superblock */ + if (recover || err) + goto out; + + /* write current valid superblock */ + mark_buffer_dirty(sbh); + err = sync_dirty_buffer(sbh); +out: + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + return err; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; - struct f2fs_super_block *raw_super = NULL; + struct f2fs_super_block *raw_super; struct buffer_head *raw_super_buf; struct inode *root; - long err = -EINVAL; - bool retry = true; + long err; + bool retry = true, need_fsck = false; char *options = NULL; - int i; + int recovery, i; try_onemore: + err = -EINVAL; + raw_super = NULL; + raw_super_buf = NULL; + recovery = 0; + /* allocate memory for f2fs-specific super block info */ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); if (!sbi) @@ -975,22 +1097,12 @@ try_onemore: goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &raw_super_buf); + err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery); if (err) goto free_sbi; sb->s_fs_info = sbi; - /* init some FS parameters */ - sbi->active_logs = NR_CURSEG_TYPE; - - set_opt(sbi, BG_GC); - -#ifdef CONFIG_F2FS_FS_XATTR - set_opt(sbi, XATTR_USER); -#endif -#ifdef CONFIG_F2FS_FS_POSIX_ACL - set_opt(sbi, POSIX_ACL); -#endif + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); if (data && !options) { @@ -1072,6 +1184,8 @@ try_onemore: INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); + init_extent_cache_info(sbi); + init_ino_entry_info(sbi); /* setup f2fs internal modules */ @@ -1131,14 +1245,6 @@ try_onemore: proc_create_data("segment_info", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_info_fops, sb); - if (test_opt(sbi, DISCARD)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) - f2fs_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } - sbi->s_kobj.kset = f2fs_kset; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, @@ -1146,9 +1252,6 @@ try_onemore: if (err) goto free_proc; - if (!retry) - set_sbi_flag(sbi, SBI_NEED_FSCK); - /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { /* @@ -1160,8 +1263,13 @@ try_onemore: err = -EROFS; goto free_kobj; } + + if (need_fsck) + set_sbi_flag(sbi, SBI_NEED_FSCK); + err = recover_fsync_data(sbi); if (err) { + need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%ld", err); goto free_kobj; @@ -1179,6 +1287,13 @@ try_onemore: goto free_kobj; } kfree(options); + + /* recover broken superblock */ + if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) { + f2fs_msg(sb, KERN_INFO, "Recover invalid superblock"); + f2fs_commit_super(sbi, true); + } + return 0; free_kobj: @@ -1212,7 +1327,7 @@ free_sbi: /* give only one another chance */ if (retry) { - retry = 0; + retry = false; shrink_dcache_sb(sb); goto try_onemore; } @@ -1278,20 +1393,30 @@ static int __init init_f2fs_fs(void) err = create_checkpoint_caches(); if (err) goto free_segment_manager_caches; + err = create_extent_cache(); + if (err) + goto free_checkpoint_caches; f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); if (!f2fs_kset) { err = -ENOMEM; - goto free_checkpoint_caches; + goto free_extent_cache; } - err = register_filesystem(&f2fs_fs_type); + err = f2fs_init_crypto(); if (err) goto free_kset; + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_crypto; f2fs_create_root_stats(); f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_crypto: + f2fs_exit_crypto(); free_kset: kset_unregister(f2fs_kset); +free_extent_cache: + destroy_extent_cache(); free_checkpoint_caches: destroy_checkpoint_caches(); free_segment_manager_caches: @@ -1309,6 +1434,8 @@ static void __exit exit_f2fs_fs(void) remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); + f2fs_exit_crypto(); + destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); destroy_node_manager_caches(); diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 875aa8179bc1..145fb659ad44 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -80,7 +80,7 @@ out: radix_tree_preload_end(); } -void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush) +void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) { struct inode *inode; pid_t pid; @@ -91,8 +91,8 @@ void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush) return; } - inode = page->mapping->host; - pid = page_private(page); + inode = fio->page->mapping->host; + pid = page_private(fio->page); major = MAJOR(inode->i_sb->s_dev); minor = MINOR(inode->i_sb->s_dev); diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h index 1041dbeb52ae..67db24ac1e85 100644 --- a/fs/f2fs/trace.h +++ b/fs/f2fs/trace.h @@ -33,12 +33,12 @@ struct last_io_info { }; extern void f2fs_trace_pid(struct page *); -extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int); +extern void f2fs_trace_ios(struct f2fs_io_info *, int); extern void f2fs_build_trace_ios(void); extern void f2fs_destroy_trace_ios(void); #else #define f2fs_trace_pid(p) -#define f2fs_trace_ios(p, i, n) +#define f2fs_trace_ios(i, n) #define f2fs_build_trace_ios() #define f2fs_destroy_trace_ios() diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 5072bf9ae0ef..07449b980acb 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -83,7 +83,7 @@ static int f2fs_xattr_generic_get(struct dentry *dentry, const char *name, } if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_getxattr(dentry->d_inode, type, name, buffer, size, NULL); + return f2fs_getxattr(d_inode(dentry), type, name, buffer, size, NULL); } static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, @@ -108,7 +108,7 @@ static int f2fs_xattr_generic_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return f2fs_setxattr(dentry->d_inode, type, name, + return f2fs_setxattr(d_inode(dentry), type, name, value, size, NULL, flags); } @@ -130,19 +130,20 @@ static size_t f2fs_xattr_advise_list(struct dentry *dentry, char *list, static int f2fs_xattr_advise_get(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (strcmp(name, "") != 0) return -EINVAL; - *((char *)buffer) = F2FS_I(inode)->i_advise; + if (buffer) + *((char *)buffer) = F2FS_I(inode)->i_advise; return sizeof(char); } static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int type) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (strcmp(name, "") != 0) return -EINVAL; @@ -152,6 +153,7 @@ static int f2fs_xattr_advise_set(struct dentry *dentry, const char *name, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; + mark_inode_dirty(inode); return 0; } @@ -442,7 +444,7 @@ cleanup: ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct f2fs_xattr_entry *entry; void *base_addr; int error = 0; @@ -582,6 +584,9 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ctime = CURRENT_TIME; clear_inode_flag(fi, FI_ACL_MODE); } + if (index == F2FS_XATTR_INDEX_ENCRYPTION && + !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) + f2fs_set_encrypted_inode(inode); if (ipage) update_inode(inode, ipage); diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 969d792ca362..71a7100d5492 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -35,6 +35,10 @@ #define F2FS_XATTR_INDEX_LUSTRE 5 #define F2FS_XATTR_INDEX_SECURITY 6 #define F2FS_XATTR_INDEX_ADVISE 7 +/* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */ +#define F2FS_XATTR_INDEX_ENCRYPTION 9 + +#define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT "c" struct f2fs_xattr_header { __le32 h_magic; /* magic number for identification */ diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 91ad9e1c9441..93fc62232ec2 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -8,9 +8,7 @@ * May 1999. AV. Fixed the bogosity with FAT32 (read "FAT28"). Fscking lusers. */ -#include <linux/fs.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include "fat.h" /* this must be > 0. */ diff --git a/fs/fat/dir.c b/fs/fat/dir.c index c5d6bb939d19..4afc4d9d2e41 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -13,13 +13,9 @@ * Short name translation 1999, 2001 by Wolfram Pienkoss <wp@bszh.de> */ -#include <linux/module.h> #include <linux/slab.h> -#include <linux/time.h> -#include <linux/buffer_head.h> #include <linux/compat.h> #include <linux/uaccess.h> -#include <linux/kernel.h> #include "fat.h" /* diff --git a/fs/fat/fat.h b/fs/fat/fat.h index 64e295e8ff38..be5e15323bab 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -2,11 +2,8 @@ #define _FAT_H #include <linux/buffer_head.h> -#include <linux/string.h> #include <linux/nls.h> -#include <linux/fs.h> #include <linux/hash.h> -#include <linux/mutex.h> #include <linux/ratelimit.h> #include <linux/msdos_fs.h> @@ -66,7 +63,7 @@ struct msdos_sb_info { unsigned short sec_per_clus; /* sectors/cluster */ unsigned short cluster_bits; /* log2(cluster_size) */ unsigned int cluster_size; /* cluster size */ - unsigned char fats, fat_bits; /* number of FATs, FAT bits (12 or 16) */ + unsigned char fats, fat_bits; /* number of FATs, FAT bits (12,16 or 32) */ unsigned short fat_start; unsigned long fat_length; /* FAT start & length (sec.) */ unsigned long dir_start; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 260705c58062..8226557130a2 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -3,9 +3,6 @@ * Released under GPL v2. */ -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/msdos_fs.h> #include <linux/blkdev.h> #include "fat.h" diff --git a/fs/fat/file.c b/fs/fat/file.c index 8429c68e3057..a08f1039909a 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -10,11 +10,8 @@ #include <linux/module.h> #include <linux/compat.h> #include <linux/mount.h> -#include <linux/time.h> -#include <linux/buffer_head.h> -#include <linux/writeback.h> -#include <linux/backing-dev.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/fsnotify.h> #include <linux/security.h> #include "fat.h" @@ -170,8 +167,6 @@ int fat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) const struct file_operations fat_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, @@ -311,7 +306,7 @@ void fat_truncate_blocks(struct inode *inode, loff_t offset) int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size; @@ -383,7 +378,7 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) int fat_setattr(struct dentry *dentry, struct iattr *attr) { struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 497c7c5263c7..509411dd3698 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -11,22 +11,14 @@ */ #include <linux/module.h> -#include <linux/init.h> -#include <linux/time.h> -#include <linux/slab.h> -#include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/mpage.h> -#include <linux/buffer_head.h> -#include <linux/mount.h> -#include <linux/aio.h> #include <linux/vfs.h> +#include <linux/seq_file.h> #include <linux/parser.h> #include <linux/uio.h> -#include <linux/writeback.h> -#include <linux/log2.h> -#include <linux/hash.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <asm/unaligned.h> #include "fat.h" @@ -246,8 +238,7 @@ static int fat_write_end(struct file *file, struct address_space *mapping, return err; } -static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, +static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; @@ -256,7 +247,7 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_iter_count(iter); ssize_t ret; - if (rw == WRITE) { + if (iov_iter_rw(iter) == WRITE) { /* * FIXME: blockdev_direct_IO() doesn't use ->write_begin(), * so we need to update the ->mmu_private to block boundary. @@ -275,8 +266,8 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb, * FAT need to use the DIO_LOCKING for avoiding the race * condition of fat_get_block() and ->truncate(). */ - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, fat_get_block); - if (ret < 0 && (rw & WRITE)) + ret = blockdev_direct_IO(iocb, inode, iter, offset, fat_get_block); + if (ret < 0 && iov_iter_rw(iter) == WRITE) fat_write_failed(mapping, offset + count); return ret; @@ -1280,8 +1271,7 @@ out: static int fat_read_root(struct inode *inode) { - struct super_block *sb = inode->i_sb; - struct msdos_sb_info *sbi = MSDOS_SB(sb); + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); int error; MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index d8da2d2e30ae..c4589e981760 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -6,10 +6,6 @@ * and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru) */ -#include <linux/module.h> -#include <linux/fs.h> -#include <linux/buffer_head.h> -#include <linux/time.h> #include "fat.h" /* diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index a783b0e1272a..b7e2b33aa793 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -7,8 +7,6 @@ */ #include <linux/module.h> -#include <linux/time.h> -#include <linux/buffer_head.h> #include "fat.h" /* Characters that are undesirable in an MS-DOS file name */ @@ -310,7 +308,7 @@ out: static int msdos_rmdir(struct inode *dir, struct dentry *dentry) { struct super_block *sb = dir->i_sb; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct fat_slot_info sinfo; int err; @@ -404,7 +402,7 @@ out: /***** Unlink a file */ static int msdos_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct fat_slot_info sinfo; int err; @@ -442,8 +440,8 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name, int err, old_attrs, is_dir, update_dotdot, corrupt = 0; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; - old_inode = old_dentry->d_inode; - new_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); err = fat_scan(old_dir, old_name, &old_sinfo); if (err) { diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index b8b92c2f9683..7092584f424a 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -16,10 +16,8 @@ */ #include <linux/module.h> -#include <linux/jiffies.h> #include <linux/ctype.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include <linux/namei.h> #include "fat.h" @@ -35,7 +33,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry) { int ret = 1; spin_lock(&dentry->d_lock); - if (dentry->d_time != dentry->d_parent->d_inode->i_version) + if (dentry->d_time != d_inode(dentry->d_parent)->i_version) ret = 0; spin_unlock(&dentry->d_lock); return ret; @@ -47,7 +45,7 @@ static int vfat_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; /* This is not negative dentry. Always valid. */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return 1; return vfat_revalidate_shortname(dentry); } @@ -67,7 +65,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, unsigned int flags) * positive dentry isn't good idea. So it's unsupported like * rename("filename", "FILENAME") for now. */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return 1; /* @@ -803,7 +801,7 @@ out: static int vfat_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; int err; @@ -834,7 +832,7 @@ out: static int vfat_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; int err; @@ -917,8 +915,8 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry, struct super_block *sb = old_dir->i_sb; old_sinfo.bh = sinfo.bh = dotdot_bh = NULL; - old_inode = old_dentry->d_inode; - new_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_inode = d_inode(new_dentry); mutex_lock(&MSDOS_SB(sb)->s_lock); err = vfat_find(old_dir, &old_dentry->d_name, &old_sinfo); if (err) diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c index 93e14933dcb6..eb192656fba2 100644 --- a/fs/fat/nfs.c +++ b/fs/fat/nfs.c @@ -266,7 +266,7 @@ struct inode *fat_rebuild_parent(struct super_block *sb, int parent_logstart) * Find the parent for a directory that is not currently connected to * the filesystem root. * - * On entry, the caller holds child_dir->d_inode->i_mutex. + * On entry, the caller holds d_inode(child_dir)->i_mutex. */ static struct dentry *fat_get_parent(struct dentry *child_dir) { @@ -276,7 +276,7 @@ static struct dentry *fat_get_parent(struct dentry *child_dir) struct inode *parent_inode = NULL; struct msdos_sb_info *sbi = MSDOS_SB(sb); - if (!fat_get_dotdot_entry(child_dir->d_inode, &bh, &de)) { + if (!fat_get_dotdot_entry(d_inode(child_dir), &bh, &de)) { int parent_logstart = fat_get_start(sbi, de); parent_inode = fat_dget(sb, parent_logstart); if (!parent_inode && sbi->options.nfs == FAT_NFS_NOSTALE_RO) diff --git a/fs/fhandle.c b/fs/fhandle.c index 999ff5c3cab0..d59712dfa3e7 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -195,8 +195,9 @@ static int handle_to_path(int mountdirfd, struct file_handle __user *ufh, goto out_err; } /* copy the full handle */ - if (copy_from_user(handle, ufh, - sizeof(struct file_handle) + + *handle = f_handle; + if (copy_from_user(&handle->f_handle, + &ufh->f_handle, f_handle.handle_bytes)) { retval = -EFAULT; goto out_handle; diff --git a/fs/file.c b/fs/file.c index ee738ea028fa..6c672ad329e9 100644 --- a/fs/file.c +++ b/fs/file.c @@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr) spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); + + /* make sure all __fd_install() have seen resize_in_progress + * or have finished their rcu_read_lock_sched() section. + */ + if (atomic_read(&files->count) > 1) + synchronize_sched(); + spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; @@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr) __free_fdtable(new_fdt); return -EMFILE; } - /* - * Check again since another task may have expanded the fd table while - * we dropped the lock - */ cur_fdt = files_fdtable(files); - if (nr >= cur_fdt->max_fds) { - /* Continue as planned */ - copy_fdtable(new_fdt, cur_fdt); - rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt != &files->fdtab) - call_rcu(&cur_fdt->rcu, free_fdtable_rcu); - } else { - /* Somebody else expanded, so undo our attempt */ - __free_fdtable(new_fdt); - } + BUG_ON(nr < cur_fdt->max_fds); + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + if (cur_fdt != &files->fdtab) + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); + /* coupled with smp_rmb() in __fd_install() */ + smp_wmb(); return 1; } @@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr) * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) { struct fdtable *fdt; + int expanded = 0; +repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return 0; + return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; + if (unlikely(files->resize_in_progress)) { + spin_unlock(&files->file_lock); + expanded = 1; + wait_event(files->resize_wait, !files->resize_in_progress); + spin_lock(&files->file_lock); + goto repeat; + } + /* All good, so we try */ - return expand_fdtable(files, nr); + files->resize_in_progress = true; + expanded = expand_fdtable(files, nr); + files->resize_in_progress = false; + + wake_up_all(&files->resize_wait); + return expanded; } static inline void __set_close_on_exec(int fd, struct fdtable *fdt) @@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->resize_in_progress = false; + init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; @@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); + + might_sleep(); + rcu_read_lock_sched(); + + while (unlikely(files->resize_in_progress)) { + rcu_read_unlock_sched(); + wait_event(files->resize_wait, !files->resize_in_progress); + rcu_read_lock_sched(); + } + /* coupled with smp_wmb() in expand_fdtable() */ + smp_rmb(); + fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + rcu_read_unlock_sched(); } void fd_install(unsigned int fd, struct file *file) @@ -635,12 +664,17 @@ static struct file *__fget(unsigned int fd, fmode_t mask) struct file *file; rcu_read_lock(); +loop: file = fcheck_files(files, fd); if (file) { - /* File object ref couldn't be taken */ - if ((file->f_mode & mask) || - !atomic_long_inc_not_zero(&file->f_count)) + /* File object ref couldn't be taken. + * dup2() atomicity guarantee is the reason + * we loop to catch the new file (or NULL pointer) + */ + if (file->f_mode & mask) file = NULL; + else if (!get_file_rcu(file)) + goto loop; } rcu_read_unlock(); diff --git a/fs/file_table.c b/fs/file_table.c index 3f85411b03ce..ad17e05ebf95 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -20,12 +20,12 @@ #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> -#include <linux/lglock.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/task_work.h> #include <linux/ima.h> +#include <linux/swap.h> #include <linux/atomic.h> @@ -168,10 +168,10 @@ struct file *alloc_file(struct path *path, fmode_t mode, file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; if ((mode & FMODE_READ) && - likely(fop->read || fop->aio_read || fop->read_iter)) + likely(fop->read || fop->read_iter)) mode |= FMODE_CAN_READ; if ((mode & FMODE_WRITE) && - likely(fop->write || fop->aio_write || fop->write_iter)) + likely(fop->write || fop->write_iter)) mode |= FMODE_CAN_WRITE; file->f_mode = mode; file->f_op = fop; @@ -309,19 +309,24 @@ void put_filp(struct file *file) } } -void __init files_init(unsigned long mempages) +void __init files_init(void) { - unsigned long n; - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); +} - /* - * One file with associated inode and dcache is very roughly 1K. - * Per default don't use more than 10% of our memory for files. - */ +/* + * One file with associated inode and dcache is very roughly 1K. Per default + * do not use more than 10% of our memory for files. + */ +void __init files_maxfiles_init(void) +{ + unsigned long n; + unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + + memreserve = min(memreserve, totalram_pages - 1); + n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; - n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h index 881aa3d217f0..e3dcb4467d92 100644 --- a/fs/freevxfs/vxfs_extern.h +++ b/fs/freevxfs/vxfs_extern.h @@ -50,9 +50,6 @@ extern daddr_t vxfs_bmap1(struct inode *, long); /* vxfs_fshead.c */ extern int vxfs_read_fshead(struct super_block *); -/* vxfs_immed.c */ -extern const struct inode_operations vxfs_immed_symlink_iops; - /* vxfs_inode.c */ extern const struct address_space_operations vxfs_immed_aops; extern struct kmem_cache *vxfs_inode_cachep; diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index c36aeaf92e41..cb84f0fcc72a 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -32,29 +32,15 @@ */ #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/namei.h> #include "vxfs.h" #include "vxfs_extern.h" #include "vxfs_inode.h" -static void * vxfs_immed_follow_link(struct dentry *, struct nameidata *); - static int vxfs_immed_readpage(struct file *, struct page *); /* - * Inode operations for immed symlinks. - * - * Unliked all other operations we do not go through the pagecache, - * but do all work directly on the inode. - */ -const struct inode_operations vxfs_immed_symlink_iops = { - .readlink = generic_readlink, - .follow_link = vxfs_immed_follow_link, -}; - -/* * Address space operations for immed files and directories. */ const struct address_space_operations vxfs_immed_aops = { @@ -62,26 +48,6 @@ const struct address_space_operations vxfs_immed_aops = { }; /** - * vxfs_immed_follow_link - follow immed symlink - * @dp: dentry for the link - * @np: pathname lookup data for the current path walk - * - * Description: - * vxfs_immed_follow_link restarts the pathname lookup with - * the data obtained from @dp. - * - * Returns: - * Zero on success, else a negative error code. - */ -static void * -vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np) -{ - struct vxfs_inode_info *vip = VXFS_INO(dp->d_inode); - nd_set_link(np, vip->vii_immed.vi_immed); - return NULL; -} - -/** * vxfs_immed_readpage - read part of an immed inode into pagecache * @file: file context (unused) * @page: page frame to fill in. diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 363e3ae25f6b..ef73ed674a27 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -35,6 +35,7 @@ #include <linux/pagemap.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/namei.h> #include "vxfs.h" #include "vxfs_inode.h" @@ -327,8 +328,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino) ip->i_op = &page_symlink_inode_operations; ip->i_mapping->a_ops = &vxfs_aops; } else { - ip->i_op = &vxfs_immed_symlink_iops; - vip->vii_immed.vi_immed[ip->i_size] = '\0'; + ip->i_op = &simple_symlink_inode_operations; + ip->i_link = vip->vii_immed.vi_immed; + nd_terminate_link(ip->i_link, ip->i_size, + sizeof(vip->vii_immed.vi_immed) - 1); } } else init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev)); diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 99c7f0a37af4..484b32d3234a 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -61,13 +61,6 @@ const struct file_operations vxfs_dir_operations = { .iterate = vxfs_readdir, }; - -static inline u_long -dir_pages(struct inode *inode) -{ - return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - static inline u_long dir_blocks(struct inode *ip) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 32a8bbd7a9ad..518c6294bf6c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -27,6 +27,7 @@ #include <linux/backing-dev.h> #include <linux/tracepoint.h> #include <linux/device.h> +#include <linux/memcontrol.h> #include "internal.h" /* @@ -34,6 +35,10 @@ */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) +struct wb_completion { + atomic_t cnt; +}; + /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -47,13 +52,29 @@ struct wb_writeback_work { unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ + unsigned int auto_free:1; /* free on completion */ + unsigned int single_wait:1; + unsigned int single_done:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ - struct completion *done; /* set if the caller waits */ + struct wb_completion *done; /* set if the caller waits */ }; /* + * If one wants to wait for one or more wb_writeback_works, each work's + * ->done should be set to a wb_completion defined using the following + * macro. Once all work items are issued with wb_queue_work(), the caller + * can wait for the completion of all using wb_wait_for_completion(). Work + * items which are waited upon aren't freed automatically on completion. + */ +#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \ + struct wb_completion cmpl = { \ + .cnt = ATOMIC_INIT(1), \ + } + + +/* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's * possible for the worst case time between when an inode has its @@ -65,35 +86,6 @@ struct wb_writeback_work { */ unsigned int dirtytime_expire_interval = 12 * 60 * 60; -/** - * writeback_in_progress - determine whether there is writeback in progress - * @bdi: the device's backing_dev_info structure. - * - * Determine whether there is writeback waiting to be handled against a - * backing device. - */ -int writeback_in_progress(struct backing_dev_info *bdi) -{ - return test_bit(BDI_writeback_running, &bdi->state); -} -EXPORT_SYMBOL(writeback_in_progress); - -struct backing_dev_info *inode_to_bdi(struct inode *inode) -{ - struct super_block *sb; - - if (!inode) - return &noop_backing_dev_info; - - sb = inode->i_sb; -#ifdef CONFIG_BLOCK - if (sb_is_blkdev_sb(sb)) - return blk_get_backing_dev_info(I_BDEV(inode)); -#endif - return sb->s_bdi; -} -EXPORT_SYMBOL_GPL(inode_to_bdi); - static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_wb_list); @@ -109,45 +101,831 @@ static inline struct inode *wb_inode(struct list_head *head) EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); -static void bdi_wakeup_thread(struct backing_dev_info *bdi) +static bool wb_io_lists_populated(struct bdi_writeback *wb) +{ + if (wb_has_dirty_io(wb)) { + return false; + } else { + set_bit(WB_has_dirty_io, &wb->state); + WARN_ON_ONCE(!wb->avg_write_bandwidth); + atomic_long_add(wb->avg_write_bandwidth, + &wb->bdi->tot_write_bandwidth); + return true; + } +} + +static void wb_io_lists_depopulated(struct bdi_writeback *wb) { - spin_lock_bh(&bdi->wb_lock); - if (test_bit(BDI_registered, &bdi->state)) - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); - spin_unlock_bh(&bdi->wb_lock); + if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && + list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { + clear_bit(WB_has_dirty_io, &wb->state); + WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, + &wb->bdi->tot_write_bandwidth) < 0); + } } -static void bdi_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) +/** + * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list + * @inode: inode to be moved + * @wb: target bdi_writeback + * @head: one of @wb->b_{dirty|io|more_io} + * + * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. + * Returns %true if @inode is the first occupant of the !dirty_time IO + * lists; otherwise, %false. + */ +static bool inode_wb_list_move_locked(struct inode *inode, + struct bdi_writeback *wb, + struct list_head *head) { - trace_writeback_queue(bdi, work); + assert_spin_locked(&wb->list_lock); + + list_move(&inode->i_wb_list, head); - spin_lock_bh(&bdi->wb_lock); - if (!test_bit(BDI_registered, &bdi->state)) { - if (work->done) - complete(work->done); + /* dirty_time doesn't count as dirty_io until expiration */ + if (head != &wb->b_dirty_time) + return wb_io_lists_populated(wb); + + wb_io_lists_depopulated(wb); + return false; +} + +/** + * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list + * @inode: inode to be removed + * @wb: bdi_writeback @inode is being removed from + * + * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and + * clear %WB_has_dirty_io if all are empty afterwards. + */ +static void inode_wb_list_del_locked(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + + list_del_init(&inode->i_wb_list); + wb_io_lists_depopulated(wb); +} + +static void wb_wakeup(struct bdi_writeback *wb) +{ + spin_lock_bh(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + mod_delayed_work(bdi_wq, &wb->dwork, 0); + spin_unlock_bh(&wb->work_lock); +} + +static void wb_queue_work(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + trace_writeback_queue(wb->bdi, work); + + spin_lock_bh(&wb->work_lock); + if (!test_bit(WB_registered, &wb->state)) { + if (work->single_wait) + work->single_done = 1; goto out_unlock; } - list_add_tail(&work->list, &bdi->work_list); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + if (work->done) + atomic_inc(&work->done->cnt); + list_add_tail(&work->list, &wb->work_list); + mod_delayed_work(bdi_wq, &wb->dwork, 0); out_unlock: - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); +} + +/** + * wb_wait_for_completion - wait for completion of bdi_writeback_works + * @bdi: bdi work items were issued to + * @done: target wb_completion + * + * Wait for one or more work items issued to @bdi with their ->done field + * set to @done, which should have been defined with + * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such + * work items are completed. Work items which are waited upon aren't freed + * automatically on completion. + */ +static void wb_wait_for_completion(struct backing_dev_info *bdi, + struct wb_completion *done) +{ + atomic_dec(&done->cnt); /* put down the initial count */ + wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +/* parameters for foreign inode detection, see wb_detach_inode() */ +#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ +#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ +#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ +#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ + +#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ +#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) + /* each slot's duration is 2s / 16 */ +#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) + /* if foreign slots >= 8, switch */ +#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) + /* one round can affect upto 5 slots */ + +void __inode_attach_wb(struct inode *inode, struct page *page) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; + + if (inode_cgwb_enabled(inode)) { + struct cgroup_subsys_state *memcg_css; + + if (page) { + memcg_css = mem_cgroup_css_from_page(page); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + } else { + /* must pin memcg_css, see wb_get_create() */ + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); + } + } + + if (!wb) + wb = &bdi->wb; + + /* + * There may be multiple instances of this function racing to + * update the same inode. Use cmpxchg() to tell the winner. + */ + if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) + wb_put(wb); +} + +/** + * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it + * @inode: inode of interest with i_lock held + * + * Returns @inode's wb with its list_lock held. @inode->i_lock must be + * held on entry and is released on return. The returned wb is guaranteed + * to stay @inode's associated wb until its list_lock is released. + */ +static struct bdi_writeback * +locked_inode_to_wb_and_lock_list(struct inode *inode) + __releases(&inode->i_lock) + __acquires(&wb->list_lock) +{ + while (true) { + struct bdi_writeback *wb = inode_to_wb(inode); + + /* + * inode_to_wb() association is protected by both + * @inode->i_lock and @wb->list_lock but list_lock nests + * outside i_lock. Drop i_lock and verify that the + * association hasn't changed after acquiring list_lock. + */ + wb_get(wb); + spin_unlock(&inode->i_lock); + spin_lock(&wb->list_lock); + wb_put(wb); /* not gonna deref it anymore */ + + /* i_wb may have changed inbetween, can't use inode_to_wb() */ + if (likely(wb == inode->i_wb)) + return wb; /* @inode already has ref */ + + spin_unlock(&wb->list_lock); + cpu_relax(); + spin_lock(&inode->i_lock); + } +} + +/** + * inode_to_wb_and_lock_list - determine an inode's wb and lock it + * @inode: inode of interest + * + * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held + * on entry. + */ +static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) + __acquires(&wb->list_lock) +{ + spin_lock(&inode->i_lock); + return locked_inode_to_wb_and_lock_list(inode); +} + +struct inode_switch_wbs_context { + struct inode *inode; + struct bdi_writeback *new_wb; + + struct rcu_head rcu_head; + struct work_struct work; +}; + +static void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct inode_switch_wbs_context *isw = + container_of(work, struct inode_switch_wbs_context, work); + struct inode *inode = isw->inode; + struct address_space *mapping = inode->i_mapping; + struct bdi_writeback *old_wb = inode->i_wb; + struct bdi_writeback *new_wb = isw->new_wb; + struct radix_tree_iter iter; + bool switched = false; + void **slot; + + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions + * between unlocked_inode_to_wb_begin/end() are guaranteed to be + * synchronizing against mapping->tree_lock. + * + * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&new_wb->list_lock); + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + spin_lock(&inode->i_lock); + spin_lock_irq(&mapping->tree_lock); + + /* + * Once I_FREEING is visible under i_lock, the eviction path owns + * the inode and we shouldn't modify ->i_wb_list. + */ + if (unlikely(inode->i_state & I_FREEING)) + goto skip_switch; + + /* + * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points + * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to + * pages actually under underwriteback. + */ + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + PAGECACHE_TAG_DIRTY) { + struct page *page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (likely(page) && PageDirty(page)) { + __dec_wb_stat(old_wb, WB_RECLAIMABLE); + __inc_wb_stat(new_wb, WB_RECLAIMABLE); + } + } + + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + PAGECACHE_TAG_WRITEBACK) { + struct page *page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (likely(page)) { + WARN_ON_ONCE(!PageWriteback(page)); + __dec_wb_stat(old_wb, WB_WRITEBACK); + __inc_wb_stat(new_wb, WB_WRITEBACK); + } + } + + wb_get(new_wb); + + /* + * Transfer to @new_wb's IO list if necessary. The specific list + * @inode was on is ignored and the inode is put on ->b_dirty which + * is always correct including from ->b_dirty_time. The transfer + * preserves @inode->dirtied_when ordering. + */ + if (!list_empty(&inode->i_wb_list)) { + struct inode *pos; + + inode_wb_list_del_locked(inode, old_wb); + inode->i_wb = new_wb; + list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list) + if (time_after_eq(inode->dirtied_when, + pos->dirtied_when)) + break; + inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev); + } else { + inode->i_wb = new_wb; + } + + /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ + inode->i_wb_frn_winner = 0; + inode->i_wb_frn_avg_time = 0; + inode->i_wb_frn_history = 0; + switched = true; +skip_switch: + /* + * Paired with load_acquire in unlocked_inode_to_wb_begin() and + * ensures that the new wb is visible if they see !I_WB_SWITCH. + */ + smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); + + spin_unlock_irq(&mapping->tree_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + + if (switched) { + wb_wakeup(new_wb); + wb_put(old_wb); + } + wb_put(new_wb); + + iput(inode); + kfree(isw); +} + +static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) +{ + struct inode_switch_wbs_context *isw = container_of(rcu_head, + struct inode_switch_wbs_context, rcu_head); + + /* needs to grab bh-unsafe locks, bounce to work item */ + INIT_WORK(&isw->work, inode_switch_wbs_work_fn); + schedule_work(&isw->work); +} + +/** + * inode_switch_wbs - change the wb association of an inode + * @inode: target inode + * @new_wb_id: ID of the new wb + * + * Switch @inode's wb association to the wb identified by @new_wb_id. The + * switching is performed asynchronously and may fail silently. + */ +static void inode_switch_wbs(struct inode *inode, int new_wb_id) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; + + /* noop if seems to be already in progress */ + if (inode->i_state & I_WB_SWITCH) + return; + + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); + if (!isw) + return; + + /* find and pin the new wb */ + rcu_read_lock(); + memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); + if (memcg_css) + isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + rcu_read_unlock(); + if (!isw->new_wb) + goto out_free; + + /* while holding I_WB_SWITCH, no one else can update the association */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_WB_SWITCH | I_FREEING) || + inode_to_wb(inode) == isw->new_wb) { + spin_unlock(&inode->i_lock); + goto out_free; + } + inode->i_state |= I_WB_SWITCH; + spin_unlock(&inode->i_lock); + + ihold(inode); + isw->inode = inode; + + /* + * In addition to synchronizing among switchers, I_WB_SWITCH tells + * the RCU protected stat update paths to grab the mapping's + * tree_lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be visible. + */ + call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); + return; + +out_free: + if (isw->new_wb) + wb_put(isw->new_wb); + kfree(isw); +} + +/** + * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it + * @wbc: writeback_control of interest + * @inode: target inode + * + * @inode is locked and about to be written back under the control of @wbc. + * Record @inode's writeback context into @wbc and unlock the i_lock. On + * writeback completion, wbc_detach_inode() should be called. This is used + * to track the cgroup writeback context. + */ +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) +{ + if (!inode_cgwb_enabled(inode)) { + spin_unlock(&inode->i_lock); + return; + } + + wbc->wb = inode_to_wb(inode); + wbc->inode = inode; + + wbc->wb_id = wbc->wb->memcg_css->id; + wbc->wb_lcand_id = inode->i_wb_frn_winner; + wbc->wb_tcand_id = 0; + wbc->wb_bytes = 0; + wbc->wb_lcand_bytes = 0; + wbc->wb_tcand_bytes = 0; + + wb_get(wbc->wb); + spin_unlock(&inode->i_lock); + + /* + * A dying wb indicates that the memcg-blkcg mapping has changed + * and a new wb is already serving the memcg. Switch immediately. + */ + if (unlikely(wb_dying(wbc->wb))) + inode_switch_wbs(inode, wbc->wb_id); +} + +/** + * wbc_detach_inode - disassociate wbc from inode and perform foreign detection + * @wbc: writeback_control of the just finished writeback + * + * To be called after a writeback attempt of an inode finishes and undoes + * wbc_attach_and_unlock_inode(). Can be called under any context. + * + * As concurrent write sharing of an inode is expected to be very rare and + * memcg only tracks page ownership on first-use basis severely confining + * the usefulness of such sharing, cgroup writeback tracks ownership + * per-inode. While the support for concurrent write sharing of an inode + * is deemed unnecessary, an inode being written to by different cgroups at + * different points in time is a lot more common, and, more importantly, + * charging only by first-use can too readily lead to grossly incorrect + * behaviors (single foreign page can lead to gigabytes of writeback to be + * incorrectly attributed). + * + * To resolve this issue, cgroup writeback detects the majority dirtier of + * an inode and transfers the ownership to it. To avoid unnnecessary + * oscillation, the detection mechanism keeps track of history and gives + * out the switch verdict only if the foreign usage pattern is stable over + * a certain amount of time and/or writeback attempts. + * + * On each writeback attempt, @wbc tries to detect the majority writer + * using Boyer-Moore majority vote algorithm. In addition to the byte + * count from the majority voting, it also counts the bytes written for the + * current wb and the last round's winner wb (max of last round's current + * wb, the winner from two rounds ago, and the last round's majority + * candidate). Keeping track of the historical winner helps the algorithm + * to semi-reliably detect the most active writer even when it's not the + * absolute majority. + * + * Once the winner of the round is determined, whether the winner is + * foreign or not and how much IO time the round consumed is recorded in + * inode->i_wb_frn_history. If the amount of recorded foreign IO time is + * over a certain threshold, the switch verdict is given. + */ +void wbc_detach_inode(struct writeback_control *wbc) +{ + struct bdi_writeback *wb = wbc->wb; + struct inode *inode = wbc->inode; + unsigned long avg_time, max_bytes, max_time; + u16 history; + int max_id; + + if (!wb) + return; + + history = inode->i_wb_frn_history; + avg_time = inode->i_wb_frn_avg_time; + + /* pick the winner of this round */ + if (wbc->wb_bytes >= wbc->wb_lcand_bytes && + wbc->wb_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_id; + max_bytes = wbc->wb_bytes; + } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_lcand_id; + max_bytes = wbc->wb_lcand_bytes; + } else { + max_id = wbc->wb_tcand_id; + max_bytes = wbc->wb_tcand_bytes; + } + + /* + * Calculate the amount of IO time the winner consumed and fold it + * into the running average kept per inode. If the consumed IO + * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for + * deciding whether to switch or not. This is to prevent one-off + * small dirtiers from skewing the verdict. + */ + max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, + wb->avg_write_bandwidth); + if (avg_time) + avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - + (avg_time >> WB_FRN_TIME_AVG_SHIFT); + else + avg_time = max_time; /* immediate catch up on first run */ + + if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { + int slots; + + /* + * The switch verdict is reached if foreign wb's consume + * more than a certain proportion of IO time in a + * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot + * history mask where each bit represents one sixteenth of + * the period. Determine the number of slots to shift into + * history from @max_time. + */ + slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), + (unsigned long)WB_FRN_HIST_MAX_SLOTS); + history <<= slots; + if (wbc->wb_id != max_id) + history |= (1U << slots) - 1; + + /* + * Switch if the current wb isn't the consistent winner. + * If there are multiple closely competing dirtiers, the + * inode may switch across them repeatedly over time, which + * is okay. The main goal is avoiding keeping an inode on + * the wrong wb for an extended period of time. + */ + if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) + inode_switch_wbs(inode, max_id); + } + + /* + * Multiple instances of this function may race to update the + * following fields but we don't mind occassional inaccuracies. + */ + inode->i_wb_frn_winner = max_id; + inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); + inode->i_wb_frn_history = history; + + wb_put(wbc->wb); + wbc->wb = NULL; +} + +/** + * wbc_account_io - account IO issued during writeback + * @wbc: writeback_control of the writeback in progress + * @page: page being written out + * @bytes: number of bytes being written out + * + * @bytes from @page are about to written out during the writeback + * controlled by @wbc. Keep the book for foreign inode detection. See + * wbc_detach_inode(). + */ +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes) +{ + int id; + + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (!wbc->wb) + return; + + rcu_read_lock(); + id = mem_cgroup_css_from_page(page)->id; + rcu_read_unlock(); + + if (id == wbc->wb_id) { + wbc->wb_bytes += bytes; + return; + } + + if (id == wbc->wb_lcand_id) + wbc->wb_lcand_bytes += bytes; + + /* Boyer-Moore majority vote algorithm */ + if (!wbc->wb_tcand_bytes) + wbc->wb_tcand_id = id; + if (id == wbc->wb_tcand_id) + wbc->wb_tcand_bytes += bytes; + else + wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); } +EXPORT_SYMBOL_GPL(wbc_account_io); -static void -__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic, enum wb_reason reason) +/** + * inode_congested - test whether an inode is congested + * @inode: inode to test for congestion + * @cong_bits: mask of WB_[a]sync_congested bits to test + * + * Tests whether @inode is congested. @cong_bits is the mask of congestion + * bits to test and the return value is the mask of set bits. + * + * If cgroup writeback is enabled for @inode, the congestion state is + * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg + * associated with @inode is congested; otherwise, the root wb's congestion + * state is used. + */ +int inode_congested(struct inode *inode, int cong_bits) +{ + /* + * Once set, ->i_wb never becomes NULL while the inode is alive. + * Start transaction iff ->i_wb is visible. + */ + if (inode && inode_to_wb_is_valid(inode)) { + struct bdi_writeback *wb; + bool locked, congested; + + wb = unlocked_inode_to_wb_begin(inode, &locked); + congested = wb_congested(wb, cong_bits); + unlocked_inode_to_wb_end(inode, locked); + return congested; + } + + return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); +} +EXPORT_SYMBOL_GPL(inode_congested); + +/** + * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work + * @bdi: bdi the work item was issued to + * @work: work item to wait for + * + * Wait for the completion of @work which was issued to one of @bdi's + * bdi_writeback's. The caller must have set @work->single_wait before + * issuing it. This wait operates independently fo + * wb_wait_for_completion() and also disables automatic freeing of @work. + */ +static void wb_wait_for_single_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + if (WARN_ON_ONCE(!work->single_wait)) + return; + + wait_event(bdi->wb_waitq, work->single_done); + + /* + * Paired with smp_wmb() in wb_do_writeback() and ensures that all + * modifications to @work prior to assertion of ->single_done is + * visible to the caller once this function returns. + */ + smp_rmb(); +} + +/** + * wb_split_bdi_pages - split nr_pages to write according to bandwidth + * @wb: target bdi_writeback to split @nr_pages to + * @nr_pages: number of pages to write for the whole bdi + * + * Split @wb's portion of @nr_pages according to @wb's write bandwidth in + * relation to the total write bandwidth of all wb's w/ dirty inodes on + * @wb->bdi. + */ +static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) +{ + unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); + + if (nr_pages == LONG_MAX) + return LONG_MAX; + + /* + * This may be called on clean wb's and proportional distribution + * may not make sense, just use the original @nr_pages in those + * cases. In general, we wanna err on the side of writing more. + */ + if (!tot_bw || this_bw >= tot_bw) + return nr_pages; + else + return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); +} + +/** + * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb + * @wb: target bdi_writeback + * @base_work: source wb_writeback_work + * + * Try to make a clone of @base_work and issue it to @wb. If cloning + * succeeds, %true is returned; otherwise, @base_work is issued directly + * and %false is returned. In the latter case, the caller is required to + * wait for @base_work's completion using wb_wait_for_single_work(). + * + * A clone is auto-freed on completion. @base_work never is. + */ +static bool wb_clone_and_queue_work(struct bdi_writeback *wb, + struct wb_writeback_work *base_work) { struct wb_writeback_work *work; + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + *work = *base_work; + work->auto_free = 1; + work->single_wait = 0; + } else { + work = base_work; + work->auto_free = 0; + work->single_wait = 1; + } + work->single_done = 0; + wb_queue_work(wb, work); + return work != base_work; +} + +/** + * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi + * @bdi: target backing_dev_info + * @base_work: wb_writeback_work to issue + * @skip_if_busy: skip wb's which already have writeback in progress + * + * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which + * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's + * distributed to the busy wbs according to each wb's proportion in the + * total active write bandwidth of @bdi. + */ +static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, + struct wb_writeback_work *base_work, + bool skip_if_busy) +{ + long nr_pages = base_work->nr_pages; + int next_blkcg_id = 0; + struct bdi_writeback *wb; + struct wb_iter iter; + + might_sleep(); + + if (!bdi_has_dirty_io(bdi)) + return; +restart: + rcu_read_lock(); + bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { + if (!wb_has_dirty_io(wb) || + (skip_if_busy && writeback_in_progress(wb))) + continue; + + base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); + if (!wb_clone_and_queue_work(wb, base_work)) { + next_blkcg_id = wb->blkcg_css->id + 1; + rcu_read_unlock(); + wb_wait_for_single_work(bdi, base_work); + goto restart; + } + } + rcu_read_unlock(); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static struct bdi_writeback * +locked_inode_to_wb_and_lock_list(struct inode *inode) + __releases(&inode->i_lock) + __acquires(&wb->list_lock) +{ + struct bdi_writeback *wb = inode_to_wb(inode); + + spin_unlock(&inode->i_lock); + spin_lock(&wb->list_lock); + return wb; +} + +static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) + __acquires(&wb->list_lock) +{ + struct bdi_writeback *wb = inode_to_wb(inode); + + spin_lock(&wb->list_lock); + return wb; +} + +static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) +{ + return nr_pages; +} + +static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, + struct wb_writeback_work *base_work, + bool skip_if_busy) +{ + might_sleep(); + + if (bdi_has_dirty_io(bdi) && + (!skip_if_busy || !writeback_in_progress(&bdi->wb))) { + base_work->auto_free = 0; + base_work->single_wait = 0; + base_work->single_done = 0; + wb_queue_work(&bdi->wb, base_work); + } +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason) +{ + struct wb_writeback_work *work; + + if (!wb_has_dirty_io(wb)) + return; + /* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - trace_writeback_nowork(bdi); - bdi_wakeup_thread(bdi); + trace_writeback_nowork(wb->bdi); + wb_wakeup(wb); return; } @@ -155,46 +933,29 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; work->reason = reason; + work->auto_free = 1; - bdi_queue_work(bdi, work); + wb_queue_work(wb, work); } /** - * bdi_start_writeback - start writeback - * @bdi: the backing device to write from - * @nr_pages: the number of pages to write - * @reason: reason why some writeback work was initiated - * - * Description: - * This does WB_SYNC_NONE opportunistic writeback. The IO is only - * started when this function returns, we make no guarantees on - * completion. Caller need not hold sb s_umount semaphore. - * - */ -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason) -{ - __bdi_start_writeback(bdi, nr_pages, true, reason); -} - -/** - * bdi_start_background_writeback - start background writeback - * @bdi: the backing device to write from + * wb_start_background_writeback - start background writeback + * @wb: bdi_writback to write from * * Description: * This makes sure WB_SYNC_NONE background writeback happens. When - * this function returns, it is only guaranteed that for given BDI + * this function returns, it is only guaranteed that for given wb * some IO is happening if we are over background dirty threshold. * Caller need not hold sb s_umount semaphore. */ -void bdi_start_background_writeback(struct backing_dev_info *bdi) +void wb_start_background_writeback(struct bdi_writeback *wb) { /* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ - trace_writeback_wake_background(bdi); - bdi_wakeup_thread(bdi); + trace_writeback_wake_background(wb->bdi); + wb_wakeup(wb); } /* @@ -202,11 +963,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb; - spin_lock(&bdi->wb.list_lock); - list_del_init(&inode->i_wb_list); - spin_unlock(&bdi->wb.list_lock); + wb = inode_to_wb_and_lock_list(inode); + inode_wb_list_del_locked(inode, wb); + spin_unlock(&wb->list_lock); } /* @@ -220,7 +981,6 @@ void inode_wb_list_del(struct inode *inode) */ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -228,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_wb_list, &wb->b_dirty); + inode_wb_list_move_locked(inode, wb, &wb->b_dirty); } /* @@ -236,8 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&wb->list_lock); - list_move(&inode->i_wb_list, &wb->b_more_io); + inode_wb_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -346,6 +1105,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, EXPIRE_DIRTY_ATIME, work); + if (moved) + wb_io_lists_populated(wb); trace_writeback_queue_io(wb, work, moved); } @@ -471,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, &wb->b_dirty_time); + inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); } } @@ -605,10 +1366,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); + wbc_detach_inode(wbc); spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); /* @@ -616,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -624,7 +1386,7 @@ out: return ret; } -static long writeback_chunk_size(struct backing_dev_info *bdi, +static long writeback_chunk_size(struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -645,8 +1407,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi, if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; else { - pages = min(bdi->avg_write_bandwidth / 2, - global_dirty_limit / DIRTY_SCOPE); + pages = min(wb->avg_write_bandwidth / 2, + global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); @@ -741,9 +1503,9 @@ static long writeback_sb_inodes(struct super_block *sb, continue; } inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb->bdi, work); + write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; @@ -753,6 +1515,7 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; spin_lock(&wb->list_lock); @@ -830,33 +1593,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, return nr_pages - work.nr_pages; } -static bool over_bground_thresh(struct backing_dev_info *bdi) -{ - unsigned long background_thresh, dirty_thresh; - - global_dirty_limits(&background_thresh, &dirty_thresh); - - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh) - return true; - - if (bdi_stat(bdi, BDI_RECLAIMABLE) > - bdi_dirty_limit(bdi, background_thresh)) - return true; - - return false; -} - -/* - * Called under wb->list_lock. If there are multiple wb per bdi, - * only the flusher working on the first wb should do it. - */ -static void wb_update_bandwidth(struct bdi_writeback *wb, - unsigned long start_time) -{ - __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); -} - /* * Explicit flushing or periodic writeback of "old" data. * @@ -899,14 +1635,14 @@ static long wb_writeback(struct bdi_writeback *wb, * after the other works are all done. */ if ((work->for_background || work->for_kupdate) && - !list_empty(&wb->bdi->work_list)) + !list_empty(&wb->work_list)) break; /* * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh(wb->bdi)) + if (work->for_background && !wb_over_bg_thresh(wb)) break; /* @@ -970,18 +1706,17 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Return the next wb_writeback_work struct that hasn't been processed yet. */ -static struct wb_writeback_work * -get_next_work_item(struct backing_dev_info *bdi) +static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; - spin_lock_bh(&bdi->wb_lock); - if (!list_empty(&bdi->work_list)) { - work = list_entry(bdi->work_list.next, + spin_lock_bh(&wb->work_lock); + if (!list_empty(&wb->work_list)) { + work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); return work; } @@ -998,7 +1733,7 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh(wb->bdi)) { + if (wb_over_bg_thresh(wb)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, @@ -1053,25 +1788,33 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) */ static long wb_do_writeback(struct bdi_writeback *wb) { - struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; long wrote = 0; - set_bit(BDI_writeback_running, &wb->bdi->state); - while ((work = get_next_work_item(bdi)) != NULL) { + set_bit(WB_writeback_running, &wb->state); + while ((work = get_next_work_item(wb)) != NULL) { + struct wb_completion *done = work->done; + bool need_wake_up = false; - trace_writeback_exec(bdi, work); + trace_writeback_exec(wb->bdi, work); wrote += wb_writeback(wb, work); - /* - * Notify the caller of completion if this is a synchronous - * work item, otherwise just free it. - */ - if (work->done) - complete(work->done); - else + if (work->single_wait) { + WARN_ON_ONCE(work->auto_free); + /* paired w/ rmb in wb_wait_for_single_work() */ + smp_wmb(); + work->single_done = 1; + need_wake_up = true; + } else if (work->auto_free) { kfree(work); + } + + if (done && atomic_dec_and_test(&done->cnt)) + need_wake_up = true; + + if (need_wake_up) + wake_up_all(&wb->bdi->wb_waitq); } /* @@ -1079,7 +1822,7 @@ static long wb_do_writeback(struct bdi_writeback *wb) */ wrote += wb_check_old_data_flush(wb); wrote += wb_check_background_flush(wb); - clear_bit(BDI_writeback_running, &wb->bdi->state); + clear_bit(WB_writeback_running, &wb->state); return wrote; } @@ -1088,43 +1831,42 @@ static long wb_do_writeback(struct bdi_writeback *wb) * Handle writeback of dirty data for the device backed by this bdi. Also * reschedules periodically and does kupdated style flushing. */ -void bdi_writeback_workfn(struct work_struct *work) +void wb_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); - struct backing_dev_info *bdi = wb->bdi; long pages_written; - set_worker_desc("flush-%s", dev_name(bdi->dev)); + set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || - !test_bit(BDI_registered, &bdi->state))) { + !test_bit(WB_registered, &wb->state))) { /* - * The normal path. Keep writing back @bdi until its + * The normal path. Keep writing back @wb until its * work_list is empty. Note that this path is also taken - * if @bdi is shutting down even when we're running off the + * if @wb is shutting down even when we're running off the * rescuer as work_list needs to be drained. */ do { pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); - } while (!list_empty(&bdi->work_list)); + } while (!list_empty(&wb->work_list)); } else { /* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */ - pages_written = writeback_inodes_wb(&bdi->wb, 1024, + pages_written = writeback_inodes_wb(wb, 1024, WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); } - if (!list_empty(&bdi->work_list)) + if (!list_empty(&wb->work_list)) mod_delayed_work(bdi_wq, &wb->dwork, 0); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) - bdi_wakeup_thread_delayed(bdi); + wb_wakeup_delayed(wb); current->flags &= ~PF_SWAPWRITE; } @@ -1142,9 +1884,15 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + struct bdi_writeback *wb; + struct wb_iter iter; + if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false, reason); + + bdi_for_each_wb(wb, bdi, &iter, 0) + wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), + false, reason); } rcu_read_unlock(); } @@ -1173,9 +1921,12 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { - if (list_empty(&bdi->wb.b_dirty_time)) - continue; - bdi_wakeup_thread(bdi); + struct bdi_writeback *wb; + struct wb_iter iter; + + bdi_for_each_wb(wb, bdi, &iter, 0) + if (!list_empty(&bdi->wb.b_dirty_time)) + wb_wakeup(&bdi->wb); } rcu_read_unlock(); schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); @@ -1249,7 +2000,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; - struct backing_dev_info *bdi = NULL; int dirtytime; trace_writeback_mark_inode_dirty(inode, flags); @@ -1289,6 +2039,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; + inode_attach_wb(inode, NULL); + if (flags & I_DIRTY_INODE) inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; @@ -1317,38 +2069,39 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + struct bdi_writeback *wb; + struct list_head *dirty_list; bool wakeup_bdi = false; - bdi = inode_to_bdi(inode); - spin_unlock(&inode->i_lock); - spin_lock(&bdi->wb.list_lock); - if (bdi_cap_writeback_dirty(bdi)) { - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi-%s not registered\n", bdi->name); + wb = locked_inode_to_wb_and_lock_list(inode); - /* - * If this is the first dirty inode for this - * bdi, we have to wake-up the corresponding - * bdi thread to make sure background - * write-back happens later. - */ - if (!wb_has_dirty_io(&bdi->wb)) - wakeup_bdi = true; - } + WARN(bdi_cap_writeback_dirty(wb->bdi) && + !test_bit(WB_registered, &wb->state), + "bdi-%s not registered\n", wb->bdi->name); inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; + if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) - list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + dirty_list = &wb->b_dirty; else - list_move(&inode->i_wb_list, - &bdi->wb.b_dirty_time); - spin_unlock(&bdi->wb.list_lock); + dirty_list = &wb->b_dirty_time; + + wakeup_bdi = inode_wb_list_move_locked(inode, wb, + dirty_list); + + spin_unlock(&wb->list_lock); trace_writeback_dirty_inode_enqueue(inode); - if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); + /* + * If this is the first dirty inode for this bdi, + * we have to wake-up the corresponding bdi thread + * to make sure background write-back happens + * later. + */ + if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) + wb_wakeup_delayed(wb); return; } } @@ -1411,6 +2164,28 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } +static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason, bool skip_if_busy) +{ + DEFINE_WB_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, + .reason = reason, + }; + struct backing_dev_info *bdi = sb->s_bdi; + + if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) + return; + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); + wb_wait_for_completion(bdi, &done); +} + /** * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock @@ -1425,21 +2200,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason) { - DECLARE_COMPLETION_ONSTACK(done); - struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .tagged_writepages = 1, - .done = &done, - .nr_pages = nr, - .reason = reason, - }; - - if (sb->s_bdi == &noop_backing_dev_info) - return; - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work(sb->s_bdi, &work); - wait_for_completion(&done); + __writeback_inodes_sb_nr(sb, nr, reason, false); } EXPORT_SYMBOL(writeback_inodes_sb_nr); @@ -1467,19 +2228,15 @@ EXPORT_SYMBOL(writeback_inodes_sb); * Invoke writeback_inodes_sb_nr if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int try_to_writeback_inodes_sb_nr(struct super_block *sb, - unsigned long nr, - enum wb_reason reason) +bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason) { - if (writeback_in_progress(sb->s_bdi)) - return 1; - if (!down_read_trylock(&sb->s_umount)) - return 0; + return false; - writeback_inodes_sb_nr(sb, nr, reason); + __writeback_inodes_sb_nr(sb, nr, reason, true); up_read(&sb->s_umount); - return 1; + return true; } EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); @@ -1491,7 +2248,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); * Implement by try_to_writeback_inodes_sb_nr() * Returns 1 if writeback was started, 0 if not. */ -int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) +bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } @@ -1506,7 +2263,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); */ void sync_inodes_sb(struct super_block *sb) { - DECLARE_COMPLETION_ONSTACK(done); + DEFINE_WB_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, @@ -1516,14 +2273,15 @@ void sync_inodes_sb(struct super_block *sb) .reason = WB_REASON_SYNC, .for_sync = 1, }; + struct backing_dev_info *bdi = sb->s_bdi; /* Nothing to do? */ - if (sb->s_bdi == &noop_backing_dev_info) + if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work(sb->s_bdi, &work); - wait_for_completion(&done); + bdi_split_work_to_wbs(bdi, &work, false); + wb_wait_for_completion(bdi, &done); wait_sb_inodes(sb); } diff --git a/fs/fs_pin.c b/fs/fs_pin.c index b06c98796afb..611b5408f6ec 100644 --- a/fs/fs_pin.c +++ b/fs/fs_pin.c @@ -9,8 +9,8 @@ static DEFINE_SPINLOCK(pin_lock); void pin_remove(struct fs_pin *pin) { spin_lock(&pin_lock); - hlist_del(&pin->m_list); - hlist_del(&pin->s_list); + hlist_del_init(&pin->m_list); + hlist_del_init(&pin->s_list); spin_unlock(&pin_lock); spin_lock_irq(&pin->wait.lock); pin->done = 1; diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 89acec742e0b..d403c69bee08 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -327,7 +327,8 @@ static int fscache_alloc_object(struct fscache_cache *cache, object_already_extant: ret = -ENOBUFS; - if (fscache_object_is_dead(object)) { + if (fscache_object_is_dying(object) || + fscache_cache_is_broken(object)) { spin_unlock(&cookie->lock); goto error; } @@ -671,7 +672,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) if (!op) return -ENOMEM; - fscache_operation_init(op, NULL, NULL); + fscache_operation_init(op, NULL, NULL, NULL); op->flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -695,8 +696,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) /* the work queue now carries its own ref on the object */ spin_unlock(&cookie->lock); - ret = fscache_wait_for_operation_activation(object, op, - NULL, NULL, NULL); + ret = fscache_wait_for_operation_activation(object, op, NULL, NULL); if (ret == 0) { /* ask the cache to honour the operation */ ret = object->cache->ops->check_consistency(op); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 7872a62ef30c..97ec45110957 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -124,8 +124,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); -extern int fscache_cancel_op(struct fscache_operation *, - void (*)(struct fscache_operation *)); +extern int fscache_cancel_op(struct fscache_operation *, bool); extern void fscache_cancel_all_ops(struct fscache_object *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); @@ -138,8 +137,7 @@ extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); extern int fscache_wait_for_operation_activation(struct fscache_object *, struct fscache_operation *, atomic_t *, - atomic_t *, - void (*)(struct fscache_operation *)); + atomic_t *); extern void fscache_invalidate_writes(struct fscache_cookie *); /* @@ -164,6 +162,7 @@ extern atomic_t fscache_n_op_pend; extern atomic_t fscache_n_op_run; extern atomic_t fscache_n_op_enqueue; extern atomic_t fscache_n_op_deferred_release; +extern atomic_t fscache_n_op_initialised; extern atomic_t fscache_n_op_release; extern atomic_t fscache_n_op_gc; extern atomic_t fscache_n_op_cancelled; @@ -271,6 +270,11 @@ extern atomic_t fscache_n_cop_write_page; extern atomic_t fscache_n_cop_uncache_page; extern atomic_t fscache_n_cop_dissociate_pages; +extern atomic_t fscache_n_cache_no_space_reject; +extern atomic_t fscache_n_cache_stale_objects; +extern atomic_t fscache_n_cache_retired_objects; +extern atomic_t fscache_n_cache_culled_objects; + static inline void fscache_stat(atomic_t *stat) { atomic_inc(stat); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index da032daf0e0d..9e792e30f4db 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -328,6 +328,17 @@ void fscache_object_init(struct fscache_object *object, EXPORT_SYMBOL(fscache_object_init); /* + * Mark the object as no longer being live, making sure that we synchronise + * against op submission. + */ +static inline void fscache_mark_object_dead(struct fscache_object *object) +{ + spin_lock(&object->lock); + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + spin_unlock(&object->lock); +} + +/* * Abort object initialisation before we start it. */ static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object, @@ -610,6 +621,8 @@ static const struct fscache_state *fscache_lookup_failure(struct fscache_object object->cache->ops->lookup_complete(object); fscache_stat_d(&fscache_n_cop_lookup_complete); + set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags); + cookie = object->cookie; set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) @@ -629,7 +642,7 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob _enter("{OBJ%x,%d,%d},%d", object->debug_id, object->n_ops, object->n_children, event); - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); object->oob_event_mask = 0; if (list_empty(&object->dependents) && @@ -948,7 +961,8 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj if (!op) goto nomem; - fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); + fscache_operation_init(op, object->cache->ops->invalidate_object, + NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -974,13 +988,13 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj return transit_to(UPDATE_OBJECT); nomem: - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); fscache_unuse_cookie(object); _leave(" [ENOMEM]"); return transit_to(KILL_OBJECT); submit_op_failed: - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); spin_unlock(&cookie->lock); fscache_unuse_cookie(object); kfree(op); @@ -1016,3 +1030,50 @@ static const struct fscache_state *fscache_update_object(struct fscache_object * _leave(""); return transit_to(WAIT_FOR_CMD); } + +/** + * fscache_object_retrying_stale - Note retrying stale object + * @object: The object that will be retried + * + * Note that an object lookup found an on-disk object that was adjudged to be + * stale and has been deleted. The lookup will be retried. + */ +void fscache_object_retrying_stale(struct fscache_object *object) +{ + fscache_stat(&fscache_n_cache_no_space_reject); +} +EXPORT_SYMBOL(fscache_object_retrying_stale); + +/** + * fscache_object_mark_killed - Note that an object was killed + * @object: The object that was culled + * @why: The reason the object was killed. + * + * Note that an object was killed. Returns true if the object was + * already marked killed, false if it wasn't. + */ +void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why) +{ + if (test_and_set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags)) { + pr_err("Error: Object already killed by cache [%s]\n", + object->cache->identifier); + return; + } + + switch (why) { + case FSCACHE_OBJECT_NO_SPACE: + fscache_stat(&fscache_n_cache_no_space_reject); + break; + case FSCACHE_OBJECT_IS_STALE: + fscache_stat(&fscache_n_cache_stale_objects); + break; + case FSCACHE_OBJECT_WAS_RETIRED: + fscache_stat(&fscache_n_cache_retired_objects); + break; + case FSCACHE_OBJECT_WAS_CULLED: + fscache_stat(&fscache_n_cache_culled_objects); + break; + } +} +EXPORT_SYMBOL(fscache_object_mark_killed); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index e7b87a0e5185..de67745e1cd7 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -20,6 +20,35 @@ atomic_t fscache_op_debug_id; EXPORT_SYMBOL(fscache_op_debug_id); +static void fscache_operation_dummy_cancel(struct fscache_operation *op) +{ +} + +/** + * fscache_operation_init - Do basic initialisation of an operation + * @op: The operation to initialise + * @release: The release function to assign + * + * Do basic initialisation of an operation. The caller must still set flags, + * object and processor if needed. + */ +void fscache_operation_init(struct fscache_operation *op, + fscache_operation_processor_t processor, + fscache_operation_cancel_t cancel, + fscache_operation_release_t release) +{ + INIT_WORK(&op->work, fscache_op_work_func); + atomic_set(&op->usage, 1); + op->state = FSCACHE_OP_ST_INITIALISED; + op->debug_id = atomic_inc_return(&fscache_op_debug_id); + op->processor = processor; + op->cancel = cancel ?: fscache_operation_dummy_cancel; + op->release = release; + INIT_LIST_HEAD(&op->pend_link); + fscache_stat(&fscache_n_op_initialised); +} +EXPORT_SYMBOL(fscache_operation_init); + /** * fscache_enqueue_operation - Enqueue an operation for processing * @op: The operation to enqueue @@ -76,6 +105,43 @@ static void fscache_run_op(struct fscache_object *object, } /* + * report an unexpected submission + */ +static void fscache_report_unexpected_submission(struct fscache_object *object, + struct fscache_operation *op, + const struct fscache_state *ostate) +{ + static bool once_only; + struct fscache_operation *p; + unsigned n; + + if (once_only) + return; + once_only = true; + + kdebug("unexpected submission OP%x [OBJ%x %s]", + op->debug_id, object->debug_id, object->state->name); + kdebug("objstate=%s [%s]", object->state->name, ostate->name); + kdebug("objflags=%lx", object->flags); + kdebug("objevent=%lx [%lx]", object->events, object->event_mask); + kdebug("ops=%u inp=%u exc=%u", + object->n_ops, object->n_in_progress, object->n_exclusive); + + if (!list_empty(&object->pending_ops)) { + n = 0; + list_for_each_entry(p, &object->pending_ops, pend_link) { + ASSERTCMP(p->object, ==, object); + kdebug("%p %p", op->processor, op->release); + n++; + } + + kdebug("n=%u", n); + } + + dump_stack(); +} + +/* * submit an exclusive operation for an object * - other ops are excluded from running simultaneously with this one * - this gets any extra refs it needs on an op @@ -83,6 +149,8 @@ static void fscache_run_op(struct fscache_object *object, int fscache_submit_exclusive_op(struct fscache_object *object, struct fscache_operation *op) { + const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); @@ -95,8 +163,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object, ASSERTCMP(object->n_ops, >=, object->n_exclusive); ASSERT(list_empty(&op->pend_link)); + ostate = object->state; + smp_rmb(); + op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -118,7 +199,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, /* need to issue a new write op after this */ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -126,12 +207,15 @@ int fscache_submit_exclusive_op(struct fscache_object *object, list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; + } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } else { - /* If we're in any other state, there must have been an I/O - * error of some nature. - */ - ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags)); - ret = -EIO; + fscache_report_unexpected_submission(object, op, ostate); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } spin_unlock(&object->lock); @@ -139,43 +223,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object, } /* - * report an unexpected submission - */ -static void fscache_report_unexpected_submission(struct fscache_object *object, - struct fscache_operation *op, - const struct fscache_state *ostate) -{ - static bool once_only; - struct fscache_operation *p; - unsigned n; - - if (once_only) - return; - once_only = true; - - kdebug("unexpected submission OP%x [OBJ%x %s]", - op->debug_id, object->debug_id, object->state->name); - kdebug("objstate=%s [%s]", object->state->name, ostate->name); - kdebug("objflags=%lx", object->flags); - kdebug("objevent=%lx [%lx]", object->events, object->event_mask); - kdebug("ops=%u inp=%u exc=%u", - object->n_ops, object->n_in_progress, object->n_exclusive); - - if (!list_empty(&object->pending_ops)) { - n = 0; - list_for_each_entry(p, &object->pending_ops, pend_link) { - ASSERTCMP(p->object, ==, object); - kdebug("%p %p", op->processor, op->release); - n++; - } - - kdebug("n=%u", n); - } - - dump_stack(); -} - -/* * submit an operation for an object * - objects may be submitted only in the following states: * - during object creation (write ops may be submitted) @@ -187,6 +234,7 @@ int fscache_submit_op(struct fscache_object *object, struct fscache_operation *op) { const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},{%u}", @@ -204,7 +252,17 @@ int fscache_submit_op(struct fscache_object *object, smp_rmb(); op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; @@ -222,23 +280,21 @@ int fscache_submit_op(struct fscache_object *object, fscache_run_op(object, op); } ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; - } else if (fscache_object_is_dying(object)) { - fscache_stat(&fscache_n_op_rejected); + } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; - } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + } else { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } @@ -293,9 +349,10 @@ void fscache_start_operations(struct fscache_object *object) * cancel an operation that's pending on an object */ int fscache_cancel_op(struct fscache_operation *op, - void (*do_cancel)(struct fscache_operation *)) + bool cancel_in_progress_op) { struct fscache_object *object = op->object; + bool put = false; int ret; _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); @@ -309,19 +366,37 @@ int fscache_cancel_op(struct fscache_operation *op, ret = -EBUSY; if (op->state == FSCACHE_OP_ST_PENDING) { ASSERT(!list_empty(&op->pend_link)); - fscache_stat(&fscache_n_op_cancelled); list_del_init(&op->pend_link); - if (do_cancel) - do_cancel(op); + put = true; + + fscache_stat(&fscache_n_op_cancelled); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + ret = 0; + } else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) { + ASSERTCMP(object->n_in_progress, >, 0); + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + fscache_stat(&fscache_n_op_cancelled); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - fscache_put_operation(op); ret = 0; } + if (put) + fscache_put_operation(op); spin_unlock(&object->lock); _leave(" = %d", ret); return ret; @@ -345,6 +420,7 @@ void fscache_cancel_all_ops(struct fscache_object *object) list_del_init(&op->pend_link); ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) @@ -377,8 +453,12 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled) spin_lock(&object->lock); - op->state = cancelled ? - FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE; + if (!cancelled) { + op->state = FSCACHE_OP_ST_COMPLETE; + } else { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + } if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; @@ -409,9 +489,9 @@ void fscache_put_operation(struct fscache_operation *op) return; _debug("PUT OP"); - ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE, + ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && + op->state != FSCACHE_OP_ST_COMPLETE, op->state, ==, FSCACHE_OP_ST_CANCELLED); - op->state = FSCACHE_OP_ST_DEAD; fscache_stat(&fscache_n_op_release); @@ -419,37 +499,39 @@ void fscache_put_operation(struct fscache_operation *op) op->release(op); op->release = NULL; } + op->state = FSCACHE_OP_ST_DEAD; object = op->object; + if (likely(object)) { + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) + fscache_unuse_cookie(object); + + /* now... we may get called with the object spinlock held, so we + * complete the cleanup here only if we can immediately acquire the + * lock, and defer it otherwise */ + if (!spin_trylock(&object->lock)) { + _debug("defer put"); + fscache_stat(&fscache_n_op_deferred_release); + + cache = object->cache; + spin_lock(&cache->op_gc_list_lock); + list_add_tail(&op->pend_link, &cache->op_gc_list); + spin_unlock(&cache->op_gc_list_lock); + schedule_work(&cache->op_gc); + _leave(" [defer]"); + return; + } - if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) - atomic_dec(&object->n_reads); - if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) - fscache_unuse_cookie(object); - - /* now... we may get called with the object spinlock held, so we - * complete the cleanup here only if we can immediately acquire the - * lock, and defer it otherwise */ - if (!spin_trylock(&object->lock)) { - _debug("defer put"); - fscache_stat(&fscache_n_op_deferred_release); + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - cache = object->cache; - spin_lock(&cache->op_gc_list_lock); - list_add_tail(&op->pend_link, &cache->op_gc_list); - spin_unlock(&cache->op_gc_list_lock); - schedule_work(&cache->op_gc); - _leave(" [defer]"); - return; + spin_unlock(&object->lock); } - ASSERTCMP(object->n_ops, >, 0); - object->n_ops--; - if (object->n_ops == 0) - fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - - spin_unlock(&object->lock); - kfree(op); _leave(" [done]"); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index de33b3fccca6..483bbc613bf0 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -213,7 +213,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, fscache_attr_changed_op, NULL); + fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -239,7 +239,7 @@ nobufs_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_operation(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_attr_changed_nobufs); @@ -249,6 +249,17 @@ nobufs: EXPORT_SYMBOL(__fscache_attr_changed); /* + * Handle cancellation of a pending retrieval op + */ +static void fscache_do_cancel_retrieval(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + atomic_set(&op->n_pages, 0); +} + +/* * release a retrieval op reference */ static void fscache_release_retrieval_op(struct fscache_operation *_op) @@ -258,11 +269,12 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) _enter("{OP%x}", op->op.debug_id); - ASSERTCMP(atomic_read(&op->n_pages), ==, 0); + ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED, + atomic_read(&op->n_pages), ==, 0); fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) - fscache_put_context(op->op.object->cookie, op->context); + fscache_put_context(op->cookie, op->context); _leave(""); } @@ -285,15 +297,24 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); + fscache_operation_init(&op->op, NULL, + fscache_do_cancel_retrieval, + fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); + op->cookie = cookie; op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; op->start_time = jiffies; INIT_LIST_HEAD(&op->to_do); + + /* Pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure. + */ + if (context) + fscache_get_context(op->cookie, context); return op; } @@ -330,24 +351,12 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) } /* - * Handle cancellation of a pending retrieval op - */ -static void fscache_do_cancel_retrieval(struct fscache_operation *_op) -{ - struct fscache_retrieval *op = - container_of(_op, struct fscache_retrieval, op); - - atomic_set(&op->n_pages, 0); -} - -/* * wait for an object to become active (or dead) */ int fscache_wait_for_operation_activation(struct fscache_object *object, struct fscache_operation *op, atomic_t *stat_op_waits, - atomic_t *stat_object_dead, - void (*do_cancel)(struct fscache_operation *)) + atomic_t *stat_object_dead) { int ret; @@ -359,7 +368,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, TASK_INTERRUPTIBLE) != 0) { - ret = fscache_cancel_op(op, do_cancel); + ret = fscache_cancel_op(op, false); if (ret == 0) return -ERESTARTSYS; @@ -377,11 +386,13 @@ check_if_dead: _leave(" = -ENOBUFS [cancelled]"); return -ENOBUFS; } - if (unlikely(fscache_object_is_dead(object))) { - pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state); - fscache_cancel_op(op, do_cancel); + if (unlikely(fscache_object_is_dying(object) || + fscache_cache_is_broken(object))) { + enum fscache_operation_state state = op->state; + fscache_cancel_op(op, true); if (stat_object_dead) fscache_stat(stat_object_dead); + _leave(" = -ENOBUFS [obj dead %d]", state); return -ENOBUFS; } return 0; @@ -453,17 +464,12 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -503,7 +509,7 @@ nobufs_unlock: spin_unlock(&cookie->lock); if (wake_cookie) __fscache_wake_unused_cookie(cookie); - kfree(op); + fscache_put_retrieval(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); _leave(" = -ENOBUFS"); @@ -584,17 +590,12 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -632,7 +633,7 @@ nobufs_unlock_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_retrieval(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); nobufs: @@ -700,8 +701,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_alloc_op_waits), - __fscache_stat(&fscache_n_allocs_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_allocs_object_dead)); if (ret < 0) goto error; @@ -726,7 +726,7 @@ nobufs_unlock_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_retrieval(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); nobufs: @@ -944,7 +944,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_write_op, + fscache_operation_init(&op->op, fscache_write_op, NULL, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING) | @@ -1016,7 +1016,7 @@ already_pending: spin_unlock(&object->lock); spin_unlock(&cookie->lock); radix_tree_preload_end(); - kfree(op); + fscache_put_operation(&op->op); fscache_stat(&fscache_n_stores_ok); _leave(" = 0"); return 0; @@ -1036,7 +1036,7 @@ nobufs_unlock_obj: nobufs: spin_unlock(&cookie->lock); radix_tree_preload_end(); - kfree(op); + fscache_put_operation(&op->op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_stores_nobufs); @@ -1044,7 +1044,7 @@ nobufs: return -ENOBUFS; nomem_free: - kfree(op); + fscache_put_operation(&op->op); nomem: fscache_stat(&fscache_n_stores_oom); _leave(" = -ENOMEM"); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 40d13c70ef51..7cfa0aacdf6d 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -23,6 +23,7 @@ atomic_t fscache_n_op_run; atomic_t fscache_n_op_enqueue; atomic_t fscache_n_op_requeue; atomic_t fscache_n_op_deferred_release; +atomic_t fscache_n_op_initialised; atomic_t fscache_n_op_release; atomic_t fscache_n_op_gc; atomic_t fscache_n_op_cancelled; @@ -130,6 +131,11 @@ atomic_t fscache_n_cop_write_page; atomic_t fscache_n_cop_uncache_page; atomic_t fscache_n_cop_dissociate_pages; +atomic_t fscache_n_cache_no_space_reject; +atomic_t fscache_n_cache_stale_objects; +atomic_t fscache_n_cache_retired_objects; +atomic_t fscache_n_cache_culled_objects; + /* * display the general statistics */ @@ -246,7 +252,8 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_op_enqueue), atomic_read(&fscache_n_op_cancelled), atomic_read(&fscache_n_op_rejected)); - seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", + seq_printf(m, "Ops : ini=%u dfr=%u rel=%u gc=%u\n", + atomic_read(&fscache_n_op_initialised), atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), atomic_read(&fscache_n_op_gc)); @@ -271,6 +278,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_cop_write_page), atomic_read(&fscache_n_cop_uncache_page), atomic_read(&fscache_n_cop_dissociate_pages)); + seq_printf(m, "CacheEv: nsp=%d stl=%d rtr=%d cul=%d\n", + atomic_read(&fscache_n_cache_no_space_reject), + atomic_read(&fscache_n_cache_stale_objects), + atomic_read(&fscache_n_cache_retired_objects), + atomic_read(&fscache_n_cache_culled_objects)); return 0; } diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 205e0d5d5307..f863ac6647ac 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -244,7 +244,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) return 0; parent = fuse_control_sb->s_root; - inc_nlink(parent->d_inode); + inc_nlink(d_inode(parent)); sprintf(name, "%u", fc->dev); parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, &simple_dir_inode_operations, @@ -283,11 +283,11 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) for (i = fc->ctl_ndents - 1; i >= 0; i--) { struct dentry *dentry = fc->ctl_dentry[i]; - dentry->d_inode->i_private = NULL; + d_inode(dentry)->i_private = NULL; d_drop(dentry); dput(dentry); } - drop_nlink(fuse_control_sb->s_root->d_inode); + drop_nlink(d_inode(fuse_control_sb->s_root)); } static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 28d0c7abba1c..eae2c11268bc 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,7 +38,6 @@ #include <linux/device.h> #include <linux/file.h> #include <linux/fs.h> -#include <linux/aio.h> #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/list.h> @@ -48,6 +47,7 @@ #include <linux/slab.h> #include <linux/stat.h> #include <linux/module.h> +#include <linux/uio.h> #include "fuse_i.h" @@ -88,32 +88,23 @@ static struct list_head *cuse_conntbl_head(dev_t devt) * FUSE file. */ -static ssize_t cuse_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) +static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to) { + struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp }; loff_t pos = 0; - struct iovec iov = { .iov_base = buf, .iov_len = count }; - struct fuse_io_priv io = { .async = 0, .file = file }; - struct iov_iter ii; - iov_iter_init(&ii, READ, &iov, 1, count); - return fuse_direct_io(&io, &ii, &pos, FUSE_DIO_CUSE); + return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE); } -static ssize_t cuse_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from) { + struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp }; loff_t pos = 0; - struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; - struct fuse_io_priv io = { .async = 0, .file = file }; - struct iov_iter ii; - iov_iter_init(&ii, WRITE, &iov, 1, count); - /* * No locking or generic_write_checks(), the server is * responsible for locking and sanity checks. */ - return fuse_direct_io(&io, &ii, &pos, + return fuse_direct_io(&io, from, &pos, FUSE_DIO_WRITE | FUSE_DIO_CUSE); } @@ -186,8 +177,8 @@ static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd, static const struct file_operations cuse_frontend_fops = { .owner = THIS_MODULE, - .read = cuse_read, - .write = cuse_write, + .read_iter = cuse_read_iter, + .write_iter = cuse_write_iter, .open = cuse_open, .release = cuse_release, .unlocked_ioctl = cuse_file_ioctl, @@ -498,6 +489,7 @@ static void cuse_fc_release(struct fuse_conn *fc) */ static int cuse_channel_open(struct inode *inode, struct file *file) { + struct fuse_dev *fud; struct cuse_conn *cc; int rc; @@ -508,17 +500,22 @@ static int cuse_channel_open(struct inode *inode, struct file *file) fuse_conn_init(&cc->fc); + fud = fuse_dev_alloc(&cc->fc); + if (!fud) { + kfree(cc); + return -ENOMEM; + } + INIT_LIST_HEAD(&cc->list); cc->fc.release = cuse_fc_release; - cc->fc.connected = 1; cc->fc.initialized = 1; rc = cuse_send_init(cc); if (rc) { - fuse_conn_put(&cc->fc); + fuse_dev_free(fud); return rc; } - file->private_data = &cc->fc; /* channel owns base reference to cc */ + file->private_data = fud; return 0; } @@ -536,7 +533,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct cuse_conn *cc = fc_to_cc(file->private_data); + struct fuse_dev *fud = file->private_data; + struct cuse_conn *cc = fc_to_cc(fud->fc); int rc; /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 39706c57ad3c..80cc1b35d460 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,20 +19,19 @@ #include <linux/pipe_fs_i.h> #include <linux/swap.h> #include <linux/splice.h> -#include <linux/aio.h> MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; -static struct fuse_conn *fuse_get_conn(struct file *file) +static struct fuse_dev *fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return file->private_data; + return ACCESS_ONCE(file->private_data); } static void fuse_request_init(struct fuse_req *req, struct page **pages, @@ -49,6 +48,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, req->pages = pages; req->page_descs = page_descs; req->max_pages = npages; + __set_bit(FR_PENDING, &req->flags); } static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) @@ -169,6 +169,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (!fc->connected) goto out; + err = -ECONNREFUSED; + if (fc->conn_error) + goto out; + req = fuse_request_alloc(npages); err = -ENOMEM; if (!req) { @@ -178,8 +182,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, } fuse_req_init_context(req); - req->waiting = 1; - req->background = for_background; + __set_bit(FR_WAITING, &req->flags); + if (for_background) + __set_bit(FR_BACKGROUND, &req->flags); + return req; out: @@ -269,15 +275,15 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, req = get_reserved_req(fc, file); fuse_req_init_context(req); - req->waiting = 1; - req->background = 0; + __set_bit(FR_WAITING, &req->flags); + __clear_bit(FR_BACKGROUND, &req->flags); return req; } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - if (unlikely(req->background)) { + if (test_bit(FR_BACKGROUND, &req->flags)) { /* * We get here in the unlikely case that a background * request was allocated but not sent @@ -288,8 +294,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) spin_unlock(&fc->lock); } - if (req->waiting) + if (test_bit(FR_WAITING, &req->flags)) { + __clear_bit(FR_WAITING, &req->flags); atomic_dec(&fc->num_waiting); + } if (req->stolen_file) put_reserved_req(fc, req); @@ -310,46 +318,38 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) return nbytes; } -static u64 fuse_get_unique(struct fuse_conn *fc) +static u64 fuse_get_unique(struct fuse_iqueue *fiq) { - fc->reqctr++; - /* zero is special */ - if (fc->reqctr == 0) - fc->reqctr = 1; - - return fc->reqctr; + return ++fiq->reqctr; } -static void queue_request(struct fuse_conn *fc, struct fuse_req *req) +static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); - list_add_tail(&req->list, &fc->pending); - req->state = FUSE_REQ_PENDING; - if (!req->waiting) { - req->waiting = 1; - atomic_inc(&fc->num_waiting); - } - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + list_add_tail(&req->list, &fiq->pending); + wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, u64 nodeid, u64 nlookup) { + struct fuse_iqueue *fiq = &fc->iq; + forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fc->lock); - if (fc->connected) { - fc->forget_list_tail->next = forget; - fc->forget_list_tail = forget; - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_lock(&fiq->waitq.lock); + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; + wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } else { kfree(forget); } - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); } static void flush_bg_queue(struct fuse_conn *fc) @@ -357,12 +357,15 @@ static void flush_bg_queue(struct fuse_conn *fc) while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; + struct fuse_iqueue *fiq = &fc->iq; req = list_entry(fc->bg_queue.next, struct fuse_req, list); list_del(&req->list); fc->active_background++; - req->in.h.unique = fuse_get_unique(fc); - queue_request(fc, req); + spin_lock(&fiq->waitq.lock); + req->in.h.unique = fuse_get_unique(fiq); + queue_request(fiq, req); + spin_unlock(&fiq->waitq.lock); } } @@ -373,20 +376,22 @@ static void flush_bg_queue(struct fuse_conn *fc) * was closed. The requester thread is woken up (if still waiting), * the 'end' callback is called if given, else the reference to the * request is released - * - * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) -__releases(fc->lock) { - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - req->end = NULL; - list_del(&req->list); - list_del(&req->intr_entry); - req->state = FUSE_REQ_FINISHED; - if (req->background) { - req->background = 0; + struct fuse_iqueue *fiq = &fc->iq; + if (test_and_set_bit(FR_FINISHED, &req->flags)) + return; + + spin_lock(&fiq->waitq.lock); + list_del_init(&req->intr_entry); + spin_unlock(&fiq->waitq.lock); + WARN_ON(test_bit(FR_PENDING, &req->flags)); + WARN_ON(test_bit(FR_SENT, &req->flags)); + if (test_bit(FR_BACKGROUND, &req->flags)) { + spin_lock(&fc->lock); + clear_bit(FR_BACKGROUND, &req->flags); if (fc->num_background == fc->max_background) fc->blocked = 0; @@ -402,122 +407,105 @@ __releases(fc->lock) fc->num_background--; fc->active_background--; flush_bg_queue(fc); + spin_unlock(&fc->lock); } - spin_unlock(&fc->lock); wake_up(&req->waitq); - if (end) - end(fc, req); + if (req->end) + req->end(fc, req); fuse_put_request(fc, req); } -static void wait_answer_interruptible(struct fuse_conn *fc, - struct fuse_req *req) -__releases(fc->lock) -__acquires(fc->lock) -{ - if (signal_pending(current)) - return; - - spin_unlock(&fc->lock); - wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); - spin_lock(&fc->lock); -} - -static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) +static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { - list_add_tail(&req->intr_entry, &fc->interrupts); - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_lock(&fiq->waitq.lock); + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); + wake_up_locked(&fiq->waitq); + } + spin_unlock(&fiq->waitq.lock); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) -__releases(fc->lock) -__acquires(fc->lock) { + struct fuse_iqueue *fiq = &fc->iq; + int err; + if (!fc->no_interrupt) { /* Any signal may interrupt this */ - wait_answer_interruptible(fc, req); - - if (req->aborted) - goto aborted; - if (req->state == FUSE_REQ_FINISHED) + err = wait_event_interruptible(req->waitq, + test_bit(FR_FINISHED, &req->flags)); + if (!err) return; - req->interrupted = 1; - if (req->state == FUSE_REQ_SENT) - queue_interrupt(fc, req); + set_bit(FR_INTERRUPTED, &req->flags); + /* matches barrier in fuse_dev_do_read() */ + smp_mb__after_atomic(); + if (test_bit(FR_SENT, &req->flags)) + queue_interrupt(fiq, req); } - if (!req->force) { + if (!test_bit(FR_FORCE, &req->flags)) { sigset_t oldset; /* Only fatal signals may interrupt this */ block_sigs(&oldset); - wait_answer_interruptible(fc, req); + err = wait_event_interruptible(req->waitq, + test_bit(FR_FINISHED, &req->flags)); restore_sigs(&oldset); - if (req->aborted) - goto aborted; - if (req->state == FUSE_REQ_FINISHED) + if (!err) return; + spin_lock(&fiq->waitq.lock); /* Request is not yet in userspace, bail out */ - if (req->state == FUSE_REQ_PENDING) { + if (test_bit(FR_PENDING, &req->flags)) { list_del(&req->list); + spin_unlock(&fiq->waitq.lock); __fuse_put_request(req); req->out.h.error = -EINTR; return; } + spin_unlock(&fiq->waitq.lock); } /* * Either request is already in userspace, or it was forced. * Wait it out. */ - spin_unlock(&fc->lock); - wait_event(req->waitq, req->state == FUSE_REQ_FINISHED); - spin_lock(&fc->lock); - - if (!req->aborted) - return; - - aborted: - BUG_ON(req->state != FUSE_REQ_FINISHED); - if (req->locked) { - /* This is uninterruptible sleep, because data is - being copied to/from the buffers of req. During - locked state, there mustn't be any filesystem - operation (e.g. page fault), since that could lead - to deadlock */ - spin_unlock(&fc->lock); - wait_event(req->waitq, !req->locked); - spin_lock(&fc->lock); - } + wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); } static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(req->background); - spin_lock(&fc->lock); - if (!fc->connected) + struct fuse_iqueue *fiq = &fc->iq; + + BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); + spin_lock(&fiq->waitq.lock); + if (!fiq->connected) { + spin_unlock(&fiq->waitq.lock); req->out.h.error = -ENOTCONN; - else if (fc->conn_error) - req->out.h.error = -ECONNREFUSED; - else { - req->in.h.unique = fuse_get_unique(fc); - queue_request(fc, req); + } else { + req->in.h.unique = fuse_get_unique(fiq); + queue_request(fiq, req); /* acquire extra reference, since request is still needed after request_end() */ __fuse_get_request(req); + spin_unlock(&fiq->waitq.lock); request_wait_answer(fc, req); + /* Pairs with smp_wmb() in request_end() */ + smp_rmb(); } - spin_unlock(&fc->lock); } void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - req->isreply = 1; + __set_bit(FR_ISREPLY, &req->flags); + if (!test_bit(FR_WAITING, &req->flags)) { + __set_bit(FR_WAITING, &req->flags); + atomic_inc(&fc->num_waiting); + } __fuse_request_send(fc, req); } EXPORT_SYMBOL_GPL(fuse_request_send); @@ -587,10 +575,20 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) return ret; } -static void fuse_request_send_nowait_locked(struct fuse_conn *fc, - struct fuse_req *req) +/* + * Called under fc->lock + * + * fc->connected must have been checked previously + */ +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req) { - BUG_ON(!req->background); + BUG_ON(!test_bit(FR_BACKGROUND, &req->flags)); + if (!test_bit(FR_WAITING, &req->flags)) { + __set_bit(FR_WAITING, &req->flags); + atomic_inc(&fc->num_waiting); + } + __set_bit(FR_ISREPLY, &req->flags); fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; @@ -603,54 +601,40 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc, flush_bg_queue(fc); } -static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { + BUG_ON(!req->end); spin_lock(&fc->lock); if (fc->connected) { - fuse_request_send_nowait_locked(fc, req); + fuse_request_send_background_locked(fc, req); spin_unlock(&fc->lock); } else { + spin_unlock(&fc->lock); req->out.h.error = -ENOTCONN; - request_end(fc, req); + req->end(fc, req); + fuse_put_request(fc, req); } } - -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) -{ - req->isreply = 1; - fuse_request_send_nowait(fc, req); -} EXPORT_SYMBOL_GPL(fuse_request_send_background); static int fuse_request_send_notify_reply(struct fuse_conn *fc, struct fuse_req *req, u64 unique) { int err = -ENODEV; + struct fuse_iqueue *fiq = &fc->iq; - req->isreply = 0; + __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; - spin_lock(&fc->lock); - if (fc->connected) { - queue_request(fc, req); + spin_lock(&fiq->waitq.lock); + if (fiq->connected) { + queue_request(fiq, req); err = 0; } - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return err; } -/* - * Called under fc->lock - * - * fc->connected must have been checked previously - */ -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req) -{ - req->isreply = 1; - fuse_request_send_nowait_locked(fc, req); -} - void fuse_force_forget(struct file *file, u64 nodeid) { struct inode *inode = file_inode(file); @@ -666,7 +650,7 @@ void fuse_force_forget(struct file *file, u64 nodeid) req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - req->isreply = 0; + __clear_bit(FR_ISREPLY, &req->flags); __fuse_request_send(fc, req); /* ignore errors */ fuse_put_request(fc, req); @@ -677,62 +661,58 @@ void fuse_force_forget(struct file *file, u64 nodeid) * anything that could cause a page-fault. If the request was already * aborted bail out. */ -static int lock_request(struct fuse_conn *fc, struct fuse_req *req) +static int lock_request(struct fuse_req *req) { int err = 0; if (req) { - spin_lock(&fc->lock); - if (req->aborted) + spin_lock(&req->waitq.lock); + if (test_bit(FR_ABORTED, &req->flags)) err = -ENOENT; else - req->locked = 1; - spin_unlock(&fc->lock); + set_bit(FR_LOCKED, &req->flags); + spin_unlock(&req->waitq.lock); } return err; } /* - * Unlock request. If it was aborted during being locked, the - * requester thread is currently waiting for it to be unlocked, so - * wake it up. + * Unlock request. If it was aborted while locked, caller is responsible + * for unlocking and ending the request. */ -static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) +static int unlock_request(struct fuse_req *req) { + int err = 0; if (req) { - spin_lock(&fc->lock); - req->locked = 0; - if (req->aborted) - wake_up(&req->waitq); - spin_unlock(&fc->lock); + spin_lock(&req->waitq.lock); + if (test_bit(FR_ABORTED, &req->flags)) + err = -ENOENT; + else + clear_bit(FR_LOCKED, &req->flags); + spin_unlock(&req->waitq.lock); } + return err; } struct fuse_copy_state { - struct fuse_conn *fc; int write; struct fuse_req *req; - const struct iovec *iov; + struct iov_iter *iter; struct pipe_buffer *pipebufs; struct pipe_buffer *currbuf; struct pipe_inode_info *pipe; unsigned long nr_segs; - unsigned long seglen; - unsigned long addr; struct page *pg; unsigned len; unsigned offset; unsigned move_pages:1; }; -static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, - int write, - const struct iovec *iov, unsigned long nr_segs) +static void fuse_copy_init(struct fuse_copy_state *cs, int write, + struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); - cs->fc = fc; cs->write = write; - cs->iov = iov; - cs->nr_segs = nr_segs; + cs->iter = iter; } /* Unmap and put previous page of userspace buffer */ @@ -763,7 +743,10 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) struct page *page; int err; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); if (cs->pipebufs) { struct pipe_buffer *buf = cs->pipebufs; @@ -800,25 +783,19 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->nr_segs++; } } else { - if (!cs->seglen) { - BUG_ON(!cs->nr_segs); - cs->seglen = cs->iov[0].iov_len; - cs->addr = (unsigned long) cs->iov[0].iov_base; - cs->iov++; - cs->nr_segs--; - } - err = get_user_pages_fast(cs->addr, 1, cs->write, &page); + size_t off; + err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off); if (err < 0) return err; - BUG_ON(err != 1); + BUG_ON(!err); + cs->len = err; + cs->offset = off; cs->pg = page; - cs->offset = cs->addr % PAGE_SIZE; - cs->len = min(PAGE_SIZE - cs->offset, cs->seglen); - cs->seglen -= cs->len; - cs->addr += cs->len; + cs->offset = off; + iov_iter_advance(cs->iter, err); } - return lock_request(cs->fc, cs->req); + return lock_request(cs->req); } /* Do as much copy to/from userspace buffer as we can */ @@ -869,7 +846,10 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); err = buf->ops->confirm(cs->pipe, buf); @@ -923,12 +903,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) lru_cache_add_file(newpage); err = 0; - spin_lock(&cs->fc->lock); - if (cs->req->aborted) + spin_lock(&cs->req->waitq.lock); + if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else *pagep = newpage; - spin_unlock(&cs->fc->lock); + spin_unlock(&cs->req->waitq.lock); if (err) { unlock_page(newpage); @@ -948,7 +928,7 @@ out_fallback: cs->pg = buf->page; cs->offset = buf->offset; - err = lock_request(cs->fc, cs->req); + err = lock_request(cs->req); if (err) return err; @@ -959,11 +939,15 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, unsigned offset, unsigned count) { struct pipe_buffer *buf; + int err; if (cs->nr_segs == cs->pipe->buffers) return -EIO; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); buf = cs->pipebufs; @@ -1074,36 +1058,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, return err; } -static int forget_pending(struct fuse_conn *fc) +static int forget_pending(struct fuse_iqueue *fiq) { - return fc->forget_list_head.next != NULL; + return fiq->forget_list_head.next != NULL; } -static int request_pending(struct fuse_conn *fc) +static int request_pending(struct fuse_iqueue *fiq) { - return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) || - forget_pending(fc); -} - -/* Wait until a request is available on the pending list */ -static void request_wait(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue_exclusive(&fc->waitq, &wait); - while (fc->connected && !request_pending(fc)) { - set_current_state(TASK_INTERRUPTIBLE); - if (signal_pending(current)) - break; - - spin_unlock(&fc->lock); - schedule(); - spin_lock(&fc->lock); - } - set_current_state(TASK_RUNNING); - remove_wait_queue(&fc->waitq, &wait); + return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) || + forget_pending(fiq); } /* @@ -1112,11 +1075,12 @@ __acquires(fc->lock) * Unlike other requests this is assembled on demand, without a need * to allocate a separate fuse_req structure. * - * Called with fc->lock held, releases it + * Called with fiq->waitq.lock held, releases it */ -static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, +static int fuse_read_interrupt(struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(fc->lock) +__releases(fiq->waitq.lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1124,7 +1088,7 @@ __releases(fc->lock) int err; list_del_init(&req->intr_entry); - req->intr_unique = fuse_get_unique(fc); + req->intr_unique = fuse_get_unique(fiq); memset(&ih, 0, sizeof(ih)); memset(&arg, 0, sizeof(arg)); ih.len = reqsize; @@ -1132,7 +1096,7 @@ __releases(fc->lock) ih.unique = req->intr_unique; arg.unique = req->in.h.unique; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); if (nbytes < reqsize) return -EINVAL; @@ -1144,21 +1108,21 @@ __releases(fc->lock) return err ? err : reqsize; } -static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, +static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, unsigned max, unsigned *countp) { - struct fuse_forget_link *head = fc->forget_list_head.next; + struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; unsigned count; for (count = 0; *newhead != NULL && count < max; count++) newhead = &(*newhead)->next; - fc->forget_list_head.next = *newhead; + fiq->forget_list_head.next = *newhead; *newhead = NULL; - if (fc->forget_list_head.next == NULL) - fc->forget_list_tail = &fc->forget_list_head; + if (fiq->forget_list_head.next == NULL) + fiq->forget_list_tail = &fiq->forget_list_head; if (countp != NULL) *countp = count; @@ -1166,24 +1130,24 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, return head; } -static int fuse_read_single_forget(struct fuse_conn *fc, +static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { int err; - struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL); + struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); struct fuse_forget_in arg = { .nlookup = forget->forget_one.nlookup, }; struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, - .unique = fuse_get_unique(fc), + .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); kfree(forget); if (nbytes < ih.len) return -EINVAL; @@ -1199,9 +1163,9 @@ __releases(fc->lock) return ih.len; } -static int fuse_read_batch_forget(struct fuse_conn *fc, +static int fuse_read_batch_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { int err; unsigned max_forgets; @@ -1210,18 +1174,18 @@ __releases(fc->lock) struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, - .unique = fuse_get_unique(fc), + .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; if (nbytes < ih.len) { - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return -EINVAL; } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); - head = dequeue_forget(fc, max_forgets, &count); - spin_unlock(&fc->lock); + head = dequeue_forget(fiq, max_forgets, &count); + spin_unlock(&fiq->waitq.lock); arg.count = count; ih.len += count * sizeof(struct fuse_forget_one); @@ -1248,14 +1212,15 @@ __releases(fc->lock) return ih.len; } -static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs, +static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { - if (fc->minor < 16 || fc->forget_list_head.next->next == NULL) - return fuse_read_single_forget(fc, cs, nbytes); + if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) + return fuse_read_single_forget(fiq, cs, nbytes); else - return fuse_read_batch_forget(fc, cs, nbytes); + return fuse_read_batch_forget(fiq, cs, nbytes); } /* @@ -1267,46 +1232,51 @@ __releases(fc->lock) * request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ -static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, +static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_copy_state *cs, size_t nbytes) { - int err; + ssize_t err; + struct fuse_conn *fc = fud->fc; + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; struct fuse_in *in; unsigned reqsize; restart: - spin_lock(&fc->lock); + spin_lock(&fiq->waitq.lock); err = -EAGAIN; - if ((file->f_flags & O_NONBLOCK) && fc->connected && - !request_pending(fc)) + if ((file->f_flags & O_NONBLOCK) && fiq->connected && + !request_pending(fiq)) goto err_unlock; - request_wait(fc); - err = -ENODEV; - if (!fc->connected) + err = wait_event_interruptible_exclusive_locked(fiq->waitq, + !fiq->connected || request_pending(fiq)); + if (err) goto err_unlock; - err = -ERESTARTSYS; - if (!request_pending(fc)) + + err = -ENODEV; + if (!fiq->connected) goto err_unlock; - if (!list_empty(&fc->interrupts)) { - req = list_entry(fc->interrupts.next, struct fuse_req, + if (!list_empty(&fiq->interrupts)) { + req = list_entry(fiq->interrupts.next, struct fuse_req, intr_entry); - return fuse_read_interrupt(fc, cs, nbytes, req); + return fuse_read_interrupt(fiq, cs, nbytes, req); } - if (forget_pending(fc)) { - if (list_empty(&fc->pending) || fc->forget_batch-- > 0) - return fuse_read_forget(fc, cs, nbytes); + if (forget_pending(fiq)) { + if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0) + return fuse_read_forget(fc, fiq, cs, nbytes); - if (fc->forget_batch <= -8) - fc->forget_batch = 16; + if (fiq->forget_batch <= -8) + fiq->forget_batch = 16; } - req = list_entry(fc->pending.next, struct fuse_req, list); - req->state = FUSE_REQ_READING; - list_move(&req->list, &fc->io); + req = list_entry(fiq->pending.next, struct fuse_req, list); + clear_bit(FR_PENDING, &req->flags); + list_del_init(&req->list); + spin_unlock(&fiq->waitq.lock); in = &req->in; reqsize = in->h.len; @@ -1319,37 +1289,48 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, request_end(fc, req); goto restart; } - spin_unlock(&fc->lock); + spin_lock(&fpq->lock); + list_add(&req->list, &fpq->io); + spin_unlock(&fpq->lock); cs->req = req; err = fuse_copy_one(cs, &in->h, sizeof(in->h)); if (!err) err = fuse_copy_args(cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); fuse_copy_finish(cs); - spin_lock(&fc->lock); - req->locked = 0; - if (req->aborted) { - request_end(fc, req); - return -ENODEV; + spin_lock(&fpq->lock); + clear_bit(FR_LOCKED, &req->flags); + if (!fpq->connected) { + err = -ENODEV; + goto out_end; } if (err) { req->out.h.error = -EIO; - request_end(fc, req); - return err; + goto out_end; } - if (!req->isreply) - request_end(fc, req); - else { - req->state = FUSE_REQ_SENT; - list_move_tail(&req->list, &fc->processing); - if (req->interrupted) - queue_interrupt(fc, req); - spin_unlock(&fc->lock); + if (!test_bit(FR_ISREPLY, &req->flags)) { + err = reqsize; + goto out_end; } + list_move_tail(&req->list, &fpq->processing); + spin_unlock(&fpq->lock); + set_bit(FR_SENT, &req->flags); + /* matches barrier in request_wait_answer() */ + smp_mb__after_atomic(); + if (test_bit(FR_INTERRUPTED, &req->flags)) + queue_interrupt(fiq, req); + return reqsize; +out_end: + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); + request_end(fc, req); + return err; + err_unlock: - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return err; } @@ -1364,18 +1345,21 @@ static int fuse_dev_open(struct inode *inode, struct file *file) return 0; } -static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return -EPERM; - fuse_copy_init(&cs, fc, 1, iov, nr_segs); + if (!iter_is_iovec(to)) + return -EINVAL; + + fuse_copy_init(&cs, 1, to); - return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); + return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); } static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, @@ -1387,18 +1371,19 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, int do_wakeup = 0; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(in); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(in); + + if (!fud) return -EPERM; bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) return -ENOMEM; - fuse_copy_init(&cs, fc, 1, NULL, 0); + fuse_copy_init(&cs, 1, NULL); cs.pipebufs = bufs; cs.pipe = pipe; - ret = fuse_dev_do_read(fc, in, &cs, len); + ret = fuse_dev_do_read(fud, in, &cs, len); if (ret < 0) goto out; @@ -1837,11 +1822,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, } /* Look up request on processing list by unique ID */ -static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) +static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) { struct fuse_req *req; - list_for_each_entry(req, &fc->processing, list) { + list_for_each_entry(req, &fpq->processing, list) { if (req->in.h.unique == unique || req->intr_unique == unique) return req; } @@ -1878,10 +1863,12 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, * it from the list and copy the rest of the buffer to the request. * The request is finished by calling request_end() */ -static ssize_t fuse_dev_do_write(struct fuse_conn *fc, +static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_copy_state *cs, size_t nbytes) { int err; + struct fuse_conn *fc = fud->fc; + struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; struct fuse_out_header oh; @@ -1909,79 +1896,79 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc, if (oh.error <= -1000 || oh.error > 0) goto err_finish; - spin_lock(&fc->lock); + spin_lock(&fpq->lock); err = -ENOENT; - if (!fc->connected) - goto err_unlock; + if (!fpq->connected) + goto err_unlock_pq; - req = request_find(fc, oh.unique); + req = request_find(fpq, oh.unique); if (!req) - goto err_unlock; + goto err_unlock_pq; - if (req->aborted) { - spin_unlock(&fc->lock); - fuse_copy_finish(cs); - spin_lock(&fc->lock); - request_end(fc, req); - return -ENOENT; - } /* Is it an interrupt reply? */ if (req->intr_unique == oh.unique) { + spin_unlock(&fpq->lock); + err = -EINVAL; if (nbytes != sizeof(struct fuse_out_header)) - goto err_unlock; + goto err_finish; if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - queue_interrupt(fc, req); + queue_interrupt(&fc->iq, req); - spin_unlock(&fc->lock); fuse_copy_finish(cs); return nbytes; } - req->state = FUSE_REQ_WRITING; - list_move(&req->list, &fc->io); + clear_bit(FR_SENT, &req->flags); + list_move(&req->list, &fpq->io); req->out.h = oh; - req->locked = 1; + set_bit(FR_LOCKED, &req->flags); + spin_unlock(&fpq->lock); cs->req = req; if (!req->out.page_replace) cs->move_pages = 0; - spin_unlock(&fc->lock); err = copy_out_args(cs, &req->out, nbytes); fuse_copy_finish(cs); - spin_lock(&fc->lock); - req->locked = 0; - if (!err) { - if (req->aborted) - err = -ENOENT; - } else if (!req->aborted) + spin_lock(&fpq->lock); + clear_bit(FR_LOCKED, &req->flags); + if (!fpq->connected) + err = -ENOENT; + else if (err) req->out.h.error = -EIO; + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); + request_end(fc, req); return err ? err : nbytes; - err_unlock: - spin_unlock(&fc->lock); + err_unlock_pq: + spin_unlock(&fpq->lock); err_finish: fuse_copy_finish(cs); return err; } -static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + + if (!fud) return -EPERM; - fuse_copy_init(&cs, fc, 0, iov, nr_segs); + if (!iter_is_iovec(from)) + return -EINVAL; - return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs)); + fuse_copy_init(&cs, 0, from); + + return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); } static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, @@ -1992,12 +1979,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_conn *fc; + struct fuse_dev *fud; size_t rem; ssize_t ret; - fc = fuse_get_conn(out); - if (!fc) + fud = fuse_get_dev(out); + if (!fud) return -EPERM; bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); @@ -2044,14 +2031,15 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, } pipe_unlock(pipe); - fuse_copy_init(&cs, fc, 0, NULL, nbuf); + fuse_copy_init(&cs, 0, NULL); cs.pipebufs = bufs; + cs.nr_segs = nbuf; cs.pipe = pipe; if (flags & SPLICE_F_MOVE) cs.move_pages = 1; - ret = fuse_dev_do_write(fc, &cs, len); + ret = fuse_dev_do_write(fud, &cs, len); for (idx = 0; idx < nbuf; idx++) { struct pipe_buffer *buf = &bufs[idx]; @@ -2065,18 +2053,21 @@ out: static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { unsigned mask = POLLOUT | POLLWRNORM; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_iqueue *fiq; + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return POLLERR; - poll_wait(file, &fc->waitq, wait); + fiq = &fud->fc->iq; + poll_wait(file, &fiq->waitq, wait); - spin_lock(&fc->lock); - if (!fc->connected) + spin_lock(&fiq->waitq.lock); + if (!fiq->connected) mask = POLLERR; - else if (request_pending(fc)) + else if (request_pending(fiq)) mask |= POLLIN | POLLRDNORM; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return mask; } @@ -2087,67 +2078,18 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) -__releases(fc->lock) -__acquires(fc->lock) { while (!list_empty(head)) { struct fuse_req *req; req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; - request_end(fc, req); - spin_lock(&fc->lock); - } -} - -/* - * Abort requests under I/O - * - * The requests are set to aborted and finished, and the request - * waiter is woken up. This will make request_wait_answer() wait - * until the request is unlocked and then return. - * - * If the request is asynchronous, then the end function needs to be - * called after waiting for the request to be unlocked (if it was - * locked). - */ -static void end_io_requests(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - while (!list_empty(&fc->io)) { - struct fuse_req *req = - list_entry(fc->io.next, struct fuse_req, list); - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - - req->aborted = 1; - req->out.h.error = -ECONNABORTED; - req->state = FUSE_REQ_FINISHED; + clear_bit(FR_PENDING, &req->flags); + clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - wake_up(&req->waitq); - if (end) { - req->end = NULL; - __fuse_get_request(req); - spin_unlock(&fc->lock); - wait_event(req->waitq, !req->locked); - end(fc, req); - fuse_put_request(fc, req); - spin_lock(&fc->lock); - } + request_end(fc, req); } } -static void end_queued_requests(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - fc->max_background = UINT_MAX; - flush_bg_queue(fc); - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); - while (forget_pending(fc)) - kfree(dequeue_forget(fc, 1, NULL)); -} - static void end_polls(struct fuse_conn *fc) { struct rb_node *p; @@ -2166,82 +2108,171 @@ static void end_polls(struct fuse_conn *fc) /* * Abort all requests. * - * Emergency exit in case of a malicious or accidental deadlock, or - * just a hung filesystem. + * Emergency exit in case of a malicious or accidental deadlock, or just a hung + * filesystem. * - * The same effect is usually achievable through killing the - * filesystem daemon and all users of the filesystem. The exception - * is the combination of an asynchronous request and the tricky - * deadlock (see Documentation/filesystems/fuse.txt). + * The same effect is usually achievable through killing the filesystem daemon + * and all users of the filesystem. The exception is the combination of an + * asynchronous request and the tricky deadlock (see + * Documentation/filesystems/fuse.txt). * - * During the aborting, progression of requests from the pending and - * processing lists onto the io list, and progression of new requests - * onto the pending list is prevented by req->connected being false. - * - * Progression of requests under I/O to the processing list is - * prevented by the req->aborted flag being true for these requests. - * For this reason requests on the io list must be aborted first. + * Aborting requests under I/O goes as follows: 1: Separate out unlocked + * requests, they should be finished off immediately. Locked requests will be + * finished after unlock; see unlock_request(). 2: Finish off the unlocked + * requests. It is possible that some request will finish before we can. This + * is OK, the request will in that case be removed from the list before we touch + * it. */ void fuse_abort_conn(struct fuse_conn *fc) { + struct fuse_iqueue *fiq = &fc->iq; + spin_lock(&fc->lock); if (fc->connected) { + struct fuse_dev *fud; + struct fuse_req *req, *next; + LIST_HEAD(to_end1); + LIST_HEAD(to_end2); + fc->connected = 0; fc->blocked = 0; fuse_set_initialized(fc); - end_io_requests(fc); - end_queued_requests(fc); + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + + spin_lock(&fpq->lock); + fpq->connected = 0; + list_for_each_entry_safe(req, next, &fpq->io, list) { + req->out.h.error = -ECONNABORTED; + spin_lock(&req->waitq.lock); + set_bit(FR_ABORTED, &req->flags); + if (!test_bit(FR_LOCKED, &req->flags)) { + set_bit(FR_PRIVATE, &req->flags); + list_move(&req->list, &to_end1); + } + spin_unlock(&req->waitq.lock); + } + list_splice_init(&fpq->processing, &to_end2); + spin_unlock(&fpq->lock); + } + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + + spin_lock(&fiq->waitq.lock); + fiq->connected = 0; + list_splice_init(&fiq->pending, &to_end2); + while (forget_pending(fiq)) + kfree(dequeue_forget(fiq, 1, NULL)); + wake_up_all_locked(&fiq->waitq); + spin_unlock(&fiq->waitq.lock); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); end_polls(fc); - wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_unlock(&fc->lock); + + while (!list_empty(&to_end1)) { + req = list_first_entry(&to_end1, struct fuse_req, list); + __fuse_get_request(req); + list_del_init(&req->list); + request_end(fc, req); + } + end_requests(fc, &to_end2); + } else { + spin_unlock(&fc->lock); } - spin_unlock(&fc->lock); } EXPORT_SYMBOL_GPL(fuse_abort_conn); int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc = fuse_get_conn(file); - if (fc) { - spin_lock(&fc->lock); - fc->connected = 0; - fc->blocked = 0; - fuse_set_initialized(fc); - end_queued_requests(fc); - end_polls(fc); - wake_up_all(&fc->blocked_waitq); - spin_unlock(&fc->lock); - fuse_conn_put(fc); - } + struct fuse_dev *fud = fuse_get_dev(file); + + if (fud) { + struct fuse_conn *fc = fud->fc; + struct fuse_pqueue *fpq = &fud->pq; + WARN_ON(!list_empty(&fpq->io)); + end_requests(fc, &fpq->processing); + /* Are we the last open device? */ + if (atomic_dec_and_test(&fc->dev_count)) { + WARN_ON(fc->iq.fasync != NULL); + fuse_abort_conn(fc); + } + fuse_dev_free(fud); + } return 0; } EXPORT_SYMBOL_GPL(fuse_dev_release); static int fuse_dev_fasync(int fd, struct file *file, int on) { - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return -EPERM; /* No locking - fasync_helper does its own locking */ - return fasync_helper(fd, file, on, &fc->fasync); + return fasync_helper(fd, file, on, &fud->fc->iq.fasync); +} + +static int fuse_device_clone(struct fuse_conn *fc, struct file *new) +{ + struct fuse_dev *fud; + + if (new->private_data) + return -EINVAL; + + fud = fuse_dev_alloc(fc); + if (!fud) + return -ENOMEM; + + new->private_data = fud; + atomic_inc(&fc->dev_count); + + return 0; +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int err = -ENOTTY; + + if (cmd == FUSE_DEV_IOC_CLONE) { + int oldfd; + + err = -EFAULT; + if (!get_user(oldfd, (__u32 __user *) arg)) { + struct file *old = fget(oldfd); + + err = -EINVAL; + if (old) { + struct fuse_dev *fud = fuse_get_dev(old); + + if (fud) { + mutex_lock(&fuse_mutex); + err = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); + } + fput(old); + } + } + } + return err; } const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, .llseek = no_llseek, - .read = do_sync_read, - .aio_read = fuse_dev_read, + .read_iter = fuse_dev_read, .splice_read = fuse_dev_splice_read, - .write = do_sync_write, - .aio_write = fuse_dev_write, + .write_iter = fuse_dev_write, .splice_write = fuse_dev_splice_write, .poll = fuse_dev_poll, .release = fuse_dev_release, .fasync = fuse_dev_fasync, + .unlocked_ioctl = fuse_dev_ioctl, + .compat_ioctl = fuse_dev_ioctl, }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 1545b711ddcf..5e2e08712d3b 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -192,7 +192,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) struct fuse_inode *fi; int ret; - inode = ACCESS_ONCE(entry->d_inode); + inode = d_inode_rcu(entry); if (inode && is_bad_inode(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || @@ -220,7 +220,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) attr_version = fuse_get_attr_version(fc); parent = dget_parent(entry); - fuse_lookup_init(fc, &args, get_node_id(parent->d_inode), + fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)), &entry->d_name, &outarg); ret = fuse_simple_request(fc, &args); dput(parent); @@ -254,7 +254,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) return -ECHILD; } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { parent = dget_parent(entry); - fuse_advise_use_readdirplus(parent->d_inode); + fuse_advise_use_readdirplus(d_inode(parent)); dput(parent); } } @@ -487,7 +487,7 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, entry = res; } - if (!(flags & O_CREAT) || entry->d_inode) + if (!(flags & O_CREAT) || d_really_is_positive(entry)) goto no_open; /* Only creates */ @@ -653,7 +653,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) args.in.args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fc->lock); @@ -689,7 +689,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) args.in.args[0].value = entry->d_name.name; err = fuse_simple_request(fc, &args); if (!err) { - clear_nlink(entry->d_inode); + clear_nlink(d_inode(entry)); fuse_invalidate_attr(dir); fuse_invalidate_entry_cache(entry); } else if (err == -EINTR) @@ -721,12 +721,12 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, err = fuse_simple_request(fc, &args); if (!err) { /* ctime changes */ - fuse_invalidate_attr(oldent->d_inode); - fuse_update_ctime(oldent->d_inode); + fuse_invalidate_attr(d_inode(oldent)); + fuse_update_ctime(d_inode(oldent)); if (flags & RENAME_EXCHANGE) { - fuse_invalidate_attr(newent->d_inode); - fuse_update_ctime(newent->d_inode); + fuse_invalidate_attr(d_inode(newent)); + fuse_update_ctime(d_inode(newent)); } fuse_invalidate_attr(olddir); @@ -734,10 +734,10 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, fuse_invalidate_attr(newdir); /* newent will end up negative */ - if (!(flags & RENAME_EXCHANGE) && newent->d_inode) { - fuse_invalidate_attr(newent->d_inode); + if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) { + fuse_invalidate_attr(d_inode(newent)); fuse_invalidate_entry_cache(newent); - fuse_update_ctime(newent->d_inode); + fuse_update_ctime(d_inode(newent)); } } else if (err == -EINTR) { /* If request was interrupted, DEITY only knows if the @@ -746,7 +746,7 @@ static int fuse_rename_common(struct inode *olddir, struct dentry *oldent, directory), then there can be inconsistency between the dcache and the real filesystem. Tough luck. */ fuse_invalidate_entry(oldent); - if (newent->d_inode) + if (d_really_is_positive(newent)) fuse_invalidate_entry(newent); } @@ -788,7 +788,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, { int err; struct fuse_link_in inarg; - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); @@ -961,9 +961,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, fuse_invalidate_attr(parent); fuse_invalidate_entry(entry); - if (child_nodeid != 0 && entry->d_inode) { - mutex_lock(&entry->d_inode->i_mutex); - if (get_node_id(entry->d_inode) != child_nodeid) { + if (child_nodeid != 0 && d_really_is_positive(entry)) { + mutex_lock(&d_inode(entry)->i_mutex); + if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; } @@ -977,13 +977,13 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, err = -ENOTEMPTY; goto badentry; } - entry->d_inode->i_flags |= S_DEAD; + d_inode(entry)->i_flags |= S_DEAD; } dont_mount(entry); - clear_nlink(entry->d_inode); + clear_nlink(d_inode(entry)); err = 0; badentry: - mutex_unlock(&entry->d_inode->i_mutex); + mutex_unlock(&d_inode(entry)->i_mutex); if (!err) d_delete(entry); } else { @@ -1169,7 +1169,7 @@ static int fuse_direntplus_link(struct file *file, struct qstr name = QSTR_INIT(dirent->name, dirent->namelen); struct dentry *dentry; struct dentry *alias; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; @@ -1205,7 +1205,7 @@ static int fuse_direntplus_link(struct file *file, name.hash = full_name_hash(name.name, name.len); dentry = d_lookup(parent, &name); if (dentry) { - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) { d_drop(dentry); } else if (get_node_id(inode) != o->nodeid || @@ -1365,9 +1365,9 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) return err; } -static char *read_link(struct dentry *dentry) +static const char *fuse_follow_link(struct dentry *dentry, void **cookie) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); char *link; @@ -1389,28 +1389,12 @@ static char *read_link(struct dentry *dentry) link = ERR_PTR(ret); } else { link[ret] = '\0'; + *cookie = link; } fuse_invalidate_atime(inode); return link; } -static void free_link(char *link) -{ - if (!IS_ERR(link)) - free_page((unsigned long) link); -} - -static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, read_link(dentry)); - return NULL; -} - -static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c) -{ - free_link(nd_get_link(nd)); -} - static int fuse_dir_open(struct inode *inode, struct file *file) { return fuse_open_common(inode, file, true); @@ -1712,7 +1696,7 @@ error: static int fuse_setattr(struct dentry *entry, struct iattr *attr) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; @@ -1726,7 +1710,7 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, struct kstat *stat) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); if (!fuse_allow_current_process(fc)) @@ -1738,7 +1722,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry, static int fuse_setxattr(struct dentry *entry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_setxattr_in inarg; @@ -1774,7 +1758,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name, static ssize_t fuse_getxattr(struct dentry *entry, const char *name, void *value, size_t size) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; @@ -1815,7 +1799,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); struct fuse_getxattr_in inarg; @@ -1857,7 +1841,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) static int fuse_removexattr(struct dentry *entry, const char *name) { - struct inode *inode = entry->d_inode; + struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); int err; @@ -1926,7 +1910,7 @@ static const struct inode_operations fuse_common_inode_operations = { static const struct inode_operations fuse_symlink_inode_operations = { .setattr = fuse_setattr, .follow_link = fuse_follow_link, - .put_link = fuse_put_link, + .put_link = free_page_put_link, .readlink = generic_readlink, .getattr = fuse_getattr, .setxattr = fuse_setxattr, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c01ec3bdcfd8..f523f2f04c19 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -15,8 +15,8 @@ #include <linux/module.h> #include <linux/compat.h> #include <linux/swap.h> -#include <linux/aio.h> #include <linux/falloc.h> +#include <linux/uio.h> static const struct file_operations fuse_direct_io_file_operations; @@ -96,17 +96,17 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) * Drop the release request when client does not * implement 'open' */ - req->background = 0; + __clear_bit(FR_BACKGROUND, &req->flags); iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else if (sync) { - req->background = 0; + __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(ff->fc, req); iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else { req->end = fuse_release_end; - req->background = 1; + __set_bit(FR_BACKGROUND, &req->flags); fuse_request_send_background(ff->fc, req); } kfree(ff); @@ -299,8 +299,8 @@ void fuse_sync_release(struct fuse_file *ff, int flags) { WARN_ON(atomic_read(&ff->count) > 1); fuse_prepare_release(ff, flags, FUSE_RELEASE); - ff->reserved_req->force = 1; - ff->reserved_req->background = 0; + __set_bit(FR_FORCE, &ff->reserved_req->flags); + __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags); fuse_request_send(ff->fc, ff->reserved_req); fuse_put_request(ff->fc, ff->reserved_req); kfree(ff); @@ -426,7 +426,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - req->force = 1; + __set_bit(FR_FORCE, &req->flags); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); @@ -528,6 +528,17 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) } } +static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io) +{ + if (io->err) + return io->err; + + if (io->bytes >= 0 && io->write) + return -EIO; + + return io->bytes < 0 ? io->size : io->bytes; +} + /** * In case of short read, the caller sets 'pos' to the position of * actual end of fuse request in IO request. Otherwise, if bytes_requested @@ -546,6 +557,7 @@ static void fuse_release_user_pages(struct fuse_req *req, int write) */ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) { + bool is_sync = is_sync_kiocb(io->iocb); int left; spin_lock(&io->lock); @@ -555,30 +567,24 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos) io->bytes = pos; left = --io->reqs; + if (!left && is_sync) + complete(io->done); spin_unlock(&io->lock); - if (!left) { - long res; + if (!left && !is_sync) { + ssize_t res = fuse_get_res_by_io(io); - if (io->err) - res = io->err; - else if (io->bytes >= 0 && io->write) - res = -EIO; - else { - res = io->bytes < 0 ? io->size : io->bytes; - - if (!is_sync_kiocb(io->iocb)) { - struct inode *inode = file_inode(io->iocb->ki_filp); - struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); + if (res >= 0) { + struct inode *inode = file_inode(io->iocb->ki_filp); + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - spin_unlock(&fc->lock); - } + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + spin_unlock(&fc->lock); } - aio_complete(io->iocb, res, 0); + io->iocb->ki_complete(io->iocb, res, 0); kfree(io); } } @@ -1139,13 +1145,11 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; - size_t count = iov_iter_count(from); ssize_t written = 0; ssize_t written_buffered = 0; struct inode *inode = mapping->host; ssize_t err; loff_t endbyte = 0; - loff_t pos = iocb->ki_pos; if (get_fuse_conn(inode)->writeback_cache) { /* Update size (EOF optimization) and mode (SUID clearing) */ @@ -1161,15 +1165,11 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) + err = generic_write_checks(iocb, from); + if (err <= 0) goto out; - if (count == 0) - goto out; - - iov_iter_truncate(from, count); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; @@ -1177,7 +1177,8 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err) goto out; - if (file->f_flags & O_DIRECT) { + if (iocb->ki_flags & IOCB_DIRECT) { + loff_t pos = iocb->ki_pos; written = generic_file_direct_write(iocb, from, pos); if (written < 0 || !iov_iter_count(from)) goto out; @@ -1203,9 +1204,9 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) written += written_buffered; iocb->ki_pos = pos + written_buffered; } else { - written = fuse_perform_write(file, mapping, from, pos); + written = fuse_perform_write(file, mapping, from, iocb->ki_pos); if (written >= 0) - iocb->ki_pos = pos + written; + iocb->ki_pos += written; } out: current->backing_dev_info = NULL; @@ -1395,55 +1396,30 @@ static ssize_t __fuse_direct_read(struct fuse_io_priv *io, return res; } -static ssize_t fuse_direct_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct fuse_io_priv io = { .async = 0, .file = file }; - struct iovec iov = { .iov_base = buf, .iov_len = count }; - struct iov_iter ii; - iov_iter_init(&ii, READ, &iov, 1, count); - return __fuse_direct_read(&io, &ii, ppos); -} - -static ssize_t __fuse_direct_write(struct fuse_io_priv *io, - struct iov_iter *iter, - loff_t *ppos) +static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct file *file = io->file; - struct inode *inode = file_inode(file); - size_t count = iov_iter_count(iter); - ssize_t res; - - - res = generic_write_checks(file, ppos, &count, 0); - if (!res) { - iov_iter_truncate(iter, count); - res = fuse_direct_io(io, iter, ppos, FUSE_DIO_WRITE); - } - - fuse_invalidate_attr(inode); - - return res; + struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp }; + return __fuse_direct_read(&io, to, &iocb->ki_pos); } -static ssize_t fuse_direct_write(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) +static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - ssize_t res; struct fuse_io_priv io = { .async = 0, .file = file }; - struct iov_iter ii; - iov_iter_init(&ii, WRITE, &iov, 1, count); + ssize_t res; if (is_bad_inode(inode)) return -EIO; /* Don't allow parallel writes to the same file */ mutex_lock(&inode->i_mutex); - res = __fuse_direct_write(&io, &ii, ppos); + res = generic_write_checks(iocb, from); if (res > 0) - fuse_write_update_size(inode, *ppos); + res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); + fuse_invalidate_attr(inode); + if (res > 0) + fuse_write_update_size(inode, iocb->ki_pos); mutex_unlock(&inode->i_mutex); return res; @@ -1469,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(&req->writepages_entry); for (i = 0; i < req->num_pages; i++) { - dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); } @@ -1635,7 +1611,8 @@ static int fuse_writepage_locked(struct page *page) if (!req) goto err; - req->background = 1; /* writeback always goes to bg_queue */ + /* writeback always goes to bg_queue */ + __set_bit(FR_BACKGROUND, &req->flags); tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) goto err_free; @@ -1658,7 +1635,7 @@ static int fuse_writepage_locked(struct page *page) req->end = fuse_writepage_end; req->inode = inode; - inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); @@ -1766,16 +1743,15 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, } } - if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || - old_req->state == FUSE_REQ_PENDING)) { + if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) { struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); copy_highpage(old_req->pages[0], page); spin_unlock(&fc->lock); - dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); goto out; @@ -1854,7 +1830,7 @@ static int fuse_writepages_fill(struct page *page, req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; req->misc.write.next = NULL; req->in.argpages = 1; - req->background = 1; + __set_bit(FR_BACKGROUND, &req->flags); req->num_pages = 0; req->end = fuse_writepage_end; req->inode = inode; @@ -1872,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page, req->page_descs[req->num_pages].offset = 0; req->page_descs[req->num_pages].length = PAGE_SIZE; - inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; @@ -2798,9 +2774,9 @@ static inline loff_t fuse_round_up(loff_t off) } static ssize_t -fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { + DECLARE_COMPLETION_ONSTACK(wait); ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; @@ -2815,15 +2791,15 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, inode = file->f_mapping->host; i_size = i_size_read(inode); - if ((rw == READ) && (offset > i_size)) + if ((iov_iter_rw(iter) == READ) && (offset > i_size)) return 0; /* optimization for short read */ - if (async_dio && rw != WRITE && offset + count > i_size) { + if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) { if (offset >= i_size) return 0; - count = min_t(loff_t, count, fuse_round_up(i_size - offset)); - iov_iter_truncate(iter, count); + iov_iter_truncate(iter, fuse_round_up(i_size - offset)); + count = iov_iter_count(iter); } io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); @@ -2834,7 +2810,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, io->bytes = -1; io->size = 0; io->offset = offset; - io->write = (rw == WRITE); + io->write = (iov_iter_rw(iter) == WRITE); io->err = 0; io->file = file; /* @@ -2849,13 +2825,19 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, * to wait on real async I/O requests, so we must submit this request * synchronously. */ - if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) + if (!is_sync_kiocb(iocb) && (offset + count > i_size) && + iov_iter_rw(iter) == WRITE) io->async = false; - if (rw == WRITE) - ret = __fuse_direct_write(io, iter, &pos); - else + if (io->async && is_sync_kiocb(iocb)) + io->done = &wait; + + if (iov_iter_rw(iter) == WRITE) { + ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE); + fuse_invalidate_attr(inode); + } else { ret = __fuse_direct_read(io, iter, &pos); + } if (io->async) { fuse_aio_complete(io, ret < 0 ? ret : 0, -1); @@ -2864,12 +2846,13 @@ fuse_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; - ret = wait_on_sync_kiocb(iocb); - } else { - kfree(io); + wait_for_completion(&wait); + ret = fuse_get_res_by_io(io); } - if (rw == WRITE) { + kfree(io); + + if (iov_iter_rw(iter) == WRITE) { if (ret > 0) fuse_write_update_size(inode, pos); else if (ret < 0 && offset + count > i_size) @@ -2957,9 +2940,7 @@ out: static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, - .read = new_sync_read, .read_iter = fuse_file_read_iter, - .write = new_sync_write, .write_iter = fuse_file_write_iter, .mmap = fuse_file_mmap, .open = fuse_open, @@ -2977,8 +2958,8 @@ static const struct file_operations fuse_file_operations = { static const struct file_operations fuse_direct_io_file_operations = { .llseek = fuse_file_llseek, - .read = fuse_direct_read, - .write = fuse_direct_write, + .read_iter = fuse_direct_read_iter, + .write_iter = fuse_direct_write_iter, .mmap = fuse_direct_mmap, .open = fuse_open, .flush = fuse_flush, diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1cdfb07c1376..405113101db8 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -241,16 +241,6 @@ struct fuse_args { #define FUSE_ARGS(args) struct fuse_args args = {} -/** The request state */ -enum fuse_req_state { - FUSE_REQ_INIT = 0, - FUSE_REQ_PENDING, - FUSE_REQ_READING, - FUSE_REQ_SENT, - FUSE_REQ_WRITING, - FUSE_REQ_FINISHED -}; - /** The request IO state (for asynchronous processing) */ struct fuse_io_priv { int async; @@ -263,10 +253,44 @@ struct fuse_io_priv { int err; struct kiocb *iocb; struct file *file; + struct completion *done; +}; + +/** + * Request flags + * + * FR_ISREPLY: set if the request has reply + * FR_FORCE: force sending of the request even if interrupted + * FR_BACKGROUND: request is sent in the background + * FR_WAITING: request is counted as "waiting" + * FR_ABORTED: the request was aborted + * FR_INTERRUPTED: the request has been interrupted + * FR_LOCKED: data is being copied to/from the request + * FR_PENDING: request is not yet in userspace + * FR_SENT: request is in userspace, waiting for an answer + * FR_FINISHED: request is finished + * FR_PRIVATE: request is on private list + */ +enum fuse_req_flag { + FR_ISREPLY, + FR_FORCE, + FR_BACKGROUND, + FR_WAITING, + FR_ABORTED, + FR_INTERRUPTED, + FR_LOCKED, + FR_PENDING, + FR_SENT, + FR_FINISHED, + FR_PRIVATE, }; /** * A request to the client + * + * .waitq.lock protects the following fields: + * - FR_ABORTED + * - FR_LOCKED (may also be modified under fc->lock, tested under both) */ struct fuse_req { /** This can be on either pending processing or io lists in @@ -282,35 +306,8 @@ struct fuse_req { /** Unique ID for the interrupt request */ u64 intr_unique; - /* - * The following bitfields are either set once before the - * request is queued or setting/clearing them is protected by - * fuse_conn->lock - */ - - /** True if the request has reply */ - unsigned isreply:1; - - /** Force sending of the request even if interrupted */ - unsigned force:1; - - /** The request was aborted */ - unsigned aborted:1; - - /** Request is sent in the background */ - unsigned background:1; - - /** The request has been interrupted */ - unsigned interrupted:1; - - /** Data is being copied to/from the request */ - unsigned locked:1; - - /** Request is counted as "waiting" */ - unsigned waiting:1; - - /** State of the request */ - enum fuse_req_state state; + /* Request flags, updated with test/set/clear_bit() */ + unsigned long flags; /** The request input */ struct fuse_in in; @@ -379,6 +376,61 @@ struct fuse_req { struct file *stolen_file; }; +struct fuse_iqueue { + /** Connection established */ + unsigned connected; + + /** Readers of the connection are waiting on this */ + wait_queue_head_t waitq; + + /** The next unique request id */ + u64 reqctr; + + /** The list of pending requests */ + struct list_head pending; + + /** Pending interrupts */ + struct list_head interrupts; + + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; +}; + +struct fuse_pqueue { + /** Connection established */ + unsigned connected; + + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** The list of requests being processed */ + struct list_head processing; + + /** The list of requests under I/O */ + struct list_head io; +}; + +/** + * Fuse device instance + */ +struct fuse_dev { + /** Fuse connection for this device */ + struct fuse_conn *fc; + + /** Processing queue */ + struct fuse_pqueue pq; + + /** list entry on fc->devices */ + struct list_head entry; +}; + /** * A Fuse connection. * @@ -393,6 +445,9 @@ struct fuse_conn { /** Refcount */ atomic_t count; + /** Number of fuse_dev's */ + atomic_t dev_count; + struct rcu_head rcu; /** The user id for this mount */ @@ -410,17 +465,8 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; - /** Readers of the connection are waiting on this */ - wait_queue_head_t waitq; - - /** The list of pending requests */ - struct list_head pending; - - /** The list of requests being processed */ - struct list_head processing; - - /** The list of requests under I/O */ - struct list_head io; + /** Input queue */ + struct fuse_iqueue iq; /** The next unique kernel file handle */ u64 khctr; @@ -443,16 +489,6 @@ struct fuse_conn { /** The list of background requests set aside for later queuing */ struct list_head bg_queue; - /** Pending interrupts */ - struct list_head interrupts; - - /** Queue of pending forgets */ - struct fuse_forget_link forget_list_head; - struct fuse_forget_link *forget_list_tail; - - /** Batching of FORGET requests (positive indicates FORGET batch) */ - int forget_batch; - /** Flag indicating that INIT reply has been received. Allocating * any fuse request will be suspended until the flag is set */ int initialized; @@ -468,9 +504,6 @@ struct fuse_conn { /** waitq for reserved requests */ wait_queue_head_t reserved_req_waitq; - /** The next unique request id */ - u64 reqctr; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -593,9 +626,6 @@ struct fuse_conn { /** number of dentries used in the above array */ int ctl_ndents; - /** O_ASYNC requests */ - struct fasync_struct *fasync; - /** Key for lock owner ID scrambling */ u32 scramble_key[4]; @@ -613,6 +643,9 @@ struct fuse_conn { /** Read/write semaphore to hold when accessing sb. */ struct rw_semaphore killsb; + + /** List of device instances belonging to this connection */ + struct list_head devices; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -825,6 +858,9 @@ void fuse_conn_init(struct fuse_conn *fc); */ void fuse_conn_put(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); +void fuse_dev_free(struct fuse_dev *fud); + /** * Add connection to control filesystem */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e8799c11424b..2913db2a5b99 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -362,8 +362,8 @@ static void fuse_send_destroy(struct fuse_conn *fc) if (req && fc->conn_init) { fc->destroy_req = NULL; req->in.h.opcode = FUSE_DESTROY; - req->force = 1; - req->background = 0; + __set_bit(FR_FORCE, &req->flags); + __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(fc, req); fuse_put_request(fc, req); } @@ -421,7 +421,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) memset(&outarg, 0, sizeof(outarg)); args.in.numargs = 0; args.in.h.opcode = FUSE_STATFS; - args.in.h.nodeid = get_node_id(dentry->d_inode); + args.in.h.nodeid = get_node_id(d_inode(dentry)); args.out.numargs = 1; args.out.args[0].size = sizeof(outarg); args.out.args[0].value = &outarg; @@ -567,30 +567,46 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void fuse_iqueue_init(struct fuse_iqueue *fiq) +{ + memset(fiq, 0, sizeof(struct fuse_iqueue)); + init_waitqueue_head(&fiq->waitq); + INIT_LIST_HEAD(&fiq->pending); + INIT_LIST_HEAD(&fiq->interrupts); + fiq->forget_list_tail = &fiq->forget_list_head; + fiq->connected = 1; +} + +static void fuse_pqueue_init(struct fuse_pqueue *fpq) +{ + memset(fpq, 0, sizeof(struct fuse_pqueue)); + spin_lock_init(&fpq->lock); + INIT_LIST_HEAD(&fpq->processing); + INIT_LIST_HEAD(&fpq->io); + fpq->connected = 1; +} + void fuse_conn_init(struct fuse_conn *fc) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); init_rwsem(&fc->killsb); atomic_set(&fc->count, 1); - init_waitqueue_head(&fc->waitq); + atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); init_waitqueue_head(&fc->reserved_req_waitq); - INIT_LIST_HEAD(&fc->pending); - INIT_LIST_HEAD(&fc->processing); - INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->interrupts); + fuse_iqueue_init(&fc->iq); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); - fc->forget_list_tail = &fc->forget_list_head; + INIT_LIST_HEAD(&fc->devices); atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; fc->khctr = 0; fc->polled_files = RB_ROOT; - fc->reqctr = 0; fc->blocked = 0; fc->initialized = 0; + fc->connected = 1; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); } @@ -740,7 +756,7 @@ static struct dentry *fuse_fh_to_parent(struct super_block *sb, static struct dentry *fuse_get_parent(struct dentry *child) { - struct inode *child_inode = child->d_inode; + struct inode *child_inode = d_inode(child); struct fuse_conn *fc = get_fuse_conn(child_inode); struct inode *inode; struct dentry *parent; @@ -930,6 +946,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) static void fuse_free_conn(struct fuse_conn *fc) { + WARN_ON(!list_empty(&fc->devices)); kfree_rcu(fc, rcu); } @@ -975,8 +992,42 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) return 0; } +struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); + if (fud) { + fud->fc = fuse_conn_get(fc); + fuse_pqueue_init(&fud->pq); + + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); + } + + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc); + +void fuse_dev_free(struct fuse_dev *fud) +{ + struct fuse_conn *fc = fud->fc; + + if (fc) { + spin_lock(&fc->lock); + list_del(&fud->entry); + spin_unlock(&fc->lock); + + fuse_conn_put(fc); + } + kfree(fud); +} +EXPORT_SYMBOL_GPL(fuse_dev_free); + static int fuse_fill_super(struct super_block *sb, void *data, int silent) { + struct fuse_dev *fud; struct fuse_conn *fc; struct inode *root; struct fuse_mount_data d; @@ -1026,12 +1077,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_fput; fuse_conn_init(fc); + fc->release = fuse_free_conn; + + fud = fuse_dev_alloc(fc); + if (!fud) + goto err_put_conn; fc->dev = sb->s_dev; fc->sb = sb; err = fuse_bdi_init(fc, sb); if (err) - goto err_put_conn; + goto err_dev_free; sb->s_bdi = &fc->bdi; @@ -1040,7 +1096,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fc->dont_mask = 1; sb->s_flags |= MS_POSIXACL; - fc->release = fuse_free_conn; fc->flags = d.flags; fc->user_id = d.user_id; fc->group_id = d.group_id; @@ -1053,14 +1108,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) root = fuse_get_root_inode(sb, d.rootmode); root_dentry = d_make_root(root); if (!root_dentry) - goto err_put_conn; + goto err_dev_free; /* only now - we want root dentry with NULL ->d_op */ sb->s_d_op = &fuse_dentry_operations; init_req = fuse_request_alloc(0); if (!init_req) goto err_put_root; - init_req->background = 1; + __set_bit(FR_BACKGROUND, &init_req->flags); if (is_bdev) { fc->destroy_req = fuse_request_alloc(0); @@ -1079,8 +1134,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - fc->connected = 1; - file->private_data = fuse_conn_get(fc); + file->private_data = fud; mutex_unlock(&fuse_mutex); /* * atomic_dec_and_test() in fput() provides the necessary @@ -1099,6 +1153,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fuse_request_free(init_req); err_put_root: dput(root_dentry); + err_dev_free: + fuse_dev_free(fud); err_put_conn: fuse_bdi_destroy(fc); fuse_conn_put(fc); @@ -1238,7 +1294,6 @@ static void fuse_fs_cleanup(void) } static struct kobject *fuse_kobj; -static struct kobject *connections_kobj; static int fuse_sysfs_init(void) { @@ -1250,11 +1305,9 @@ static int fuse_sysfs_init(void) goto out_err; } - connections_kobj = kobject_create_and_add("connections", fuse_kobj); - if (!connections_kobj) { - err = -ENOMEM; + err = sysfs_create_mount_point(fuse_kobj, "connections"); + if (err) goto out_fuse_unregister; - } return 0; @@ -1266,7 +1319,7 @@ static int fuse_sysfs_init(void) static void fuse_sysfs_cleanup(void) { - kobject_put(connections_kobj); + sysfs_remove_mount_point(fuse_kobj, "connections"); kobject_put(fuse_kobj); } diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 7b3143064af1..1be3b061c05c 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -110,11 +110,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type) error = __gfs2_xattr_set(inode, name, data, len, 0, GFS2_EATYPE_SYS); if (error) goto out; - - if (acl) - set_cached_acl(inode, type, acl); - else - forget_cached_acl(inode, type); + set_cached_acl(inode, type, acl); out: kfree(data); return error; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 4ad4f94edebe..1caee0534587 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,7 +20,7 @@ #include <linux/swap.h> #include <linux/gfs2_ondisk.h> #include <linux/backing-dev.h> -#include <linux/aio.h> +#include <linux/uio.h> #include <trace/events/writeback.h> #include "gfs2.h" @@ -171,6 +171,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w /** * gfs2_jdata_writepage - Write complete page * @page: Page to write + * @wbc: The writeback control * * Returns: errno * @@ -221,9 +222,10 @@ static int gfs2_writepages(struct address_space *mapping, * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages * @mapping: The mapping * @wbc: The writeback control - * @writepage: The writepage function to call for each page * @pvec: The vector of pages * @nr_pages: The number of pages to write + * @end: End position + * @done_index: Page index * * Returns: non-zero if loop should terminate, zero otherwise */ @@ -333,8 +335,6 @@ continue_unlock: * gfs2_write_cache_jdata - Like write_cache_pages but different * @mapping: The mapping to write * @wbc: The writeback control - * @writepage: The writepage function to call - * @data: The data to pass to writepage * * The reason that we use our own function here is that we need to * start transactions before we grab page locks. This allows us @@ -588,6 +588,10 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, /** * gfs2_readpages - Read a bunch of pages at once + * @file: The file to read from + * @mapping: Address space info + * @pages: List of pages to read + * @nr_pages: Number of pages to read * * Some notes: * 1. This is only for readahead, so we can simply ignore any things @@ -671,12 +675,12 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (alloc_required) { struct gfs2_alloc_parms ap = { .aflags = 0, }; - error = gfs2_quota_lock_check(ip); + requested = data_blocks + ind_blocks; + ap.target = requested; + error = gfs2_quota_lock_check(ip, &ap); if (error) goto out_unlock; - requested = data_blocks + ind_blocks; - ap.target = requested; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_qunlock; @@ -853,7 +857,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, * @mapping: The address space to write to * @pos: The file position * @len: The length of the data - * @copied: + * @copied: How much was actually copied by the VFS * @page: The page that has been written * @fsdata: The fsdata (unused in GFS2) * @@ -1016,13 +1020,12 @@ out: /** * gfs2_ok_for_dio - check that dio is valid on this file * @ip: The inode - * @rw: READ or WRITE * @offset: The offset at which we are reading or writing * * Returns: 0 (to ignore the i/o request and thus fall back to buffered i/o) * 1 (to accept the i/o request) */ -static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) +static int gfs2_ok_for_dio(struct gfs2_inode *ip, loff_t offset) { /* * Should we return an error here? I can't see that O_DIRECT for @@ -1039,8 +1042,8 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) -static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t gfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; @@ -1061,7 +1064,7 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, rv = gfs2_glock_nq(&gh); if (rv) return rv; - rv = gfs2_ok_for_dio(ip, rw, offset); + rv = gfs2_ok_for_dio(ip, offset); if (rv != 1) goto out; /* dio not valid, fall back to buffered i/o */ @@ -1091,13 +1094,12 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, rv = filemap_write_and_wait_range(mapping, lstart, end); if (rv) goto out; - if (rw == WRITE) + if (iov_iter_rw(iter) == WRITE) truncate_inode_pages_range(mapping, lstart, end); } - rv = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, - iter, offset, - gfs2_get_block_direct, NULL, NULL, 0); + rv = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, + offset, gfs2_get_block_direct, NULL, NULL, 0); out: gfs2_glock_dq(&gh); gfs2_holder_uninit(&gh); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index f0b945ab853e..61296ecbd0e2 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1224,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size) if (gfs2_is_stuffed(ip) && (size > (sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)))) { - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, &ap); if (error) return error; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 589f4ea9381c..30822b148f3e 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -48,9 +48,9 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; parent = dget_parent(dentry); - sdp = GFS2_SB(parent->d_inode); - dip = GFS2_I(parent->d_inode); - inode = dentry->d_inode; + sdp = GFS2_SB(d_inode(parent)); + dip = GFS2_I(d_inode(parent)); + inode = d_inode(dentry); if (inode) { if (is_bad_inode(inode)) @@ -68,7 +68,7 @@ static int gfs2_drevalidate(struct dentry *dentry, unsigned int flags) goto fail; } - error = gfs2_dir_check(parent->d_inode, &dentry->d_name, ip); + error = gfs2_dir_check(d_inode(parent), &dentry->d_name, ip); switch (error) { case 0: if (!inode) @@ -113,10 +113,10 @@ static int gfs2_dentry_delete(const struct dentry *dentry) { struct gfs2_inode *ginode; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return 0; - ginode = GFS2_I(dentry->d_inode); + ginode = GFS2_I(d_inode(dentry)); if (!ginode->i_iopen_gh.gh_gl) return 0; diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index c41d255b6a7b..5d15e9498b48 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -49,7 +49,7 @@ static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len, fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF); *len = GFS2_SMALL_FH_SIZE; - if (!parent || inode == sb->s_root->d_inode) + if (!parent || inode == d_inode(sb->s_root)) return *len; ip = GFS2_I(parent); @@ -88,8 +88,8 @@ static int get_name_filldir(struct dir_context *ctx, const char *name, static int gfs2_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *dir = parent->d_inode; - struct inode *inode = child->d_inode; + struct inode *dir = d_inode(parent); + struct inode *inode = d_inode(child); struct gfs2_inode *dip, *ip; struct get_name_filldir gnfd = { .ctx.actor = get_name_filldir, @@ -128,7 +128,7 @@ static int gfs2_get_name(struct dentry *parent, char *name, static struct dentry *gfs2_get_parent(struct dentry *child) { - return d_obtain_alias(gfs2_lookupi(child->d_inode, &gfs2_qdotdot, 1)); + return d_obtain_alias(gfs2_lookupi(d_inode(child), &gfs2_qdotdot, 1)); } static struct dentry *gfs2_get_dentry(struct super_block *sb, diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 3e32bb8e2d7e..cf4ab89159f4 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,7 +25,6 @@ #include <asm/uaccess.h> #include <linux/dlm.h> #include <linux/dlm_plock.h> -#include <linux/aio.h> #include <linux/delay.h> #include "gfs2.h" @@ -181,7 +180,7 @@ void gfs2_set_inode_flags(struct inode *inode) flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) - inode->i_flags |= S_NOSEC; + flags |= S_NOSEC; if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) flags |= S_IMMUTABLE; if (ip->i_diskflags & GFS2_DIF_APPENDONLY) @@ -429,11 +428,11 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret) goto out_unlock; - ret = gfs2_quota_lock_check(ip); - if (ret) - goto out_unlock; gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ap.target = data_blocks + ind_blocks; + ret = gfs2_quota_lock_check(ip, &ap); + if (ret) + goto out_unlock; ret = gfs2_inplace_reserve(ip, &ap); if (ret) goto out_quota_unlock; @@ -710,7 +709,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) gfs2_size_hint(file, iocb->ki_pos, iov_iter_count(from)); - if (file->f_flags & O_APPEND) { + if (iocb->ki_flags & IOCB_APPEND) { struct gfs2_holder gh; ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); @@ -765,22 +764,30 @@ out: brelse(dibh); return error; } - -static void calc_max_reserv(struct gfs2_inode *ip, loff_t max, loff_t *len, - unsigned int *data_blocks, unsigned int *ind_blocks) +/** + * calc_max_reserv() - Reverse of write_calc_reserv. Given a number of + * blocks, determine how many bytes can be written. + * @ip: The inode in question. + * @len: Max cap of bytes. What we return in *len must be <= this. + * @data_blocks: Compute and return the number of data blocks needed + * @ind_blocks: Compute and return the number of indirect blocks needed + * @max_blocks: The total blocks available to work with. + * + * Returns: void, but @len, @data_blocks and @ind_blocks are filled in. + */ +static void calc_max_reserv(struct gfs2_inode *ip, loff_t *len, + unsigned int *data_blocks, unsigned int *ind_blocks, + unsigned int max_blocks) { + loff_t max = *len; const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int max_blocks = ip->i_rgd->rd_free_clone; unsigned int tmp, max_data = max_blocks - 3 * (sdp->sd_max_height - 1); for (tmp = max_data; tmp > sdp->sd_diptrs;) { tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); max_data -= tmp; } - /* This calculation isn't the exact reverse of gfs2_write_calc_reserve, - so it might end up with fewer data blocks */ - if (max_data <= *data_blocks) - return; + *data_blocks = max_data; *ind_blocks = max_blocks - max_data; *len = ((loff_t)max_data - 3) << sdp->sd_sb.sb_bsize_shift; @@ -797,7 +804,7 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_alloc_parms ap = { .aflags = 0, }; unsigned int data_blocks = 0, ind_blocks = 0, rblocks; - loff_t bytes, max_bytes; + loff_t bytes, max_bytes, max_blks = UINT_MAX; int error; const loff_t pos = offset; const loff_t count = len; @@ -819,6 +826,9 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t gfs2_size_hint(file, offset, len); + gfs2_write_calc_reserv(ip, PAGE_SIZE, &data_blocks, &ind_blocks); + ap.min_target = data_blocks + ind_blocks; + while (len > 0) { if (len < bytes) bytes = len; @@ -827,27 +837,41 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t offset += bytes; continue; } - error = gfs2_quota_lock_check(ip); + + /* We need to determine how many bytes we can actually + * fallocate without exceeding quota or going over the + * end of the fs. We start off optimistically by assuming + * we can write max_bytes */ + max_bytes = (len > max_chunk_size) ? max_chunk_size : len; + + /* Since max_bytes is most likely a theoretical max, we + * calculate a more realistic 'bytes' to serve as a good + * starting point for the number of bytes we may be able + * to write */ + gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + ap.target = data_blocks + ind_blocks; + + error = gfs2_quota_lock_check(ip, &ap); if (error) return error; -retry: - gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks); + /* ap.allowed tells us how many blocks quota will allow + * us to write. Check if this reduces max_blks */ + if (ap.allowed && ap.allowed < max_blks) + max_blks = ap.allowed; - ap.target = data_blocks + ind_blocks; error = gfs2_inplace_reserve(ip, &ap); - if (error) { - if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) { - bytes >>= 1; - bytes &= bsize_mask; - if (bytes == 0) - bytes = sdp->sd_sb.sb_bsize; - goto retry; - } + if (error) goto out_qunlock; - } - max_bytes = bytes; - calc_max_reserv(ip, (len > max_chunk_size)? max_chunk_size: len, - &max_bytes, &data_blocks, &ind_blocks); + + /* check if the selected rgrp limits our max_blks further */ + if (ap.allowed && ap.allowed < max_blks) + max_blks = ap.allowed; + + /* Almost done. Calculate bytes that can be written using + * max_blks. We also recompute max_bytes, data_blocks and + * ind_blocks */ + calc_max_reserv(ip, &max_bytes, &data_blocks, + &ind_blocks, max_blks); rblocks = RES_DINODE + ind_blocks + RES_STATFS + RES_QUOTA + RES_RG_HDR + gfs2_rg_blocks(ip, data_blocks + ind_blocks); @@ -893,7 +917,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le struct gfs2_holder gh; int ret; - if (mode & ~FALLOC_FL_KEEP_SIZE) + if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) return -EOPNOTSUPP; mutex_lock(&inode->i_mutex); @@ -931,6 +955,22 @@ out_uninit: return ret; } +static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + int error; + struct gfs2_inode *ip = GFS2_I(out->f_mapping->host); + + error = gfs2_rs_alloc(ip); + if (error) + return (ssize_t)error; + + gfs2_size_hint(out, *ppos, len); + + return iter_file_splice_write(pipe, out, ppos, len, flags); +} + #ifdef CONFIG_GFS2_FS_LOCKING_DLM /** @@ -1065,9 +1105,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl) const struct file_operations gfs2_file_fops = { .llseek = gfs2_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = gfs2_file_write_iter, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, @@ -1077,7 +1115,7 @@ const struct file_operations gfs2_file_fops = { .lock = gfs2_lock, .flock = gfs2_flock, .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, + .splice_write = gfs2_file_splice_write, .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, }; @@ -1097,9 +1135,7 @@ const struct file_operations gfs2_dir_fops = { const struct file_operations gfs2_file_fops_nolock = { .llseek = gfs2_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = gfs2_file_write_iter, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, @@ -1107,7 +1143,7 @@ const struct file_operations gfs2_file_fops_nolock = { .release = gfs2_release, .fsync = gfs2_fsync, .splice_read = generic_file_splice_read, - .splice_write = iter_file_splice_write, + .splice_write = gfs2_file_splice_write, .setlease = generic_setlease, .fallocate = gfs2_fallocate, }; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f42dffba056a..a38e38f7b6fc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1076,7 +1076,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh) !test_bit(GLF_DEMOTE, &gl->gl_flags)) fast_path = 1; } - if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl) && + (glops->go_flags & GLOF_LRU)) gfs2_glock_add_to_lru(gl); trace_gfs2_glock_queue(gh, 0); @@ -2047,34 +2048,41 @@ static const struct file_operations gfs2_sbstats_fops = { int gfs2_create_debugfs_file(struct gfs2_sbd *sdp) { - sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root); - if (!sdp->debugfs_dir) - return -ENOMEM; - sdp->debugfs_dentry_glocks = debugfs_create_file("glocks", - S_IFREG | S_IRUGO, - sdp->debugfs_dir, sdp, - &gfs2_glocks_fops); - if (!sdp->debugfs_dentry_glocks) + struct dentry *dent; + + dent = debugfs_create_dir(sdp->sd_table_name, gfs2_root); + if (IS_ERR_OR_NULL(dent)) + goto fail; + sdp->debugfs_dir = dent; + + dent = debugfs_create_file("glocks", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_glocks_fops); + if (IS_ERR_OR_NULL(dent)) goto fail; + sdp->debugfs_dentry_glocks = dent; - sdp->debugfs_dentry_glstats = debugfs_create_file("glstats", - S_IFREG | S_IRUGO, - sdp->debugfs_dir, sdp, - &gfs2_glstats_fops); - if (!sdp->debugfs_dentry_glstats) + dent = debugfs_create_file("glstats", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_glstats_fops); + if (IS_ERR_OR_NULL(dent)) goto fail; + sdp->debugfs_dentry_glstats = dent; - sdp->debugfs_dentry_sbstats = debugfs_create_file("sbstats", - S_IFREG | S_IRUGO, - sdp->debugfs_dir, sdp, - &gfs2_sbstats_fops); - if (!sdp->debugfs_dentry_sbstats) + dent = debugfs_create_file("sbstats", + S_IFREG | S_IRUGO, + sdp->debugfs_dir, sdp, + &gfs2_sbstats_fops); + if (IS_ERR_OR_NULL(dent)) goto fail; + sdp->debugfs_dentry_sbstats = dent; return 0; fail: gfs2_delete_debugfs_file(sdp); - return -ENOMEM; + return dent ? PTR_ERR(dent) : -ENOMEM; } void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) @@ -2100,6 +2108,8 @@ void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp) int gfs2_register_debugfs(void) { gfs2_root = debugfs_create_dir("gfs2", NULL); + if (IS_ERR(gfs2_root)) + return PTR_ERR(gfs2_root); return gfs2_root ? 0 : -ENOMEM; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index fe91951c3361..fa3fa5e94553 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -144,6 +144,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl) struct gfs2_rgrpd *rgd; int error; + spin_lock(&gl->gl_spin); + rgd = gl->gl_object; + if (rgd) + gfs2_rgrp_brelse(rgd); + spin_unlock(&gl->gl_spin); + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); @@ -175,15 +181,17 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { struct gfs2_sbd *sdp = gl->gl_sbd; struct address_space *mapping = &sdp->sd_aspace; + struct gfs2_rgrpd *rgd = gl->gl_object; + + if (rgd) + gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); - if (gl->gl_object) { - struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; + if (rgd) rgd->rd_flags &= ~GFS2_RDF_UPTODATE; - } } /** @@ -561,7 +569,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_lock = inode_go_lock, .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, - .go_flags = GLOF_ASPACE, + .go_flags = GLOF_ASPACE | GLOF_LRU, }; const struct gfs2_glock_operations gfs2_rgrp_glops = { @@ -584,10 +592,12 @@ const struct gfs2_glock_operations gfs2_freeze_glops = { const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, .go_callback = iopen_go_callback, + .go_flags = GLOF_LRU, }; const struct gfs2_glock_operations gfs2_flock_glops = { .go_type = LM_TYPE_FLOCK, + .go_flags = GLOF_LRU, }; const struct gfs2_glock_operations gfs2_nondisk_glops = { @@ -596,7 +606,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = { const struct gfs2_glock_operations gfs2_quota_glops = { .go_type = LM_TYPE_QUOTA, - .go_flags = GLOF_LVB, + .go_flags = GLOF_LVB | GLOF_LRU, }; const struct gfs2_glock_operations gfs2_journal_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 7a2dbbc0d634..a1ec7c20e498 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -225,6 +225,7 @@ struct gfs2_glock_operations { const unsigned long go_flags; #define GLOF_ASPACE 1 #define GLOF_LVB 2 +#define GLOF_LRU 4 }; enum { @@ -301,8 +302,10 @@ struct gfs2_blkreserv { * to the allocation code. */ struct gfs2_alloc_parms { - u32 target; + u64 target; + u32 min_target; u32 aflags; + u64 allowed; }; enum { @@ -430,6 +433,7 @@ enum { QDF_CHANGE = 1, QDF_LOCKED = 2, QDF_REFRESH = 3, + QDF_QMSG_QUIET = 4, }; struct gfs2_quota_data { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 73c72253faac..063fdfcf8275 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -295,7 +295,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, if ((name->len == 1 && memcmp(name->name, ".", 1) == 0) || (name->len == 2 && memcmp(name->name, "..", 2) == 0 && - dir == sb->s_root->d_inode)) { + dir == d_inode(sb->s_root))) { igrab(dir); return dir; } @@ -382,7 +382,7 @@ static int alloc_dinode(struct gfs2_inode *ip, u32 flags, unsigned *dblocks) struct gfs2_alloc_parms ap = { .target = *dblocks, .aflags = flags, }; int error; - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, &ap); if (error) goto out; @@ -525,7 +525,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name, int error; if (da->nr_blocks) { - error = gfs2_quota_lock_check(dip); + error = gfs2_quota_lock_check(dip, &ap); if (error) goto fail_quota_locks; @@ -687,7 +687,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, } gfs2_set_inode_flags(inode); - if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) || + if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) || (dip->i_diskflags & GFS2_DIF_TOPDIR)) aflags |= GFS2_AF_ORLOV; @@ -888,7 +888,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, { struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(dir); - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[2]; struct buffer_head *dibh; @@ -953,7 +953,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (da.nr_blocks) { struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; - error = gfs2_quota_lock_check(dip); + error = gfs2_quota_lock_check(dip, &ap); if (error) goto out_gunlock; @@ -1055,7 +1055,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, static int gfs2_unlink_inode(struct gfs2_inode *dip, const struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); int error; @@ -1091,7 +1091,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) { struct gfs2_inode *dip = GFS2_I(dir); struct gfs2_sbd *sdp = GFS2_SB(dir); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[3]; struct gfs2_rgrpd *rgd; @@ -1227,8 +1227,8 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, */ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, - umode_t mode, int *opened) + struct file *file, unsigned flags, + umode_t mode, int *opened) { struct dentry *d; bool excl = !!(flags & O_EXCL); @@ -1241,7 +1241,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, return PTR_ERR(d); if (d != NULL) dentry = d; - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { if (!(*opened & FILE_OPENED)) return finish_no_open(file, d); dput(d); @@ -1282,7 +1282,7 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) error = -EINVAL; break; } - if (dir == sb->s_root->d_inode) { + if (dir == d_inode(sb->s_root)) { error = 0; break; } @@ -1307,6 +1307,35 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) } /** + * update_moved_ino - Update an inode that's being moved + * @ip: The inode being moved + * @ndip: The parent directory of the new filename + * @dir_rename: True of ip is a directory + * + * Returns: errno + */ + +static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip, + int dir_rename) +{ + int error; + struct buffer_head *dibh; + + if (dir_rename) + return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + return 0; +} + + +/** * gfs2_rename - Rename a file * @odir: Parent directory of old file name * @odentry: The old dentry of the file @@ -1321,7 +1350,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, { struct gfs2_inode *odip = GFS2_I(odir); struct gfs2_inode *ndip = GFS2_I(ndir); - struct gfs2_inode *ip = GFS2_I(odentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(odentry)); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; @@ -1332,8 +1361,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, unsigned int x; int error; - if (ndentry->d_inode) { - nip = GFS2_I(ndentry->d_inode); + if (d_really_is_positive(ndentry)) { + nip = GFS2_I(d_inode(ndentry)); if (ip == nip) return 0; } @@ -1354,7 +1383,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (S_ISDIR(ip->i_inode.i_mode)) { dir_rename = 1; - /* don't move a dirctory into it's subdir */ + /* don't move a directory into its subdir */ error = gfs2_ok_to_move(ip, ndip); if (error) goto out_gunlock_r; @@ -1457,7 +1486,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, /* Check out the dir to be renamed */ if (dir_rename) { - error = gfs2_permission(odentry->d_inode, MAY_WRITE); + error = gfs2_permission(d_inode(odentry), MAY_WRITE); if (error) goto out_gunlock; } @@ -1470,7 +1499,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (da.nr_blocks) { struct gfs2_alloc_parms ap = { .target = da.nr_blocks, }; - error = gfs2_quota_lock_check(ndip); + error = gfs2_quota_lock_check(ndip, &ap); if (error) goto out_gunlock; @@ -1494,20 +1523,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (nip) error = gfs2_unlink_inode(ndip, ndentry); - if (dir_rename) { - error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); - if (error) - goto out_end_trans; - } else { - struct buffer_head *dibh; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_end_trans; - ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } + error = update_moved_ino(ip, ndip, dir_rename); + if (error) + goto out_end_trans; error = gfs2_dir_del(odip, odentry); if (error) @@ -1539,6 +1557,161 @@ out: } /** + * gfs2_exchange - exchange two files + * @odir: Parent directory of old file name + * @odentry: The old dentry of the file + * @ndir: Parent directory of new file name + * @ndentry: The new dentry of the file + * @flags: The rename flags + * + * Returns: errno + */ + +static int gfs2_exchange(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry, + unsigned int flags) +{ + struct gfs2_inode *odip = GFS2_I(odir); + struct gfs2_inode *ndip = GFS2_I(ndir); + struct gfs2_inode *oip = GFS2_I(odentry->d_inode); + struct gfs2_inode *nip = GFS2_I(ndentry->d_inode); + struct gfs2_sbd *sdp = GFS2_SB(odir); + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; + unsigned int num_gh; + unsigned int x; + umode_t old_mode = oip->i_inode.i_mode; + umode_t new_mode = nip->i_inode.i_mode; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + if (odip != ndip) { + error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, + 0, &r_gh); + if (error) + goto out; + + if (S_ISDIR(old_mode)) { + /* don't move a directory into its subdir */ + error = gfs2_ok_to_move(oip, ndip); + if (error) + goto out_gunlock_r; + } + + if (S_ISDIR(new_mode)) { + /* don't move a directory into its subdir */ + error = gfs2_ok_to_move(nip, odip); + if (error) + goto out_gunlock_r; + } + } + + num_gh = 1; + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + if (odip != ndip) { + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + } + gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(ghs + x); + if (error) + goto out_gunlock; + } + + error = -ENOENT; + if (oip->i_inode.i_nlink == 0 || nip->i_inode.i_nlink == 0) + goto out_gunlock; + + error = gfs2_unlink_ok(odip, &odentry->d_name, oip); + if (error) + goto out_gunlock; + error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); + if (error) + goto out_gunlock; + + if (S_ISDIR(old_mode)) { + error = gfs2_permission(odentry->d_inode, MAY_WRITE); + if (error) + goto out_gunlock; + } + if (S_ISDIR(new_mode)) { + error = gfs2_permission(ndentry->d_inode, MAY_WRITE); + if (error) + goto out_gunlock; + } + error = gfs2_trans_begin(sdp, 4 * RES_DINODE + 4 * RES_LEAF, 0); + if (error) + goto out_gunlock; + + error = update_moved_ino(oip, ndip, S_ISDIR(old_mode)); + if (error) + goto out_end_trans; + + error = update_moved_ino(nip, odip, S_ISDIR(new_mode)); + if (error) + goto out_end_trans; + + error = gfs2_dir_mvino(ndip, &ndentry->d_name, oip, + IF2DT(old_mode)); + if (error) + goto out_end_trans; + + error = gfs2_dir_mvino(odip, &odentry->d_name, nip, + IF2DT(new_mode)); + if (error) + goto out_end_trans; + + if (odip != ndip) { + if (S_ISDIR(new_mode) && !S_ISDIR(old_mode)) { + inc_nlink(&odip->i_inode); + drop_nlink(&ndip->i_inode); + } else if (S_ISDIR(old_mode) && !S_ISDIR(new_mode)) { + inc_nlink(&ndip->i_inode); + drop_nlink(&odip->i_inode); + } + } + mark_inode_dirty(&ndip->i_inode); + if (odip != ndip) + mark_inode_dirty(&odip->i_inode); + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock: + while (x--) { + gfs2_glock_dq(ghs + x); + gfs2_holder_uninit(ghs + x); + } +out_gunlock_r: + if (r_gh.gh_gl) + gfs2_glock_dq_uninit(&r_gh); +out: + return error; +} + +static int gfs2_rename2(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry, + unsigned int flags) +{ + flags &= ~RENAME_NOREPLACE; + + if (flags & ~RENAME_EXCHANGE) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + return gfs2_exchange(odir, odentry, ndir, ndentry, flags); + + return gfs2_rename(odir, odentry, ndir, ndentry); +} + +/** * gfs2_follow_link - Follow a symbolic link * @dentry: The dentry of the link * @nd: Data that we pass to vfs_follow_link() @@ -1548,9 +1721,9 @@ out: * Returns: 0 on success or error code */ -static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *gfs2_follow_link(struct dentry *dentry, void **cookie) { - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_holder i_gh; struct buffer_head *dibh; unsigned int size; @@ -1561,8 +1734,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) error = gfs2_glock_nq(&i_gh); if (error) { gfs2_holder_uninit(&i_gh); - nd_set_link(nd, ERR_PTR(error)); - return NULL; + return ERR_PTR(error); } size = (unsigned int)i_size_read(&ip->i_inode); @@ -1586,8 +1758,9 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) brelse(dibh); out: gfs2_glock_dq_uninit(&i_gh); - nd_set_link(nd, buf); - return NULL; + if (!IS_ERR(buf)) + *cookie = buf; + return buf; } /** @@ -1669,6 +1842,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) kuid_t ouid, nuid; kgid_t ogid, ngid; int error; + struct gfs2_alloc_parms ap; ouid = inode->i_uid; ogid = inode->i_gid; @@ -1696,9 +1870,11 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (error) goto out; + ap.target = gfs2_get_inode_blocks(&ip->i_inode); + if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { - error = gfs2_quota_check(ip, nuid, ngid); + error = gfs2_quota_check(ip, nuid, ngid, &ap); if (error) goto out_gunlock_q; } @@ -1713,9 +1889,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { - u64 blocks = gfs2_get_inode_blocks(&ip->i_inode); - gfs2_quota_change(ip, -blocks, ouid, ogid); - gfs2_quota_change(ip, blocks, nuid, ngid); + gfs2_quota_change(ip, -(s64)ap.target, ouid, ogid); + gfs2_quota_change(ip, ap.target, nuid, ngid); } out_end_trans: @@ -1740,7 +1915,7 @@ out: static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder i_gh; int error; @@ -1796,7 +1971,7 @@ out: static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int error; @@ -1819,7 +1994,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, static int gfs2_setxattr(struct dentry *dentry, const char *name, const void *data, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; @@ -1839,7 +2014,7 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name, static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, void *data, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; @@ -1860,7 +2035,7 @@ static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, static int gfs2_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; @@ -1941,7 +2116,7 @@ const struct inode_operations gfs2_dir_iops = { .mkdir = gfs2_mkdir, .rmdir = gfs2_unlink, .mknod = gfs2_mknod, - .rename = gfs2_rename, + .rename2 = gfs2_rename2, .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index efc8e254787c..1e3a93f2f71d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -647,7 +647,7 @@ out_unlock: static int init_journal(struct gfs2_sbd *sdp, int undo) { - struct inode *master = sdp->sd_master_dir->d_inode; + struct inode *master = d_inode(sdp->sd_master_dir); struct gfs2_holder ji_gh; struct gfs2_inode *ip; int jindex = 1; @@ -756,6 +756,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) } } + sdp->sd_log_idle = 1; set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); gfs2_glock_dq_uninit(&ji_gh); jindex = 0; @@ -782,7 +783,7 @@ static struct lock_class_key gfs2_quota_imutex_key; static int init_inodes(struct gfs2_sbd *sdp, int undo) { int error = 0; - struct inode *master = sdp->sd_master_dir->d_inode; + struct inode *master = d_inode(sdp->sd_master_dir); if (undo) goto fail_qinode; @@ -848,7 +849,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) char buf[30]; int error = 0; struct gfs2_inode *ip; - struct inode *master = sdp->sd_master_dir->d_inode; + struct inode *master = d_inode(sdp->sd_master_dir); if (sdp->sd_args.ar_spectator) return 0; @@ -1357,7 +1358,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, return ERR_PTR(error); } s = sget(&gfs2_fs_type, test_gfs2_super, set_meta_super, flags, - path.dentry->d_inode->i_sb->s_bdev); + d_inode(path.dentry)->i_sb->s_bdev); path_put(&path); if (IS_ERR(s)) { pr_warn("gfs2 mount does not exist\n"); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3aa17d4d1cfc..9b61f92fcfdf 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -649,9 +649,117 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) slot_hold(qd); } + if (change < 0) /* Reset quiet flag if we freed some blocks */ + clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); mutex_unlock(&sdp->sd_quota_mutex); } +static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, + unsigned off, void *buf, unsigned bytes) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct page *page; + struct buffer_head *bh; + void *kaddr; + u64 blk; + unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; + unsigned to_write = bytes, pg_off = off; + int done = 0; + + blk = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift); + boff = off % bsize; + + page = find_or_create_page(mapping, index, GFP_NOFS); + if (!page) + return -ENOMEM; + if (!page_has_buffers(page)) + create_empty_buffers(page, bsize, 0); + + bh = page_buffers(page); + while (!done) { + /* Find the beginning block within the page */ + if (pg_off >= ((bnum * bsize) + bsize)) { + bh = bh->b_this_page; + bnum++; + blk++; + continue; + } + if (!buffer_mapped(bh)) { + gfs2_block_map(inode, blk, bh, 1); + if (!buffer_mapped(bh)) + goto unlock_out; + /* If it's a newly allocated disk block, zero it */ + if (buffer_new(bh)) + zero_user(page, bnum * bsize, bh->b_size); + } + if (PageUptodate(page)) + set_buffer_uptodate(bh); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ | REQ_META, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + goto unlock_out; + } + gfs2_trans_add_data(ip->i_gl, bh); + + /* If we need to write to the next block as well */ + if (to_write > (bsize - boff)) { + pg_off += (bsize - boff); + to_write -= (bsize - boff); + boff = pg_off % bsize; + continue; + } + done = 1; + } + + /* Write to the page, now that we have setup the buffer(s) */ + kaddr = kmap_atomic(page); + memcpy(kaddr + off, buf, bytes); + flush_dcache_page(page); + kunmap_atomic(kaddr); + unlock_page(page); + page_cache_release(page); + + return 0; + +unlock_out: + unlock_page(page); + page_cache_release(page); + return -EIO; +} + +static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, + loff_t loc) +{ + unsigned long pg_beg; + unsigned pg_off, nbytes, overflow = 0; + int pg_oflow = 0, error; + void *ptr; + + nbytes = sizeof(struct gfs2_quota); + + pg_beg = loc >> PAGE_CACHE_SHIFT; + pg_off = loc % PAGE_CACHE_SIZE; + + /* If the quota straddles a page boundary, split the write in two */ + if ((pg_off + nbytes) > PAGE_CACHE_SIZE) { + pg_oflow = 1; + overflow = (pg_off + nbytes) - PAGE_CACHE_SIZE; + } + + ptr = qp; + error = gfs2_write_buf_to_page(ip, pg_beg, pg_off, ptr, + nbytes - overflow); + /* If there's an overflow, write the remaining bytes to the next page */ + if (!error && pg_oflow) + error = gfs2_write_buf_to_page(ip, pg_beg + 1, 0, + ptr + nbytes - overflow, + overflow); + return error; +} + /** * gfs2_adjust_quota - adjust record of current block usage * @ip: The quota inode @@ -672,15 +780,8 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, { struct inode *inode = &ip->i_inode; struct gfs2_sbd *sdp = GFS2_SB(inode); - struct address_space *mapping = inode->i_mapping; - unsigned long index = loc >> PAGE_CACHE_SHIFT; - unsigned offset = loc & (PAGE_CACHE_SIZE - 1); - unsigned blocksize, iblock, pos; - struct buffer_head *bh; - struct page *page; - void *kaddr, *ptr; struct gfs2_quota q; - int err, nbytes; + int err; u64 size; if (gfs2_is_stuffed(ip)) { @@ -694,8 +795,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, if (err < 0) return err; + loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */ err = -EIO; be64_add_cpu(&q.qu_value, change); + if (((s64)be64_to_cpu(q.qu_value)) < 0) + q.qu_value = 0; /* Never go negative on quota usage */ qd->qd_qb.qb_value = q.qu_value; if (fdq) { if (fdq->d_fieldmask & QC_SPC_SOFT) { @@ -712,79 +816,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, } } - /* Write the quota into the quota file on disk */ - ptr = &q; - nbytes = sizeof(struct gfs2_quota); -get_a_page: - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) - return -ENOMEM; - - blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - - bh = page_buffers(page); - pos = blocksize; - while (offset >= pos) { - bh = bh->b_this_page; - iblock++; - pos += blocksize; - } - - if (!buffer_mapped(bh)) { - gfs2_block_map(inode, iblock, bh, 1); - if (!buffer_mapped(bh)) - goto unlock_out; - /* If it's a newly allocated disk block for quota, zero it */ - if (buffer_new(bh)) - zero_user(page, pos - blocksize, bh->b_size); - } - - if (PageUptodate(page)) - set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - ll_rw_block(READ | REQ_META, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - goto unlock_out; - } - - gfs2_trans_add_data(ip->i_gl, bh); - - kaddr = kmap_atomic(page); - if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) - nbytes = PAGE_CACHE_SIZE - offset; - memcpy(kaddr + offset, ptr, nbytes); - flush_dcache_page(page); - kunmap_atomic(kaddr); - unlock_page(page); - page_cache_release(page); - - /* If quota straddles page boundary, we need to update the rest of the - * quota at the beginning of the next page */ - if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) { - ptr = ptr + nbytes; - nbytes = sizeof(struct gfs2_quota) - nbytes; - offset = 0; - index++; - goto get_a_page; + err = gfs2_write_disk_quota(ip, &q, loc); + if (!err) { + size = loc + sizeof(struct gfs2_quota); + if (size > inode->i_size) + i_size_write(inode, size); + inode->i_mtime = inode->i_atime = CURRENT_TIME; + mark_inode_dirty(inode); + set_bit(QDF_REFRESH, &qd->qd_flags); } - size = loc + sizeof(struct gfs2_quota); - if (size > inode->i_size) - i_size_write(inode, size); - inode->i_mtime = inode->i_atime = CURRENT_TIME; - mark_inode_dirty(inode); - set_bit(QDF_REFRESH, &qd->qd_flags); - return 0; - -unlock_out: - unlock_page(page); - page_cache_release(page); return err; } @@ -923,6 +964,9 @@ restart: if (error) return error; + if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) + force_refresh = FORCE; + qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { @@ -974,11 +1018,8 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) sizeof(struct gfs2_quota_data *), sort_qd, NULL); for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { - int force = NO_FORCE; qd = ip->i_res->rs_qa_qd[x]; - if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) - force = FORCE; - error = do_glock(qd, force, &ip->i_res->rs_qa_qd_ghs[x]); + error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]); if (error) break; } @@ -1094,14 +1135,33 @@ static int print_message(struct gfs2_quota_data *qd, char *type) return 0; } -int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) +/** + * gfs2_quota_check - check if allocating new blocks will exceed quota + * @ip: The inode for which this check is being performed + * @uid: The uid to check against + * @gid: The gid to check against + * @ap: The allocation parameters. ap->target contains the requested + * blocks. ap->min_target, if set, contains the minimum blks + * requested. + * + * Returns: 0 on success. + * min_req = ap->min_target ? ap->min_target : ap->target; + * quota must allow atleast min_req blks for success and + * ap->allowed is set to the number of blocks allowed + * + * -EDQUOT otherwise, quota violation. ap->allowed is set to number + * of blocks available. + */ +int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, + struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; - s64 value; + s64 value, warn, limit; unsigned int x; int error = 0; + ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) return 0; @@ -1115,30 +1175,40 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) qid_eq(qd->qd_id, make_kqid_gid(gid)))) continue; + warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn); + limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit); value = (s64)be64_to_cpu(qd->qd_qb.qb_value); spin_lock(&qd_lock); value += qd->qd_change; spin_unlock(&qd_lock); - if (be64_to_cpu(qd->qd_qb.qb_limit) && (s64)be64_to_cpu(qd->qd_qb.qb_limit) < value) { - print_message(qd, "exceeded"); - quota_send_warning(qd->qd_id, - sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); - - error = -EDQUOT; - break; - } else if (be64_to_cpu(qd->qd_qb.qb_warn) && - (s64)be64_to_cpu(qd->qd_qb.qb_warn) < value && + if (limit > 0 && (limit - value) < ap->allowed) + ap->allowed = limit - value; + /* If we can't meet the target */ + if (limit && limit < (value + (s64)ap->target)) { + /* If no min_target specified or we don't meet + * min_target, return -EDQUOT */ + if (!ap->min_target || ap->min_target > ap->allowed) { + if (!test_and_set_bit(QDF_QMSG_QUIET, + &qd->qd_flags)) { + print_message(qd, "exceeded"); + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, + QUOTA_NL_BHARDWARN); + } + error = -EDQUOT; + break; + } + } else if (warn && warn < value && time_after_eq(jiffies, qd->qd_last_warn + - gfs2_tune_get(sdp, - gt_quota_warn_period) * HZ)) { + gfs2_tune_get(sdp, gt_quota_warn_period) + * HZ)) { quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); error = print_message(qd, "warning"); qd->qd_last_warn = jiffies; } } - return error; } @@ -1468,32 +1538,34 @@ int gfs2_quotad(void *data) return 0; } -static int gfs2_quota_get_xstate(struct super_block *sb, - struct fs_quota_stat *fqs) +static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state) { struct gfs2_sbd *sdp = sb->s_fs_info; - memset(fqs, 0, sizeof(struct fs_quota_stat)); - fqs->qs_version = FS_QSTAT_VERSION; + memset(state, 0, sizeof(*state)); switch (sdp->sd_args.ar_quota) { case GFS2_QUOTA_ON: - fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD); + state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; + state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; /*FALLTHRU*/ case GFS2_QUOTA_ACCOUNT: - fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT); + state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED | + QCI_SYSFILE; + state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED | + QCI_SYSFILE; break; case GFS2_QUOTA_OFF: break; } - if (sdp->sd_quota_inode) { - fqs->qs_uquota.qfs_ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; - fqs->qs_uquota.qfs_nblks = sdp->sd_quota_inode->i_blocks; + state->s_state[USRQUOTA].ino = + GFS2_I(sdp->sd_quota_inode)->i_no_addr; + state->s_state[USRQUOTA].blocks = sdp->sd_quota_inode->i_blocks; } - fqs->qs_uquota.qfs_nextents = 1; /* unsupported */ - fqs->qs_gquota = fqs->qs_uquota; /* its the same inode in both cases */ - fqs->qs_incoredqs = list_lru_count(&gfs2_qd_lru); + state->s_state[USRQUOTA].nextents = 1; /* unsupported */ + state->s_state[GRPQUOTA] = state->s_state[USRQUOTA]; + state->s_incoredqs = list_lru_count(&gfs2_qd_lru); return 0; } @@ -1620,6 +1692,8 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, /* Apply changes */ error = gfs2_adjust_quota(ip, offset, 0, qd, fdq); + if (!error) + clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); gfs2_trans_end(sdp); out_release: @@ -1638,7 +1712,7 @@ out_put: const struct quotactl_ops gfs2_quotactl_ops = { .quota_sync = gfs2_quota_sync, - .get_xstate = gfs2_quota_get_xstate, + .get_state = gfs2_quota_get_state, .get_dqblk = gfs2_get_dqblk, .set_dqblk = gfs2_set_dqblk, }; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 55d506eb3c4a..ad04b3acae2b 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -24,7 +24,8 @@ extern void gfs2_quota_unhold(struct gfs2_inode *ip); extern int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_unlock(struct gfs2_inode *ip); -extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); +extern int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, + struct gfs2_alloc_parms *ap); extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, kuid_t uid, kgid_t gid); @@ -37,7 +38,8 @@ extern int gfs2_quotad(void *data); extern void gfs2_wake_up_statfs(struct gfs2_sbd *sdp); -static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) +static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, + struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); int ret; @@ -48,7 +50,7 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) return ret; if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid); + ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid, ap); if (ret) gfs2_quota_unlock(ip); return ret; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 9150207f365c..c6c62321dfd6 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -978,10 +978,10 @@ static void set_rgrp_preferences(struct gfs2_sbd *sdp) rgd->rd_flags |= GFS2_RDF_PREFERRED; for (i = 0; i < sdp->sd_journals; i++) { rgd = gfs2_rgrpd_get_next(rgd); - if (rgd == first) + if (!rgd || rgd == first) break; } - } while (rgd != first); + } while (rgd && rgd != first); } /** @@ -1244,14 +1244,13 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh) } /** - * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get() - * @gh: The glock holder for the resource group + * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * @rgd: The resource group * */ -void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) +void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd) { - struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; int x, length = rgd->rd_length; for (x = 0; x < length; x++) { @@ -1264,6 +1263,22 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) } +/** + * gfs2_rgrp_go_unlock - Unlock a rgrp glock + * @gh: The glock holder for the resource group + * + */ + +void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) +{ + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; + int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) | + test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags); + + if (rgd && demote_requested) + gfs2_rgrp_brelse(rgd); +} + int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed) @@ -1711,10 +1726,8 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, return ret; bitmap_full: /* Mark bitmap as full and fall through */ - if ((state == GFS2_BLKST_FREE) && initial_offset == 0) { - struct gfs2_bitmap *bi = rbm_bi(rbm); + if ((state == GFS2_BLKST_FREE) && initial_offset == 0) set_bit(GBF_FULL, &bi->bi_flags); - } next_bitmap: /* Find next bitmap in the rgrp */ rbm->offset = 0; @@ -1850,14 +1863,23 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) const struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_lkstats *st; s64 r_dcount, l_dcount; - s64 r_srttb, l_srttb; + s64 l_srttb, a_srttb = 0; s64 srttb_diff; s64 sqr_diff; s64 var; + int cpu, nonzero = 0; preempt_disable(); + for_each_present_cpu(cpu) { + st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP]; + if (st->stats[GFS2_LKS_SRTTB]) { + a_srttb += st->stats[GFS2_LKS_SRTTB]; + nonzero++; + } + } st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP]; - r_srttb = st->stats[GFS2_LKS_SRTTB]; + if (nonzero) + do_div(a_srttb, nonzero); r_dcount = st->stats[GFS2_LKS_DCOUNT]; var = st->stats[GFS2_LKS_SRTTVARB] + gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; @@ -1866,10 +1888,10 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; - if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0)) + if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0)) return false; - srttb_diff = r_srttb - l_srttb; + srttb_diff = a_srttb - l_srttb; sqr_diff = srttb_diff * srttb_diff; var *= 2; @@ -1946,10 +1968,18 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd) * @ip: the inode to reserve space for * @ap: the allocation parameters * - * Returns: errno + * We try our best to find an rgrp that has at least ap->target blocks + * available. After a couple of passes (loops == 2), the prospects of finding + * such an rgrp diminish. At this stage, we return the first rgrp that has + * atleast ap->min_target blocks available. Either way, we set ap->allowed to + * the number of blocks available in the chosen rgrp. + * + * Returns: 0 on success, + * -ENOMEM if a suitable rgrp can't be found + * errno otherwise */ -int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap) +int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; @@ -2012,7 +2042,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a /* Skip unuseable resource groups */ if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) || - (ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) + (loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt)) goto skip_rgrp; if (sdp->sd_args.ar_rgrplvb) @@ -2027,11 +2057,13 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a goto check_rgrp; /* If rgrp has enough free space, use it */ - if (rs->rs_rbm.rgd->rd_free_clone >= ap->target) { + if (rs->rs_rbm.rgd->rd_free_clone >= ap->target || + (loops == 2 && ap->min_target && + rs->rs_rbm.rgd->rd_free_clone >= ap->min_target)) { ip->i_rgd = rs->rs_rbm.rgd; + ap->allowed = ip->i_rgd->rd_free_clone; return 0; } - check_rgrp: /* Check for unlinked inodes which can be reclaimed */ if (rs->rs_rbm.rgd->rd_flags & GFS2_RDF_CHECK) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index b104f4af3afd..c0ab33fa3eed 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -36,12 +36,14 @@ extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); extern int gfs2_rindex_update(struct gfs2_sbd *sdp); extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); +extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); #define GFS2_AF_ORLOV 1 -extern int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *ap); +extern int gfs2_inplace_reserve(struct gfs2_inode *ip, + struct gfs2_alloc_parms *ap); extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 1666382b198d..2982445947e1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); - if (bdi->dirty_exceeded) + if (bdi->wb.dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); @@ -1171,7 +1171,7 @@ static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *s static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) { - struct super_block *sb = dentry->d_inode->i_sb; + struct super_block *sb = d_inode(dentry)->i_sb; struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_statfs_change_host sc; int error; diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index ae8e8811f0e8..c9ff1cf7d4f3 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -101,8 +101,11 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { - int error; - int n = simple_strtol(buf, NULL, 0); + int error, n; + + error = kstrtoint(buf, 0, &n); + if (error) + return error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -134,10 +137,16 @@ static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + int error, val; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (simple_strtol(buf, NULL, 0) != 1) + error = kstrtoint(buf, 0, &val); + if (error) + return error; + + if (val != 1) return -EINVAL; gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n"); @@ -148,10 +157,16 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + int error, val; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (simple_strtol(buf, NULL, 0) != 1) + error = kstrtoint(buf, 0, &val); + if (error) + return error; + + if (val != 1) return -EINVAL; gfs2_statfs_sync(sdp->sd_vfs, 0); @@ -161,10 +176,16 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + int error, val; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (simple_strtol(buf, NULL, 0) != 1) + error = kstrtoint(buf, 0, &val); + if (error) + return error; + + if (val != 1) return -EINVAL; gfs2_quota_sync(sdp->sd_vfs, 0); @@ -181,7 +202,9 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - id = simple_strtoul(buf, NULL, 0); + error = kstrtou32(buf, 0, &id); + if (error) + return error; qid = make_kqid(current_user_ns(), USRQUOTA, id); if (!qid_valid(qid)) @@ -201,7 +224,9 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - id = simple_strtoul(buf, NULL, 0); + error = kstrtou32(buf, 0, &id); + if (error) + return error; qid = make_kqid(current_user_ns(), GRPQUOTA, id); if (!qid_valid(qid)) @@ -324,10 +349,11 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf) static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - ssize_t ret = len; - int val; + int ret, val; - val = simple_strtol(buf, NULL, 0); + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; if (val == 1) set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); @@ -336,9 +362,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) smp_mb__after_atomic(); gfs2_glock_thaw(sdp); } else { - ret = -EINVAL; + return -EINVAL; } - return ret; + return len; } static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) @@ -350,17 +376,18 @@ static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { - ssize_t ret = len; - int val; + int ret, val; - val = simple_strtol(buf, NULL, 0); + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; if ((val == 1) && !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) complete(&sdp->sd_wdack); else - ret = -EINVAL; - return ret; + return -EINVAL; + return len; } static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) @@ -553,11 +580,14 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, { struct gfs2_tune *gt = &sdp->sd_tune; unsigned int x; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - x = simple_strtoul(buf, NULL, 0); + error = kstrtouint(buf, 0, &x); + if (error) + return error; if (check_zero && !x) return -EINVAL; diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 0b81f783f787..4c096fa9e2a1 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -420,7 +420,7 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_ea_request er; struct gfs2_holder i_gh; int error; @@ -586,7 +586,7 @@ out: static int gfs2_xattr_get(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_ea_location el; int error; @@ -732,7 +732,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - error = gfs2_quota_lock_check(ip); + error = gfs2_quota_lock_check(ip, &ap); if (error) return error; @@ -1230,7 +1230,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, static int gfs2_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int type) { - return __gfs2_xattr_set(dentry->d_inode, name, value, + return __gfs2_xattr_set(d_inode(dentry), name, value, size, flags, type); } diff --git a/fs/hfs/attr.c b/fs/hfs/attr.c index e057ec542a6a..8d931b157bbe 100644 --- a/fs/hfs/attr.c +++ b/fs/hfs/attr.c @@ -16,7 +16,7 @@ int hfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; hfs_cat_rec rec; struct hfs_cat_file *file; @@ -59,7 +59,7 @@ out: ssize_t hfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; hfs_cat_rec rec; struct hfs_cat_file *file; @@ -105,7 +105,7 @@ out: ssize_t hfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (!S_ISREG(inode->i_mode) || HFS_IS_RSRC(inode)) return -EOPNOTSUPP; diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 145566851e7a..70788e03820a 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -197,7 +197,7 @@ static int hfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode = hfs_new_inode(dir, &dentry->d_name, mode); if (!inode) - return -ENOSPC; + return -ENOMEM; res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { @@ -226,7 +226,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode); if (!inode) - return -ENOSPC; + return -ENOMEM; res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { @@ -253,7 +253,7 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) */ static int hfs_remove(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int res; if (S_ISDIR(inode->i_mode) && inode->i_size != 2) @@ -285,18 +285,18 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry, int res; /* Unlink destination if it already exists */ - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { res = hfs_remove(new_dir, new_dentry); if (res) return res; } - res = hfs_cat_move(old_dentry->d_inode->i_ino, + res = hfs_cat_move(d_inode(old_dentry)->i_ino, old_dir, &old_dentry->d_name, new_dir, &new_dentry->d_name); if (!res) hfs_cat_build_key(old_dir->i_sb, - (btree_key *)&HFS_I(old_dentry->d_inode)->cat_key, + (btree_key *)&HFS_I(d_inode(old_dentry))->cat_key, new_dir->i_ino, &new_dentry->d_name); return res; } diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 95d255219b1e..1f1c7dcbcc2f 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -252,7 +252,7 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb); #define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60) #define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60) -#define HFS_I(inode) (list_entry(inode, struct hfs_inode_info, vfs_inode)) +#define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode)) #define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) #define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) } diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index d0929bc81782..b99ebddb10cb 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,7 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "hfs_fs.h" #include "btree.h" @@ -124,8 +124,8 @@ static int hfs_releasepage(struct page *page, gfp_t mask) return res ? try_to_free_buffers(page) : 0; } -static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t hfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -133,13 +133,13 @@ static ssize_t hfs_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, hfs_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, offset, hfs_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ - if (unlikely((rw & WRITE) && ret < 0)) { + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = offset + count; @@ -600,7 +600,7 @@ static int hfs_file_release(struct inode *inode, struct file *file) int hfs_inode_setattr(struct dentry *dentry, struct iattr * attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_sb_info *hsb = HFS_SB(inode->i_sb); int error; @@ -674,9 +674,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, static const struct file_operations hfs_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, diff --git a/fs/hfs/super.c b/fs/hfs/super.c index eee7206c38d1..55c03b9e9070 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/nls.h> diff --git a/fs/hfs/sysdep.c b/fs/hfs/sysdep.c index 91b91fd3a901..2875961fdc10 100644 --- a/fs/hfs/sysdep.c +++ b/fs/hfs/sysdep.c @@ -21,7 +21,7 @@ static int hfs_revalidate_dentry(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); if(!inode) return 1; diff --git a/fs/hfsplus/bfind.c b/fs/hfsplus/bfind.c index c1422d91cd36..528e38b5af7f 100644 --- a/fs/hfsplus/bfind.c +++ b/fs/hfsplus/bfind.c @@ -118,9 +118,7 @@ int __hfs_brec_find(struct hfs_bnode *bnode, struct hfs_find_data *fd, int b, e; int res; - if (!rec_found) - BUG(); - + BUG_ON(!rec_found); b = 0; e = bnode->num_recs - 1; res = -ENOENT; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index 7892e6fddb66..022974ab6e3c 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -350,10 +350,11 @@ int hfsplus_delete_cat(u32 cnid, struct inode *dir, struct qstr *str) &fd.search_key->cat.name.unicode, off + 2, len); fd.search_key->key_len = cpu_to_be16(6 + len); - } else + } else { err = hfsplus_cat_build_key(sb, fd.search_key, dir->i_ino, str); if (unlikely(err)) goto out; + } err = hfs_brec_find(&fd, hfs_find_rec_by_key); if (err) diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index f0235c1640af..d0f39dcbb58e 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -81,7 +81,7 @@ again: HFSPLUS_I(HFSPLUS_SB(sb)->hidden_dir)-> create_date || entry.file.create_date == - HFSPLUS_I(sb->s_root->d_inode)-> + HFSPLUS_I(d_inode(sb->s_root))-> create_date) && HFSPLUS_SB(sb)->hidden_dir) { struct qstr str; @@ -296,8 +296,8 @@ static int hfsplus_link(struct dentry *src_dentry, struct inode *dst_dir, struct dentry *dst_dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dst_dir->i_sb); - struct inode *inode = src_dentry->d_inode; - struct inode *src_dir = src_dentry->d_parent->d_inode; + struct inode *inode = d_inode(src_dentry); + struct inode *src_dir = d_inode(src_dentry->d_parent); struct qstr str; char name[32]; u32 cnid, id; @@ -353,7 +353,7 @@ out: static int hfsplus_unlink(struct inode *dir, struct dentry *dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct qstr str; char name[32]; u32 cnid; @@ -410,7 +410,7 @@ out: static int hfsplus_rmdir(struct inode *dir, struct dentry *dentry) { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int res; if (inode->i_size != 2) @@ -434,7 +434,7 @@ static int hfsplus_symlink(struct inode *dir, struct dentry *dentry, { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; - int res = -ENOSPC; + int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); inode = hfsplus_new_inode(dir->i_sb, S_IFLNK | S_IRWXUGO); @@ -476,7 +476,7 @@ static int hfsplus_mknod(struct inode *dir, struct dentry *dentry, { struct hfsplus_sb_info *sbi = HFSPLUS_SB(dir->i_sb); struct inode *inode; - int res = -ENOSPC; + int res = -ENOMEM; mutex_lock(&sbi->vh_mutex); inode = hfsplus_new_inode(dir->i_sb, mode); @@ -529,7 +529,7 @@ static int hfsplus_rename(struct inode *old_dir, struct dentry *old_dentry, int res; /* Unlink destination if it already exists */ - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { if (d_is_dir(new_dentry)) res = hfsplus_rmdir(new_dir, new_dentry); else diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index b0441d65fa54..f91a1faf819e 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -263,7 +263,7 @@ struct hfsplus_inode_info { static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) { - return list_entry(inode, struct hfsplus_inode_info, vfs_inode); + return container_of(inode, struct hfsplus_inode_info, vfs_inode); } /* diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 0cf786f2d046..6dd107d7421e 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,7 +14,7 @@ #include <linux/pagemap.h> #include <linux/mpage.h> #include <linux/sched.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "hfsplus_fs.h" #include "hfsplus_raw.h" @@ -122,8 +122,8 @@ static int hfsplus_releasepage(struct page *page, gfp_t mask) return res ? try_to_free_buffers(page) : 0; } -static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t hfsplus_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -131,14 +131,13 @@ static ssize_t hfsplus_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, - hfsplus_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, offset, hfsplus_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ - if (unlikely((rw & WRITE) && ret < 0)) { + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = offset + count; @@ -244,7 +243,7 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); @@ -254,6 +253,12 @@ static int hfsplus_setattr(struct dentry *dentry, struct iattr *attr) if ((attr->ia_valid & ATTR_SIZE) && attr->ia_size != i_size_read(inode)) { inode_dio_wait(inode); + if (attr->ia_size > inode->i_size) { + error = generic_cont_expand_simple(inode, + attr->ia_size); + if (error) + return error; + } truncate_setsize(inode, attr->ia_size); hfsplus_file_truncate(inode); } @@ -341,9 +346,7 @@ static const struct inode_operations hfsplus_file_inode_operations = { static const struct file_operations hfsplus_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .splice_read = generic_file_splice_read, diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index d3ff5cc317d7..0624ce4e0702 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -26,7 +26,7 @@ static int hfsplus_ioctl_bless(struct file *file, int __user *user_flags) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfsplus_sb_info *sbi = HFSPLUS_SB(inode->i_sb); struct hfsplus_vh *vh = sbi->s_vhdr; struct hfsplus_vh *bvh = sbi->s_backup_vhdr; @@ -76,7 +76,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) { struct inode *inode = file_inode(file); struct hfsplus_inode_info *hip = HFSPLUS_I(inode); - unsigned int flags; + unsigned int flags, new_fl = 0; int err = 0; err = mnt_want_write_file(file); @@ -110,14 +110,12 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) } if (flags & FS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & FS_APPEND_FL) - inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; + new_fl |= S_APPEND; + + inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND); if (flags & FS_NODUMP_FL) hip->userflags |= HFSPLUS_FLG_NODUMP; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 593af2fdcc2d..7302d96ae8bf 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/pagemap.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/vfs.h> diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index d98094a9f476..416b1dbafe51 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -44,7 +44,7 @@ static int strcmp_xattr_acl(const char *name) return -1; } -static inline int is_known_namespace(const char *name) +static bool is_known_namespace(const char *name) { if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && @@ -424,6 +424,28 @@ static int copy_name(char *buffer, const char *xattr_name, int name_len) return len; } +int hfsplus_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + const char *prefix, size_t prefixlen) +{ + char *xattr_name; + int res; + + if (!strcmp(name, "")) + return -EINVAL; + + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; + strcpy(xattr_name, prefix); + strcpy(xattr_name + prefixlen, name); + res = __hfsplus_setxattr(d_inode(dentry), xattr_name, value, size, + flags); + kfree(xattr_name); + return res; +} + static ssize_t hfsplus_getxattr_finder_info(struct inode *inode, void *value, size_t size) { @@ -560,6 +582,30 @@ failed_getxattr_init: return res; } +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size, + const char *prefix, size_t prefixlen) +{ + int res; + char *xattr_name; + + if (!strcmp(name, "")) + return -EINVAL; + + xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, + GFP_KERNEL); + if (!xattr_name) + return -ENOMEM; + + strcpy(xattr_name, prefix); + strcpy(xattr_name + prefixlen, name); + + res = __hfsplus_getxattr(d_inode(dentry), xattr_name, value, size); + kfree(xattr_name); + return res; + +} + static inline int can_list(const char *xattr_name) { if (!xattr_name) @@ -574,7 +620,7 @@ static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, char *buffer, size_t size) { ssize_t res = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; u16 entry_type; u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; @@ -642,7 +688,7 @@ ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) { ssize_t err; ssize_t res = 0; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hfs_find_data fd; u16 key_len = 0; struct hfsplus_attr_key attr_key; @@ -806,9 +852,6 @@ end_removexattr: static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char *xattr_name; - int res; - if (!strcmp(name, "")) return -EINVAL; @@ -818,24 +861,19 @@ static int hfsplus_osx_getxattr(struct dentry *dentry, const char *name, */ if (is_known_namespace(name)) return -EOPNOTSUPP; - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN - + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); - if (!xattr_name) - return -ENOMEM; - strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); - strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); - res = hfsplus_getxattr(dentry, xattr_name, buffer, size); - kfree(xattr_name); - return res; + /* + * osx is the namespace we use to indicate an unprefixed + * attribute on the filesystem (like the ones that OS X + * creates), so we pass the name through unmodified (after + * ensuring it doesn't conflict with another namespace). + */ + return __hfsplus_getxattr(d_inode(dentry), name, buffer, size); } static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char *xattr_name; - int res; - if (!strcmp(name, "")) return -EINVAL; @@ -845,16 +883,14 @@ static int hfsplus_osx_setxattr(struct dentry *dentry, const char *name, */ if (is_known_namespace(name)) return -EOPNOTSUPP; - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN - + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); - if (!xattr_name) - return -ENOMEM; - strcpy(xattr_name, XATTR_MAC_OSX_PREFIX); - strcpy(xattr_name + XATTR_MAC_OSX_PREFIX_LEN, name); - res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); - kfree(xattr_name); - return res; + /* + * osx is the namespace we use to indicate an unprefixed + * attribute on the filesystem (like the ones that OS X + * creates), so we pass the name through unmodified (after + * ensuring it doesn't conflict with another namespace). + */ + return __hfsplus_setxattr(d_inode(dentry), name, buffer, size, flags); } static size_t hfsplus_osx_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hfsplus/xattr.h b/fs/hfsplus/xattr.h index 288530cf80b5..f9b0955b3d28 100644 --- a/fs/hfsplus/xattr.h +++ b/fs/hfsplus/xattr.h @@ -21,22 +21,16 @@ extern const struct xattr_handler *hfsplus_xattr_handlers[]; int __hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags); -static inline int hfsplus_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) -{ - return __hfsplus_setxattr(dentry->d_inode, name, value, size, flags); -} +int hfsplus_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, + const char *prefix, size_t prefixlen); ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, - void *value, size_t size); - -static inline ssize_t hfsplus_getxattr(struct dentry *dentry, - const char *name, - void *value, - size_t size) -{ - return __hfsplus_getxattr(dentry->d_inode, name, value, size); -} + void *value, size_t size); + +ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size, + const char *prefix, size_t prefixlen); ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size); diff --git a/fs/hfsplus/xattr_security.c b/fs/hfsplus/xattr_security.c index 6ec5e107691f..aacff00a9ff9 100644 --- a/fs/hfsplus/xattr_security.c +++ b/fs/hfsplus/xattr_security.c @@ -16,43 +16,17 @@ static int hfsplus_security_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char *xattr_name; - int res; - - if (!strcmp(name, "")) - return -EINVAL; - - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, - GFP_KERNEL); - if (!xattr_name) - return -ENOMEM; - strcpy(xattr_name, XATTR_SECURITY_PREFIX); - strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); - - res = hfsplus_getxattr(dentry, xattr_name, buffer, size); - kfree(xattr_name); - return res; + return hfsplus_getxattr(dentry, name, buffer, size, + XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); } static int hfsplus_security_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char *xattr_name; - int res; - - if (!strcmp(name, "")) - return -EINVAL; - - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, - GFP_KERNEL); - if (!xattr_name) - return -ENOMEM; - strcpy(xattr_name, XATTR_SECURITY_PREFIX); - strcpy(xattr_name + XATTR_SECURITY_PREFIX_LEN, name); - - res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); - kfree(xattr_name); - return res; + return hfsplus_setxattr(dentry, name, buffer, size, flags, + XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN); } static size_t hfsplus_security_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hfsplus/xattr_trusted.c b/fs/hfsplus/xattr_trusted.c index 3c5f27e4746a..bcf65089b7f7 100644 --- a/fs/hfsplus/xattr_trusted.c +++ b/fs/hfsplus/xattr_trusted.c @@ -14,43 +14,16 @@ static int hfsplus_trusted_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char *xattr_name; - int res; - - if (!strcmp(name, "")) - return -EINVAL; - - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, - GFP_KERNEL); - if (!xattr_name) - return -ENOMEM; - strcpy(xattr_name, XATTR_TRUSTED_PREFIX); - strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); - - res = hfsplus_getxattr(dentry, xattr_name, buffer, size); - kfree(xattr_name); - return res; + return hfsplus_getxattr(dentry, name, buffer, size, + XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN); } static int hfsplus_trusted_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char *xattr_name; - int res; - - if (!strcmp(name, "")) - return -EINVAL; - - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, - GFP_KERNEL); - if (!xattr_name) - return -ENOMEM; - strcpy(xattr_name, XATTR_TRUSTED_PREFIX); - strcpy(xattr_name + XATTR_TRUSTED_PREFIX_LEN, name); - - res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); - kfree(xattr_name); - return res; + return hfsplus_setxattr(dentry, name, buffer, size, flags, + XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } static size_t hfsplus_trusted_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hfsplus/xattr_user.c b/fs/hfsplus/xattr_user.c index 2b625a538b64..5aa0e6dc4a1e 100644 --- a/fs/hfsplus/xattr_user.c +++ b/fs/hfsplus/xattr_user.c @@ -14,43 +14,16 @@ static int hfsplus_user_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size, int type) { - char *xattr_name; - int res; - if (!strcmp(name, "")) - return -EINVAL; - - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, - GFP_KERNEL); - if (!xattr_name) - return -ENOMEM; - strcpy(xattr_name, XATTR_USER_PREFIX); - strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); - - res = hfsplus_getxattr(dentry, xattr_name, buffer, size); - kfree(xattr_name); - return res; + return hfsplus_getxattr(dentry, name, buffer, size, + XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } static int hfsplus_user_setxattr(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int type) { - char *xattr_name; - int res; - - if (!strcmp(name, "")) - return -EINVAL; - - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, - GFP_KERNEL); - if (!xattr_name) - return -ENOMEM; - strcpy(xattr_name, XATTR_USER_PREFIX); - strcpy(xattr_name + XATTR_USER_PREFIX_LEN, name); - - res = hfsplus_setxattr(dentry, xattr_name, buffer, size, flags); - kfree(xattr_name); - return res; + return hfsplus_setxattr(dentry, name, buffer, size, flags, + XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); } static size_t hfsplus_user_listxattr(struct dentry *dentry, char *list, diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 4fcd40d6f308..91e19f9dffe5 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -66,7 +66,8 @@ extern int stat_file(const char *path, struct hostfs_stat *p, int fd); extern int access_file(char *path, int r, int w, int x); extern int open_file(char *path, int r, int w, int append); extern void *open_dir(char *path, int *err_out); -extern char *read_dir(void *stream, unsigned long long *pos, +extern void seek_dir(void *stream, unsigned long long pos); +extern char *read_dir(void *stream, unsigned long long *pos_out, unsigned long long *ino_out, int *len_out, unsigned int *type_out); extern void close_file(void *stream); @@ -77,8 +78,7 @@ extern int write_file(int fd, unsigned long long *offset, const char *buf, int len); extern int lseek_file(int fd, long long offset, int whence); extern int fsync_file(int fd, int datasync); -extern int file_create(char *name, int ur, int uw, int ux, int gr, - int gw, int gx, int or, int ow, int ox); +extern int file_create(char *name, int mode); extern int set_attr(const char *file, struct hostfs_iattr *attrs, int fd); extern int make_symlink(const char *from, const char *to); extern int unlink_file(const char *file); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index fd62cae0fdcb..059597b23f67 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -24,6 +24,7 @@ struct hostfs_inode_info { int fd; fmode_t mode; struct inode vfs_inode; + struct mutex open_mutex; }; static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) @@ -92,16 +93,22 @@ static char *__dentry_name(struct dentry *dentry, char *name) __putname(name); return NULL; } + + /* + * This function relies on the fact that dentry_path_raw() will place + * the path name at the end of the provided buffer. + */ + BUG_ON(p + strlen(p) + 1 != name + PATH_MAX); + strlcpy(name, root, PATH_MAX); if (len > p - name) { __putname(name); return NULL; } - if (p > name + len) { - char *s = name + len; - while ((*s++ = *p++) != '\0') - ; - } + + if (p > name + len) + strcpy(name + len, p); + return name; } @@ -135,21 +142,19 @@ static char *follow_link(char *link) int len, n; char *name, *resolved, *end; - len = 64; - while (1) { + name = __getname(); + if (!name) { n = -ENOMEM; - name = kmalloc(len, GFP_KERNEL); - if (name == NULL) - goto out; - - n = hostfs_do_readlink(link, name, len); - if (n < len) - break; - len *= 2; - kfree(name); + goto out_free; } + + n = hostfs_do_readlink(link, name, PATH_MAX); if (n < 0) goto out_free; + else if (n == PATH_MAX) { + n = -E2BIG; + goto out_free; + } if (*name == '/') return name; @@ -168,13 +173,12 @@ static char *follow_link(char *link) } sprintf(resolved, "%s%s", link, name); - kfree(name); + __putname(name); kfree(link); return resolved; out_free: - kfree(name); - out: + __putname(name); return ERR_PTR(n); } @@ -225,6 +229,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) hi->fd = -1; hi->mode = 0; inode_init_once(&hi->vfs_inode); + mutex_init(&hi->open_mutex); return &hi->vfs_inode; } @@ -257,6 +262,9 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) if (strlen(root_path) > offset) seq_printf(seq, ",%s", root_path + offset); + if (append) + seq_puts(seq, ",append"); + return 0; } @@ -284,6 +292,7 @@ static int hostfs_readdir(struct file *file, struct dir_context *ctx) if (dir == NULL) return -error; next = ctx->pos; + seek_dir(dir, next); while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) { if (!dir_emit(ctx, name, len, ino, type)) break; @@ -293,13 +302,12 @@ static int hostfs_readdir(struct file *file, struct dir_context *ctx) return 0; } -static int hostfs_file_open(struct inode *ino, struct file *file) +static int hostfs_open(struct inode *ino, struct file *file) { - static DEFINE_MUTEX(open_mutex); char *name; - fmode_t mode = 0; + fmode_t mode; int err; - int r = 0, w = 0, fd; + int r, w, fd; mode = file->f_mode & (FMODE_READ | FMODE_WRITE); if ((mode & HOSTFS_I(ino)->mode) == mode) @@ -308,12 +316,12 @@ static int hostfs_file_open(struct inode *ino, struct file *file) mode |= HOSTFS_I(ino)->mode; retry: + r = w = 0; + if (mode & FMODE_READ) r = 1; if (mode & FMODE_WRITE) - w = 1; - if (w) - r = 1; + r = w = 1; name = dentry_name(file->f_path.dentry); if (name == NULL) @@ -324,15 +332,16 @@ retry: if (fd < 0) return fd; - mutex_lock(&open_mutex); + mutex_lock(&HOSTFS_I(ino)->open_mutex); /* somebody else had handled it first? */ if ((mode & HOSTFS_I(ino)->mode) == mode) { - mutex_unlock(&open_mutex); + mutex_unlock(&HOSTFS_I(ino)->open_mutex); + close_file(&fd); return 0; } if ((mode | HOSTFS_I(ino)->mode) != mode) { mode |= HOSTFS_I(ino)->mode; - mutex_unlock(&open_mutex); + mutex_unlock(&HOSTFS_I(ino)->open_mutex); close_file(&fd); goto retry; } @@ -342,12 +351,12 @@ retry: err = replace_file(fd, HOSTFS_I(ino)->fd); close_file(&fd); if (err < 0) { - mutex_unlock(&open_mutex); + mutex_unlock(&HOSTFS_I(ino)->open_mutex); return err; } } HOSTFS_I(ino)->mode = mode; - mutex_unlock(&open_mutex); + mutex_unlock(&HOSTFS_I(ino)->open_mutex); return 0; } @@ -378,13 +387,11 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, static const struct file_operations hostfs_file_fops = { .llseek = generic_file_llseek, - .read = new_sync_read, .splice_read = generic_file_splice_read, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, - .write = new_sync_write, .mmap = generic_file_mmap, - .open = hostfs_file_open, + .open = hostfs_open, .release = hostfs_file_release, .fsync = hostfs_fsync, }; @@ -393,6 +400,8 @@ static const struct file_operations hostfs_dir_fops = { .llseek = generic_file_llseek, .iterate = hostfs_readdir, .read = generic_read_dir, + .open = hostfs_open, + .fsync = hostfs_fsync, }; static int hostfs_writepage(struct page *page, struct writeback_control *wbc) @@ -400,7 +409,7 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc) struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; char *buffer; - unsigned long long base; + loff_t base = page_offset(page); int count = PAGE_CACHE_SIZE; int end_index = inode->i_size >> PAGE_CACHE_SHIFT; int err; @@ -409,7 +418,6 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc) count = inode->i_size & (PAGE_CACHE_SIZE-1); buffer = kmap(page); - base = ((unsigned long long) page->index) << PAGE_CACHE_SHIFT; err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count); if (err != count) { @@ -434,26 +442,29 @@ static int hostfs_writepage(struct page *page, struct writeback_control *wbc) static int hostfs_readpage(struct file *file, struct page *page) { char *buffer; - long long start; - int err = 0; + loff_t start = page_offset(page); + int bytes_read, ret = 0; - start = (long long) page->index << PAGE_CACHE_SHIFT; buffer = kmap(page); - err = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer, + bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer, PAGE_CACHE_SIZE); - if (err < 0) + if (bytes_read < 0) { + ClearPageUptodate(page); + SetPageError(page); + ret = bytes_read; goto out; + } - memset(&buffer[err], 0, PAGE_CACHE_SIZE - err); + memset(buffer + bytes_read, 0, PAGE_CACHE_SIZE - bytes_read); - flush_dcache_page(page); + ClearPageError(page); SetPageUptodate(page); - if (PageError(page)) ClearPageError(page); - err = 0; + out: + flush_dcache_page(page); kunmap(page); unlock_page(page); - return err; + return ret; } static int hostfs_write_begin(struct file *file, struct address_space *mapping, @@ -530,11 +541,13 @@ static int read_name(struct inode *ino, char *name) init_special_inode(ino, st.mode & S_IFMT, rdev); ino->i_op = &hostfs_iops; break; - - default: + case S_IFREG: ino->i_op = &hostfs_iops; ino->i_fop = &hostfs_file_fops; ino->i_mapping->a_ops = &hostfs_aops; + break; + default: + return -EIO; } ino->i_ino = st.ino; @@ -568,10 +581,7 @@ static int hostfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (name == NULL) goto out_put; - fd = file_create(name, - mode & S_IRUSR, mode & S_IWUSR, mode & S_IXUSR, - mode & S_IRGRP, mode & S_IWGRP, mode & S_IXGRP, - mode & S_IROTH, mode & S_IWOTH, mode & S_IXOTH); + fd = file_create(name, mode & 0777); if (fd < 0) error = fd; else @@ -797,7 +807,7 @@ static int hostfs_permission(struct inode *ino, int desired) static int hostfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hostfs_iattr attrs; char *name; int err; @@ -882,7 +892,7 @@ static const struct inode_operations hostfs_dir_iops = { .setattr = hostfs_setattr, }; -static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *hostfs_follow_link(struct dentry *dentry, void **cookie) { char *link = __getname(); if (link) { @@ -896,21 +906,18 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd) } if (err < 0) { __putname(link); - link = ERR_PTR(err); + return ERR_PTR(err); } } else { - link = ERR_PTR(-ENOMEM); + return ERR_PTR(-ENOMEM); } - nd_set_link(nd, link); - return NULL; + return *cookie = link; } -static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +static void hostfs_put_link(struct inode *unused, void *cookie) { - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - __putname(s); + __putname(cookie); } static const struct inode_operations hostfs_link_iops = { diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 9765dab95cbd..9c1e0f019880 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -97,21 +97,27 @@ void *open_dir(char *path, int *err_out) return dir; } -char *read_dir(void *stream, unsigned long long *pos, +void seek_dir(void *stream, unsigned long long pos) +{ + DIR *dir = stream; + + seekdir(dir, pos); +} + +char *read_dir(void *stream, unsigned long long *pos_out, unsigned long long *ino_out, int *len_out, unsigned int *type_out) { DIR *dir = stream; struct dirent *ent; - seekdir(dir, *pos); ent = readdir(dir); if (ent == NULL) return NULL; *len_out = strlen(ent->d_name); *ino_out = ent->d_ino; *type_out = ent->d_type; - *pos = telldir(dir); + *pos_out = ent->d_off; return ent->d_name; } @@ -175,21 +181,10 @@ void close_dir(void *stream) closedir(stream); } -int file_create(char *name, int ur, int uw, int ux, int gr, - int gw, int gx, int or, int ow, int ox) +int file_create(char *name, int mode) { - int mode, fd; - - mode = 0; - mode |= ur ? S_IRUSR : 0; - mode |= uw ? S_IWUSR : 0; - mode |= ux ? S_IXUSR : 0; - mode |= gr ? S_IRGRP : 0; - mode |= gw ? S_IWGRP : 0; - mode |= gx ? S_IXGRP : 0; - mode |= or ? S_IROTH : 0; - mode |= ow ? S_IWOTH : 0; - mode |= ox ? S_IXOTH : 0; + int fd; + fd = open64(name, O_CREAT | O_RDWR, mode); if (fd < 0) return -errno; diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index f005046e1591..d6a4b55d2ab0 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -484,3 +484,98 @@ struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *a a->btree.first_free = cpu_to_le16(8); return a; } + +static unsigned find_run(__le32 *bmp, unsigned *idx) +{ + unsigned len; + while (tstbits(bmp, *idx, 1)) { + (*idx)++; + if (unlikely(*idx >= 0x4000)) + return 0; + } + len = 1; + while (!tstbits(bmp, *idx + len, 1)) + len++; + return len; +} + +static int do_trim(struct super_block *s, secno start, unsigned len, secno limit_start, secno limit_end, unsigned minlen, unsigned *result) +{ + int err; + secno end; + if (fatal_signal_pending(current)) + return -EINTR; + end = start + len; + if (start < limit_start) + start = limit_start; + if (end > limit_end) + end = limit_end; + if (start >= end) + return 0; + if (end - start < minlen) + return 0; + err = sb_issue_discard(s, start, end - start, GFP_NOFS, 0); + if (err) + return err; + *result += end - start; + return 0; +} + +int hpfs_trim_fs(struct super_block *s, u64 start, u64 end, u64 minlen, unsigned *result) +{ + int err = 0; + struct hpfs_sb_info *sbi = hpfs_sb(s); + unsigned idx, len, start_bmp, end_bmp; + __le32 *bmp; + struct quad_buffer_head qbh; + + *result = 0; + if (!end || end > sbi->sb_fs_size) + end = sbi->sb_fs_size; + if (start >= sbi->sb_fs_size) + return 0; + if (minlen > 0x4000) + return 0; + if (start < sbi->sb_dirband_start + sbi->sb_dirband_size && end > sbi->sb_dirband_start) { + hpfs_lock(s); + if (s->s_flags & MS_RDONLY) { + err = -EROFS; + goto unlock_1; + } + if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { + err = -EIO; + goto unlock_1; + } + idx = 0; + while ((len = find_run(bmp, &idx)) && !err) { + err = do_trim(s, sbi->sb_dirband_start + idx * 4, len * 4, start, end, minlen, result); + idx += len; + } + hpfs_brelse4(&qbh); +unlock_1: + hpfs_unlock(s); + } + start_bmp = start >> 14; + end_bmp = (end + 0x3fff) >> 14; + while (start_bmp < end_bmp && !err) { + hpfs_lock(s); + if (s->s_flags & MS_RDONLY) { + err = -EROFS; + goto unlock_2; + } + if (!(bmp = hpfs_map_bitmap(s, start_bmp, &qbh, "trim"))) { + err = -EIO; + goto unlock_2; + } + idx = 0; + while ((len = find_run(bmp, &idx)) && !err) { + err = do_trim(s, (start_bmp << 14) + idx, len, start, end, minlen, result); + idx += len; + } + hpfs_brelse4(&qbh); +unlock_2: + hpfs_unlock(s); + start_bmp++; + } + return err; +} diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 2a8e07425de0..dc540bfcee1d 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -327,4 +327,5 @@ const struct file_operations hpfs_dir_ops = .iterate = hpfs_readdir, .release = hpfs_dir_release, .fsync = hpfs_file_fsync, + .unlocked_ioctl = hpfs_ioctl, }; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 7f54e5f76cec..7ca28d604bf7 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -197,14 +197,13 @@ const struct address_space_operations hpfs_aops = { const struct file_operations hpfs_file_ops = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .release = hpfs_file_release, .fsync = hpfs_file_fsync, .splice_read = generic_file_splice_read, + .unlocked_ioctl = hpfs_ioctl, }; const struct inode_operations hpfs_file_iops = diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index b63b75fa00e7..c4867b5116dd 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/slab.h> +#include <linux/sched.h> +#include <linux/blkdev.h> #include <asm/unaligned.h> #include "hpfs.h" @@ -200,6 +202,7 @@ void hpfs_free_dnode(struct super_block *, secno); struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *); struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **); struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **); +int hpfs_trim_fs(struct super_block *, u64, u64, u64, unsigned *); /* anode.c */ @@ -304,7 +307,7 @@ extern const struct address_space_operations hpfs_symlink_aops; static inline struct hpfs_inode_info *hpfs_i(struct inode *inode) { - return list_entry(inode, struct hpfs_inode_info, vfs_inode); + return container_of(inode, struct hpfs_inode_info, vfs_inode); } static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) @@ -318,6 +321,7 @@ __printf(2, 3) void hpfs_error(struct super_block *, const char *, ...); int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); unsigned hpfs_get_free_dnodes(struct super_block *); +long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg); /* * local time (HPFS) to GMT (Unix) diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 7ce4b74234a1..933c73780813 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -257,7 +257,7 @@ void hpfs_write_inode_nolock(struct inode *i) int hpfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error = -EINVAL; hpfs_lock(inode->i_sb); diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index bdbc2c3080a4..a0872f239f04 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -359,7 +359,7 @@ static int hpfs_unlink(struct inode *dir, struct dentry *dentry) unsigned len = dentry->d_name.len; struct quad_buffer_head qbh; struct hpfs_dirent *de; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); dnode_secno dno; int r; int rep = 0; @@ -433,7 +433,7 @@ static int hpfs_rmdir(struct inode *dir, struct dentry *dentry) unsigned len = dentry->d_name.len; struct quad_buffer_head qbh; struct hpfs_dirent *de; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); dnode_secno dno; int n_items = 0; int err; @@ -522,8 +522,8 @@ static int hpfs_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned old_len = old_dentry->d_name.len; const unsigned char *new_name = new_dentry->d_name.name; unsigned new_len = new_dentry->d_name.len; - struct inode *i = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *i = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct quad_buffer_head qbh, qbh1; struct hpfs_dirent *dep, *nde; struct hpfs_dirent de; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 7cd00d3a7c9b..68a9bed05628 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -52,17 +52,20 @@ static void unmark_dirty(struct super_block *s) } /* Filesystem error... */ -static char err_buf[1024]; - void hpfs_error(struct super_block *s, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(err_buf, sizeof(err_buf), fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_err("filesystem error: %pV", &vaf); + va_end(args); - pr_err("filesystem error: %s", err_buf); if (!hpfs_sb(s)->sb_was_error) { if (hpfs_sb(s)->sb_err == 2) { pr_cont("; crashing the system because you wanted it\n"); @@ -196,12 +199,39 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } + +long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + switch (cmd) { + case FITRIM: { + struct fstrim_range range; + secno n_trimmed; + int r; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) + return -EFAULT; + r = hpfs_trim_fs(file_inode(file)->i_sb, range.start >> 9, (range.start + range.len) >> 9, (range.minlen + 511) >> 9, &n_trimmed); + if (r) + return r; + range.len = (u64)n_trimmed << 9; + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) + return -EFAULT; + return 0; + } + default: { + return -ENOIOCTLCMD; + } + } +} + + static struct kmem_cache * hpfs_inode_cachep; static struct inode *hpfs_alloc_inode(struct super_block *sb) { struct hpfs_inode_info *ei; - ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); + ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; ei->vfs_inode.i_version = 1; @@ -424,11 +454,14 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) int o; struct hpfs_sb_info *sbi = hpfs_sb(s); char *new_opts = kstrdup(data, GFP_KERNEL); - + + if (!new_opts) + return -ENOMEM; + sync_filesystem(s); *flags |= MS_NOATIME; - + hpfs_lock(s); uid = sbi->sb_uid; gid = sbi->sb_gid; umask = 0777 & ~sbi->sb_mode; diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile deleted file mode 100644 index 3a982bd975d2..000000000000 --- a/fs/hppfs/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -# -# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) -# Licensed under the GPL -# - -obj-$(CONFIG_HPPFS) += hppfs.o diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c deleted file mode 100644 index 043ac9d77262..000000000000 --- a/fs/hppfs/hppfs.c +++ /dev/null @@ -1,766 +0,0 @@ -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <linux/ctype.h> -#include <linux/dcache.h> -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/module.h> -#include <linux/mount.h> -#include <linux/slab.h> -#include <linux/statfs.h> -#include <linux/types.h> -#include <linux/pid_namespace.h> -#include <linux/namei.h> -#include <asm/uaccess.h> -#include <os.h> - -static struct inode *get_inode(struct super_block *, struct dentry *); - -struct hppfs_data { - struct list_head list; - char contents[PAGE_SIZE - sizeof(struct list_head)]; -}; - -struct hppfs_private { - struct file *proc_file; - int host_fd; - loff_t len; - struct hppfs_data *contents; -}; - -struct hppfs_inode_info { - struct dentry *proc_dentry; - struct inode vfs_inode; -}; - -static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode) -{ - return container_of(inode, struct hppfs_inode_info, vfs_inode); -} - -#define HPPFS_SUPER_MAGIC 0xb00000ee - -static const struct super_operations hppfs_sbops; - -static int is_pid(struct dentry *dentry) -{ - struct super_block *sb; - int i; - - sb = dentry->d_sb; - if (dentry->d_parent != sb->s_root) - return 0; - - for (i = 0; i < dentry->d_name.len; i++) { - if (!isdigit(dentry->d_name.name[i])) - return 0; - } - return 1; -} - -static char *dentry_name(struct dentry *dentry, int extra) -{ - struct dentry *parent; - char *root, *name; - const char *seg_name; - int len, seg_len, root_len; - - len = 0; - parent = dentry; - while (parent->d_parent != parent) { - if (is_pid(parent)) - len += strlen("pid") + 1; - else len += parent->d_name.len + 1; - parent = parent->d_parent; - } - - root = "proc"; - root_len = strlen(root); - len += root_len; - name = kmalloc(len + extra + 1, GFP_KERNEL); - if (name == NULL) - return NULL; - - name[len] = '\0'; - parent = dentry; - while (parent->d_parent != parent) { - if (is_pid(parent)) { - seg_name = "pid"; - seg_len = strlen(seg_name); - } - else { - seg_name = parent->d_name.name; - seg_len = parent->d_name.len; - } - - len -= seg_len + 1; - name[len] = '/'; - memcpy(&name[len + 1], seg_name, seg_len); - parent = parent->d_parent; - } - memcpy(name, root, root_len); - return name; -} - -static int file_removed(struct dentry *dentry, const char *file) -{ - char *host_file; - int extra, fd; - - extra = 0; - if (file != NULL) - extra += strlen(file) + 1; - - host_file = dentry_name(dentry, extra + strlen("/remove")); - if (host_file == NULL) { - printk(KERN_ERR "file_removed : allocation failed\n"); - return -ENOMEM; - } - - if (file != NULL) { - strcat(host_file, "/"); - strcat(host_file, file); - } - strcat(host_file, "/remove"); - - fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); - kfree(host_file); - if (fd > 0) { - os_close_file(fd); - return 1; - } - return 0; -} - -static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, - unsigned int flags) -{ - struct dentry *proc_dentry, *parent; - struct qstr *name = &dentry->d_name; - struct inode *inode; - int err, deleted; - - deleted = file_removed(dentry, NULL); - if (deleted < 0) - return ERR_PTR(deleted); - else if (deleted) - return ERR_PTR(-ENOENT); - - parent = HPPFS_I(ino)->proc_dentry; - mutex_lock(&parent->d_inode->i_mutex); - proc_dentry = lookup_one_len(name->name, parent, name->len); - mutex_unlock(&parent->d_inode->i_mutex); - - if (IS_ERR(proc_dentry)) - return proc_dentry; - - err = -ENOMEM; - inode = get_inode(ino->i_sb, proc_dentry); - if (!inode) - goto out; - - d_add(dentry, inode); - return NULL; - - out: - return ERR_PTR(err); -} - -static const struct inode_operations hppfs_file_iops = { -}; - -static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count, - loff_t *ppos, int is_user) -{ - ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); - ssize_t n; - - read = file_inode(file)->i_fop->read; - - if (!is_user) - set_fs(KERNEL_DS); - - n = (*read)(file, buf, count, &file->f_pos); - - if (!is_user) - set_fs(USER_DS); - - if (ppos) - *ppos = file->f_pos; - return n; -} - -static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count) -{ - ssize_t n; - int cur, err; - char *new_buf; - - n = -ENOMEM; - new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (new_buf == NULL) { - printk(KERN_ERR "hppfs_read_file : kmalloc failed\n"); - goto out; - } - n = 0; - while (count > 0) { - cur = min_t(ssize_t, count, PAGE_SIZE); - err = os_read_file(fd, new_buf, cur); - if (err < 0) { - printk(KERN_ERR "hppfs_read : read failed, " - "errno = %d\n", err); - n = err; - goto out_free; - } else if (err == 0) - break; - - if (copy_to_user(buf, new_buf, err)) { - n = -EFAULT; - goto out_free; - } - n += err; - count -= err; - } - out_free: - kfree(new_buf); - out: - return n; -} - -static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) -{ - struct hppfs_private *hppfs = file->private_data; - struct hppfs_data *data; - loff_t off; - int err; - - if (hppfs->contents != NULL) { - int rem; - - if (*ppos >= hppfs->len) - return 0; - - data = hppfs->contents; - off = *ppos; - while (off >= sizeof(data->contents)) { - data = list_entry(data->list.next, struct hppfs_data, - list); - off -= sizeof(data->contents); - } - - if (off + count > hppfs->len) - count = hppfs->len - off; - rem = copy_to_user(buf, &data->contents[off], count); - *ppos += count - rem; - if (rem > 0) - return -EFAULT; - } else if (hppfs->host_fd != -1) { - err = os_seek_file(hppfs->host_fd, *ppos); - if (err) { - printk(KERN_ERR "hppfs_read : seek failed, " - "errno = %d\n", err); - return err; - } - err = hppfs_read_file(hppfs->host_fd, buf, count); - if (err < 0) { - printk(KERN_ERR "hppfs_read: read failed: %d\n", err); - return err; - } - count = err; - if (count > 0) - *ppos += count; - } - else count = read_proc(hppfs->proc_file, buf, count, ppos, 1); - - return count; -} - -static ssize_t hppfs_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); - - write = file_inode(proc_file)->i_fop->write; - return (*write)(proc_file, buf, len, ppos); -} - -static int open_host_sock(char *host_file, int *filter_out) -{ - char *end; - int fd; - - end = &host_file[strlen(host_file)]; - strcpy(end, "/rw"); - *filter_out = 1; - fd = os_connect_socket(host_file); - if (fd > 0) - return fd; - - strcpy(end, "/r"); - *filter_out = 0; - fd = os_connect_socket(host_file); - return fd; -} - -static void free_contents(struct hppfs_data *head) -{ - struct hppfs_data *data; - struct list_head *ele, *next; - - if (head == NULL) - return; - - list_for_each_safe(ele, next, &head->list) { - data = list_entry(ele, struct hppfs_data, list); - kfree(data); - } - kfree(head); -} - -static struct hppfs_data *hppfs_get_data(int fd, int filter, - struct file *proc_file, - struct file *hppfs_file, - loff_t *size_out) -{ - struct hppfs_data *data, *new, *head; - int n, err; - - err = -ENOMEM; - data = kmalloc(sizeof(*data), GFP_KERNEL); - if (data == NULL) { - printk(KERN_ERR "hppfs_get_data : head allocation failed\n"); - goto failed; - } - - INIT_LIST_HEAD(&data->list); - - head = data; - *size_out = 0; - - if (filter) { - while ((n = read_proc(proc_file, data->contents, - sizeof(data->contents), NULL, 0)) > 0) - os_write_file(fd, data->contents, n); - err = os_shutdown_socket(fd, 0, 1); - if (err) { - printk(KERN_ERR "hppfs_get_data : failed to shut down " - "socket\n"); - goto failed_free; - } - } - while (1) { - n = os_read_file(fd, data->contents, sizeof(data->contents)); - if (n < 0) { - err = n; - printk(KERN_ERR "hppfs_get_data : read failed, " - "errno = %d\n", err); - goto failed_free; - } else if (n == 0) - break; - - *size_out += n; - - if (n < sizeof(data->contents)) - break; - - new = kmalloc(sizeof(*data), GFP_KERNEL); - if (new == 0) { - printk(KERN_ERR "hppfs_get_data : data allocation " - "failed\n"); - err = -ENOMEM; - goto failed_free; - } - - INIT_LIST_HEAD(&new->list); - list_add(&new->list, &data->list); - data = new; - } - return head; - - failed_free: - free_contents(head); - failed: - return ERR_PTR(err); -} - -static struct hppfs_private *hppfs_data(void) -{ - struct hppfs_private *data; - - data = kmalloc(sizeof(*data), GFP_KERNEL); - if (data == NULL) - return data; - - *data = ((struct hppfs_private ) { .host_fd = -1, - .len = -1, - .contents = NULL } ); - return data; -} - -static int file_mode(int fmode) -{ - if (fmode == (FMODE_READ | FMODE_WRITE)) - return O_RDWR; - if (fmode == FMODE_READ) - return O_RDONLY; - if (fmode == FMODE_WRITE) - return O_WRONLY; - return 0; -} - -static int hppfs_open(struct inode *inode, struct file *file) -{ - const struct cred *cred = file->f_cred; - struct hppfs_private *data; - struct path path; - char *host_file; - int err, fd, type, filter; - - err = -ENOMEM; - data = hppfs_data(); - if (data == NULL) - goto out; - - host_file = dentry_name(file->f_path.dentry, strlen("/rw")); - if (host_file == NULL) - goto out_free2; - - path.mnt = inode->i_sb->s_fs_info; - path.dentry = HPPFS_I(inode)->proc_dentry; - - data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); - err = PTR_ERR(data->proc_file); - if (IS_ERR(data->proc_file)) - goto out_free1; - - type = os_file_type(host_file); - if (type == OS_TYPE_FILE) { - fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); - if (fd >= 0) - data->host_fd = fd; - else - printk(KERN_ERR "hppfs_open : failed to open '%s', " - "errno = %d\n", host_file, -fd); - - data->contents = NULL; - } else if (type == OS_TYPE_DIR) { - fd = open_host_sock(host_file, &filter); - if (fd > 0) { - data->contents = hppfs_get_data(fd, filter, - data->proc_file, - file, &data->len); - if (!IS_ERR(data->contents)) - data->host_fd = fd; - } else - printk(KERN_ERR "hppfs_open : failed to open a socket " - "in '%s', errno = %d\n", host_file, -fd); - } - kfree(host_file); - - file->private_data = data; - return 0; - - out_free1: - kfree(host_file); - out_free2: - free_contents(data->contents); - kfree(data); - out: - return err; -} - -static int hppfs_dir_open(struct inode *inode, struct file *file) -{ - const struct cred *cred = file->f_cred; - struct hppfs_private *data; - struct path path; - int err; - - err = -ENOMEM; - data = hppfs_data(); - if (data == NULL) - goto out; - - path.mnt = inode->i_sb->s_fs_info; - path.dentry = HPPFS_I(inode)->proc_dentry; - data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); - err = PTR_ERR(data->proc_file); - if (IS_ERR(data->proc_file)) - goto out_free; - - file->private_data = data; - return 0; - - out_free: - kfree(data); - out: - return err; -} - -static loff_t hppfs_llseek(struct file *file, loff_t off, int where) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - loff_t (*llseek)(struct file *, loff_t, int); - loff_t ret; - - llseek = file_inode(proc_file)->i_fop->llseek; - if (llseek != NULL) { - ret = (*llseek)(proc_file, off, where); - if (ret < 0) - return ret; - } - - return default_llseek(file, off, where); -} - -static int hppfs_release(struct inode *inode, struct file *file) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - if (proc_file) - fput(proc_file); - kfree(data); - return 0; -} - -static const struct file_operations hppfs_file_fops = { - .owner = NULL, - .llseek = hppfs_llseek, - .read = hppfs_read, - .write = hppfs_write, - .open = hppfs_open, - .release = hppfs_release, -}; - -struct hppfs_dirent { - struct dir_context ctx; - struct dir_context *caller; - struct dentry *dentry; -}; - -static int hppfs_filldir(struct dir_context *ctx, const char *name, int size, - loff_t offset, u64 inode, unsigned int type) -{ - struct hppfs_dirent *dirent = - container_of(ctx, struct hppfs_dirent, ctx); - - if (file_removed(dirent->dentry, name)) - return 0; - - dirent->caller->pos = dirent->ctx.pos; - return !dir_emit(dirent->caller, name, size, inode, type); -} - -static int hppfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - struct hppfs_dirent d = { - .ctx.actor = hppfs_filldir, - .caller = ctx, - .dentry = file->f_path.dentry - }; - int err; - proc_file->f_pos = ctx->pos; - err = iterate_dir(proc_file, &d.ctx); - ctx->pos = d.ctx.pos; - return err; -} - -static const struct file_operations hppfs_dir_fops = { - .owner = NULL, - .iterate = hppfs_readdir, - .open = hppfs_dir_open, - .llseek = default_llseek, - .release = hppfs_release, -}; - -static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) -{ - sf->f_blocks = 0; - sf->f_bfree = 0; - sf->f_bavail = 0; - sf->f_files = 0; - sf->f_ffree = 0; - sf->f_type = HPPFS_SUPER_MAGIC; - return 0; -} - -static struct inode *hppfs_alloc_inode(struct super_block *sb) -{ - struct hppfs_inode_info *hi; - - hi = kmalloc(sizeof(*hi), GFP_KERNEL); - if (!hi) - return NULL; - - hi->proc_dentry = NULL; - inode_init_once(&hi->vfs_inode); - return &hi->vfs_inode; -} - -void hppfs_evict_inode(struct inode *ino) -{ - clear_inode(ino); - dput(HPPFS_I(ino)->proc_dentry); - mntput(ino->i_sb->s_fs_info); -} - -static void hppfs_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kfree(HPPFS_I(inode)); -} - -static void hppfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, hppfs_i_callback); -} - -static const struct super_operations hppfs_sbops = { - .alloc_inode = hppfs_alloc_inode, - .destroy_inode = hppfs_destroy_inode, - .evict_inode = hppfs_evict_inode, - .statfs = hppfs_statfs, -}; - -static int hppfs_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; - return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, - buflen); -} - -static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; - - return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); -} - -static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; - - if (proc_dentry->d_inode->i_op->put_link) - proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie); -} - -static const struct inode_operations hppfs_dir_iops = { - .lookup = hppfs_lookup, -}; - -static const struct inode_operations hppfs_link_iops = { - .readlink = hppfs_readlink, - .follow_link = hppfs_follow_link, - .put_link = hppfs_put_link, -}; - -static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) -{ - struct inode *proc_ino = dentry->d_inode; - struct inode *inode = new_inode(sb); - - if (!inode) { - dput(dentry); - return NULL; - } - - if (d_is_dir(dentry)) { - inode->i_op = &hppfs_dir_iops; - inode->i_fop = &hppfs_dir_fops; - } else if (d_is_symlink(dentry)) { - inode->i_op = &hppfs_link_iops; - inode->i_fop = &hppfs_file_fops; - } else { - inode->i_op = &hppfs_file_iops; - inode->i_fop = &hppfs_file_fops; - } - - HPPFS_I(inode)->proc_dentry = dentry; - - inode->i_uid = proc_ino->i_uid; - inode->i_gid = proc_ino->i_gid; - inode->i_atime = proc_ino->i_atime; - inode->i_mtime = proc_ino->i_mtime; - inode->i_ctime = proc_ino->i_ctime; - inode->i_ino = proc_ino->i_ino; - inode->i_mode = proc_ino->i_mode; - set_nlink(inode, proc_ino->i_nlink); - inode->i_size = proc_ino->i_size; - inode->i_blocks = proc_ino->i_blocks; - - return inode; -} - -static int hppfs_fill_super(struct super_block *sb, void *d, int silent) -{ - struct inode *root_inode; - struct vfsmount *proc_mnt; - int err = -ENOENT; - - proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); - if (IS_ERR(proc_mnt)) - goto out; - - sb->s_blocksize = 1024; - sb->s_blocksize_bits = 10; - sb->s_magic = HPPFS_SUPER_MAGIC; - sb->s_op = &hppfs_sbops; - sb->s_fs_info = proc_mnt; - - err = -ENOMEM; - root_inode = get_inode(sb, dget(proc_mnt->mnt_root)); - sb->s_root = d_make_root(root_inode); - if (!sb->s_root) - goto out_mntput; - - return 0; - - out_mntput: - mntput(proc_mnt); - out: - return(err); -} - -static struct dentry *hppfs_read_super(struct file_system_type *type, - int flags, const char *dev_name, - void *data) -{ - return mount_nodev(type, flags, data, hppfs_fill_super); -} - -static struct file_system_type hppfs_type = { - .owner = THIS_MODULE, - .name = "hppfs", - .mount = hppfs_read_super, - .kill_sb = kill_anon_super, - .fs_flags = 0, -}; -MODULE_ALIAS_FS("hppfs"); - -static int __init init_hppfs(void) -{ - return register_filesystem(&hppfs_type); -} - -static void __exit exit_hppfs(void) -{ - unregister_filesystem(&hppfs_type); -} - -module_init(init_hppfs) -module_exit(exit_hppfs) -MODULE_LICENSE("GPL"); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index c274aca8e8dc..973c24ce59ad 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -34,6 +34,7 @@ #include <linux/security.h> #include <linux/magic.h> #include <linux/migrate.h> +#include <linux/uio.h> #include <asm/uaccess.h> @@ -47,9 +48,10 @@ struct hugetlbfs_config { kuid_t uid; kgid_t gid; umode_t mode; - long nr_blocks; + long max_hpages; long nr_inodes; struct hstate *hstate; + long min_hpages; }; struct hugetlbfs_inode_info { @@ -67,7 +69,7 @@ int sysctl_hugetlb_shm_group; enum { Opt_size, Opt_nr_inodes, Opt_mode, Opt_uid, Opt_gid, - Opt_pagesize, + Opt_pagesize, Opt_min_size, Opt_err, }; @@ -78,6 +80,7 @@ static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_pagesize, "pagesize=%s"}, + {Opt_min_size, "min_size=%s"}, {Opt_err, NULL}, }; @@ -127,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) goto out; ret = 0; - hugetlb_prefault_arch_hook(vma->vm_mm); if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: @@ -179,42 +181,33 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } #endif -static int +static size_t hugetlbfs_read_actor(struct page *page, unsigned long offset, - char __user *buf, unsigned long count, - unsigned long size) + struct iov_iter *to, unsigned long size) { - char *kaddr; - unsigned long left, copied = 0; + size_t copied = 0; int i, chunksize; - if (size > count) - size = count; - /* Find which 4k chunk and offset with in that chunk */ i = offset >> PAGE_CACHE_SHIFT; offset = offset & ~PAGE_CACHE_MASK; while (size) { + size_t n; chunksize = PAGE_CACHE_SIZE; if (offset) chunksize -= offset; if (chunksize > size) chunksize = size; - kaddr = kmap(&page[i]); - left = __copy_to_user(buf, kaddr + offset, chunksize); - kunmap(&page[i]); - if (left) { - copied += (chunksize - left); - break; - } + n = copy_page_to_iter(&page[i], offset, chunksize, to); + copied += n; + if (n != chunksize) + return copied; offset = 0; size -= chunksize; - buf += chunksize; - copied += chunksize; i++; } - return copied ? copied : -EFAULT; + return copied; } /* @@ -222,39 +215,34 @@ hugetlbfs_read_actor(struct page *page, unsigned long offset, * data. Its *very* similar to do_generic_mapping_read(), we can't use that * since it has PAGE_CACHE_SIZE assumptions. */ -static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, - size_t len, loff_t *ppos) +static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct hstate *h = hstate_file(filp); - struct address_space *mapping = filp->f_mapping; + struct file *file = iocb->ki_filp; + struct hstate *h = hstate_file(file); + struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - unsigned long index = *ppos >> huge_page_shift(h); - unsigned long offset = *ppos & ~huge_page_mask(h); + unsigned long index = iocb->ki_pos >> huge_page_shift(h); + unsigned long offset = iocb->ki_pos & ~huge_page_mask(h); unsigned long end_index; loff_t isize; ssize_t retval = 0; - /* validate length */ - if (len == 0) - goto out; - - for (;;) { + while (iov_iter_count(to)) { struct page *page; - unsigned long nr, ret; - int ra; + size_t nr, copied; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); isize = i_size_read(inode); if (!isize) - goto out; + break; end_index = (isize - 1) >> huge_page_shift(h); - if (index >= end_index) { - if (index > end_index) - goto out; + if (index > end_index) + break; + if (index == end_index) { nr = ((isize - 1) & ~huge_page_mask(h)) + 1; if (nr <= offset) - goto out; + break; } nr = nr - offset; @@ -265,39 +253,27 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ - ret = len < nr ? len : nr; - if (clear_user(buf, ret)) - ra = -EFAULT; - else - ra = 0; + copied = iov_iter_zero(nr, to); } else { unlock_page(page); /* * We have the page, copy it to user space buffer. */ - ra = hugetlbfs_read_actor(page, offset, buf, len, nr); - ret = ra; + copied = hugetlbfs_read_actor(page, offset, to, nr); page_cache_release(page); } - if (ra < 0) { - if (retval == 0) - retval = ra; - goto out; + offset += copied; + retval += copied; + if (copied != nr && iov_iter_count(to)) { + if (!retval) + retval = -EFAULT; + break; } - - offset += ret; - retval += ret; - len -= ret; index += offset >> huge_page_shift(h); offset &= ~huge_page_mask(h); - - /* short read or no more work */ - if ((ret != nr) || (len == 0)) - break; } -out: - *ppos = ((loff_t)index << huge_page_shift(h)) + offset; + iocb->ki_pos = ((loff_t)index << huge_page_shift(h)) + offset; return retval; } @@ -319,7 +295,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, static void truncate_huge_page(struct page *page) { - cancel_dirty_page(page, /* No IO accounting for huge pages? */0); + ClearPageDirty(page); ClearPageUptodate(page); delete_from_page_cache(page); } @@ -416,7 +392,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; @@ -610,7 +586,7 @@ static int hugetlbfs_migrate_page(struct address_space *mapping, static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); - struct hstate *h = hstate_inode(dentry->d_inode); + struct hstate *h = hstate_inode(d_inode(dentry)); buf->f_type = HUGETLBFS_MAGIC; buf->f_bsize = huge_page_size(h); @@ -721,7 +697,7 @@ static void init_once(void *foo) } const struct file_operations hugetlbfs_file_operations = { - .read = hugetlbfs_read, + .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, @@ -754,14 +730,38 @@ static const struct super_operations hugetlbfs_ops = { .show_options = generic_show_options, }; +enum { NO_SIZE, SIZE_STD, SIZE_PERCENT }; + +/* + * Convert size option passed from command line to number of huge pages + * in the pool specified by hstate. Size option could be in bytes + * (val_type == SIZE_STD) or percentage of the pool (val_type == SIZE_PERCENT). + */ +static long long +hugetlbfs_size_to_hpages(struct hstate *h, unsigned long long size_opt, + int val_type) +{ + if (val_type == NO_SIZE) + return -1; + + if (val_type == SIZE_PERCENT) { + size_opt <<= huge_page_shift(h); + size_opt *= h->max_huge_pages; + do_div(size_opt, 100); + } + + size_opt >>= huge_page_shift(h); + return size_opt; +} + static int hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) { char *p, *rest; substring_t args[MAX_OPT_ARGS]; int option; - unsigned long long size = 0; - enum { NO_SIZE, SIZE_STD, SIZE_PERCENT } setsize = NO_SIZE; + unsigned long long max_size_opt = 0, min_size_opt = 0; + int max_val_type = NO_SIZE, min_val_type = NO_SIZE; if (!options) return 0; @@ -799,10 +799,10 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) /* memparse() will accept a K/M/G without a digit */ if (!isdigit(*args[0].from)) goto bad_val; - size = memparse(args[0].from, &rest); - setsize = SIZE_STD; + max_size_opt = memparse(args[0].from, &rest); + max_val_type = SIZE_STD; if (*rest == '%') - setsize = SIZE_PERCENT; + max_val_type = SIZE_PERCENT; break; } @@ -825,6 +825,17 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) break; } + case Opt_min_size: { + /* memparse() will accept a K/M/G without a digit */ + if (!isdigit(*args[0].from)) + goto bad_val; + min_size_opt = memparse(args[0].from, &rest); + min_val_type = SIZE_STD; + if (*rest == '%') + min_val_type = SIZE_PERCENT; + break; + } + default: pr_err("Bad mount option: \"%s\"\n", p); return -EINVAL; @@ -832,15 +843,22 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) } } - /* Do size after hstate is set up */ - if (setsize > NO_SIZE) { - struct hstate *h = pconfig->hstate; - if (setsize == SIZE_PERCENT) { - size <<= huge_page_shift(h); - size *= h->max_huge_pages; - do_div(size, 100); - } - pconfig->nr_blocks = (size >> huge_page_shift(h)); + /* + * Use huge page pool size (in hstate) to convert the size + * options to number of huge pages. If NO_SIZE, -1 is returned. + */ + pconfig->max_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + max_size_opt, max_val_type); + pconfig->min_hpages = hugetlbfs_size_to_hpages(pconfig->hstate, + min_size_opt, min_val_type); + + /* + * If max_size was specified, then min_size must be smaller + */ + if (max_val_type > NO_SIZE && + pconfig->min_hpages > pconfig->max_hpages) { + pr_err("minimum size can not be greater than maximum size\n"); + return -EINVAL; } return 0; @@ -859,12 +877,13 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) save_mount_options(sb, data); - config.nr_blocks = -1; /* No limit on size by default */ + config.max_hpages = -1; /* No limit on size by default */ config.nr_inodes = -1; /* No limit on number of inodes by default */ config.uid = current_fsuid(); config.gid = current_fsgid(); config.mode = 0755; config.hstate = &default_hstate; + config.min_hpages = -1; /* No default minimum size */ ret = hugetlbfs_parse_options(data, &config); if (ret) return ret; @@ -878,8 +897,15 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) sbinfo->max_inodes = config.nr_inodes; sbinfo->free_inodes = config.nr_inodes; sbinfo->spool = NULL; - if (config.nr_blocks != -1) { - sbinfo->spool = hugepage_new_subpool(config.nr_blocks); + /* + * Allocate and initialize subpool if maximum or minimum size is + * specified. Any needed reservations (for minimim size) are taken + * taken when the subpool is created. + */ + if (config.max_hpages != -1 || config.min_hpages != -1) { + sbinfo->spool = hugepage_new_subpool(config.hstate, + config.max_hpages, + config.min_hpages); if (!sbinfo->spool) goto out_free; } @@ -984,6 +1010,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out_dentry; + if (creat_flags == HUGETLB_SHMFS_INODE) + inode->i_flags |= S_PRIVATE; file = ERR_PTR(-ENOMEM); if (hugetlb_reserve_pages(inode, 0, diff --git a/fs/inode.c b/fs/inode.c index f00b16f45507..d30640f7a193 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -152,6 +152,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_pipe = NULL; inode->i_bdev = NULL; inode->i_cdev = NULL; + inode->i_link = NULL; inode->i_rdev = 0; inode->dirtied_when = 0; @@ -223,6 +224,7 @@ EXPORT_SYMBOL(free_inode_nonrcu); void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); + inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode->i_flctx); @@ -839,7 +841,11 @@ unsigned int get_next_ino(void) } #endif - *p = ++res; + res++; + /* get_next_ino should not provide a 0 inode number */ + if (unlikely(!res)) + res++; + *p = res; put_cpu_var(last_ino); return res; } @@ -1584,36 +1590,47 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -void touch_atime(const struct path *path) +bool atime_needs_update(const struct path *path, struct inode *inode) { struct vfsmount *mnt = path->mnt; - struct inode *inode = path->dentry->d_inode; struct timespec now; if (inode->i_flags & S_NOATIME) - return; + return false; if (IS_NOATIME(inode)) - return; + return false; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) - return; + return false; if (mnt->mnt_flags & MNT_NOATIME) - return; + return false; if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) - return; + return false; now = current_fs_time(inode->i_sb); if (!relatime_need_update(mnt, inode, now)) - return; + return false; if (timespec_equal(&inode->i_atime, &now)) + return false; + + return true; +} + +void touch_atime(const struct path *path) +{ + struct vfsmount *mnt = path->mnt; + struct inode *inode = d_inode(path->dentry); + struct timespec now; + + if (!atime_needs_update(path, inode)) return; if (!sb_start_write_trylock(inode->i_sb)) return; - if (__mnt_want_write(mnt)) + if (__mnt_want_write(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to @@ -1624,6 +1641,7 @@ void touch_atime(const struct path *path) * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ + now = current_fs_time(inode->i_sb); update_time(inode, &now, S_ATIME); __mnt_drop_write(mnt); skip_update: @@ -1639,7 +1657,7 @@ EXPORT_SYMBOL(touch_atime); */ int should_remove_suid(struct dentry *dentry) { - umode_t mode = dentry->d_inode->i_mode; + umode_t mode = d_inode(dentry)->i_mode; int kill = 0; /* suid always must be killed */ @@ -1660,7 +1678,31 @@ int should_remove_suid(struct dentry *dentry) } EXPORT_SYMBOL(should_remove_suid); -static int __remove_suid(struct dentry *dentry, int kill) +/* + * Return mask of changes for notify_change() that need to be done as a + * response to write or truncate. Return 0 if nothing has to be changed. + * Negative value on error (change should be denied). + */ +int dentry_needs_remove_privs(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + int mask = 0; + int ret; + + if (IS_NOSEC(inode)) + return 0; + + mask = should_remove_suid(dentry); + ret = security_inode_need_killpriv(dentry); + if (ret < 0) + return ret; + if (ret) + mask |= ATTR_KILL_PRIV; + return mask; +} +EXPORT_SYMBOL(dentry_needs_remove_privs); + +static int __remove_privs(struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1672,33 +1714,32 @@ static int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs, NULL); } -int file_remove_suid(struct file *file) +/* + * Remove special file priviledges (suid, capabilities) when file is written + * to or truncated. + */ +int file_remove_privs(struct file *file) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - int killsuid; - int killpriv; + struct inode *inode = d_inode(dentry); + int kill; int error = 0; /* Fast path for nothing security related */ if (IS_NOSEC(inode)) return 0; - killsuid = should_remove_suid(dentry); - killpriv = security_inode_need_killpriv(dentry); - - if (killpriv < 0) - return killpriv; - if (killpriv) - error = security_inode_killpriv(dentry); - if (!error && killsuid) - error = __remove_suid(dentry, killsuid); - if (!error && (inode->i_sb->s_flags & MS_NOSEC)) - inode->i_flags |= S_NOSEC; + kill = file_needs_remove_privs(file); + if (kill < 0) + return kill; + if (kill) + error = __remove_privs(dentry, kill); + if (!error) + inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL(file_remove_suid); +EXPORT_SYMBOL(file_remove_privs); /** * file_update_time - update mtime and ctime time @@ -1946,20 +1987,6 @@ void inode_dio_wait(struct inode *inode) EXPORT_SYMBOL(inode_dio_wait); /* - * inode_dio_done - signal finish of a direct I/O requests - * @inode: inode the direct I/O happens on - * - * This is called once we've finished processing a direct I/O request, - * and is used to wake up callers waiting for direct I/O to be quiesced. - */ -void inode_dio_done(struct inode *inode) -{ - if (atomic_dec_and_test(&inode->i_dio_count)) - wake_up_bit(&inode->i_state, __I_DIO_WAKEUP); -} -EXPORT_SYMBOL(inode_dio_done); - -/* * inode_set_flags - atomically set some inode flags * * Note: the caller should be holding i_mutex, or else be sure that @@ -1967,9 +1994,8 @@ EXPORT_SYMBOL(inode_dio_done); * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify * i_flags actually followed this rule, is that there is at least one - * code path which doesn't today --- for example, - * __generic_file_aio_write() calls file_remove_suid() without holding - * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * code path which doesn't today so we use cmpxchg() out of an abundance + * of caution. * * In the long run, i_mutex is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure diff --git a/fs/internal.h b/fs/internal.h index 01dce1d1476b..4d5af583ab03 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -107,6 +107,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, extern long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag); extern int open_check_o_direct(struct file *f); +extern int vfs_open(const struct path *, struct file *, const struct cred *); /* * inode.c diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 12088d8de3fa..0c5f721b4e91 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -44,7 +44,7 @@ static struct dentry *isofs_export_get_parent(struct dentry *child) { unsigned long parent_block = 0; unsigned long parent_offset = 0; - struct inode *child_inode = child->d_inode; + struct inode *child_inode = d_inode(child); struct iso_inode_info *e_child_inode = ISOFS_I(child_inode); struct iso_directory_record *de = NULL; struct buffer_head * bh = NULL; diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 988b32ed4c87..4227dc4f7437 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -390,7 +390,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) unsigned long blocknr; if (is_journal_aborted(journal)) - return 1; + return -EIO; if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr)) return 1; @@ -405,10 +405,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * jbd2_cleanup_journal_tail() doesn't get called all that often. */ if (journal->j_flags & JBD2_BARRIER) - blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL); + blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL); - __jbd2_update_log_tail(journal, first_tid, blocknr); - return 0; + return __jbd2_update_log_tail(journal, first_tid, blocknr); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b96bd8076b70..4ff3fad4e9e3 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); -retry_alloc: - new_bh = alloc_buffer_head(GFP_NOFS); - if (!new_bh) { - /* - * Failure is not an option, but __GFP_NOFAIL is going - * away; so we retry ourselves here. - */ - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry_alloc; - } + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); @@ -885,9 +876,10 @@ int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid, * * Requires j_checkpoint_mutex */ -void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) +int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) { unsigned long freed; + int ret; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); @@ -897,7 +889,10 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA); + if (ret) + goto out; + write_lock(&journal->j_state_lock); freed = block - journal->j_tail; if (block < journal->j_tail) @@ -913,6 +908,9 @@ void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) journal->j_tail_sequence = tid; journal->j_tail = block; write_unlock(&journal->j_state_lock); + +out: + return ret; } /* @@ -1137,7 +1135,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, { journal_t *journal = journal_init_common(); struct buffer_head *bh; - char *p; int n; if (!journal) @@ -1150,9 +1147,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, journal->j_blk_offset = start; journal->j_maxlen = len; bdevname(journal->j_dev, journal->j_devname); - p = journal->j_devname; - while ((p = strchr(p, '/'))) - *p = '!'; + strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; @@ -1204,10 +1199,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; journal->j_inode = inode; bdevname(journal->j_dev, journal->j_devname); - p = journal->j_devname; - while ((p = strchr(p, '/'))) - *p = '!'; - p = journal->j_devname + strlen(journal->j_devname); + p = strreplace(journal->j_devname, '/', '!'); sprintf(p, "-%lu", journal->j_inode->i_ino); jbd_debug(1, "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", @@ -1331,7 +1323,7 @@ static int journal_reset(journal_t *journal) return jbd2_journal_start_thread(journal); } -static void jbd2_write_superblock(journal_t *journal, int write_op) +static int jbd2_write_superblock(journal_t *journal, int write_op) { struct buffer_head *bh = journal->j_sb_buffer; journal_superblock_t *sb = journal->j_superblock; @@ -1370,7 +1362,10 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) printk(KERN_ERR "JBD2: Error %d detected when updating " "journal superblock for %s.\n", ret, journal->j_devname); + jbd2_journal_abort(journal, ret); } + + return ret; } /** @@ -1383,10 +1378,11 @@ static void jbd2_write_superblock(journal_t *journal, int write_op) * Update a journal's superblock information about log tail and write it to * disk, waiting for the IO to complete. */ -void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, +int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, unsigned long tail_block, int write_op) { journal_superblock_t *sb = journal->j_superblock; + int ret; BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex)); jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n", @@ -1395,13 +1391,18 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, sb->s_sequence = cpu_to_be32(tail_tid); sb->s_start = cpu_to_be32(tail_block); - jbd2_write_superblock(journal, write_op); + ret = jbd2_write_superblock(journal, write_op); + if (ret) + goto out; /* Log is no longer empty */ write_lock(&journal->j_state_lock); WARN_ON(!sb->s_sequence); journal->j_flags &= ~JBD2_FLUSHED; write_unlock(&journal->j_state_lock); + +out: + return ret; } /** @@ -1950,7 +1951,14 @@ int jbd2_journal_flush(journal_t *journal) return -EIO; mutex_lock(&journal->j_checkpoint_mutex); - jbd2_cleanup_journal_tail(journal); + if (!err) { + err = jbd2_cleanup_journal_tail(journal); + if (err < 0) { + mutex_unlock(&journal->j_checkpoint_mutex); + goto out; + } + err = 0; + } /* Finally, mark the journal as really needing no recovery. * This sets s_start==0 in the underlying superblock, which is @@ -1966,7 +1974,8 @@ int jbd2_journal_flush(journal_t *journal) J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); write_unlock(&journal->j_state_lock); - return 0; +out: + return err; } /** @@ -2330,7 +2339,7 @@ static int jbd2_journal_init_journal_head_cache(void) jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ - SLAB_TEMPORARY, /* flags */ + SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, NULL); /* ctor */ retval = 0; if (!jbd2_journal_head_cache) { @@ -2362,10 +2371,8 @@ static struct journal_head *journal_alloc_journal_head(void) if (!ret) { jbd_debug(1, "out of memory for journal_head\n"); pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); - while (!ret) { - yield(); - ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); - } + ret = kmem_cache_zalloc(jbd2_journal_head_cache, + GFP_NOFS | __GFP_NOFAIL); } return ret; } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index b5128c6e63ad..a9079d035ae5 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -842,15 +842,23 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, { jbd2_journal_revoke_header_t *header; int offset, max; + int csum_size = 0; + __u32 rcount; int record_len = 4; header = (jbd2_journal_revoke_header_t *) bh->b_data; offset = sizeof(jbd2_journal_revoke_header_t); - max = be32_to_cpu(header->r_count); + rcount = be32_to_cpu(header->r_count); if (!jbd2_revoke_block_csum_verify(journal, header)) return -EINVAL; + if (jbd2_journal_has_csum_v2or3(journal)) + csum_size = sizeof(struct jbd2_journal_revoke_tail); + if (rcount > journal->j_blocksize - csum_size) + return -EINVAL; + max = rcount; + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) record_len = 8; diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index c6cbaef2bda1..0abf2e7f725b 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -141,11 +141,13 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, { struct list_head *hash_list; struct jbd2_revoke_record_s *record; + gfp_t gfp_mask = GFP_NOFS; -repeat: - record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS); + if (journal_oom_retry) + gfp_mask |= __GFP_NOFAIL; + record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) - goto oom; + return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; @@ -154,13 +156,6 @@ repeat: list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; - -oom: - if (!journal_oom_retry) - return -ENOMEM; - jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); - yield(); - goto repeat; } /* Find a revoke record in the journal's hash table. */ @@ -577,7 +572,7 @@ static void write_one_revoke_record(journal_t *journal, { int csum_size = 0; struct buffer_head *descriptor; - int offset; + int sz, offset; journal_header_t *header; /* If we are already aborting, this all becomes a noop. We @@ -594,9 +589,14 @@ static void write_one_revoke_record(journal_t *journal, if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_revoke_tail); + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) + sz = 8; + else + sz = 4; + /* Make sure we have a descriptor with space left for the record */ if (descriptor) { - if (offset >= journal->j_blocksize - csum_size) { + if (offset + sz > journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset, write_op); descriptor = NULL; } @@ -619,16 +619,13 @@ static void write_one_revoke_record(journal_t *journal, *descriptorp = descriptor; } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) { + if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); - offset += 8; - - } else { + else * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); - offset += 4; - } + offset += sz; *offsetp = offset; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5f09370c90a8..f3d06174b051 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -278,22 +278,16 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { + /* + * If __GFP_FS is not present, then we may be being called from + * inside the fs writeback layer, so we MUST NOT fail. + */ + if ((gfp_mask & __GFP_FS) == 0) + gfp_mask |= __GFP_NOFAIL; new_transaction = kmem_cache_zalloc(transaction_cache, gfp_mask); - if (!new_transaction) { - /* - * If __GFP_FS is not present, then we may be - * being called from inside the fs writeback - * layer, so we MUST NOT fail. Since - * __GFP_NOFAIL is going away, we will arrange - * to retry the allocation ourselves. - */ - if ((gfp_mask & __GFP_FS) == 0) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto alloc_transaction; - } + if (!new_transaction) return -ENOMEM; - } } jbd_debug(3, "New handle %p going live.\n", handle); @@ -551,7 +545,6 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) int result; int wanted; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -627,7 +620,6 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, gfp_t gfp_mask) tid_t tid; int need_to_start, ret; - WARN_ON(!transaction); /* If we've had an abort of any type, don't even think about * actually doing the restart! */ if (is_handle_aborted(handle)) @@ -763,6 +755,30 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } +/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ +static void jbd2_freeze_jh_data(struct journal_head *jh) +{ + struct page *page; + int offset; + char *source; + struct buffer_head *bh = jh2bh(jh); + + J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); + page = bh->b_page; + offset = offset_in_page(bh->b_data); + source = kmap_atomic(page); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers); + memcpy(jh->b_frozen_data, source + offset, bh->b_size); + kunmap_atomic(source); + + /* + * Now that the frozen data is saved off, we need to store any matching + * triggers. + */ + jh->b_frozen_triggers = jh->b_triggers; +} + /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -782,10 +798,8 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, journal_t *journal; int error; char *frozen_buffer = NULL; - int need_copy = 0; unsigned long start_lock, time_lock; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -870,119 +884,96 @@ repeat: jh->b_modified = 0; /* + * If the buffer is not journaled right now, we need to make sure it + * doesn't get written to disk before the caller actually commits the + * new data + */ + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + /* + * Make sure all stores to jh (b_modified, b_frozen_data) are + * visible before attaching it to the running transaction. + * Paired with barrier in jbd2_write_access_granted() + */ + smp_wmb(); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + goto done; + } + /* * If there is already a copy-out version of this buffer, then we don't * need to make another one */ if (jh->b_frozen_data) { JBUFFER_TRACE(jh, "has frozen data"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - jh->b_next_transaction = transaction; - goto done; + goto attach_next; } - /* Is there data here we need to preserve? */ + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); - if (jh->b_transaction && jh->b_transaction != transaction) { - JBUFFER_TRACE(jh, "owned by older transaction"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); + /* + * There is one case we have to be very careful about. If the + * committing transaction is currently writing this buffer out to disk + * and has NOT made a copy-out, then we cannot modify the buffer + * contents at all right now. The essence of copy-out is that it is + * the extra copy, not the primary copy, which gets journaled. If the + * primary copy is already going to disk then we cannot do copy-out + * here. + */ + if (buffer_shadow(bh)) { + JBUFFER_TRACE(jh, "on shadow: sleep"); + jbd_unlock_bh_state(bh); + wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); + goto repeat; + } - /* There is one case we have to be very careful about. - * If the committing transaction is currently writing - * this buffer out to disk and has NOT made a copy-out, - * then we cannot modify the buffer contents at all - * right now. The essence of copy-out is that it is the - * extra copy, not the primary copy, which gets - * journaled. If the primary copy is already going to - * disk then we cannot do copy-out here. */ - - if (buffer_shadow(bh)) { - JBUFFER_TRACE(jh, "on shadow: sleep"); + /* + * Only do the copy if the currently-owning transaction still needs it. + * If buffer isn't on BJ_Metadata list, the committing transaction is + * past that stage (here we use the fact that BH_Shadow is set under + * bh_state lock together with refiling to BJ_Shadow list and at this + * point we know the buffer doesn't have BH_Shadow set). + * + * Subtle point, though: if this is a get_undo_access, then we will be + * relying on the frozen_data to contain the new value of the + * committed_data record after the transaction, so we HAVE to force the + * frozen_data copy in that case. + */ + if (jh->b_jlist == BJ_Metadata || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - wait_on_bit_io(&bh->b_state, BH_Shadow, - TASK_UNINTERRUPTIBLE); - goto repeat; - } - - /* - * Only do the copy if the currently-owning transaction still - * needs it. If buffer isn't on BJ_Metadata list, the - * committing transaction is past that stage (here we use the - * fact that BH_Shadow is set under bh_state lock together with - * refiling to BJ_Shadow list and at this point we know the - * buffer doesn't have BH_Shadow set). - * - * Subtle point, though: if this is a get_undo_access, - * then we will be relying on the frozen_data to contain - * the new value of the committed_data record after the - * transaction, so we HAVE to force the frozen_data copy - * in that case. - */ - if (jh->b_jlist == BJ_Metadata || force_copy) { - JBUFFER_TRACE(jh, "generate frozen data"); + frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); - frozen_buffer = - jbd2_alloc(jh2bh(jh)->b_size, - GFP_NOFS); - if (!frozen_buffer) { - printk(KERN_ERR - "%s: OOM for frozen_buffer\n", - __func__); - JBUFFER_TRACE(jh, "oom!"); - error = -ENOMEM; - jbd_lock_bh_state(bh); - goto done; - } - goto repeat; + printk(KERN_ERR "%s: OOM for frozen_buffer\n", + __func__); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + goto out; } - jh->b_frozen_data = frozen_buffer; - frozen_buffer = NULL; - need_copy = 1; + goto repeat; } - jh->b_next_transaction = transaction; + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + jbd2_freeze_jh_data(jh); } - - +attach_next: /* - * Finally, if the buffer is not journaled right now, we need to make - * sure it doesn't get written to disk before the caller actually - * commits the new data + * Make sure all stores to jh (b_modified, b_frozen_data) are visible + * before attaching it to the running transaction. Paired with barrier + * in jbd2_write_access_granted() */ - if (!jh->b_transaction) { - JBUFFER_TRACE(jh, "no transaction"); - J_ASSERT_JH(jh, !jh->b_next_transaction); - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - spin_lock(&journal->j_list_lock); - __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); - spin_unlock(&journal->j_list_lock); - } + smp_wmb(); + jh->b_next_transaction = transaction; done: - if (need_copy) { - struct page *page; - int offset; - char *source; - - J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), - "Possible IO failure.\n"); - page = jh2bh(jh)->b_page; - offset = offset_in_page(jh2bh(jh)->b_data); - source = kmap_atomic(page); - /* Fire data frozen trigger just before we copy the data */ - jbd2_buffer_frozen_trigger(jh, source + offset, - jh->b_triggers); - memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source); - - /* - * Now that the frozen data is saved off, we need to store - * any matching triggers. - */ - jh->b_frozen_triggers = jh->b_triggers; - } jbd_unlock_bh_state(bh); /* @@ -999,6 +990,55 @@ out: return error; } +/* Fast check whether buffer is already attached to the required transaction */ +static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh) +{ + struct journal_head *jh; + bool ret = false; + + /* Dirty buffers require special handling... */ + if (buffer_dirty(bh)) + return false; + + /* + * RCU protects us from dereferencing freed pages. So the checks we do + * are guaranteed not to oops. However the jh slab object can get freed + * & reallocated while we work with it. So we have to be careful. When + * we see jh attached to the running transaction, we know it must stay + * so until the transaction is committed. Thus jh won't be freed and + * will be attached to the same bh while we run. However it can + * happen jh gets freed, reallocated, and attached to the transaction + * just after we get pointer to it from bh. So we have to be careful + * and recheck jh still belongs to our bh before we return success. + */ + rcu_read_lock(); + if (!buffer_jbd(bh)) + goto out; + /* This should be bh2jh() but that doesn't work with inline functions */ + jh = READ_ONCE(bh->b_private); + if (!jh) + goto out; + if (jh->b_transaction != handle->h_transaction && + jh->b_next_transaction != handle->h_transaction) + goto out; + /* + * There are two reasons for the barrier here: + * 1) Make sure to fetch b_bh after we did previous checks so that we + * detect when jh went through free, realloc, attach to transaction + * while we were checking. Paired with implicit barrier in that path. + * 2) So that access to bh done after jbd2_write_access_granted() + * doesn't get reordered and see inconsistent state of concurrent + * do_get_write_access(). + */ + smp_mb(); + if (unlikely(jh->b_bh != bh)) + goto out; + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /** * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to @@ -1012,9 +1052,13 @@ out: int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { - struct journal_head *jh = jbd2_journal_add_journal_head(bh); + struct journal_head *jh; int rc; + if (jbd2_write_access_granted(handle, bh)) + return 0; + + jh = jbd2_journal_add_journal_head(bh); /* We do not want to get caught playing with fields which the * log thread also manipulates. Make sure that the buffer * completes any outstanding IO before proceeding. */ @@ -1051,7 +1095,6 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) int err; jbd_debug(5, "journal_head %p\n", jh); - WARN_ON(!transaction); err = -EROFS; if (is_handle_aborted(handle)) goto out; @@ -1145,11 +1188,14 @@ out: int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) { int err; - struct journal_head *jh = jbd2_journal_add_journal_head(bh); + struct journal_head *jh; char *committed_data = NULL; JBUFFER_TRACE(jh, "entry"); + if (jbd2_write_access_granted(handle, bh)) + return 0; + jh = jbd2_journal_add_journal_head(bh); /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done @@ -1266,7 +1312,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int ret = 0; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -1397,7 +1442,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) int err = 0; int was_modified = 0; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; @@ -1530,8 +1574,22 @@ int jbd2_journal_stop(handle_t *handle) tid_t tid; pid_t pid; - if (!transaction) - goto free_and_exit; + if (!transaction) { + /* + * Handle is already detached from the transaction so + * there is nothing to do other than decrease a refcount, + * or free the handle if refcount drops to zero + */ + if (--handle->h_ref > 0) { + jbd_debug(4, "h_ref %d -> %d\n", handle->h_ref + 1, + handle->h_ref); + return err; + } else { + if (handle->h_rsv_handle) + jbd2_free_handle(handle->h_rsv_handle); + goto free_and_exit; + } + } journal = transaction->t_journal; J_ASSERT(journal_current_handle() == handle); @@ -2373,7 +2431,6 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) transaction_t *transaction = handle->h_transaction; journal_t *journal; - WARN_ON(!transaction); if (is_handle_aborted(handle)) return -EROFS; journal = transaction->t_journal; diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index f21b6fb5e4c4..81180022923f 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -224,14 +224,14 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) { struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); - struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(dentry->d_inode); + struct jffs2_inode_info *dead_f = JFFS2_INODE_INFO(d_inode(dentry)); int ret; uint32_t now = get_seconds(); ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, dead_f, now); if (dead_f->inocache) - set_nlink(dentry->d_inode, dead_f->inocache->pino_nlink); + set_nlink(d_inode(dentry), dead_f->inocache->pino_nlink); if (!ret) dir_i->i_mtime = dir_i->i_ctime = ITIME(now); return ret; @@ -241,8 +241,8 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct dentry *dentry) { - struct jffs2_sb_info *c = JFFS2_SB_INFO(old_dentry->d_inode->i_sb); - struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); + struct jffs2_sb_info *c = JFFS2_SB_INFO(d_inode(old_dentry)->i_sb); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry)); struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); int ret; uint8_t type; @@ -256,7 +256,7 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de return -EPERM; /* XXX: This is ugly */ - type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12; + type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12; if (!type) type = DT_REG; now = get_seconds(); @@ -264,11 +264,11 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de if (!ret) { mutex_lock(&f->sem); - set_nlink(old_dentry->d_inode, ++f->inocache->pino_nlink); + set_nlink(d_inode(old_dentry), ++f->inocache->pino_nlink); mutex_unlock(&f->sem); - d_instantiate(dentry, old_dentry->d_inode); + d_instantiate(dentry, d_inode(old_dentry)); dir_i->i_mtime = dir_i->i_ctime = ITIME(now); - ihold(old_dentry->d_inode); + ihold(d_inode(old_dentry)); } return ret; } @@ -354,6 +354,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char ret = -ENOMEM; goto fail; } + inode->i_link = f->target; jffs2_dbg(1, "%s(): symlink's target '%s' cached\n", __func__, (char *)f->target); @@ -585,7 +586,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) { struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); - struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry)); struct jffs2_full_dirent *fd; int ret; uint32_t now = get_seconds(); @@ -599,7 +600,7 @@ static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) dentry->d_name.len, f, now); if (!ret) { dir_i->i_mtime = dir_i->i_ctime = ITIME(now); - clear_nlink(dentry->d_inode); + clear_nlink(d_inode(dentry)); drop_nlink(dir_i); } return ret; @@ -770,8 +771,8 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, * the VFS can't check whether the victim is empty. The filesystem * needs to do that for itself. */ - if (new_dentry->d_inode) { - victim_f = JFFS2_INODE_INFO(new_dentry->d_inode); + if (d_really_is_positive(new_dentry)) { + victim_f = JFFS2_INODE_INFO(d_inode(new_dentry)); if (d_is_dir(new_dentry)) { struct jffs2_full_dirent *fd; @@ -794,12 +795,12 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, /* Make a hard link */ /* XXX: This is ugly */ - type = (old_dentry->d_inode->i_mode & S_IFMT) >> 12; + type = (d_inode(old_dentry)->i_mode & S_IFMT) >> 12; if (!type) type = DT_REG; now = get_seconds(); ret = jffs2_do_link(c, JFFS2_INODE_INFO(new_dir_i), - old_dentry->d_inode->i_ino, type, + d_inode(old_dentry)->i_ino, type, new_dentry->d_name.name, new_dentry->d_name.len, now); if (ret) @@ -808,9 +809,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, if (victim_f) { /* There was a victim. Kill it off nicely */ if (d_is_dir(new_dentry)) - clear_nlink(new_dentry->d_inode); + clear_nlink(d_inode(new_dentry)); else - drop_nlink(new_dentry->d_inode); + drop_nlink(d_inode(new_dentry)); /* Don't oops if the victim was a dirent pointing to an inode which didn't exist. */ if (victim_f->inocache) { @@ -836,9 +837,9 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, if (ret) { /* Oh shit. We really ought to make a single node which can do both atomically */ - struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); + struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(old_dentry)); mutex_lock(&f->sem); - inc_nlink(old_dentry->d_inode); + inc_nlink(d_inode(old_dentry)); if (f->inocache && !d_is_dir(old_dentry)) f->inocache->pino_nlink++; mutex_unlock(&f->sem); @@ -846,8 +847,8 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n", __func__, ret); /* Might as well let the VFS know */ - d_instantiate(new_dentry, old_dentry->d_inode); - ihold(old_dentry->d_inode); + d_instantiate(new_dentry, d_inode(old_dentry)); + ihold(d_inode(old_dentry)); new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); return ret; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 64989ca9ba90..f509f62e12f6 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -51,9 +51,7 @@ const struct file_operations jffs2_file_operations = { .llseek = generic_file_llseek, .open = generic_file_open, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .unlocked_ioctl=jffs2_ioctl, .mmap = generic_file_readonly_mmap, diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 601afd1afddf..2caf1682036d 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -190,7 +190,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) int jffs2_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc; rc = inode_change_ok(inode, iattr); @@ -272,12 +272,9 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) mutex_lock(&f->sem); ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node); + if (ret) + goto error; - if (ret) { - mutex_unlock(&f->sem); - iget_failed(inode); - return ERR_PTR(ret); - } inode->i_mode = jemode_to_cpu(latest_node.mode); i_uid_write(inode, je16_to_cpu(latest_node.uid)); i_gid_write(inode, je16_to_cpu(latest_node.gid)); @@ -294,6 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) case S_IFLNK: inode->i_op = &jffs2_symlink_inode_operations; + inode->i_link = f->target; break; case S_IFDIR: diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index d200a9b8fd5e..824e61ede465 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -19,7 +19,7 @@ struct kstatfs; struct kvec; -#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode)) +#define JFFS2_INODE_INFO(i) (container_of(i, struct jffs2_inode_info, vfs_inode)) #define OFNI_EDONI_2SFFJ(f) (&(f)->vfs_inode) #define JFFS2_SB_INFO(sb) (sb->s_fs_info) #define OFNI_BS_2SFFJ(c) ((struct super_block *)c->os_priv) diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index dddbde4f56f4..28e0aab42bc3 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1203,17 +1203,13 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n", ret, retlen, sizeof(*latest_node)); /* FIXME: If this fails, there seems to be a memory leak. Find it. */ - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); - return ret?ret:-EIO; + return ret ? ret : -EIO; } crc = crc32(0, latest_node, sizeof(*latest_node)-8); if (crc != je32_to_cpu(latest_node->node_crc)) { JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n", f->inocache->ino, ref_offset(rii.latest_ref)); - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return -EIO; } @@ -1250,16 +1246,11 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, * keep in RAM to facilitate quick follow symlink * operation. */ uint32_t csize = je32_to_cpu(latest_node->csize); - if (csize > JFFS2_MAX_NAME_LEN) { - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); + if (csize > JFFS2_MAX_NAME_LEN) return -ENAMETOOLONG; - } f->target = kmalloc(csize + 1, GFP_KERNEL); if (!f->target) { JFFS2_ERROR("can't allocate %u bytes of memory for the symlink target path cache\n", csize); - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return -ENOMEM; } @@ -1271,8 +1262,6 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, ret = -EIO; kfree(f->target); f->target = NULL; - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return ret; } @@ -1289,15 +1278,11 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, if (f->metadata) { JFFS2_ERROR("Argh. Special inode #%u with mode 0%o had metadata node\n", f->inocache->ino, jemode_to_cpu(latest_node->mode)); - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return -EIO; } if (!frag_first(&f->fragtree)) { JFFS2_ERROR("Argh. Special inode #%u with mode 0%o has no fragments\n", f->inocache->ino, jemode_to_cpu(latest_node->mode)); - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return -EIO; } /* ASSERT: f->fraglist != NULL */ @@ -1305,8 +1290,6 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, JFFS2_ERROR("Argh. Special inode #%u with mode 0x%x had more than one node\n", f->inocache->ino, jemode_to_cpu(latest_node->mode)); /* FIXME: Deal with it - check crc32, check for duplicate node, check times and discard the older one */ - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return -EIO; } /* OK. We're happy */ @@ -1400,10 +1383,8 @@ int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i f->inocache = ic; ret = jffs2_do_read_inode_internal(c, f, &n); - if (!ret) { - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); - } + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); jffs2_xattr_do_crccheck_inode(c, ic); kfree (f); return ret; diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index aca97f35b292..d4b43fb7adb1 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -54,7 +54,7 @@ static int jffs2_security_getxattr(struct dentry *dentry, const char *name, if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, name, buffer, size); } @@ -64,7 +64,7 @@ static int jffs2_security_setxattr(struct dentry *dentry, const char *name, if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_SECURITY, + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); } diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 3d76f28a2ba9..d86c5e3176a1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -140,14 +140,14 @@ static struct dentry *jffs2_get_parent(struct dentry *child) BUG_ON(!d_is_dir(child)); - f = JFFS2_INODE_INFO(child->d_inode); + f = JFFS2_INODE_INFO(d_inode(child)); pino = f->inocache->pino_nlink; JFFS2_DEBUG("Parent of directory ino #%u is #%u\n", f->inocache->ino, pino); - return d_obtain_alias(jffs2_iget(child->d_inode->i_sb, pino)); + return d_obtain_alias(jffs2_iget(d_inode(child)->i_sb, pino)); } static const struct export_operations jffs2_export_ops = { diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index c7c77b0dfccd..8ce2f240125b 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -9,58 +9,15 @@ * */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/fs.h> -#include <linux/namei.h> #include "nodelist.h" -static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd); - const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = jffs2_follow_link, + .follow_link = simple_follow_link, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, .listxattr = jffs2_listxattr, .removexattr = jffs2_removexattr }; - -static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode); - char *p = (char *)f->target; - - /* - * We don't acquire the f->sem mutex here since the only data we - * use is f->target. - * - * 1. If we are here the inode has already built and f->target has - * to point to the target path. - * 2. Nobody uses f->target (if the inode is symlink's inode). The - * exception is inode freeing function which frees f->target. But - * it can't be called while we are here and before VFS has - * stopped using our f->target string which we provide by means of - * nd_set_link() call. - */ - - if (!p) { - pr_err("%s(): can't find symlink target\n", __func__); - p = ERR_PTR(-EIO); - } - jffs2_dbg(1, "%s(): target path is '%s'\n", - __func__, (char *)f->target); - - nd_set_link(nd, p); - - /* - * We will unlock the f->sem mutex but VFS will use the f->target string. This is safe - * since the only way that may cause f->target to be changed is iput() operation. - * But VFS will not use f->target after iput() has been called. - */ - return NULL; -} - diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index d72817ac51f6..f092fee5be50 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -195,7 +195,7 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat /* unchecked xdatum is chained with c->xattr_unchecked */ list_del_init(&xd->xindex); - dbg_xattr("success on verfying xdatum (xid=%u, version=%u)\n", + dbg_xattr("success on verifying xdatum (xid=%u, version=%u)\n", xd->xid, xd->version); return 0; @@ -960,7 +960,7 @@ static const struct xattr_handler *xprefix_to_handler(int xprefix) { ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb); struct jffs2_inode_cache *ic = f->inocache; @@ -1266,7 +1266,6 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ if (rc) { JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n", __func__, rc, totlen); - rc = rc ? rc : -EBADFD; goto out; } rc = save_xattr_ref(c, ref); diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index 1c868194c504..ceaf9c693225 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -21,7 +21,7 @@ static int jffs2_trusted_getxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, name, buffer, size); } @@ -30,7 +30,7 @@ static int jffs2_trusted_setxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_TRUSTED, + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); } diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index 916b5c966039..a71391eba514 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -21,7 +21,7 @@ static int jffs2_user_getxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_getxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER, name, buffer, size); } @@ -30,7 +30,7 @@ static int jffs2_user_setxattr(struct dentry *dentry, const char *name, { if (!strcmp(name, "")) return -EINVAL; - return do_jffs2_setxattr(dentry->d_inode, JFFS2_XPREFIX_USER, + return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER, name, buffer, size, flags); } diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 10815f8dfd8b..b9dc23cd04f2 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -76,7 +76,7 @@ static int jfs_open(struct inode *inode, struct file *file) if (ji->active_ag == -1) { struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); - atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]); + atomic_inc(&jfs_sb->bmap->db_active[ji->active_ag]); } spin_unlock_irq(&ji->ag_lock); } @@ -100,7 +100,7 @@ static int jfs_release(struct inode *inode, struct file *file) int jfs_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int rc; rc = inode_change_ok(inode, iattr); @@ -151,8 +151,6 @@ const struct inode_operations jfs_file_inode_operations = { const struct file_operations jfs_file_operations = { .open = jfs_open, .llseek = generic_file_llseek, - .write = new_sync_write, - .read = new_sync_read, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index bd3df1ca3c9b..41aa3ca6a6a4 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -22,8 +22,8 @@ #include <linux/buffer_head.h> #include <linux/pagemap.h> #include <linux/quotaops.h> +#include <linux/uio.h> #include <linux/writeback.h> -#include <linux/aio.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" @@ -63,11 +63,12 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) inode->i_mapping->a_ops = &jfs_aops; } else { inode->i_op = &jfs_fast_symlink_inode_operations; + inode->i_link = JFS_IP(inode)->i_inline; /* * The inline data should be null-terminated, but * don't let on-disk corruption crash the kernel */ - JFS_IP(inode)->i_inline[inode->i_size] = '\0'; + inode->i_link[inode->i_size] = '\0'; } } else { inode->i_op = &jfs_file_inode_operations; @@ -133,11 +134,11 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) * It has been committed since the last change, but was still * on the dirty inode list. */ - if (!test_cflag(COMMIT_Dirty, inode)) { + if (!test_cflag(COMMIT_Dirty, inode)) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait); return 0; - } + } if (jfs_commit_inode(inode, wait)) { jfs_err("jfs_write_inode: jfs_commit_inode failed!"); @@ -330,8 +331,8 @@ static sector_t jfs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, jfs_get_block); } -static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t jfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -339,13 +340,13 @@ static ssize_t jfs_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, jfs_get_block); + ret = blockdev_direct_IO(iocb, inode, iter, offset, jfs_get_block); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ - if (unlikely((rw & WRITE) && ret < 0)) { + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = offset + count; diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 93a1232894f6..8db8b7d61e40 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -180,9 +180,6 @@ long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case JFS_IOC_SETFLAGS32: cmd = JFS_IOC_SETFLAGS; break; - case FITRIM: - cmd = FITRIM; - break; } return jfs_ioctl(filp, cmd, arg); } diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index fa7e795bd8ae..1f26d1910409 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -206,7 +206,7 @@ struct jfs_sb_info { static inline struct jfs_inode_info *JFS_IP(struct inode *inode) { - return list_entry(inode, struct jfs_inode_info, vfs_inode); + return container_of(inode, struct jfs_inode_info, vfs_inode); } static inline int jfs_dirtable_inline(struct inode *inode) diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index 49ba7ff1bbb9..16a0922beb59 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -183,30 +183,23 @@ static inline void remove_metapage(struct page *page, struct metapage *mp) #endif -static void init_once(void *foo) -{ - struct metapage *mp = (struct metapage *)foo; - - mp->lid = 0; - mp->lsn = 0; - mp->flag = 0; - mp->data = NULL; - mp->clsn = 0; - mp->log = NULL; - set_bit(META_free, &mp->flag); - init_waitqueue_head(&mp->wait); -} - static inline struct metapage *alloc_metapage(gfp_t gfp_mask) { - return mempool_alloc(metapage_mempool, gfp_mask); + struct metapage *mp = mempool_alloc(metapage_mempool, gfp_mask); + + if (mp) { + mp->lid = 0; + mp->lsn = 0; + mp->data = NULL; + mp->clsn = 0; + mp->log = NULL; + init_waitqueue_head(&mp->wait); + } + return mp; } static inline void free_metapage(struct metapage *mp) { - mp->flag = 0; - set_bit(META_free, &mp->flag); - mempool_free(mp, metapage_mempool); } @@ -216,7 +209,7 @@ int __init metapage_init(void) * Allocate the metapage structures */ metapage_cache = kmem_cache_create("jfs_mp", sizeof(struct metapage), - 0, 0, init_once); + 0, 0, NULL); if (metapage_cache == NULL) return -ENOMEM; diff --git a/fs/jfs/jfs_metapage.h b/fs/jfs/jfs_metapage.h index a78beda85f68..337e9e51ac06 100644 --- a/fs/jfs/jfs_metapage.h +++ b/fs/jfs/jfs_metapage.h @@ -48,7 +48,6 @@ struct metapage { /* metapage flag */ #define META_locked 0 -#define META_free 1 #define META_dirty 2 #define META_sync 3 #define META_discard 4 diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 38fdc533f4ec..a5ac97b9a933 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -346,7 +346,7 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) { int rc; tid_t tid; /* transaction id */ - struct inode *ip = dentry->d_inode; + struct inode *ip = d_inode(dentry); ino_t ino; struct component_name dname; struct inode *iplist[2]; @@ -472,7 +472,7 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) { int rc; tid_t tid; /* transaction id */ - struct inode *ip = dentry->d_inode; + struct inode *ip = d_inode(dentry); ino_t ino; struct component_name dname; /* object name */ struct inode *iplist[2]; @@ -791,7 +791,7 @@ static int jfs_link(struct dentry *old_dentry, { int rc; tid_t tid; - struct inode *ip = old_dentry->d_inode; + struct inode *ip = d_inode(old_dentry); ino_t ino; struct component_name dname; struct btstack btstack; @@ -879,8 +879,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, struct component_name dname; int ssize; /* source pathname size */ struct btstack btstack; - struct inode *ip = dentry->d_inode; - unchar *i_fastsymlink; + struct inode *ip = d_inode(dentry); s64 xlen = 0; int bmask = 0, xsize; s64 xaddr; @@ -946,8 +945,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, if (ssize <= IDATASIZE) { ip->i_op = &jfs_fast_symlink_inode_operations; - i_fastsymlink = JFS_IP(ip)->i_inline; - memcpy(i_fastsymlink, name, ssize); + ip->i_link = JFS_IP(ip)->i_inline; + memcpy(ip->i_link, name, ssize); ip->i_size = ssize - 1; /* @@ -1086,8 +1085,8 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, dquot_initialize(old_dir); dquot_initialize(new_dir); - old_ip = old_dentry->d_inode; - new_ip = new_dentry->d_inode; + old_ip = d_inode(old_dentry); + new_ip = d_inode(new_dentry); if ((rc = get_UCSname(&old_dname, old_dentry))) goto out1; @@ -1161,7 +1160,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = dtModify(tid, new_dir, &new_dname, &ino, old_ip->i_ino, JFS_RENAME); if (rc) - goto out4; + goto out_tx; drop_nlink(new_ip); if (S_ISDIR(new_ip->i_mode)) { drop_nlink(new_ip); @@ -1186,7 +1185,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if ((new_size = commitZeroLink(tid, new_ip)) < 0) { txAbort(tid, 1); /* Marks FS Dirty */ rc = new_size; - goto out4; + goto out_tx; } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_DELETE; @@ -1204,7 +1203,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (rc) { jfs_err("jfs_rename didn't expect dtSearch to fail " "w/rc = %d", rc); - goto out4; + goto out_tx; } ino = old_ip->i_ino; @@ -1212,7 +1211,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (rc) { if (rc == -EIO) jfs_err("jfs_rename: dtInsert returned -EIO"); - goto out4; + goto out_tx; } if (S_ISDIR(old_ip->i_mode)) inc_nlink(new_dir); @@ -1227,7 +1226,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_err("jfs_rename did not expect dtDelete to return rc = %d", rc); txAbort(tid, 1); /* Marks Filesystem dirty */ - goto out4; + goto out_tx; } if (S_ISDIR(old_ip->i_mode)) { drop_nlink(old_dir); @@ -1286,7 +1285,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = txCommit(tid, ipcount, iplist, commit_flag); - out4: + out_tx: txEnd(tid); if (new_ip) mutex_unlock(&JFS_IP(new_ip)->commit_mutex); @@ -1309,13 +1308,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (new_ip && (new_ip->i_nlink == 0)) set_cflag(COMMIT_Nolink, new_ip); - out3: - free_UCSname(&new_dname); - out2: - free_UCSname(&old_dname); - out1: - if (new_ip && !S_ISDIR(new_ip->i_mode)) - IWRITE_UNLOCK(new_ip); /* * Truncating the directory index table is not guaranteed. It * may need to be done iteratively @@ -1326,7 +1318,13 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, clear_cflag(COMMIT_Stale, old_dir); } - + if (new_ip && !S_ISDIR(new_ip->i_mode)) + IWRITE_UNLOCK(new_ip); + out3: + free_UCSname(&new_dname); + out2: + free_UCSname(&old_dname); + out1: jfs_info("jfs_rename: returning %d", rc); return rc; } @@ -1500,9 +1498,9 @@ struct dentry *jfs_get_parent(struct dentry *dentry) unsigned long parent_ino; parent_ino = - le32_to_cpu(JFS_IP(dentry->d_inode)->i_dtroot.header.idotdot); + le32_to_cpu(JFS_IP(d_inode(dentry))->i_dtroot.header.idotdot); - return d_obtain_alias(jfs_iget(dentry->d_inode->i_sb, parent_ino)); + return d_obtain_alias(jfs_iget(d_inode(dentry)->i_sb, parent_ino)); } const struct inode_operations jfs_dir_inode_operations = { @@ -1578,7 +1576,7 @@ static int jfs_ci_revalidate(struct dentry *dentry, unsigned int flags) * positive dentry isn't good idea. So it's unsupported like * rename("filename", "FILENAME") for now. */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) return 1; /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 5d30c56ae075..4cd9798f4948 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -102,7 +102,7 @@ void jfs_error(struct super_block *sb, const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - pr_err("ERROR: (device %s): %pf: %pV\n", + pr_err("ERROR: (device %s): %ps: %pV\n", sb->s_id, __builtin_return_address(0), &vaf); va_end(args); diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index 205b946d8e0d..5929e2363cb8 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -17,21 +17,13 @@ */ #include <linux/fs.h> -#include <linux/namei.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_xattr.h" -static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - char *s = JFS_IP(dentry->d_inode)->i_inline; - nd_set_link(nd, s); - return NULL; -} - const struct inode_operations jfs_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = jfs_follow_link, + .follow_link = simple_follow_link, .setattr = jfs_setattr, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 46325d5c34fc..48b15a6e5558 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -849,7 +849,7 @@ int __jfs_setxattr(tid_t tid, struct inode *inode, const char *name, int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t value_len, int flags) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct jfs_inode_info *ji = JFS_IP(inode); int rc; tid_t tid; @@ -872,7 +872,7 @@ int jfs_setxattr(struct dentry *dentry, const char *name, const void *value, tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); - rc = __jfs_setxattr(tid, dentry->d_inode, name, value, value_len, + rc = __jfs_setxattr(tid, d_inode(dentry), name, value, value_len, flags); if (!rc) rc = txCommit(tid, 1, &inode, 0); @@ -959,7 +959,7 @@ ssize_t jfs_getxattr(struct dentry *dentry, const char *name, void *data, return -EOPNOTSUPP; } - err = __jfs_getxattr(dentry->d_inode, name, data, buf_size); + err = __jfs_getxattr(d_inode(dentry), name, data, buf_size); return err; } @@ -976,7 +976,7 @@ static inline int can_list(struct jfs_ea *ea) ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); char *buffer; ssize_t size = 0; int xattr_size; @@ -1029,7 +1029,7 @@ ssize_t jfs_listxattr(struct dentry * dentry, char *data, size_t buf_size) int jfs_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct jfs_inode_info *ji = JFS_IP(inode); int rc; tid_t tid; @@ -1047,7 +1047,7 @@ int jfs_removexattr(struct dentry *dentry, const char *name) tid = txBegin(inode->i_sb, 0); mutex_lock(&ji->commit_mutex); - rc = __jfs_setxattr(tid, dentry->d_inode, name, NULL, 0, XATTR_REPLACE); + rc = __jfs_setxattr(tid, d_inode(dentry), name, NULL, 0, XATTR_REPLACE); if (!rc) rc = txCommit(tid, 1, &inode, 0); txEnd(tid); diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 6acc9648f986..2d48d28e1640 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -444,7 +444,7 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) return -ECHILD; /* Always perform fresh lookup for negatives */ - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out_bad_unlocked; kn = dentry->d_fsdata; @@ -518,7 +518,14 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); + /* + * If the ino of the sysfs entry created for a kmem cache gets + * allocated from an ida layer, which is accounted to the memcg that + * owns the cache, the memcg will get pinned forever. So do not account + * ino ida allocations. + */ + ret = ida_simple_get(&root->ino_ida, 1, 0, + GFP_KERNEL | __GFP_NOACCOUNT); if (ret < 0) goto err_out2; kn->ino = ret; @@ -585,6 +592,9 @@ int kernfs_add_one(struct kernfs_node *kn) goto out_unlock; ret = -ENOENT; + if (parent->flags & KERNFS_EMPTY_DIR) + goto out_unlock; + if ((parent->flags & KERNFS_ACTIVATED) && !kernfs_active(parent)) goto out_unlock; @@ -776,6 +786,38 @@ struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, return ERR_PTR(rc); } +/** + * kernfs_create_empty_dir - create an always empty directory + * @parent: parent in which to create a new directory + * @name: name of the new directory + * + * Returns the created node on success, ERR_PTR() value on failure. + */ +struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, + const char *name) +{ + struct kernfs_node *kn; + int rc; + + /* allocate */ + kn = kernfs_new_node(parent, name, S_IRUGO|S_IXUGO|S_IFDIR, KERNFS_DIR); + if (!kn) + return ERR_PTR(-ENOMEM); + + kn->flags |= KERNFS_EMPTY_DIR; + kn->dir.root = parent->dir.root; + kn->ns = NULL; + kn->priv = NULL; + + /* link in */ + rc = kernfs_add_one(kn); + if (!rc) + return kn; + + kernfs_put(kn); + return ERR_PTR(rc); +} + static struct dentry *kernfs_iop_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -1247,7 +1289,8 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, mutex_lock(&kernfs_mutex); error = -ENOENT; - if (!kernfs_active(kn) || !kernfs_active(new_parent)) + if (!kernfs_active(kn) || !kernfs_active(new_parent) || + (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; error = 0; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 2bacb9988566..7247252ee9b1 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -785,7 +785,6 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; struct kernfs_open_node *on = kn->attr.open; - /* need parent for the kobj, grab both */ if (!kernfs_get_active(kn)) goto trigger; diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 9000874a945b..756dd56aaf60 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -111,7 +111,7 @@ int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct kernfs_node *kn = dentry->d_fsdata; int error; @@ -172,11 +172,11 @@ int kernfs_iop_setxattr(struct dentry *dentry, const char *name, if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; - error = security_inode_setsecurity(dentry->d_inode, suffix, + error = security_inode_setsecurity(d_inode(dentry), suffix, value, size, flags); if (error) return error; - error = security_inode_getsecctx(dentry->d_inode, + error = security_inode_getsecctx(d_inode(dentry), &secdata, &secdata_len); if (error) return error; @@ -271,7 +271,7 @@ int kernfs_iop_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct kernfs_node *kn = dentry->d_fsdata; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); mutex_lock(&kernfs_mutex); kernfs_refresh_inode(kn, inode); @@ -296,6 +296,8 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode) case KERNFS_DIR: inode->i_op = &kernfs_dir_iops; inode->i_fop = &kernfs_dir_fops; + if (kn->flags & KERNFS_EMPTY_DIR) + make_empty_dir_inode(inode); break; case KERNFS_FILE: inode->i_size = kn->attr.size; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index af9fa7499919..6762bfbd8207 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -76,7 +76,6 @@ extern struct kmem_cache *kernfs_node_cache; /* * inode.c */ -struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct inode *inode, int mask); int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 8a198898e39a..db272528ab5b 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -112,25 +112,18 @@ static int kernfs_getlink(struct dentry *dentry, char *path) return error; } -static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie) { int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); - if (page) { - error = kernfs_getlink(dentry, (char *) page); - if (error < 0) - free_page((unsigned long)page); - } - nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); - return NULL; -} - -static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *page = nd_get_link(nd); - if (!IS_ERR(page)) + if (!page) + return ERR_PTR(-ENOMEM); + error = kernfs_getlink(dentry, (char *)page); + if (unlikely(error < 0)) { free_page((unsigned long)page); + return ERR_PTR(error); + } + return *cookie = (char *)page; } const struct inode_operations kernfs_symlink_iops = { @@ -140,7 +133,7 @@ const struct inode_operations kernfs_symlink_iops = { .listxattr = kernfs_iop_listxattr, .readlink = generic_readlink, .follow_link = kernfs_iop_follow_link, - .put_link = kernfs_iop_put_link, + .put_link = free_page_put_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .permission = kernfs_iop_permission, diff --git a/fs/libfs.c b/fs/libfs.c index 0ab65122ee45..102edfd39000 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -20,15 +20,10 @@ #include "internal.h" -static inline int simple_positive(struct dentry *dentry) -{ - return dentry->d_inode && !d_unhashed(dentry); -} - int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); stat->blocks = inode->i_mapping->nrpages << (PAGE_CACHE_SHIFT - 9); return 0; @@ -94,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close); loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); switch (whence) { case 1: offset += file->f_pos; @@ -102,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return -EINVAL; } if (offset != file->f_pos) { @@ -129,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&dentry->d_lock); } } - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return offset; } EXPORT_SYMBOL(dcache_dir_lseek); @@ -169,7 +164,7 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) spin_unlock(&next->d_lock); spin_unlock(&dentry->d_lock); if (!dir_emit(ctx, next->d_name.name, next->d_name.len, - next->d_inode->i_ino, dt_type(next->d_inode))) + d_inode(next)->i_ino, dt_type(d_inode(next)))) return 0; spin_lock(&dentry->d_lock); spin_lock_nested(&next->d_lock, DENTRY_D_LOCK_NESTED); @@ -270,7 +265,7 @@ EXPORT_SYMBOL(simple_open); int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); @@ -304,7 +299,7 @@ EXPORT_SYMBOL(simple_empty); int simple_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; drop_nlink(inode); @@ -318,7 +313,7 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; - drop_nlink(dentry->d_inode); + drop_nlink(d_inode(dentry)); simple_unlink(dir, dentry); drop_nlink(dir); return 0; @@ -328,16 +323,16 @@ EXPORT_SYMBOL(simple_rmdir); int simple_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int they_are_dirs = d_is_dir(old_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; - if (new_dentry->d_inode) { + if (d_really_is_positive(new_dentry)) { simple_unlink(new_dir, new_dentry); if (they_are_dirs) { - drop_nlink(new_dentry->d_inode); + drop_nlink(d_inode(new_dentry)); drop_nlink(old_dir); } } else if (they_are_dirs) { @@ -368,7 +363,7 @@ EXPORT_SYMBOL(simple_rename); */ int simple_setattr(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, iattr); @@ -1024,15 +1019,18 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); -void kfree_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) +void kfree_put_link(struct inode *unused, void *cookie) { - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - kfree(s); + kfree(cookie); } EXPORT_SYMBOL(kfree_put_link); +void free_page_put_link(struct inode *unused, void *cookie) +{ + free_page((unsigned long) cookie); +} +EXPORT_SYMBOL(free_page_put_link); + /* * nop .set_page_dirty method so that people can use .page_mkwrite on * anon inodes. @@ -1093,3 +1091,110 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, return -EINVAL; } EXPORT_SYMBOL(simple_nosetlease); + +const char *simple_follow_link(struct dentry *dentry, void **cookie) +{ + return d_inode(dentry)->i_link; +} +EXPORT_SYMBOL(simple_follow_link); + +const struct inode_operations simple_symlink_inode_operations = { + .follow_link = simple_follow_link, + .readlink = generic_readlink +}; +EXPORT_SYMBOL(simple_symlink_inode_operations); + +/* + * Operations for a permanently empty directory. + */ +static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + return ERR_PTR(-ENOENT); +} + +static int empty_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct inode *inode = d_inode(dentry); + generic_fillattr(inode, stat); + return 0; +} + +static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) +{ + return -EPERM; +} + +static int empty_dir_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + return -EOPNOTSUPP; +} + +static ssize_t empty_dir_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + return -EOPNOTSUPP; +} + +static int empty_dir_removexattr(struct dentry *dentry, const char *name) +{ + return -EOPNOTSUPP; +} + +static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) +{ + return -EOPNOTSUPP; +} + +static const struct inode_operations empty_dir_inode_operations = { + .lookup = empty_dir_lookup, + .permission = generic_permission, + .setattr = empty_dir_setattr, + .getattr = empty_dir_getattr, + .setxattr = empty_dir_setxattr, + .getxattr = empty_dir_getxattr, + .removexattr = empty_dir_removexattr, + .listxattr = empty_dir_listxattr, +}; + +static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) +{ + /* An empty directory has two entries . and .. at offsets 0 and 1 */ + return generic_file_llseek_size(file, offset, whence, 2, 2); +} + +static int empty_dir_readdir(struct file *file, struct dir_context *ctx) +{ + dir_emit_dots(file, ctx); + return 0; +} + +static const struct file_operations empty_dir_operations = { + .llseek = empty_dir_llseek, + .read = generic_read_dir, + .iterate = empty_dir_readdir, + .fsync = noop_fsync, +}; + + +void make_empty_dir_inode(struct inode *inode) +{ + set_nlink(inode, 2); + inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_rdev = 0; + inode->i_size = 2; + inode->i_blkbits = PAGE_SHIFT; + inode->i_blocks = 0; + + inode->i_op = &empty_dir_inode_operations; + inode->i_fop = &empty_dir_operations; +} + +bool is_empty_dir_inode(struct inode *inode) +{ + return (inode->i_fop == &empty_dir_operations) && + (inode->i_op == &empty_dir_inode_operations); +} diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 665ef5a05183..a563ddbc19e6 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -31,7 +31,7 @@ static struct hlist_head nlm_files[FILE_NRHASH]; static DEFINE_MUTEX(nlm_file_mutex); -#ifdef NFSD_DEBUG +#ifdef CONFIG_SUNRPC_DEBUG static inline void nlm_debug_print_fh(char *msg, struct nfs_fh *f) { u32 *fhp = (u32*)f->data; diff --git a/fs/locks.c b/fs/locks.c index 40bc384728c0..d3d558ba4da7 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -203,11 +203,11 @@ static struct kmem_cache *flctx_cache __read_mostly; static struct kmem_cache *filelock_cache __read_mostly; static struct file_lock_context * -locks_get_lock_context(struct inode *inode) +locks_get_lock_context(struct inode *inode, int type) { struct file_lock_context *new; - if (likely(inode->i_flctx)) + if (likely(inode->i_flctx) || type == F_UNLCK) goto out; new = kmem_cache_alloc(flctx_cache, GFP_KERNEL); @@ -223,14 +223,7 @@ locks_get_lock_context(struct inode *inode) * Assign the pointer if it's not already assigned. If it is, then * free the context we just allocated. */ - spin_lock(&inode->i_lock); - if (likely(!inode->i_flctx)) { - inode->i_flctx = new; - new = NULL; - } - spin_unlock(&inode->i_lock); - - if (new) + if (cmpxchg(&inode->i_flctx, NULL, new)) kmem_cache_free(flctx_cache, new); out: return inode->i_flctx; @@ -276,8 +269,10 @@ void locks_release_private(struct file_lock *fl) } if (fl->fl_lmops) { - if (fl->fl_lmops->lm_put_owner) - fl->fl_lmops->lm_put_owner(fl); + if (fl->fl_lmops->lm_put_owner) { + fl->fl_lmops->lm_put_owner(fl->fl_owner); + fl->fl_owner = NULL; + } fl->fl_lmops = NULL; } } @@ -333,7 +328,7 @@ void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) if (fl->fl_lmops) { if (fl->fl_lmops->lm_get_owner) - fl->fl_lmops->lm_get_owner(new, fl); + fl->fl_lmops->lm_get_owner(fl->fl_owner); } } EXPORT_SYMBOL(locks_copy_conflock); @@ -592,11 +587,15 @@ posix_owner_key(struct file_lock *fl) static void locks_insert_global_blocked(struct file_lock *waiter) { + lockdep_assert_held(&blocked_lock_lock); + hash_add(blocked_hash, &waiter->fl_link, posix_owner_key(waiter)); } static void locks_delete_global_blocked(struct file_lock *waiter) { + lockdep_assert_held(&blocked_lock_lock); + hash_del(&waiter->fl_link); } @@ -730,7 +729,7 @@ static int posix_locks_conflict(struct file_lock *caller_fl, struct file_lock *s /* POSIX locks owned by the same process do not conflict with * each other. */ - if (!IS_POSIX(sys_fl) || posix_same_owner(caller_fl, sys_fl)) + if (posix_same_owner(caller_fl, sys_fl)) return (0); /* Check whether they overlap */ @@ -748,7 +747,7 @@ static int flock_locks_conflict(struct file_lock *caller_fl, struct file_lock *s /* FLOCK locks referring to the same filp do not conflict with * each other. */ - if (!IS_FLOCK(sys_fl) || (caller_fl->fl_file == sys_fl->fl_file)) + if (caller_fl->fl_file == sys_fl->fl_file) return (0); if ((caller_fl->fl_type & LOCK_MAND) || (sys_fl->fl_type & LOCK_MAND)) return 0; @@ -838,6 +837,8 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, { int i = 0; + lockdep_assert_held(&blocked_lock_lock); + /* * This deadlock detector can't reasonably detect deadlocks with * FL_OFDLCK locks, since they aren't owned by a process, per-se. @@ -861,19 +862,21 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, * whether or not a lock was successfully freed by testing the return * value for -ENOENT. */ -static int flock_lock_file(struct file *filp, struct file_lock *request) +static int flock_lock_inode(struct inode *inode, struct file_lock *request) { struct file_lock *new_fl = NULL; struct file_lock *fl; struct file_lock_context *ctx; - struct inode *inode = file_inode(filp); int error = 0; bool found = false; LIST_HEAD(dispose); - ctx = locks_get_lock_context(inode); - if (!ctx) - return -ENOMEM; + ctx = locks_get_lock_context(inode, request->fl_type); + if (!ctx) { + if (request->fl_type != F_UNLCK) + return -ENOMEM; + return (request->fl_flags & FL_EXISTS) ? -ENOENT : 0; + } if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) { new_fl = locks_alloc_lock(); @@ -886,7 +889,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) goto find_conflict; list_for_each_entry(fl, &ctx->flc_flock, fl_list) { - if (filp != fl->fl_file) + if (request->fl_file != fl->fl_file) continue; if (request->fl_type == fl->fl_type) goto out; @@ -939,9 +942,9 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str bool added = false; LIST_HEAD(dispose); - ctx = locks_get_lock_context(inode); + ctx = locks_get_lock_context(inode, request->fl_type); if (!ctx) - return -ENOMEM; + return (request->fl_type == F_UNLCK) ? 0 : -ENOMEM; /* * We may need two file_lock structures for this operation, @@ -964,8 +967,6 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str */ if (request->fl_type != F_UNLCK) { list_for_each_entry(fl, &ctx->flc_posix, fl_list) { - if (!IS_POSIX(fl)) - continue; if (!posix_locks_conflict(request, fl)) continue; if (conflock) @@ -1162,20 +1163,19 @@ int posix_lock_file(struct file *filp, struct file_lock *fl, EXPORT_SYMBOL(posix_lock_file); /** - * posix_lock_file_wait - Apply a POSIX-style lock to a file - * @filp: The file to apply the lock to + * posix_lock_inode_wait - Apply a POSIX-style lock to a file + * @inode: inode of file to which lock request should be applied * @fl: The lock to be applied * - * Add a POSIX style lock to a file. - * We merge adjacent & overlapping locks whenever possible. - * POSIX locks are sorted by owner task, then by starting address + * Variant of posix_lock_file_wait that does not take a filp, and so can be + * used after the filp has already been torn down. */ -int posix_lock_file_wait(struct file *filp, struct file_lock *fl) +int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep (); for (;;) { - error = posix_lock_file(filp, fl, NULL); + error = __posix_lock_file(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1187,7 +1187,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl) } return error; } -EXPORT_SYMBOL(posix_lock_file_wait); +EXPORT_SYMBOL(posix_lock_inode_wait); /** * locks_mandatory_locked - Check for an active lock @@ -1605,7 +1605,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr lease = *flp; trace_generic_add_lease(inode, lease); - ctx = locks_get_lock_context(inode); + /* Note that arg is never F_UNLCK here */ + ctx = locks_get_lock_context(inode, arg); if (!ctx) return -ENOMEM; @@ -1848,18 +1849,18 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) } /** - * flock_lock_file_wait - Apply a FLOCK-style lock to a file - * @filp: The file to apply the lock to + * flock_lock_inode_wait - Apply a FLOCK-style lock to a file + * @inode: inode of the file to apply to * @fl: The lock to be applied * - * Add a FLOCK style lock to a file. + * Apply a FLOCK style lock request to an inode. */ -int flock_lock_file_wait(struct file *filp, struct file_lock *fl) +int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep(); for (;;) { - error = flock_lock_file(filp, fl); + error = flock_lock_inode(inode, fl); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1871,8 +1872,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl) } return error; } - -EXPORT_SYMBOL(flock_lock_file_wait); +EXPORT_SYMBOL(flock_lock_inode_wait); /** * sys_flock: - flock() system call. @@ -2398,7 +2398,8 @@ locks_remove_flock(struct file *filp) .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; - struct file_lock_context *flctx = file_inode(filp)->i_flctx; + struct inode *inode = file_inode(filp); + struct file_lock_context *flctx = inode->i_flctx; if (list_empty(&flctx->flc_flock)) return; @@ -2406,7 +2407,7 @@ locks_remove_flock(struct file *filp) if (filp->f_op->flock) filp->f_op->flock(filp, F_SETLKW, &fl); else - flock_lock_file(filp, &fl); + flock_lock_inode(inode, &fl); if (fl.fl_ops && fl.fl_ops->fl_release_private) fl.fl_ops->fl_release_private(&fl); @@ -2555,15 +2556,10 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl, : (fl->fl_type == F_WRLCK) ? "WRITE" : "READ "); } if (inode) { -#ifdef WE_CAN_BREAK_LSLK_NOW - seq_printf(f, "%d %s:%ld ", fl_pid, - inode->i_sb->s_id, inode->i_ino); -#else - /* userspace relies on this representation of dev_t ;-( */ + /* userspace relies on this representation of dev_t */ seq_printf(f, "%d %02x:%02x:%ld ", fl_pid, MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), inode->i_ino); -#endif } else { seq_printf(f, "%d <none>:0 ", fl_pid); } @@ -2592,6 +2588,44 @@ static int locks_show(struct seq_file *f, void *v) return 0; } +static void __show_fd_locks(struct seq_file *f, + struct list_head *head, int *id, + struct file *filp, struct files_struct *files) +{ + struct file_lock *fl; + + list_for_each_entry(fl, head, fl_list) { + + if (filp != fl->fl_file) + continue; + if (fl->fl_owner != files && + fl->fl_owner != filp) + continue; + + (*id)++; + seq_puts(f, "lock:\t"); + lock_get_status(f, fl, *id, ""); + } +} + +void show_fd_locks(struct seq_file *f, + struct file *filp, struct files_struct *files) +{ + struct inode *inode = file_inode(filp); + struct file_lock_context *ctx; + int id = 0; + + ctx = inode->i_flctx; + if (!ctx) + return; + + spin_lock(&ctx->flc_lock); + __show_fd_locks(f, &ctx->flc_flock, &id, filp, files); + __show_fd_locks(f, &ctx->flc_posix, &id, filp, files); + __show_fd_locks(f, &ctx->flc_lease, &id, filp, files); + spin_unlock(&ctx->flc_lock); +} + static void *locks_start(struct seq_file *f, loff_t *pos) __acquires(&blocked_lock_lock) { diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 6bdc347008f5..f9b45d46d4c4 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -213,7 +213,7 @@ static void abort_transaction(struct inode *inode, struct logfs_transaction *ta) static int logfs_unlink(struct inode *dir, struct dentry *dentry) { struct logfs_super *super = logfs_super(dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct logfs_transaction *ta; struct page *page; pgoff_t index; @@ -271,7 +271,7 @@ static inline int logfs_empty_dir(struct inode *dir) static int logfs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (!logfs_empty_dir(inode)) return -ENOTEMPTY; @@ -537,7 +537,7 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, static int logfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; ihold(inode); @@ -607,7 +607,7 @@ static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry, /* 2. write target dd */ mutex_lock(&super->s_dirop_mutex); logfs_add_transaction(new_dir, ta); - err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode); + err = logfs_write_dir(new_dir, new_dentry, d_inode(old_dentry)); if (!err) err = write_inode(new_dir); @@ -658,8 +658,8 @@ static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct logfs_super *super = logfs_super(old_dir->i_sb); - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); int isdir = S_ISDIR(old_inode->i_mode); struct logfs_disk_dentry dd; struct logfs_transaction *ta; @@ -719,7 +719,7 @@ out: static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - if (new_dentry->d_inode) + if (d_really_is_positive(new_dentry)) return logfs_rename_target(old_dir, old_dentry, new_dir, new_dentry); return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry); @@ -779,6 +779,7 @@ fail: const struct inode_operations logfs_symlink_iops = { .readlink = generic_readlink, .follow_link = page_follow_link_light, + .put_link = page_put_link, }; const struct inode_operations logfs_dir_iops = { diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 8538752df2f6..1a6f0167b16a 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -241,7 +241,7 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) static int logfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = 0; err = inode_change_ok(inode, attr); @@ -271,8 +271,6 @@ const struct file_operations logfs_reg_fops = { .llseek = generic_file_llseek, .mmap = generic_file_readonly_mmap, .open = generic_file_open, - .read = new_sync_read, - .write = new_sync_write, }; const struct address_space_operations logfs_reg_aops = { diff --git a/fs/minix/dir.c b/fs/minix/dir.c index dfaf6fa9b7b5..d19ac258105a 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -45,11 +45,6 @@ minix_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; @@ -156,7 +151,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; - struct inode * dir = dentry->d_parent->d_inode; + struct inode * dir = d_inode(dentry->d_parent); struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); unsigned long n; @@ -203,7 +198,7 @@ found: int minix_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct super_block * sb = dir->i_sb; diff --git a/fs/minix/file.c b/fs/minix/file.c index a967de085ac0..94f0eb9a6e2c 100644 --- a/fs/minix/file.c +++ b/fs/minix/file.c @@ -14,9 +14,7 @@ */ const struct file_operations minix_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, @@ -25,7 +23,7 @@ const struct file_operations minix_file_operations = { static int minix_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 3f57af196a7d..086cd0a61e80 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -62,7 +62,7 @@ static struct kmem_cache * minix_inode_cachep; static struct inode *minix_alloc_inode(struct super_block *sb) { struct minix_inode_info *ei; - ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; @@ -626,8 +626,8 @@ static int minix_write_inode(struct inode *inode, struct writeback_control *wbc) int minix_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct super_block *sb = dentry->d_sb; - generic_fillattr(dentry->d_inode, stat); - if (INODE_VERSION(dentry->d_inode) == MINIX_V1) + generic_fillattr(d_inode(dentry), stat); + if (INODE_VERSION(d_inode(dentry)) == MINIX_V1) stat->blocks = (BLOCK_SIZE / 512) * V1_minix_blocks(stat->size, sb); else stat->blocks = (sb->s_blocksize / 512) * V2_minix_blocks(stat->size, sb); diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 1ebd11854622..01ad81dcacc5 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -84,7 +84,7 @@ static inline struct minix_sb_info *minix_sb(struct super_block *sb) static inline struct minix_inode_info *minix_i(struct inode *inode) { - return list_entry(inode, struct minix_inode_info, vfs_inode); + return container_of(inode, struct minix_inode_info, vfs_inode); } static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) diff --git a/fs/minix/namei.c b/fs/minix/namei.c index cd950e2331b6..a795a11e50c7 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -104,7 +104,7 @@ out_fail: static int minix_link(struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); @@ -151,7 +151,7 @@ out_dir: static int minix_unlink(struct inode * dir, struct dentry *dentry) { int err = -ENOENT; - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct page * page; struct minix_dir_entry * de; @@ -171,7 +171,7 @@ end_unlink: static int minix_rmdir(struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); int err = -ENOTEMPTY; if (minix_empty_dir(inode)) { @@ -187,8 +187,8 @@ static int minix_rmdir(struct inode * dir, struct dentry *dentry) static int minix_rename(struct inode * old_dir, struct dentry *old_dentry, struct inode * new_dir, struct dentry *new_dentry) { - struct inode * old_inode = old_dentry->d_inode; - struct inode * new_inode = new_dentry->d_inode; + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct minix_dir_entry * dir_de = NULL; struct page * old_page; diff --git a/fs/mount.h b/fs/mount.h index 6a61c2b3e385..14db05d424f7 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -88,6 +88,7 @@ static inline int is_mounted(struct vfsmount *mnt) extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); +extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); extern void __detach_mounts(struct dentry *dentry); @@ -117,7 +118,6 @@ static inline void unlock_mount_hash(void) } struct proc_mounts { - struct seq_file m; struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); @@ -126,8 +126,6 @@ struct proc_mounts { loff_t cached_index; }; -#define proc_mounts(p) (container_of((p), struct proc_mounts, m)) - extern const struct seq_operations mounts_op; extern bool __is_local_mountpoint(struct dentry *dentry); diff --git a/fs/mpage.c b/fs/mpage.c index 3e79220babac..ca0244b69de8 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -605,6 +605,8 @@ alloc_new: bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; + + wbc_init_bio(wbc, bio); } /* @@ -612,6 +614,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ + wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(WRITE, bio); diff --git a/fs/namei.c b/fs/namei.c index c83145af4bfc..fbbcf0993312 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -119,15 +119,14 @@ * PATH_MAX includes the nul terminator --RR. */ -#define EMBEDDED_NAME_MAX (PATH_MAX - sizeof(struct filename)) +#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) struct filename * getname_flags(const char __user *filename, int flags, int *empty) { - struct filename *result, *err; - int len; - long max; + struct filename *result; char *kname; + int len; result = audit_reusename(filename); if (result) @@ -136,22 +135,18 @@ getname_flags(const char __user *filename, int flags, int *empty) result = __getname(); if (unlikely(!result)) return ERR_PTR(-ENOMEM); - result->refcnt = 1; /* * First, try to embed the struct filename inside the names_cache * allocation */ - kname = (char *)result + sizeof(*result); + kname = (char *)result->iname; result->name = kname; - result->separate = false; - max = EMBEDDED_NAME_MAX; -recopy: - len = strncpy_from_user(kname, filename, max); + len = strncpy_from_user(kname, filename, EMBEDDED_NAME_MAX); if (unlikely(len < 0)) { - err = ERR_PTR(len); - goto error; + __putname(result); + return ERR_PTR(len); } /* @@ -160,43 +155,49 @@ recopy: * names_cache allocation for the pathname, and re-do the copy from * userland. */ - if (len == EMBEDDED_NAME_MAX && max == EMBEDDED_NAME_MAX) { + if (unlikely(len == EMBEDDED_NAME_MAX)) { + const size_t size = offsetof(struct filename, iname[1]); kname = (char *)result; - result = kzalloc(sizeof(*result), GFP_KERNEL); - if (!result) { - err = ERR_PTR(-ENOMEM); - result = (struct filename *)kname; - goto error; + /* + * size is chosen that way we to guarantee that + * result->iname[0] is within the same object and that + * kname can't be equal to result->iname, no matter what. + */ + result = kzalloc(size, GFP_KERNEL); + if (unlikely(!result)) { + __putname(kname); + return ERR_PTR(-ENOMEM); } result->name = kname; - result->separate = true; - result->refcnt = 1; - max = PATH_MAX; - goto recopy; + len = strncpy_from_user(kname, filename, PATH_MAX); + if (unlikely(len < 0)) { + __putname(kname); + kfree(result); + return ERR_PTR(len); + } + if (unlikely(len == PATH_MAX)) { + __putname(kname); + kfree(result); + return ERR_PTR(-ENAMETOOLONG); + } } + result->refcnt = 1; /* The empty path is special. */ if (unlikely(!len)) { if (empty) *empty = 1; - err = ERR_PTR(-ENOENT); - if (!(flags & LOOKUP_EMPTY)) - goto error; + if (!(flags & LOOKUP_EMPTY)) { + putname(result); + return ERR_PTR(-ENOENT); + } } - err = ERR_PTR(-ENAMETOOLONG); - if (unlikely(len >= PATH_MAX)) - goto error; - result->uptr = filename; result->aname = NULL; audit_getname(result); return result; - -error: - putname(result); - return err; } struct filename * @@ -216,8 +217,7 @@ getname_kernel(const char * filename) return ERR_PTR(-ENOMEM); if (len <= EMBEDDED_NAME_MAX) { - result->name = (char *)(result) + sizeof(*result); - result->separate = false; + result->name = (char *)result->iname; } else if (len <= PATH_MAX) { struct filename *tmp; @@ -227,7 +227,6 @@ getname_kernel(const char * filename) return ERR_PTR(-ENOMEM); } tmp->name = (char *)result; - tmp->separate = true; result = tmp; } else { __putname(result); @@ -249,7 +248,7 @@ void putname(struct filename *name) if (--name->refcnt > 0) return; - if (name->separate) { + if (name->name != name->iname) { __putname(name->name); kfree(name); } else @@ -493,6 +492,7 @@ void path_put(const struct path *path) } EXPORT_SYMBOL(path_put); +#define EMBEDDED_LEVELS 2 struct nameidata { struct path path; struct qstr last; @@ -502,10 +502,139 @@ struct nameidata { unsigned seq, m_seq; int last_type; unsigned depth; - struct file *base; - char *saved_names[MAX_NESTED_LINKS + 1]; + int total_link_count; + struct saved { + struct path link; + void *cookie; + const char *name; + struct inode *inode; + unsigned seq; + } *stack, internal[EMBEDDED_LEVELS]; + struct filename *name; + struct nameidata *saved; + unsigned root_seq; + int dfd; }; +static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) +{ + struct nameidata *old = current->nameidata; + p->stack = p->internal; + p->dfd = dfd; + p->name = name; + p->total_link_count = old ? old->total_link_count : 0; + p->saved = old; + current->nameidata = p; +} + +static void restore_nameidata(void) +{ + struct nameidata *now = current->nameidata, *old = now->saved; + + current->nameidata = old; + if (old) + old->total_link_count = now->total_link_count; + if (now->stack != now->internal) { + kfree(now->stack); + now->stack = now->internal; + } +} + +static int __nd_alloc_stack(struct nameidata *nd) +{ + struct saved *p; + + if (nd->flags & LOOKUP_RCU) { + p= kmalloc(MAXSYMLINKS * sizeof(struct saved), + GFP_ATOMIC); + if (unlikely(!p)) + return -ECHILD; + } else { + p= kmalloc(MAXSYMLINKS * sizeof(struct saved), + GFP_KERNEL); + if (unlikely(!p)) + return -ENOMEM; + } + memcpy(p, nd->internal, sizeof(nd->internal)); + nd->stack = p; + return 0; +} + +static inline int nd_alloc_stack(struct nameidata *nd) +{ + if (likely(nd->depth != EMBEDDED_LEVELS)) + return 0; + if (likely(nd->stack != nd->internal)) + return 0; + return __nd_alloc_stack(nd); +} + +static void drop_links(struct nameidata *nd) +{ + int i = nd->depth; + while (i--) { + struct saved *last = nd->stack + i; + struct inode *inode = last->inode; + if (last->cookie && inode->i_op->put_link) { + inode->i_op->put_link(inode, last->cookie); + last->cookie = NULL; + } + } +} + +static void terminate_walk(struct nameidata *nd) +{ + drop_links(nd); + if (!(nd->flags & LOOKUP_RCU)) { + int i; + path_put(&nd->path); + for (i = 0; i < nd->depth; i++) + path_put(&nd->stack[i].link); + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + } else { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + } + nd->depth = 0; +} + +/* path_put is needed afterwards regardless of success or failure */ +static bool legitimize_path(struct nameidata *nd, + struct path *path, unsigned seq) +{ + int res = __legitimize_mnt(path->mnt, nd->m_seq); + if (unlikely(res)) { + if (res > 0) + path->mnt = NULL; + path->dentry = NULL; + return false; + } + if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { + path->dentry = NULL; + return false; + } + return !read_seqcount_retry(&path->dentry->d_seq, seq); +} + +static bool legitimize_links(struct nameidata *nd) +{ + int i; + for (i = 0; i < nd->depth; i++) { + struct saved *last = nd->stack + i; + if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { + drop_links(nd); + nd->depth = i + 1; + return false; + } + } + return true; +} + /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't @@ -521,35 +650,28 @@ struct nameidata { * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: child of nd->path.dentry or NULL + * @seq: seq number to check dentry against * Returns: 0 on success, -ECHILD on failure * * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry * for ref-walk mode. @dentry must be a path found by a do_lookup call on * @nd or NULL. Must be called from rcu-walk context. + * Nothing should touch nameidata between unlazy_walk() failure and + * terminate_walk(). */ -static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) +static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq) { - struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); - /* - * After legitimizing the bastards, terminate_walk() - * will do the right thing for non-RCU mode, and all our - * subsequent exit cases should rcu_read_unlock() - * before returning. Do vfsmount first; if dentry - * can't be legitimized, just set nd->path.dentry to NULL - * and rely on dput(NULL) being a no-op. - */ - if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) - return -ECHILD; nd->flags &= ~LOOKUP_RCU; - - if (!lockref_get_not_dead(&parent->d_lockref)) { - nd->path.dentry = NULL; - goto out; - } + if (unlikely(!legitimize_links(nd))) + goto out2; + if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) + goto out2; + if (unlikely(!lockref_get_not_dead(&parent->d_lockref))) + goto out1; /* * For a negative lookup, the lookup sequence point is the parents @@ -569,7 +691,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) } else { if (!lockref_get_not_dead(&dentry->d_lockref)) goto out; - if (read_seqcount_retry(&dentry->d_seq, nd->seq)) + if (read_seqcount_retry(&dentry->d_seq, seq)) goto drop_dentry; } @@ -578,22 +700,24 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) * still valid and get it if required. */ if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - spin_lock(&fs->lock); - if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) - goto unlock_and_drop_dentry; - path_get(&nd->root); - spin_unlock(&fs->lock); + if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) { + rcu_read_unlock(); + dput(dentry); + return -ECHILD; + } } rcu_read_unlock(); return 0; -unlock_and_drop_dentry: - spin_unlock(&fs->lock); drop_dentry: rcu_read_unlock(); dput(dentry); goto drop_root_mnt; +out2: + nd->path.mnt = NULL; +out1: + nd->path.dentry = NULL; out: rcu_read_unlock(); drop_root_mnt: @@ -602,6 +726,24 @@ drop_root_mnt: return -ECHILD; } +static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq) +{ + if (unlikely(!legitimize_path(nd, link, seq))) { + drop_links(nd); + nd->depth = 0; + nd->flags &= ~LOOKUP_RCU; + nd->path.mnt = NULL; + nd->path.dentry = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + } else if (likely(unlazy_walk(nd, NULL, 0)) == 0) { + return 0; + } + path_put(link); + return -ECHILD; +} + static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { return dentry->d_op->d_revalidate(dentry, flags); @@ -623,26 +765,10 @@ static int complete_walk(struct nameidata *nd) int status; if (nd->flags & LOOKUP_RCU) { - nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - - if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) { - rcu_read_unlock(); - return -ECHILD; - } - if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { - rcu_read_unlock(); - mntput(nd->path.mnt); - return -ECHILD; - } - if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { - rcu_read_unlock(); - dput(dentry); - mntput(nd->path.mnt); + if (unlikely(unlazy_walk(nd, NULL, 0))) return -ECHILD; - } - rcu_read_unlock(); } if (likely(!(nd->flags & LOOKUP_JUMPED))) @@ -658,28 +784,24 @@ static int complete_walk(struct nameidata *nd) if (!status) status = -ESTALE; - path_put(&nd->path); return status; } -static __always_inline void set_root(struct nameidata *nd) +static void set_root(struct nameidata *nd) { get_fs_root(current->fs, &nd->root); } -static int link_path_walk(const char *, struct nameidata *); - -static __always_inline unsigned set_root_rcu(struct nameidata *nd) +static void set_root_rcu(struct nameidata *nd) { struct fs_struct *fs = current->fs; - unsigned seq, res; + unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->root = fs->root; - res = __read_seqcount_begin(&nd->root.dentry->d_seq); + nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); - return res; } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -705,8 +827,9 @@ static inline void path_to_nameidata(const struct path *path, * Helper to directly jump to a known parsed path from ->follow_link, * caller must have taken a reference to path beforehand. */ -void nd_jump_link(struct nameidata *nd, struct path *path) +void nd_jump_link(struct path *path) { + struct nameidata *nd = current->nameidata; path_put(&nd->path); nd->path = *path; @@ -714,24 +837,14 @@ void nd_jump_link(struct nameidata *nd, struct path *path) nd->flags |= LOOKUP_JUMPED; } -void nd_set_link(struct nameidata *nd, char *path) -{ - nd->saved_names[nd->depth] = path; -} -EXPORT_SYMBOL(nd_set_link); - -char *nd_get_link(struct nameidata *nd) -{ - return nd->saved_names[nd->depth]; -} -EXPORT_SYMBOL(nd_get_link); - -static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +static inline void put_link(struct nameidata *nd) { - struct inode *inode = link->dentry->d_inode; - if (inode->i_op->put_link) - inode->i_op->put_link(link->dentry, nd, cookie); - path_put(link); + struct saved *last = nd->stack + --nd->depth; + struct inode *inode = last->inode; + if (last->cookie && inode->i_op->put_link) + inode->i_op->put_link(inode, last->cookie); + if (!(nd->flags & LOOKUP_RCU)) + path_put(&last->link); } int sysctl_protected_symlinks __read_mostly = 0; @@ -739,7 +852,6 @@ int sysctl_protected_hardlinks __read_mostly = 0; /** * may_follow_link - Check symlink following for unsafe situations - * @link: The path of the symlink * @nd: nameidata pathwalk data * * In the case of the sysctl_protected_symlinks sysctl being enabled, @@ -753,7 +865,7 @@ int sysctl_protected_hardlinks __read_mostly = 0; * * Returns 0 if following the symlink is allowed, -ve on error. */ -static inline int may_follow_link(struct path *link, struct nameidata *nd) +static inline int may_follow_link(struct nameidata *nd) { const struct inode *inode; const struct inode *parent; @@ -762,7 +874,7 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd) return 0; /* Allowed if owner and follower match. */ - inode = link->dentry->d_inode; + inode = nd->stack[0].inode; if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; @@ -775,9 +887,10 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd) if (uid_eq(parent->i_uid, inode->i_uid)) return 0; - audit_log_link_denied("follow_link", link); - path_put_conditional(link, nd); - path_put(&nd->path); + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + audit_log_link_denied("follow_link", &nd->stack[0].link); return -EACCES; } @@ -850,82 +963,68 @@ static int may_linkat(struct path *link) return -EPERM; } -static __always_inline int -follow_link(struct path *link, struct nameidata *nd, void **p) +static __always_inline +const char *get_link(struct nameidata *nd) { - struct dentry *dentry = link->dentry; + struct saved *last = nd->stack + nd->depth - 1; + struct dentry *dentry = last->link.dentry; + struct inode *inode = last->inode; int error; - char *s; + const char *res; - BUG_ON(nd->flags & LOOKUP_RCU); - - if (link->mnt == nd->path.mnt) - mntget(link->mnt); - - error = -ELOOP; - if (unlikely(current->total_link_count >= 40)) - goto out_put_nd_path; - - cond_resched(); - current->total_link_count++; - - touch_atime(link); - nd_set_link(nd, NULL); + if (!(nd->flags & LOOKUP_RCU)) { + touch_atime(&last->link); + cond_resched(); + } else if (atime_needs_update(&last->link, inode)) { + if (unlikely(unlazy_walk(nd, NULL, 0))) + return ERR_PTR(-ECHILD); + touch_atime(&last->link); + } - error = security_inode_follow_link(link->dentry, nd); - if (error) - goto out_put_nd_path; + error = security_inode_follow_link(dentry, inode, + nd->flags & LOOKUP_RCU); + if (unlikely(error)) + return ERR_PTR(error); nd->last_type = LAST_BIND; - *p = dentry->d_inode->i_op->follow_link(dentry, nd); - error = PTR_ERR(*p); - if (IS_ERR(*p)) - goto out_put_nd_path; - - error = 0; - s = nd_get_link(nd); - if (s) { - if (unlikely(IS_ERR(s))) { - path_put(&nd->path); - put_link(nd, link, *p); - return PTR_ERR(s); + res = inode->i_link; + if (!res) { + if (nd->flags & LOOKUP_RCU) { + if (unlikely(unlazy_walk(nd, NULL, 0))) + return ERR_PTR(-ECHILD); } - if (*s == '/') { + res = inode->i_op->follow_link(dentry, &last->cookie); + if (IS_ERR_OR_NULL(res)) { + last->cookie = NULL; + return res; + } + } + if (*res == '/') { + if (nd->flags & LOOKUP_RCU) { + struct dentry *d; + if (!nd->root.mnt) + set_root_rcu(nd); + nd->path = nd->root; + d = nd->path.dentry; + nd->inode = d->d_inode; + nd->seq = nd->root_seq; + if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) + return ERR_PTR(-ECHILD); + } else { if (!nd->root.mnt) set_root(nd); path_put(&nd->path); nd->path = nd->root; path_get(&nd->root); - nd->flags |= LOOKUP_JUMPED; + nd->inode = nd->path.dentry->d_inode; } - nd->inode = nd->path.dentry->d_inode; - error = link_path_walk(s, nd); - if (unlikely(error)) - put_link(nd, link, *p); + nd->flags |= LOOKUP_JUMPED; + while (unlikely(*++res == '/')) + ; } - - return error; - -out_put_nd_path: - *p = NULL; - path_put(&nd->path); - path_put(link); - return error; -} - -static int follow_up_rcu(struct path *path) -{ - struct mount *mnt = real_mount(path->mnt); - struct mount *parent; - struct dentry *mountpoint; - - parent = mnt->mnt_parent; - if (&parent->mnt == path->mnt) - return 0; - mountpoint = mnt->mnt_mountpoint; - path->dentry = mountpoint; - path->mnt = &parent->mnt; - return 1; + if (!*res) + res = NULL; + return res; } /* @@ -966,7 +1065,7 @@ EXPORT_SYMBOL(follow_up); * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ -static int follow_automount(struct path *path, unsigned flags, +static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; @@ -986,13 +1085,13 @@ static int follow_automount(struct path *path, unsigned flags, * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ - if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && path->dentry->d_inode) return -EISDIR; - current->total_link_count++; - if (current->total_link_count >= 40) + nd->total_link_count++; + if (nd->total_link_count >= 40) return -ELOOP; mnt = path->dentry->d_op->d_automount(path); @@ -1006,7 +1105,7 @@ static int follow_automount(struct path *path, unsigned flags, * the path being looked up; if it wasn't then the remainder of * the path is inaccessible and we should say so. */ - if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT)) + if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT)) return -EREMOTE; return PTR_ERR(mnt); } @@ -1046,7 +1145,7 @@ static int follow_automount(struct path *path, unsigned flags, * * Serialization is taken care of in namespace.c */ -static int follow_managed(struct path *path, unsigned flags) +static int follow_managed(struct path *path, struct nameidata *nd) { struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ unsigned managed; @@ -1090,7 +1189,7 @@ static int follow_managed(struct path *path, unsigned flags) /* Handle an automount point */ if (managed & DCACHE_NEED_AUTOMOUNT) { - ret = follow_automount(path, flags, &need_mntput); + ret = follow_automount(path, nd, &need_mntput); if (ret < 0) break; continue; @@ -1104,7 +1203,11 @@ static int follow_managed(struct path *path, unsigned flags) mntput(path->mnt); if (ret == -EISDIR) ret = 0; - return ret < 0 ? ret : need_mntput; + if (need_mntput) + nd->flags |= LOOKUP_JUMPED; + if (unlikely(ret < 0)) + path_put_conditional(path, nd); + return ret; } int follow_down_one(struct path *path) @@ -1134,7 +1237,7 @@ static inline int managed_dentry_rcu(struct dentry *dentry) * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode) + struct inode **inode, unsigned *seqp) { for (;;) { struct mount *mounted; @@ -1161,7 +1264,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, path->mnt = &mounted->mnt; path->dentry = mounted->mnt.mnt_root; nd->flags |= LOOKUP_JUMPED; - nd->seq = read_seqcount_begin(&path->dentry->d_seq); + *seqp = read_seqcount_begin(&path->dentry->d_seq); /* * Update the inode too. We don't need to re-check the * dentry sequence number here after this d_inode read, @@ -1180,10 +1283,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) set_root_rcu(nd); while (1) { - if (nd->path.dentry == nd->root.dentry && - nd->path.mnt == nd->root.mnt) { + if (path_equal(&nd->path, &nd->root)) break; - } if (nd->path.dentry != nd->path.mnt->mnt_root) { struct dentry *old = nd->path.dentry; struct dentry *parent = old->d_parent; @@ -1191,38 +1292,42 @@ static int follow_dotdot_rcu(struct nameidata *nd) inode = parent->d_inode; seq = read_seqcount_begin(&parent->d_seq); - if (read_seqcount_retry(&old->d_seq, nd->seq)) - goto failed; + if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + return -ECHILD; nd->path.dentry = parent; nd->seq = seq; break; + } else { + struct mount *mnt = real_mount(nd->path.mnt); + struct mount *mparent = mnt->mnt_parent; + struct dentry *mountpoint = mnt->mnt_mountpoint; + struct inode *inode2 = mountpoint->d_inode; + unsigned seq = read_seqcount_begin(&mountpoint->d_seq); + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return -ECHILD; + if (&mparent->mnt == nd->path.mnt) + break; + /* we know that mountpoint was pinned */ + nd->path.dentry = mountpoint; + nd->path.mnt = &mparent->mnt; + inode = inode2; + nd->seq = seq; } - if (!follow_up_rcu(&nd->path)) - break; - inode = nd->path.dentry->d_inode; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } - while (d_mountpoint(nd->path.dentry)) { + while (unlikely(d_mountpoint(nd->path.dentry))) { struct mount *mounted; mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return -ECHILD; if (!mounted) break; nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - if (read_seqretry(&mount_lock, nd->m_seq)) - goto failed; } nd->inode = inode; return 0; - -failed: - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - rcu_read_unlock(); - return -ECHILD; } /* @@ -1401,7 +1506,8 @@ static struct dentry *__lookup_hash(struct qstr *name, * It _is_ time-critical. */ static int lookup_fast(struct nameidata *nd, - struct path *path, struct inode **inode) + struct path *path, struct inode **inode, + unsigned *seqp) { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; @@ -1416,6 +1522,7 @@ static int lookup_fast(struct nameidata *nd, */ if (nd->flags & LOOKUP_RCU) { unsigned seq; + bool negative; dentry = __d_lookup_rcu(parent, &nd->last, &seq); if (!dentry) goto unlazy; @@ -1424,9 +1531,12 @@ static int lookup_fast(struct nameidata *nd, * This sequence count validates that the inode matches * the dentry name information from lookup. */ - *inode = dentry->d_inode; + *inode = d_backing_inode(dentry); + negative = d_is_negative(dentry); if (read_seqcount_retry(&dentry->d_seq, seq)) return -ECHILD; + if (negative) + return -ENOENT; /* * This sequence count validates that the parent had no @@ -1437,8 +1547,8 @@ static int lookup_fast(struct nameidata *nd, */ if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return -ECHILD; - nd->seq = seq; + *seqp = seq; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { @@ -1449,10 +1559,10 @@ static int lookup_fast(struct nameidata *nd, } path->mnt = mnt; path->dentry = dentry; - if (likely(__follow_mount_rcu(nd, path, inode))) + if (likely(__follow_mount_rcu(nd, path, inode, seqp))) return 0; unlazy: - if (unlazy_walk(nd, dentry)) + if (unlazy_walk(nd, dentry, seq)) return -ECHILD; } else { dentry = __d_lookup(parent, &nd->last); @@ -1473,17 +1583,16 @@ unlazy: goto need_lookup; } + if (unlikely(d_is_negative(dentry))) { + dput(dentry); + return -ENOENT; + } path->mnt = mnt; path->dentry = dentry; - err = follow_managed(path, nd->flags); - if (unlikely(err < 0)) { - path_put_conditional(path, nd); - return err; - } - if (err) - nd->flags |= LOOKUP_JUMPED; - *inode = path->dentry->d_inode; - return 0; + err = follow_managed(path, nd); + if (likely(!err)) + *inode = d_backing_inode(path->dentry); + return err; need_lookup: return 1; @@ -1493,7 +1602,6 @@ need_lookup: static int lookup_slow(struct nameidata *nd, struct path *path) { struct dentry *dentry, *parent; - int err; parent = nd->path.dentry; BUG_ON(nd->inode != parent->d_inode); @@ -1505,14 +1613,7 @@ static int lookup_slow(struct nameidata *nd, struct path *path) return PTR_ERR(dentry); path->mnt = nd->path.mnt; path->dentry = dentry; - err = follow_managed(path, nd->flags); - if (unlikely(err < 0)) { - path_put_conditional(path, nd); - return err; - } - if (err) - nd->flags |= LOOKUP_JUMPED; - return 0; + return follow_managed(path, nd); } static inline int may_lookup(struct nameidata *nd) @@ -1521,7 +1622,7 @@ static inline int may_lookup(struct nameidata *nd) int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD) return err; - if (unlazy_walk(nd, NULL)) + if (unlazy_walk(nd, NULL, 0)) return -ECHILD; } return inode_permission(nd->inode, MAY_EXEC); @@ -1531,24 +1632,45 @@ static inline int handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; + return follow_dotdot_rcu(nd); } else follow_dotdot(nd); } return 0; } -static void terminate_walk(struct nameidata *nd) +static int pick_link(struct nameidata *nd, struct path *link, + struct inode *inode, unsigned seq) { + int error; + struct saved *last; + if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { + path_to_nameidata(link, nd); + return -ELOOP; + } if (!(nd->flags & LOOKUP_RCU)) { - path_put(&nd->path); - } else { - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - rcu_read_unlock(); + if (link->mnt == nd->path.mnt) + mntget(link->mnt); } + error = nd_alloc_stack(nd); + if (unlikely(error)) { + if (error == -ECHILD) { + if (unlikely(unlazy_link(nd, link, seq))) + return -ECHILD; + error = nd_alloc_stack(nd); + } + if (error) { + path_put(link); + return error; + } + } + + last = nd->stack + nd->depth++; + last->link = *link; + last->cookie = NULL; + last->inode = inode; + last->seq = seq; + return 1; } /* @@ -1557,97 +1679,68 @@ static void terminate_walk(struct nameidata *nd) * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int should_follow_link(struct dentry *dentry, int follow) +static inline int should_follow_link(struct nameidata *nd, struct path *link, + int follow, + struct inode *inode, unsigned seq) { - return unlikely(d_is_symlink(dentry)) ? follow : 0; + if (likely(!d_is_symlink(link->dentry))) + return 0; + if (!follow) + return 0; + return pick_link(nd, link, inode, seq); } -static inline int walk_component(struct nameidata *nd, struct path *path, - int follow) +enum {WALK_GET = 1, WALK_PUT = 2}; + +static int walk_component(struct nameidata *nd, int flags) { + struct path path; struct inode *inode; + unsigned seq; int err; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ - if (unlikely(nd->last_type != LAST_NORM)) - return handle_dots(nd, nd->last_type); - err = lookup_fast(nd, path, &inode); + if (unlikely(nd->last_type != LAST_NORM)) { + err = handle_dots(nd, nd->last_type); + if (flags & WALK_PUT) + put_link(nd); + return err; + } + err = lookup_fast(nd, &path, &inode, &seq); if (unlikely(err)) { if (err < 0) - goto out_err; + return err; - err = lookup_slow(nd, path); + err = lookup_slow(nd, &path); if (err < 0) - goto out_err; + return err; - inode = path->dentry->d_inode; + inode = d_backing_inode(path.dentry); + seq = 0; /* we are already out of RCU mode */ + err = -ENOENT; + if (d_is_negative(path.dentry)) + goto out_path_put; } - err = -ENOENT; - if (!inode || d_is_negative(path->dentry)) - goto out_path_put; - if (should_follow_link(path->dentry, follow)) { - if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, path->dentry))) { - err = -ECHILD; - goto out_err; - } - } - BUG_ON(inode != path->dentry->d_inode); - return 1; - } - path_to_nameidata(path, nd); + if (flags & WALK_PUT) + put_link(nd); + err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq); + if (unlikely(err)) + return err; + path_to_nameidata(&path, nd); nd->inode = inode; + nd->seq = seq; return 0; out_path_put: - path_to_nameidata(path, nd); -out_err: - terminate_walk(nd); + path_to_nameidata(&path, nd); return err; } /* - * This limits recursive symlink follows to 8, while - * limiting consecutive symlinks to 40. - * - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ -static inline int nested_symlink(struct path *path, struct nameidata *nd) -{ - int res; - - if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { - path_put_conditional(path, nd); - path_put(&nd->path); - return -ELOOP; - } - BUG_ON(nd->depth >= MAX_NESTED_LINKS); - - nd->depth++; - current->link_count++; - - do { - struct path link = *path; - void *cookie; - - res = follow_link(&link, nd, &cookie); - if (res) - break; - res = walk_component(nd, path, LOOKUP_FOLLOW); - put_link(nd, &link, cookie); - } while (res > 0); - - current->link_count--; - nd->depth--; - return res; -} - -/* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * @@ -1773,9 +1866,8 @@ static inline u64 hash_name(const char *name) */ static int link_path_walk(const char *name, struct nameidata *nd) { - struct path next; int err; - + while (*name=='/') name++; if (!*name) @@ -1788,7 +1880,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) err = may_lookup(nd); if (err) - break; + return err; hash_len = hash_name(name); @@ -1810,7 +1902,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); if (err < 0) - break; + return err; hash_len = this.hash_len; name = this.name; } @@ -1822,7 +1914,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) name += hashlen_len(hash_len); if (!*name) - return 0; + goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. @@ -1830,71 +1922,94 @@ static int link_path_walk(const char *name, struct nameidata *nd) do { name++; } while (unlikely(*name == '/')); - if (!*name) - return 0; - - err = walk_component(nd, &next, LOOKUP_FOLLOW); + if (unlikely(!*name)) { +OK: + /* pathname body, done */ + if (!nd->depth) + return 0; + name = nd->stack[nd->depth - 1].name; + /* trailing symlink, done */ + if (!name) + return 0; + /* last component of nested symlink */ + err = walk_component(nd, WALK_GET | WALK_PUT); + } else { + err = walk_component(nd, WALK_GET); + } if (err < 0) return err; if (err) { - err = nested_symlink(&next, nd); - if (err) - return err; + const char *s = get_link(nd); + + if (unlikely(IS_ERR(s))) + return PTR_ERR(s); + err = 0; + if (unlikely(!s)) { + /* jumped */ + put_link(nd); + } else { + nd->stack[nd->depth - 1].name = name; + name = s; + continue; + } } - if (!d_can_lookup(nd->path.dentry)) { - err = -ENOTDIR; - break; + if (unlikely(!d_can_lookup(nd->path.dentry))) { + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; + } + return -ENOTDIR; } } - terminate_walk(nd); - return err; } -static int path_init(int dfd, const char *name, unsigned int flags, - struct nameidata *nd) +static const char *path_init(struct nameidata *nd, unsigned flags) { int retval = 0; + const char *s = nd->name->name; nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; - nd->base = NULL; + nd->total_link_count = 0; if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; - if (*name) { + if (*s) { if (!d_can_lookup(root)) - return -ENOTDIR; + return ERR_PTR(-ENOTDIR); retval = inode_permission(inode, MAY_EXEC); if (retval) - return retval; + return ERR_PTR(retval); } nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { rcu_read_lock(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->root_seq = nd->seq; nd->m_seq = read_seqbegin(&mount_lock); } else { path_get(&nd->path); } - goto done; + return s; } nd->root.mnt = NULL; nd->m_seq = read_seqbegin(&mount_lock); - if (*name=='/') { + if (*s == '/') { if (flags & LOOKUP_RCU) { rcu_read_lock(); - nd->seq = set_root_rcu(nd); + set_root_rcu(nd); + nd->seq = nd->root_seq; } else { set_root(nd); path_get(&nd->root); } nd->path = nd->root; - } else if (dfd == AT_FDCWD) { + } else if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; @@ -1911,188 +2026,205 @@ static int path_init(int dfd, const char *name, unsigned int flags, } } else { /* Caller must check execute permissions on the starting path component */ - struct fd f = fdget_raw(dfd); + struct fd f = fdget_raw(nd->dfd); struct dentry *dentry; if (!f.file) - return -EBADF; + return ERR_PTR(-EBADF); dentry = f.file->f_path.dentry; - if (*name) { + if (*s) { if (!d_can_lookup(dentry)) { fdput(f); - return -ENOTDIR; + return ERR_PTR(-ENOTDIR); } } nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { - if (f.flags & FDPUT_FPUT) - nd->base = f.file; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); rcu_read_lock(); + nd->inode = nd->path.dentry->d_inode; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); - fdput(f); + nd->inode = nd->path.dentry->d_inode; } + fdput(f); + return s; } nd->inode = nd->path.dentry->d_inode; if (!(flags & LOOKUP_RCU)) - goto done; + return s; if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) - goto done; + return s; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); - return -ECHILD; -done: - current->total_link_count = 0; - return link_path_walk(name, nd); + return ERR_PTR(-ECHILD); } -static void path_cleanup(struct nameidata *nd) +static const char *trailing_symlink(struct nameidata *nd) { - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - path_put(&nd->root); - nd->root.mnt = NULL; - } - if (unlikely(nd->base)) - fput(nd->base); + const char *s; + int error = may_follow_link(nd); + if (unlikely(error)) + return ERR_PTR(error); + nd->flags |= LOOKUP_PARENT; + nd->stack[0].name = NULL; + s = get_link(nd); + return s ? s : ""; } -static inline int lookup_last(struct nameidata *nd, struct path *path) +static inline int lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; nd->flags &= ~LOOKUP_PARENT; - return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW); + return walk_component(nd, + nd->flags & LOOKUP_FOLLOW + ? nd->depth + ? WALK_PUT | WALK_GET + : WALK_GET + : 0); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int path_lookupat(int dfd, const char *name, - unsigned int flags, struct nameidata *nd) +static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { - struct path path; + const char *s = path_init(nd, flags); int err; - /* - * Path walking is largely split up into 2 different synchronisation - * schemes, rcu-walk and ref-walk (explained in - * Documentation/filesystems/path-lookup.txt). These share much of the - * path walk code, but some things particularly setup, cleanup, and - * following mounts are sufficiently divergent that functions are - * duplicated. Typically there is a function foo(), and its RCU - * analogue, foo_rcu(). - * - * -ECHILD is the error number of choice (just to avoid clashes) that - * is returned if some aspect of an rcu-walk fails. Such an error must - * be handled by restarting a traditional ref-walk (which will always - * be able to complete). - */ - err = path_init(dfd, name, flags, nd); - if (!err && !(flags & LOOKUP_PARENT)) { - err = lookup_last(nd, &path); - while (err > 0) { - void *cookie; - struct path link = path; - err = may_follow_link(&link, nd); - if (unlikely(err)) - break; - nd->flags |= LOOKUP_PARENT; - err = follow_link(&link, nd, &cookie); - if (err) - break; - err = lookup_last(nd, &path); - put_link(nd, &link, cookie); + if (IS_ERR(s)) + return PTR_ERR(s); + while (!(err = link_path_walk(s, nd)) + && ((err = lookup_last(nd)) > 0)) { + s = trailing_symlink(nd); + if (IS_ERR(s)) { + err = PTR_ERR(s); + break; } } - if (!err) err = complete_walk(nd); - if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!d_can_lookup(nd->path.dentry)) { - path_put(&nd->path); + if (!err && nd->flags & LOOKUP_DIRECTORY) + if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; - } + if (!err) { + *path = nd->path; + nd->path.mnt = NULL; + nd->path.dentry = NULL; } - - path_cleanup(nd); + terminate_walk(nd); return err; } -static int filename_lookup(int dfd, struct filename *name, - unsigned int flags, struct nameidata *nd) +static int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root) { - int retval = path_lookupat(dfd, name->name, flags | LOOKUP_RCU, nd); + int retval; + struct nameidata nd; + if (IS_ERR(name)) + return PTR_ERR(name); + if (unlikely(root)) { + nd.root = *root; + flags |= LOOKUP_ROOT; + } + set_nameidata(&nd, dfd, name); + retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) - retval = path_lookupat(dfd, name->name, flags, nd); + retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) - retval = path_lookupat(dfd, name->name, - flags | LOOKUP_REVAL, nd); + retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) - audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT); + audit_inode(name, path->dentry, flags & LOOKUP_PARENT); + restore_nameidata(); + putname(name); return retval; } -static int do_path_lookup(int dfd, const char *name, - unsigned int flags, struct nameidata *nd) +/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +static int path_parentat(struct nameidata *nd, unsigned flags, + struct path *parent) { - struct filename *filename = getname_kernel(name); - int retval = PTR_ERR(filename); + const char *s = path_init(nd, flags); + int err; + if (IS_ERR(s)) + return PTR_ERR(s); + err = link_path_walk(s, nd); + if (!err) + err = complete_walk(nd); + if (!err) { + *parent = nd->path; + nd->path.mnt = NULL; + nd->path.dentry = NULL; + } + terminate_walk(nd); + return err; +} - if (!IS_ERR(filename)) { - retval = filename_lookup(dfd, filename, flags, nd); - putname(filename); +static struct filename *filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type) +{ + int retval; + struct nameidata nd; + + if (IS_ERR(name)) + return name; + set_nameidata(&nd, dfd, name); + retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); + if (unlikely(retval == -ECHILD)) + retval = path_parentat(&nd, flags, parent); + if (unlikely(retval == -ESTALE)) + retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); + if (likely(!retval)) { + *last = nd.last; + *type = nd.last_type; + audit_inode(name, parent->dentry, LOOKUP_PARENT); + } else { + putname(name); + name = ERR_PTR(retval); } - return retval; + restore_nameidata(); + return name; } /* does lookup, returns the object with parent locked */ struct dentry *kern_path_locked(const char *name, struct path *path) { - struct filename *filename = getname_kernel(name); - struct nameidata nd; + struct filename *filename; struct dentry *d; - int err; + struct qstr last; + int type; + filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path, + &last, &type); if (IS_ERR(filename)) return ERR_CAST(filename); - - err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd); - if (err) { - d = ERR_PTR(err); - goto out; - } - if (nd.last_type != LAST_NORM) { - path_put(&nd.path); - d = ERR_PTR(-EINVAL); - goto out; + if (unlikely(type != LAST_NORM)) { + path_put(path); + putname(filename); + return ERR_PTR(-EINVAL); } - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - d = __lookup_hash(&nd.last, nd.path.dentry, 0); + mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = __lookup_hash(&last, path->dentry, 0); if (IS_ERR(d)) { - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); - goto out; + mutex_unlock(&path->dentry->d_inode->i_mutex); + path_put(path); } - *path = nd.path; -out: putname(filename); return d; } int kern_path(const char *name, unsigned int flags, struct path *path) { - struct nameidata nd; - int res = do_path_lookup(AT_FDCWD, name, flags, &nd); - if (!res) - *path = nd.path; - return res; + return filename_lookup(AT_FDCWD, getname_kernel(name), + flags, path, NULL); } EXPORT_SYMBOL(kern_path); @@ -2108,29 +2240,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { - struct nameidata nd; - int err; - nd.root.dentry = dentry; - nd.root.mnt = mnt; - BUG_ON(flags & LOOKUP_PARENT); - /* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */ - err = do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, &nd); - if (!err) - *path = nd.path; - return err; + struct path root = {.mnt = mnt, .dentry = dentry}; + /* the first argument of filename_lookup() is ignored with root */ + return filename_lookup(AT_FDCWD, getname_kernel(name), + flags , path, &root); } EXPORT_SYMBOL(vfs_path_lookup); -/* - * Restricted form of lookup. Doesn't follow links, single-component only, - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ -static struct dentry *lookup_hash(struct nameidata *nd) -{ - return __lookup_hash(&nd->last, nd->path.dentry, nd->flags); -} - /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup @@ -2138,9 +2254,7 @@ static struct dentry *lookup_hash(struct nameidata *nd) * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. Also note that by using this function the - * nameidata argument is passed to the filesystem methods and a filesystem - * using this helper needs to be prepared for that. + * not be called by generic code. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { @@ -2187,27 +2301,10 @@ EXPORT_SYMBOL(lookup_one_len); int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { - struct nameidata nd; - struct filename *tmp = getname_flags(name, flags, empty); - int err = PTR_ERR(tmp); - if (!IS_ERR(tmp)) { - - BUG_ON(flags & LOOKUP_PARENT); - - err = filename_lookup(dfd, tmp, flags, &nd); - putname(tmp); - if (!err) - *path = nd.path; - } - return err; + return filename_lookup(dfd, getname_flags(name, flags, empty), + flags, path, NULL); } - -int user_path_at(int dfd, const char __user *name, unsigned flags, - struct path *path) -{ - return user_path_at_empty(dfd, name, flags, path, NULL); -} -EXPORT_SYMBOL(user_path_at); +EXPORT_SYMBOL(user_path_at_empty); /* * NB: most callers don't do anything directly with the reference to the @@ -2215,26 +2312,16 @@ EXPORT_SYMBOL(user_path_at); * allocated by getname. So we must hold the reference to it until all * path-walking is complete. */ -static struct filename * -user_path_parent(int dfd, const char __user *path, struct nameidata *nd, +static inline struct filename * +user_path_parent(int dfd, const char __user *path, + struct path *parent, + struct qstr *last, + int *type, unsigned int flags) { - struct filename *s = getname(path); - int error; - /* only LOOKUP_REVAL is allowed in extra flags */ - flags &= LOOKUP_REVAL; - - if (IS_ERR(s)) - return s; - - error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd); - if (error) { - putname(s); - return ERR_PTR(error); - } - - return s; + return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL, + parent, last, type); } /** @@ -2273,10 +2360,8 @@ mountpoint_last(struct nameidata *nd, struct path *path) /* If we're in rcuwalk, drop out of it to handle last component */ if (nd->flags & LOOKUP_RCU) { - if (unlazy_walk(nd, NULL)) { - error = -ECHILD; - goto out; - } + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; } nd->flags &= ~LOOKUP_PARENT; @@ -2284,7 +2369,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) if (unlikely(nd->last_type != LAST_NORM)) { error = handle_dots(nd, nd->last_type); if (error) - goto out; + return error; dentry = dget(nd->path.dentry); goto done; } @@ -2299,91 +2384,81 @@ mountpoint_last(struct nameidata *nd, struct path *path) */ dentry = d_alloc(dir, &nd->last); if (!dentry) { - error = -ENOMEM; mutex_unlock(&dir->d_inode->i_mutex); - goto out; + return -ENOMEM; } dentry = lookup_real(dir->d_inode, dentry, nd->flags); - error = PTR_ERR(dentry); if (IS_ERR(dentry)) { mutex_unlock(&dir->d_inode->i_mutex); - goto out; + return PTR_ERR(dentry); } } mutex_unlock(&dir->d_inode->i_mutex); done: - if (!dentry->d_inode || d_is_negative(dentry)) { - error = -ENOENT; + if (d_is_negative(dentry)) { dput(dentry); - goto out; + return -ENOENT; } + if (nd->depth) + put_link(nd); path->dentry = dentry; path->mnt = nd->path.mnt; - if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW)) - return 1; + error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW, + d_backing_inode(dentry), 0); + if (unlikely(error)) + return error; mntget(path->mnt); follow_mount(path); - error = 0; -out: - terminate_walk(nd); - return error; + return 0; } /** * path_mountpoint - look up a path to be umounted - * @dfd: directory file descriptor to start walk from - * @name: full pathname to walk - * @path: pointer to container for result + * @nameidata: lookup context * @flags: lookup flags + * @path: pointer to container for result * * Look up the given name, but don't attempt to revalidate the last component. * Returns 0 and "path" will be valid on success; Returns error otherwise. */ static int -path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags) +path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) { - struct nameidata nd; + const char *s = path_init(nd, flags); int err; - - err = path_init(dfd, name, flags, &nd); - if (unlikely(err)) - goto out; - - err = mountpoint_last(&nd, path); - while (err > 0) { - void *cookie; - struct path link = *path; - err = may_follow_link(&link, &nd); - if (unlikely(err)) - break; - nd.flags |= LOOKUP_PARENT; - err = follow_link(&link, &nd, &cookie); - if (err) + if (IS_ERR(s)) + return PTR_ERR(s); + while (!(err = link_path_walk(s, nd)) && + (err = mountpoint_last(nd, path)) > 0) { + s = trailing_symlink(nd); + if (IS_ERR(s)) { + err = PTR_ERR(s); break; - err = mountpoint_last(&nd, path); - put_link(&nd, &link, cookie); + } } -out: - path_cleanup(&nd); + terminate_walk(nd); return err; } static int -filename_mountpoint(int dfd, struct filename *s, struct path *path, +filename_mountpoint(int dfd, struct filename *name, struct path *path, unsigned int flags) { + struct nameidata nd; int error; - if (IS_ERR(s)) - return PTR_ERR(s); - error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU); + if (IS_ERR(name)) + return PTR_ERR(name); + set_nameidata(&nd, dfd, name); + error = path_mountpoint(&nd, flags | LOOKUP_RCU, path); if (unlikely(error == -ECHILD)) - error = path_mountpoint(dfd, s->name, path, flags); + error = path_mountpoint(&nd, flags, path); if (unlikely(error == -ESTALE)) - error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL); + error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); if (likely(!error)) - audit_inode(s, path->dentry, 0); - putname(s); + audit_inode(name, path->dentry, 0); + restore_nameidata(); + putname(name); return error; } @@ -2449,7 +2524,7 @@ EXPORT_SYMBOL(__check_sticky); */ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) { - struct inode *inode = victim->d_inode; + struct inode *inode = d_backing_inode(victim); int error; if (d_is_negative(victim)) @@ -2915,18 +2990,19 @@ out_dput: /* * Handle the last step of open() */ -static int do_last(struct nameidata *nd, struct path *path, +static int do_last(struct nameidata *nd, struct file *file, const struct open_flags *op, - int *opened, struct filename *name) + int *opened) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; int acc_mode = op->acc_mode; + unsigned seq; struct inode *inode; - bool symlink_ok = false; struct path save_parent = { .dentry = NULL, .mnt = NULL }; + struct path path; bool retried = false; int error; @@ -2935,7 +3011,7 @@ static int do_last(struct nameidata *nd, struct path *path, if (nd->last_type != LAST_NORM) { error = handle_dots(nd, nd->last_type); - if (error) + if (unlikely(error)) return error; goto finish_open; } @@ -2943,15 +3019,13 @@ static int do_last(struct nameidata *nd, struct path *path, if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) - symlink_ok = true; /* we _can_ be in RCU mode here */ - error = lookup_fast(nd, path, &inode); + error = lookup_fast(nd, &path, &inode, &seq); if (likely(!error)) goto finish_lookup; if (error < 0) - goto out; + return error; BUG_ON(nd->inode != dir->d_inode); } else { @@ -2965,11 +3039,10 @@ static int do_last(struct nameidata *nd, struct path *path, if (error) return error; - audit_inode(name, dir, LOOKUP_PARENT); - error = -EISDIR; + audit_inode(nd->name, dir, LOOKUP_PARENT); /* trailing slashes? */ - if (nd->last.name[nd->last.len]) - goto out; + if (unlikely(nd->last.name[nd->last.len])) + return -EISDIR; } retry_lookup: @@ -2984,7 +3057,7 @@ retry_lookup: */ } mutex_lock(&dir->d_inode->i_mutex); - error = lookup_open(nd, path, file, op, got_write, opened); + error = lookup_open(nd, &path, file, op, got_write, opened); mutex_unlock(&dir->d_inode->i_mutex); if (error <= 0) { @@ -2995,7 +3068,7 @@ retry_lookup: !S_ISREG(file_inode(file)->i_mode)) will_truncate = false; - audit_inode(name, file->f_path.dentry, 0); + audit_inode(nd->name, file->f_path.dentry, 0); goto opened; } @@ -3004,15 +3077,15 @@ retry_lookup: open_flag &= ~O_TRUNC; will_truncate = false; acc_mode = MAY_OPEN; - path_to_nameidata(path, nd); + path_to_nameidata(&path, nd); goto finish_open_created; } /* * create/update audit record if it already exists. */ - if (d_is_positive(path->dentry)) - audit_inode(name, path->dentry, 0); + if (d_is_positive(path.dentry)) + audit_inode(nd->name, path.dentry, 0); /* * If atomic_open() acquired write access it is dropped now due to @@ -3024,47 +3097,45 @@ retry_lookup: got_write = false; } - error = -EEXIST; - if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) - goto exit_dput; - - error = follow_managed(path, nd->flags); - if (error < 0) - goto exit_dput; + if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { + path_to_nameidata(&path, nd); + return -EEXIST; + } - if (error) - nd->flags |= LOOKUP_JUMPED; + error = follow_managed(&path, nd); + if (unlikely(error < 0)) + return error; BUG_ON(nd->flags & LOOKUP_RCU); - inode = path->dentry->d_inode; -finish_lookup: - /* we _can_ be in RCU mode here */ - error = -ENOENT; - if (!inode || d_is_negative(path->dentry)) { - path_to_nameidata(path, nd); - goto out; + inode = d_backing_inode(path.dentry); + seq = 0; /* out of RCU mode, so the value doesn't matter */ + if (unlikely(d_is_negative(path.dentry))) { + path_to_nameidata(&path, nd); + return -ENOENT; } +finish_lookup: + if (nd->depth) + put_link(nd); + error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW, + inode, seq); + if (unlikely(error)) + return error; - if (should_follow_link(path->dentry, !symlink_ok)) { - if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, path->dentry))) { - error = -ECHILD; - goto out; - } - } - BUG_ON(inode != path->dentry->d_inode); - return 1; + if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) { + path_to_nameidata(&path, nd); + return -ELOOP; } - if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) { - path_to_nameidata(path, nd); + if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) { + path_to_nameidata(&path, nd); } else { save_parent.dentry = nd->path.dentry; - save_parent.mnt = mntget(path->mnt); - nd->path.dentry = path->dentry; + save_parent.mnt = mntget(path.mnt); + nd->path.dentry = path.dentry; } nd->inode = inode; + nd->seq = seq; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ finish_open: error = complete_walk(nd); @@ -3072,14 +3143,14 @@ finish_open: path_put(&save_parent); return error; } - audit_inode(name, nd->path.dentry, 0); + audit_inode(nd->name, nd->path.dentry, 0); error = -EISDIR; if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) goto out; error = -ENOTDIR; if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry)) goto out; - if (!S_ISREG(nd->inode->i_mode)) + if (!d_is_reg(nd->path.dentry)) will_truncate = false; if (will_truncate) { @@ -3119,12 +3190,8 @@ out: if (got_write) mnt_drop_write(nd->path.mnt); path_put(&save_parent); - terminate_walk(nd); return error; -exit_dput: - path_put_conditional(path, nd); - goto out; exit_fput: fput(file); goto out; @@ -3148,50 +3215,46 @@ stale_open: goto retry_lookup; } -static int do_tmpfile(int dfd, struct filename *pathname, - struct nameidata *nd, int flags, +static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file, int *opened) { static const struct qstr name = QSTR_INIT("/", 1); - struct dentry *dentry, *child; + struct dentry *child; struct inode *dir; - int error = path_lookupat(dfd, pathname->name, - flags | LOOKUP_DIRECTORY, nd); + struct path path; + int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) return error; - error = mnt_want_write(nd->path.mnt); + error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; + dir = path.dentry->d_inode; /* we want directory to be writable */ - error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC); + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); if (error) goto out2; - dentry = nd->path.dentry; - dir = dentry->d_inode; if (!dir->i_op->tmpfile) { error = -EOPNOTSUPP; goto out2; } - child = d_alloc(dentry, &name); + child = d_alloc(path.dentry, &name); if (unlikely(!child)) { error = -ENOMEM; goto out2; } - nd->flags &= ~LOOKUP_DIRECTORY; - nd->flags |= op->intent; - dput(nd->path.dentry); - nd->path.dentry = child; - error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode); + dput(path.dentry); + path.dentry = child; + error = dir->i_op->tmpfile(dir, child, op->mode); if (error) goto out2; - audit_inode(pathname, nd->path.dentry, 0); + audit_inode(nd->name, child, 0); /* Don't check for other permissions, the inode was just created */ - error = may_open(&nd->path, MAY_OPEN, op->open_flag); + error = may_open(&path, MAY_OPEN, op->open_flag); if (error) goto out2; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); + file->f_path.mnt = path.mnt; + error = finish_open(file, child, NULL, opened); if (error) goto out2; error = open_check_o_direct(file); @@ -3204,17 +3267,17 @@ static int do_tmpfile(int dfd, struct filename *pathname, spin_unlock(&inode->i_lock); } out2: - mnt_drop_write(nd->path.mnt); + mnt_drop_write(path.mnt); out: - path_put(&nd->path); + path_put(&path); return error; } -static struct file *path_openat(int dfd, struct filename *pathname, - struct nameidata *nd, const struct open_flags *op, int flags) +static struct file *path_openat(struct nameidata *nd, + const struct open_flags *op, unsigned flags) { + const char *s; struct file *file; - struct path path; int opened = 0; int error; @@ -3225,37 +3288,26 @@ static struct file *path_openat(int dfd, struct filename *pathname, file->f_flags = op->open_flag; if (unlikely(file->f_flags & __O_TMPFILE)) { - error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); - goto out; + error = do_tmpfile(nd, flags, op, file, &opened); + goto out2; } - error = path_init(dfd, pathname->name, flags, nd); - if (unlikely(error)) - goto out; - - error = do_last(nd, &path, file, op, &opened, pathname); - while (unlikely(error > 0)) { /* trailing symlink */ - struct path link = path; - void *cookie; - if (!(nd->flags & LOOKUP_FOLLOW)) { - path_put_conditional(&path, nd); - path_put(&nd->path); - error = -ELOOP; - break; - } - error = may_follow_link(&link, nd); - if (unlikely(error)) - break; - nd->flags |= LOOKUP_PARENT; + s = path_init(nd, flags); + if (IS_ERR(s)) { + put_filp(file); + return ERR_CAST(s); + } + while (!(error = link_path_walk(s, nd)) && + (error = do_last(nd, file, op, &opened)) > 0) { nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - error = follow_link(&link, nd, &cookie); - if (unlikely(error)) + s = trailing_symlink(nd); + if (IS_ERR(s)) { + error = PTR_ERR(s); break; - error = do_last(nd, &path, file, op, &opened, pathname); - put_link(nd, &link, cookie); + } } -out: - path_cleanup(nd); + terminate_walk(nd); +out2: if (!(opened & FILE_OPENED)) { BUG_ON(!error); put_filp(file); @@ -3279,11 +3331,13 @@ struct file *do_filp_open(int dfd, struct filename *pathname, int flags = op->lookup_flags; struct file *filp; - filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); + set_nameidata(&nd, dfd, pathname); + filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) - filp = path_openat(dfd, pathname, &nd, op, flags); + filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) - filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + filp = path_openat(&nd, op, flags | LOOKUP_REVAL); + restore_nameidata(); return filp; } @@ -3305,11 +3359,13 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, if (unlikely(IS_ERR(filename))) return ERR_CAST(filename); - file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU); + set_nameidata(&nd, -1, filename); + file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) - file = path_openat(-1, filename, &nd, op, flags); + file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) - file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL); + file = path_openat(&nd, op, flags | LOOKUP_REVAL); + restore_nameidata(); putname(filename); return file; } @@ -3318,7 +3374,8 @@ static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); - struct nameidata nd; + struct qstr last; + int type; int err2; int error; bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); @@ -3329,26 +3386,25 @@ static struct dentry *filename_create(int dfd, struct filename *name, */ lookup_flags &= LOOKUP_REVAL; - error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd); - if (error) - return ERR_PTR(error); + name = filename_parentat(dfd, name, lookup_flags, path, &last, &type); + if (IS_ERR(name)) + return ERR_CAST(name); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ - if (nd.last_type != LAST_NORM) + if (unlikely(type != LAST_NORM)) goto out; - nd.flags &= ~LOOKUP_PARENT; - nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; /* don't fail immediately if it's r/o, at least try to report other errors */ - err2 = mnt_want_write(nd.path.mnt); + err2 = mnt_want_write(path->mnt); /* * Do the final lookup. */ - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; + mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = __lookup_hash(&last, path->dentry, lookup_flags); if (IS_ERR(dentry)) goto unlock; @@ -3362,7 +3418,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ - if (unlikely(!is_dir && nd.last.name[nd.last.len])) { + if (unlikely(!is_dir && last.name[last.len])) { error = -ENOENT; goto fail; } @@ -3370,31 +3426,26 @@ static struct dentry *filename_create(int dfd, struct filename *name, error = err2; goto fail; } - *path = nd.path; + putname(name); return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + mutex_unlock(&path->dentry->d_inode->i_mutex); if (!err2) - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path->mnt); out: - path_put(&nd.path); + path_put(path); + putname(name); return dentry; } struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { - struct filename *filename = getname_kernel(pathname); - struct dentry *res; - - if (IS_ERR(filename)) - return ERR_CAST(filename); - res = filename_create(dfd, filename, path, lookup_flags); - putname(filename); - return res; + return filename_create(dfd, getname_kernel(pathname), + path, lookup_flags); } EXPORT_SYMBOL(kern_path_create); @@ -3407,16 +3458,10 @@ void done_path_create(struct path *path, struct dentry *dentry) } EXPORT_SYMBOL(done_path_create); -struct dentry *user_path_create(int dfd, const char __user *pathname, +inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { - struct filename *tmp = getname(pathname); - struct dentry *res; - if (IS_ERR(tmp)) - return ERR_CAST(tmp); - res = filename_create(dfd, tmp, path, lookup_flags); - putname(tmp); - return res; + return filename_create(dfd, getname(pathname), path, lookup_flags); } EXPORT_SYMBOL(user_path_create); @@ -3637,14 +3682,17 @@ static long do_rmdir(int dfd, const char __user *pathname) int error = 0; struct filename *name; struct dentry *dentry; - struct nameidata nd; + struct path path; + struct qstr last; + int type; unsigned int lookup_flags = 0; retry: - name = user_path_parent(dfd, pathname, &nd, lookup_flags); + name = user_path_parent(dfd, pathname, + &path, &last, &type, lookup_flags); if (IS_ERR(name)) return PTR_ERR(name); - switch(nd.last_type) { + switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; goto exit1; @@ -3656,13 +3704,12 @@ retry: goto exit1; } - nd.flags &= ~LOOKUP_PARENT; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto exit1; - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit2; @@ -3670,17 +3717,17 @@ retry: error = -ENOENT; goto exit3; } - error = security_path_rmdir(&nd.path, dentry); + error = security_path_rmdir(&path, dentry); if (error) goto exit3; - error = vfs_rmdir(nd.path.dentry->d_inode, dentry); + error = vfs_rmdir(path.dentry->d_inode, dentry); exit3: dput(dentry); exit2: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - mnt_drop_write(nd.path.mnt); + mutex_unlock(&path.dentry->d_inode->i_mutex); + mnt_drop_write(path.mnt); exit1: - path_put(&nd.path); + path_put(&path); putname(name); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -3763,43 +3810,45 @@ static long do_unlinkat(int dfd, const char __user *pathname) int error; struct filename *name; struct dentry *dentry; - struct nameidata nd; + struct path path; + struct qstr last; + int type; struct inode *inode = NULL; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: - name = user_path_parent(dfd, pathname, &nd, lookup_flags); + name = user_path_parent(dfd, pathname, + &path, &last, &type, lookup_flags); if (IS_ERR(name)) return PTR_ERR(name); error = -EISDIR; - if (nd.last_type != LAST_NORM) + if (type != LAST_NORM) goto exit1; - nd.flags &= ~LOOKUP_PARENT; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto exit1; retry_deleg: - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (nd.last.name[nd.last.len]) + if (last.name[last.len]) goto slashes; inode = dentry->d_inode; if (d_is_negative(dentry)) goto slashes; ihold(inode); - error = security_path_unlink(&nd.path, dentry); + error = security_path_unlink(&path, dentry); if (error) goto exit2; - error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode); + error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode); exit2: dput(dentry); } - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + mutex_unlock(&path.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; @@ -3808,9 +3857,9 @@ exit2: if (!error) goto retry_deleg; } - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path.mnt); exit1: - path_put(&nd.path); + path_put(&path); putname(name); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -4240,14 +4289,15 @@ EXPORT_SYMBOL(vfs_rename); SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { - struct dentry *old_dir, *new_dir; struct dentry *old_dentry, *new_dentry; struct dentry *trap; - struct nameidata oldnd, newnd; + struct path old_path, new_path; + struct qstr old_last, new_last; + int old_type, new_type; struct inode *delegated_inode = NULL; struct filename *from; struct filename *to; - unsigned int lookup_flags = 0; + unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; int error; @@ -4261,47 +4311,45 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) return -EPERM; + if (flags & RENAME_EXCHANGE) + target_flags = 0; + retry: - from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); + from = user_path_parent(olddfd, oldname, + &old_path, &old_last, &old_type, lookup_flags); if (IS_ERR(from)) { error = PTR_ERR(from); goto exit; } - to = user_path_parent(newdfd, newname, &newnd, lookup_flags); + to = user_path_parent(newdfd, newname, + &new_path, &new_last, &new_type, lookup_flags); if (IS_ERR(to)) { error = PTR_ERR(to); goto exit1; } error = -EXDEV; - if (oldnd.path.mnt != newnd.path.mnt) + if (old_path.mnt != new_path.mnt) goto exit2; - old_dir = oldnd.path.dentry; error = -EBUSY; - if (oldnd.last_type != LAST_NORM) + if (old_type != LAST_NORM) goto exit2; - new_dir = newnd.path.dentry; if (flags & RENAME_NOREPLACE) error = -EEXIST; - if (newnd.last_type != LAST_NORM) + if (new_type != LAST_NORM) goto exit2; - error = mnt_want_write(oldnd.path.mnt); + error = mnt_want_write(old_path.mnt); if (error) goto exit2; - oldnd.flags &= ~LOOKUP_PARENT; - newnd.flags &= ~LOOKUP_PARENT; - if (!(flags & RENAME_EXCHANGE)) - newnd.flags |= LOOKUP_RENAME_TARGET; - retry_deleg: - trap = lock_rename(new_dir, old_dir); + trap = lock_rename(new_path.dentry, old_path.dentry); - old_dentry = lookup_hash(&oldnd); + old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; @@ -4309,7 +4357,7 @@ retry_deleg: error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; - new_dentry = lookup_hash(&newnd); + new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; @@ -4323,16 +4371,16 @@ retry_deleg: if (!d_is_dir(new_dentry)) { error = -ENOTDIR; - if (newnd.last.name[newnd.last.len]) + if (new_last.name[new_last.len]) goto exit5; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!d_is_dir(old_dentry)) { error = -ENOTDIR; - if (oldnd.last.name[oldnd.last.len]) + if (old_last.name[old_last.len]) goto exit5; - if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len]) + if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) goto exit5; } /* source should not be ancestor of target */ @@ -4345,32 +4393,32 @@ retry_deleg: if (new_dentry == trap) goto exit5; - error = security_path_rename(&oldnd.path, old_dentry, - &newnd.path, new_dentry, flags); + error = security_path_rename(&old_path, old_dentry, + &new_path, new_dentry, flags); if (error) goto exit5; - error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry, + error = vfs_rename(old_path.dentry->d_inode, old_dentry, + new_path.dentry->d_inode, new_dentry, &delegated_inode, flags); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: - unlock_rename(new_dir, old_dir); + unlock_rename(new_path.dentry, old_path.dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } - mnt_drop_write(oldnd.path.mnt); + mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; - path_put(&newnd.path); + path_put(&new_path); putname(to); exit1: - path_put(&oldnd.path); + path_put(&old_path); putname(from); if (should_retry) { should_retry = false; @@ -4429,18 +4477,19 @@ EXPORT_SYMBOL(readlink_copy); */ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct nameidata nd; void *cookie; + struct inode *inode = d_inode(dentry); + const char *link = inode->i_link; int res; - nd.depth = 0; - cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); - if (IS_ERR(cookie)) - return PTR_ERR(cookie); - - res = readlink_copy(buffer, buflen, nd_get_link(&nd)); - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, &nd, cookie); + if (!link) { + link = inode->i_op->follow_link(dentry, &cookie); + if (IS_ERR(link)) + return PTR_ERR(link); + } + res = readlink_copy(buffer, buflen, link); + if (inode->i_op->put_link) + inode->i_op->put_link(inode, cookie); return res; } EXPORT_SYMBOL(generic_readlink); @@ -4472,22 +4521,21 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) } EXPORT_SYMBOL(page_readlink); -void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) +const char *page_follow_link_light(struct dentry *dentry, void **cookie) { struct page *page = NULL; - nd_set_link(nd, page_getlink(dentry, &page)); - return page; + char *res = page_getlink(dentry, &page); + if (!IS_ERR(res)) + *cookie = page; + return res; } EXPORT_SYMBOL(page_follow_link_light); -void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +void page_put_link(struct inode *unused, void *cookie) { struct page *page = cookie; - - if (page) { - kunmap(page); - page_cache_release(page); - } + kunmap(page); + page_cache_release(page); } EXPORT_SYMBOL(page_put_link); diff --git a/fs/namespace.c b/fs/namespace.c index 82ef1405260e..2b8aa15fd6df 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -590,24 +590,35 @@ static void delayed_free_vfsmnt(struct rcu_head *head) } /* call under rcu_read_lock */ -bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) { struct mount *mnt; if (read_seqretry(&mount_lock, seq)) - return false; + return 1; if (bastard == NULL) - return true; + return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); if (likely(!read_seqretry(&mount_lock, seq))) - return true; + return 0; if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { mnt_add_count(mnt, -1); - return false; + return 1; + } + return -1; +} + +/* call under rcu_read_lock */ +bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +{ + int res = __legitimize_mnt(bastard, seq); + if (likely(!res)) + return true; + if (unlikely(res < 0)) { + rcu_read_unlock(); + mntput(bastard); + rcu_read_lock(); } - rcu_read_unlock(); - mntput(bastard); - rcu_read_lock(); return false; } @@ -632,14 +643,17 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) */ struct mount *__lookup_mnt_last(struct vfsmount *mnt, struct dentry *dentry) { - struct mount *p, *res; - res = p = __lookup_mnt(mnt, dentry); + struct mount *p, *res = NULL; + p = __lookup_mnt(mnt, dentry); if (!p) goto out; + if (!(p->mnt.mnt_flags & MNT_UMOUNT)) + res = p; hlist_for_each_entry_continue(p, mnt_hash) { if (&p->mnt_parent->mnt != mnt || p->mnt_mountpoint != dentry) break; - res = p; + if (!(p->mnt.mnt_flags & MNT_UMOUNT)) + res = p; } out: return res; @@ -795,10 +809,8 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) /* * vfsmount lock must be held for write */ -static void detach_mnt(struct mount *mnt, struct path *old_path) +static void unhash_mnt(struct mount *mnt) { - old_path->dentry = mnt->mnt_mountpoint; - old_path->mnt = &mnt->mnt_parent->mnt; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); @@ -811,6 +823,26 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) /* * vfsmount lock must be held for write */ +static void detach_mnt(struct mount *mnt, struct path *old_path) +{ + old_path->dentry = mnt->mnt_mountpoint; + old_path->mnt = &mnt->mnt_parent->mnt; + unhash_mnt(mnt); +} + +/* + * vfsmount lock must be held for write + */ +static void umount_mnt(struct mount *mnt) +{ + /* old mountpoint will be dropped when we can do that */ + mnt->mnt_ex_mountpoint = mnt->mnt_mountpoint; + unhash_mnt(mnt); +} + +/* + * vfsmount lock must be held for write + */ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) @@ -1078,6 +1110,13 @@ static void mntput_no_expire(struct mount *mnt) rcu_read_unlock(); list_del(&mnt->mnt_instance); + + if (unlikely(!list_empty(&mnt->mnt_mounts))) { + struct mount *p, *tmp; + list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { + umount_mnt(p); + } + } unlock_mount_hash(); if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { @@ -1187,7 +1226,7 @@ EXPORT_SYMBOL(replace_mount_options); /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; down_read(&namespace_sem); if (p->cached_event == p->ns->event) { @@ -1208,7 +1247,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; p->cached_mount = seq_list_next(v, &p->ns->list, pos); p->cached_index = *pos; @@ -1222,7 +1261,7 @@ static void m_stop(struct seq_file *m, void *v) static int m_show(struct seq_file *m, void *v) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = list_entry(v, struct mount, mnt_list); return p->show(m, &r->mnt); } @@ -1298,17 +1337,15 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static void namespace_unlock(void) { - struct hlist_head head = unmounted; + struct hlist_head head; - if (likely(hlist_empty(&head))) { - up_write(&namespace_sem); - return; - } + hlist_move_list(&unmounted, &head); - head.first->pprev = &head.first; - INIT_HLIST_HEAD(&unmounted); up_write(&namespace_sem); + if (likely(hlist_empty(&head))) + return; + synchronize_rcu(); group_pin_kill(&head); @@ -1319,49 +1356,90 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } +enum umount_tree_flags { + UMOUNT_SYNC = 1, + UMOUNT_PROPAGATE = 2, + UMOUNT_CONNECTED = 4, +}; + +static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) +{ + /* Leaving mounts connected is only valid for lazy umounts */ + if (how & UMOUNT_SYNC) + return true; + + /* A mount without a parent has nothing to be connected to */ + if (!mnt_has_parent(mnt)) + return true; + + /* Because the reference counting rules change when mounts are + * unmounted and connected, umounted mounts may not be + * connected to mounted mounts. + */ + if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) + return true; + + /* Has it been requested that the mount remain connected? */ + if (how & UMOUNT_CONNECTED) + return false; + + /* Is the mount locked such that it needs to remain connected? */ + if (IS_MNT_LOCKED(mnt)) + return false; + + /* By default disconnect the mount */ + return true; +} + /* * mount_lock must be held * namespace_sem must be held for write - * how = 0 => just this tree, don't propagate - * how = 1 => propagate; we know that nobody else has reference to any victims - * how = 2 => lazy umount */ -void umount_tree(struct mount *mnt, int how) +static void umount_tree(struct mount *mnt, enum umount_tree_flags how) { - HLIST_HEAD(tmp_list); + LIST_HEAD(tmp_list); struct mount *p; + if (how & UMOUNT_PROPAGATE) + propagate_mount_unlock(mnt); + + /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { - hlist_del_init_rcu(&p->mnt_hash); - hlist_add_head(&p->mnt_hash, &tmp_list); + p->mnt.mnt_flags |= MNT_UMOUNT; + list_move(&p->mnt_list, &tmp_list); } - hlist_for_each_entry(p, &tmp_list, mnt_hash) + /* Hide the mounts from mnt_mounts */ + list_for_each_entry(p, &tmp_list, mnt_list) { list_del_init(&p->mnt_child); + } - if (how) + /* Add propogated mounts to the tmp_list */ + if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); - while (!hlist_empty(&tmp_list)) { - p = hlist_entry(tmp_list.first, struct mount, mnt_hash); - hlist_del_init_rcu(&p->mnt_hash); + while (!list_empty(&tmp_list)) { + bool disconnect; + p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); __touch_mnt_namespace(p->mnt_ns); p->mnt_ns = NULL; - if (how < 2) + if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; - pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted); + disconnect = disconnect_mount(p, how); + + pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, + disconnect ? &unmounted : NULL); if (mnt_has_parent(p)) { - hlist_del_init(&p->mnt_mp_list); - put_mountpoint(p->mnt_mp); mnt_add_count(p->mnt_parent, -1); - /* old mountpoint will be dropped when we can do that */ - p->mnt_ex_mountpoint = p->mnt_mountpoint; - p->mnt_mountpoint = p->mnt.mnt_root; - p->mnt_parent = p; - p->mnt_mp = NULL; + if (!disconnect) { + /* Don't forget about p */ + list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); + } else { + umount_mnt(p); + } } change_mnt_propagation(p, MS_PRIVATE); } @@ -1447,14 +1525,14 @@ static int do_umount(struct mount *mnt, int flags) if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 2); + umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { if (!list_empty(&mnt->mnt_list)) - umount_tree(mnt, 1); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } @@ -1480,13 +1558,17 @@ void __detach_mounts(struct dentry *dentry) namespace_lock(); mp = lookup_mountpoint(dentry); - if (!mp) + if (IS_ERR_OR_NULL(mp)) goto out_unlock; lock_mount_hash(); while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); - umount_tree(mnt, 2); + if (mnt->mnt.mnt_flags & MNT_UMOUNT) { + hlist_add_head(&mnt->mnt_umount.s_list, &unmounted); + umount_mnt(mnt); + } + else umount_tree(mnt, UMOUNT_CONNECTED); } unlock_mount_hash(); put_mountpoint(mp); @@ -1648,7 +1730,7 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, out: if (res) { lock_mount_hash(); - umount_tree(res, 0); + umount_tree(res, UMOUNT_SYNC); unlock_mount_hash(); } return q; @@ -1660,8 +1742,11 @@ struct vfsmount *collect_mounts(struct path *path) { struct mount *tree; namespace_lock(); - tree = copy_tree(real_mount(path->mnt), path->dentry, - CL_COPY_ALL | CL_PRIVATE); + if (!check_mnt(real_mount(path->mnt))) + tree = ERR_PTR(-EINVAL); + else + tree = copy_tree(real_mount(path->mnt), path->dentry, + CL_COPY_ALL | CL_PRIVATE); namespace_unlock(); if (IS_ERR(tree)) return ERR_CAST(tree); @@ -1672,7 +1757,7 @@ void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); lock_mount_hash(); - umount_tree(real_mount(mnt), 0); + umount_tree(real_mount(mnt), UMOUNT_SYNC); unlock_mount_hash(); namespace_unlock(); } @@ -1855,7 +1940,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); - umount_tree(child, 0); + umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); @@ -2035,7 +2120,7 @@ static int do_loopback(struct path *path, const char *old_name, err = graft_tree(mnt, parent, mp); if (err) { lock_mount_hash(); - umount_tree(mnt, 0); + umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } out2: @@ -2282,6 +2367,8 @@ unlock: return err; } +static bool fs_fully_visible(struct file_system_type *fs_type, int *new_mnt_flags); + /* * create a new mount for userspace and request it to be added into the * namespace's tree @@ -2313,6 +2400,10 @@ static int do_new_mount(struct path *path, const char *fstype, int flags, flags |= MS_NODEV; mnt_flags |= MNT_NODEV | MNT_LOCK_NODEV; } + if (type->fs_flags & FS_USERNS_VISIBLE) { + if (!fs_fully_visible(type, &mnt_flags)) + return -EPERM; + } } mnt = vfs_kern_mount(type, flags, name, data); @@ -2406,7 +2497,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); - umount_tree(mnt, 1); + umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } unlock_mount_hash(); namespace_unlock(); @@ -2477,7 +2568,7 @@ static void shrink_submounts(struct mount *mnt) m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); - umount_tree(m, 1); + umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); } } } @@ -3114,9 +3205,10 @@ bool current_chrooted(void) return chrooted; } -bool fs_fully_visible(struct file_system_type *type) +static bool fs_fully_visible(struct file_system_type *type, int *new_mnt_flags) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; + int new_flags = *new_mnt_flags; struct mount *mnt; bool visible = false; @@ -3129,16 +3221,42 @@ bool fs_fully_visible(struct file_system_type *type) if (mnt->mnt.mnt_sb->s_type != type) continue; - /* This mount is not fully visible if there are any child mounts - * that cover anything except for empty directories. + /* This mount is not fully visible if it's root directory + * is not the root directory of the filesystem. + */ + if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) + continue; + + /* Verify the mount flags are equal to or more permissive + * than the proposed new mount. + */ + if ((mnt->mnt.mnt_flags & MNT_LOCK_READONLY) && + !(new_flags & MNT_READONLY)) + continue; + if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + !(new_flags & MNT_NODEV)) + continue; + if ((mnt->mnt.mnt_flags & MNT_LOCK_ATIME) && + ((mnt->mnt.mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) + continue; + + /* This mount is not fully visible if there are any + * locked child mounts that cover anything except for + * empty directories. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; - if (!S_ISDIR(inode->i_mode)) - goto next; - if (inode->i_nlink > 2) + /* Only worry about locked mounts */ + if (!(mnt->mnt.mnt_flags & MNT_LOCKED)) + continue; + /* Is the directory permanetly empty? */ + if (!is_empty_dir_inode(inode)) goto next; } + /* Preserve the locked attributes */ + *new_mnt_flags |= mnt->mnt.mnt_flags & (MNT_LOCK_READONLY | \ + MNT_LOCK_NODEV | \ + MNT_LOCK_ATIME); visible = true; goto found; next: ; diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index e7ca827d7694..93575e91a7aa 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -127,7 +127,7 @@ static inline int ncp_case_sensitive(const struct inode *i) static int ncp_hash_dentry(const struct dentry *dentry, struct qstr *this) { - struct inode *inode = ACCESS_ONCE(dentry->d_inode); + struct inode *inode = d_inode_rcu(dentry); if (!inode) return 0; @@ -162,7 +162,7 @@ ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, if (len != name->len) return 1; - pinode = ACCESS_ONCE(parent->d_inode); + pinode = d_inode_rcu(parent); if (!pinode) return 1; @@ -180,7 +180,7 @@ ncp_compare_dentry(const struct dentry *parent, const struct dentry *dentry, static int ncp_delete_dentry(const struct dentry * dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode) { if (is_bad_inode(inode)) @@ -224,7 +224,7 @@ ncp_force_unlink(struct inode *dir, struct dentry* dentry) memset(&info, 0, sizeof(info)); /* remove the Read-Only flag on the NW server */ - inode = dentry->d_inode; + inode = d_inode(dentry); old_nwattr = NCP_FINFO(inode)->nwattr; info.attributes = old_nwattr & ~(aRONLY|aDELETEINHIBIT|aRENAMEINHIBIT); @@ -254,7 +254,7 @@ ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_na { struct nw_modify_dos_info info; int res=0x90,res2; - struct inode *old_inode = old_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); __le32 old_nwattr = NCP_FINFO(old_inode)->nwattr; __le32 new_nwattr = 0; /* shut compiler warning */ int old_nwattr_changed = 0; @@ -268,8 +268,8 @@ ncp_force_rename(struct inode *old_dir, struct dentry* old_dentry, char *_old_na res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(old_inode), old_inode, NULL, DM_ATTRIBUTES, &info); if (!res2) old_nwattr_changed = 1; - if (new_dentry && new_dentry->d_inode) { - new_nwattr = NCP_FINFO(new_dentry->d_inode)->nwattr; + if (new_dentry && d_really_is_positive(new_dentry)) { + new_nwattr = NCP_FINFO(d_inode(new_dentry))->nwattr; info.attributes = new_nwattr & ~(aRONLY|aRENAMEINHIBIT|aDELETEINHIBIT); res2 = ncp_modify_file_or_subdir_dos_info_path(NCP_SERVER(new_dir), new_dir, _new_name, DM_ATTRIBUTES, &info); if (!res2) @@ -324,9 +324,9 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) return -ECHILD; parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto finished; server = NCP_SERVER(dir); @@ -367,7 +367,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) * what we remember, it's not valid any more. */ if (!res) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); mutex_lock(&inode->i_mutex); if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { @@ -388,7 +388,7 @@ finished: static time_t ncp_obtain_mtime(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ncp_server *server = NCP_SERVER(inode); struct nw_info_struct i; @@ -404,7 +404,7 @@ static time_t ncp_obtain_mtime(struct dentry *dentry) static inline void ncp_invalidate_dircache_entries(struct dentry *parent) { - struct ncp_server *server = NCP_SERVER(parent->d_inode); + struct ncp_server *server = NCP_SERVER(d_inode(parent)); struct dentry *dentry; spin_lock(&parent->d_lock); @@ -418,7 +418,7 @@ ncp_invalidate_dircache_entries(struct dentry *parent) static int ncp_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct page *page = NULL; struct ncp_server *server = NCP_SERVER(inode); union ncp_dir_cache *cache = NULL; @@ -491,13 +491,13 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx) goto invalid_cache; } spin_unlock(&dentry->d_lock); - if (!dent->d_inode) { + if (d_really_is_negative(dent)) { dput(dent); goto invalid_cache; } over = !dir_emit(ctx, dent->d_name.name, dent->d_name.len, - dent->d_inode->i_ino, DT_UNKNOWN); + d_inode(dent)->i_ino, DT_UNKNOWN); dput(dent); if (over) goto finished; @@ -571,7 +571,7 @@ static void ncp_d_prune(struct dentry *dentry) { if (!dentry->d_fsdata) /* not referenced from page cache */ return; - NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE; + NCP_FINFO(d_inode(dentry->d_parent))->flags &= ~NCPI_DIR_CACHE; } static int @@ -580,7 +580,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, int inval_childs) { struct dentry *newdent, *dentry = file->f_path.dentry; - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct ncp_cache_control ctl = *ctrl; struct qstr qname; int valid = 0; @@ -621,7 +621,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, dentry_update_name_case(newdent, &qname); } - if (!newdent->d_inode) { + if (d_really_is_negative(newdent)) { struct inode *inode; entry->opened = 0; @@ -637,7 +637,7 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, spin_unlock(&dentry->d_lock); } } else { - struct inode *inode = newdent->d_inode; + struct inode *inode = d_inode(newdent); mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); ncp_update_inode2(inode, entry); @@ -659,10 +659,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, ctl.cache = kmap(ctl.page); } if (ctl.cache) { - if (newdent->d_inode) { + if (d_really_is_positive(newdent)) { newdent->d_fsdata = newdent; ctl.cache->dentry[ctl.idx] = newdent; - ino = newdent->d_inode->i_ino; + ino = d_inode(newdent)->i_ino; ncp_new_dentry(newdent); } valid = 1; @@ -807,7 +807,7 @@ int ncp_conn_logged_in(struct super_block *sb) } dent = sb->s_root; if (dent) { - struct inode* ino = dent->d_inode; + struct inode* ino = d_inode(dent); if (ino) { ncp_update_known_namespace(server, volNumber, NULL); NCP_FINFO(ino)->volNumber = volNumber; @@ -815,7 +815,7 @@ int ncp_conn_logged_in(struct super_block *sb) NCP_FINFO(ino)->DosDirNum = DosDirNum; result = 0; } else { - ncp_dbg(1, "sb->s_root->d_inode == NULL!\n"); + ncp_dbg(1, "d_inode(sb->s_root) == NULL!\n"); } } else { ncp_dbg(1, "sb->s_root == NULL!\n"); @@ -1055,7 +1055,7 @@ out: static int ncp_unlink(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ncp_server *server; int error; @@ -1145,6 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, case 0x00: ncp_dbg(1, "renamed %pd -> %pd\n", old_dentry, new_dentry); + ncp_d_prune(old_dentry); + ncp_d_prune(new_dentry); break; case 0x9E: error = -ENAMETOOLONG; diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 1dd7007f974d..011324ce9df2 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -98,30 +98,24 @@ out: } static ssize_t -ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +ncp_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); size_t already_read = 0; - off_t pos; + off_t pos = iocb->ki_pos; size_t bufsize; int error; - void* freepage; + void *freepage; size_t freelen; ncp_dbg(1, "enter %pD2\n", file); - pos = *ppos; - - if ((ssize_t) count < 0) { - return -EINVAL; - } - if (!count) + if (!iov_iter_count(to)) return 0; if (pos > inode->i_sb->s_maxbytes) return 0; - if (pos + count > inode->i_sb->s_maxbytes) { - count = inode->i_sb->s_maxbytes - pos; - } + iov_iter_truncate(to, inode->i_sb->s_maxbytes - pos); error = ncp_make_open(inode, O_RDONLY); if (error) { @@ -138,31 +132,29 @@ ncp_file_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) goto outrel; error = 0; /* First read in as much as possible for each bufsize. */ - while (already_read < count) { + while (iov_iter_count(to)) { int read_this_time; - size_t to_read = min_t(unsigned int, + size_t to_read = min_t(size_t, bufsize - (pos % bufsize), - count - already_read); + iov_iter_count(to)); error = ncp_read_bounce(NCP_SERVER(inode), NCP_FINFO(inode)->file_handle, - pos, to_read, buf, &read_this_time, + pos, to_read, to, &read_this_time, freepage, freelen); if (error) { error = -EIO; /* NW errno -> Linux errno */ break; } pos += read_this_time; - buf += read_this_time; already_read += read_this_time; - if (read_this_time != to_read) { + if (read_this_time != to_read) break; - } } vfree(freepage); - *ppos = pos; + iocb->ki_pos = pos; file_accessed(file); @@ -173,42 +165,21 @@ outrel: } static ssize_t -ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) +ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { + struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); size_t already_written = 0; - off_t pos; size_t bufsize; int errno; - void* bouncebuffer; + void *bouncebuffer; + off_t pos; ncp_dbg(1, "enter %pD2\n", file); - if ((ssize_t) count < 0) - return -EINVAL; - pos = *ppos; - if (file->f_flags & O_APPEND) { - pos = i_size_read(inode); - } + errno = generic_write_checks(iocb, from); + if (errno <= 0) + return errno; - if (pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) { - if (pos >= MAX_NON_LFS) { - return -EFBIG; - } - if (count > MAX_NON_LFS - (u32)pos) { - count = MAX_NON_LFS - (u32)pos; - } - } - if (pos >= inode->i_sb->s_maxbytes) { - if (count || pos > inode->i_sb->s_maxbytes) { - return -EFBIG; - } - } - if (pos + count > inode->i_sb->s_maxbytes) { - count = inode->i_sb->s_maxbytes - pos; - } - - if (!count) - return 0; errno = ncp_make_open(inode, O_WRONLY); if (errno) { ncp_dbg(1, "open failed, error=%d\n", errno); @@ -216,8 +187,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * } bufsize = NCP_SERVER(inode)->buffer_size; - already_written = 0; - errno = file_update_time(file); if (errno) goto outrel; @@ -227,13 +196,14 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * errno = -EIO; /* -ENOMEM */ goto outrel; } - while (already_written < count) { + pos = iocb->ki_pos; + while (iov_iter_count(from)) { int written_this_time; - size_t to_write = min_t(unsigned int, + size_t to_write = min_t(size_t, bufsize - (pos % bufsize), - count - already_written); + iov_iter_count(from)); - if (copy_from_user(bouncebuffer, buf, to_write)) { + if (copy_from_iter(bouncebuffer, to_write, from) != to_write) { errno = -EFAULT; break; } @@ -244,16 +214,14 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t * break; } pos += written_this_time; - buf += written_this_time; already_written += written_this_time; - if (written_this_time != to_write) { + if (written_this_time != to_write) break; - } } vfree(bouncebuffer); - *ppos = pos; + iocb->ki_pos = pos; if (pos > i_size_read(inode)) { mutex_lock(&inode->i_mutex); @@ -277,8 +245,8 @@ static int ncp_release(struct inode *inode, struct file *file) { const struct file_operations ncp_file_operations = { .llseek = generic_file_llseek, - .read = ncp_file_read, - .write = ncp_file_write, + .read_iter = ncp_file_read_iter, + .write_iter = ncp_file_write_iter, .unlocked_ioctl = ncp_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ncp_compat_ioctl, diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 01a9e16e9782..9605a2f63549 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -812,7 +812,7 @@ static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf) if (!d) { goto dflt; } - i = d->d_inode; + i = d_inode(d); if (!i) { goto dflt; } @@ -865,7 +865,7 @@ dflt:; int ncp_notify_change(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int result = 0; __le32 info_mask; struct nw_modify_dos_info info; @@ -878,7 +878,7 @@ int ncp_notify_change(struct dentry *dentry, struct iattr *attr) goto out; result = -EPERM; - if (IS_DEADDIR(dentry->d_inode)) + if (IS_DEADDIR(d_inode(dentry))) goto out; /* ageing the dentry to force validation */ diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index cf7e043a9447..79b113048eac 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -376,7 +376,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg struct dentry* dentry = inode->i_sb->s_root; if (dentry) { - struct inode* s_inode = dentry->d_inode; + struct inode* s_inode = d_inode(dentry); if (s_inode) { sr.volNumber = NCP_FINFO(s_inode)->volNumber; @@ -384,7 +384,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg sr.namespace = server->name_space[sr.volNumber]; result = 0; } else - ncp_dbg(1, "s_root->d_inode==NULL\n"); + ncp_dbg(1, "d_inode(s_root)==NULL\n"); } else ncp_dbg(1, "s_root==NULL\n"); } else { @@ -431,7 +431,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg if (result == 0) { dentry = inode->i_sb->s_root; if (dentry) { - struct inode* s_inode = dentry->d_inode; + struct inode* s_inode = d_inode(dentry); if (s_inode) { NCP_FINFO(s_inode)->volNumber = vnum; @@ -439,7 +439,7 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg NCP_FINFO(s_inode)->DosDirNum = dosde; server->root_setuped = 1; } else { - ncp_dbg(1, "s_root->d_inode==NULL\n"); + ncp_dbg(1, "d_inode(s_root)==NULL\n"); result = -EIO; } } else { diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index 482387532f54..88dbbc9fcf4d 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -727,7 +727,7 @@ int ncp_del_file_or_subdir2(struct ncp_server *server, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); __u8 volnum; __le32 dirent; @@ -1001,8 +1001,8 @@ out: */ int ncp_read_bounce(struct ncp_server *server, const char *file_id, - __u32 offset, __u16 to_read, char __user *target, int *bytes_read, - void* bounce, __u32 bufsize) + __u32 offset, __u16 to_read, struct iov_iter *to, + int *bytes_read, void *bounce, __u32 bufsize) { int result; @@ -1025,7 +1025,7 @@ ncp_read_bounce(struct ncp_server *server, const char *file_id, (offset & 1); *bytes_read = len; result = 0; - if (copy_to_user(target, source, len)) + if (copy_to_iter(source, len, to) != len) result = -EFAULT; } } diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 250e443a07f3..5233fbc1747a 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -53,7 +53,7 @@ static inline int ncp_read_bounce_size(__u32 size) { return sizeof(struct ncp_reply_header) + 2 + 2 + size + 8; }; int ncp_read_bounce(struct ncp_server *, const char *, __u32, __u16, - char __user *, int *, void* bounce, __u32 bouncelen); + struct iov_iter *, int *, void *bounce, __u32 bouncelen); int ncp_read_kernel(struct ncp_server *, const char *, __u32, __u16, char *, int *); int ncp_write_kernel(struct ncp_server *, const char *, __u32, __u16, diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c index 1a63bfdb4a65..421b6f91e8ec 100644 --- a/fs/ncpfs/symlink.c +++ b/fs/ncpfs/symlink.c @@ -156,7 +156,7 @@ int ncp_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { goto failfree; } - inode=dentry->d_inode; + inode=d_inode(dentry); if (ncp_make_open(inode, O_WRONLY)) goto failfree; diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index c7abc10279af..f31fd0dd92c6 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -1,6 +1,6 @@ config NFS_FS tristate "NFS client support" - depends on INET && FILE_LOCKING + depends on INET && FILE_LOCKING && MULTIUSER select LOCKD select SUNRPC select NFS_ACL_SUPPORT if NFS_V3_ACL diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 1e987acf20c9..8664417955a2 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -22,7 +22,7 @@ nfsv3-$(CONFIG_NFS_V3_ACL) += nfs3acl.o obj-$(CONFIG_NFS_V4) += nfsv4.o CFLAGS_nfs4trace.o += -I$(src) nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o \ - delegation.o idmap.o callback.o callback_xdr.o callback_proc.o \ + delegation.o nfs4idmap.o callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o nfs4getroot.o nfs4client.o nfs4session.o \ dns_resolve.o nfs4trace.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 1cac3c175d18..d2554fe140a3 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -890,6 +890,7 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .free_deviceid_node = bl_free_deviceid_node, .pg_read_ops = &bl_pg_read_ops, .pg_write_ops = &bl_pg_write_ops, + .sync = pnfs_generic_sync, }; static int __init nfs4blocklayout_init(void) diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 5aed4f98df41..e535599a0719 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -33,7 +33,7 @@ bl_free_deviceid_node(struct nfs4_deviceid_node *d) container_of(d, struct pnfs_block_dev, node); bl_free_device(dev); - kfree(dev); + kfree_rcu(dev, node.rcu); } static int diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 351be9205bf8..682529c00996 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -128,7 +128,7 @@ nfs41_callback_svc(void *vrqstp) if (try_to_freeze()) continue; - prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -142,10 +142,10 @@ nfs41_callback_svc(void *vrqstp) error); } else { spin_unlock_bh(&serv->sv_cb_lock); - /* schedule_timeout to game the hung task watchdog */ - schedule_timeout(60 * HZ); + schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } + flush_signals(current); } return 0; } @@ -458,7 +458,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) * pg_authenticate method for nfsv4 callback threads. * * The authflavor has been negotiated, so an incorrect flavor is a server - * bug. Drop packets with incorrect authflavor. + * bug. Deny packets with incorrect authflavor. * * All other checking done after NFS decoding where the nfs_client can be * found in nfs4_callback_compound @@ -468,12 +468,12 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: if (rqstp->rq_proc != CB_NULL) - return SVC_DROP; + return SVC_DENIED; break; case RPC_AUTH_GSS: /* No RPC_AUTH_GSS support yet in NFSv4.1 */ if (svc_is_backchannel(rqstp)) - return SVC_DROP; + return SVC_DENIED; } return SVC_OK; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 197806fb87ff..29e3c1b011b7 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -327,10 +327,8 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr); /* Normal */ - if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { - slot->seq_nr++; + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) goto out_ok; - } /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { @@ -418,6 +416,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_process_state *cps) { struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; struct nfs_client *clp; int i; __be32 status = htonl(NFS4ERR_BADSESSION); @@ -429,25 +428,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) goto out; + tbl = &clp->cl_session->bc_slot_table; + slot = tbl->slots + args->csa_slotid; spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { - spin_unlock(&tbl->slot_tbl_lock); status = htonl(NFS4ERR_DELAY); /* Return NFS4ERR_BADSESSION if we're draining the session * in order to reset it. */ if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) status = htonl(NFS4ERR_BADSESSION); - goto out; + goto out_unlock; } - status = validate_seqid(&clp->cl_session->bc_slot_table, args); - spin_unlock(&tbl->slot_tbl_lock); + memcpy(&res->csr_sessionid, &args->csa_sessionid, + sizeof(res->csr_sessionid)); + res->csr_sequenceid = args->csa_sequenceid; + res->csr_slotid = args->csa_slotid; + res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + + status = validate_seqid(tbl, args); if (status) - goto out; + goto out_unlock; cps->slotid = args->csa_slotid; @@ -458,15 +464,17 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, */ if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { status = htonl(NFS4ERR_DELAY); - goto out; + goto out_unlock; } - memcpy(&res->csr_sessionid, &args->csa_sessionid, - sizeof(res->csr_sessionid)); - res->csr_sequenceid = args->csa_sequenceid; - res->csr_slotid = args->csa_slotid; - res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; - res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + /* + * RFC5661 20.9.3 + * If CB_SEQUENCE returns an error, then the state of the slot + * (sequence ID, cached reply) MUST NOT change. + */ + slot->seq_nr++; +out_unlock: + spin_unlock(&tbl->slot_tbl_lock); out: cps->clp = clp; /* put in nfs4_callback_compound */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 19ca95cdfd9b..6b1697a01dde 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -909,7 +909,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r xdr_init_encode(&xdr_out, &rqstp->rq_res, p); status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); - if (status == __constant_htonl(NFS4ERR_RESOURCE)) + if (status == htonl(NFS4ERR_RESOURCE)) return rpc_garbage_args; if (hdr_arg.minorversion == 0) { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 19874151e95c..4a90c9bb3135 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -31,7 +31,6 @@ #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> @@ -776,7 +775,7 @@ static int nfs_init_server(struct nfs_server *server, server->options = data->options; server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| - NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR; + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -826,7 +825,6 @@ error: * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, - struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) { unsigned long max_rpc_payload; @@ -902,7 +900,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs if (error < 0) goto out_error; - nfs_server_set_fsinfo(server, mntfh, &fsinfo); + nfs_server_set_fsinfo(server, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { @@ -1194,8 +1192,6 @@ void nfs_clients_init(struct net *net) } #ifdef CONFIG_PROC_FS -static struct proc_dir_entry *proc_fs_nfs; - static int nfs_server_list_open(struct inode *inode, struct file *file); static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); @@ -1365,27 +1361,29 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) { struct nfs_server *server; struct nfs_client *clp; - char dev[8], fsid[17]; + char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0' + char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0' struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { - seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); + seq_puts(m, "NV SERVER PORT DEV FSID" + " FSC\n"); return 0; } /* display one transport per line on subsequent lines */ server = list_entry(v, struct nfs_server, master_link); clp = server->nfs_client; - snprintf(dev, 8, "%u:%u", + snprintf(dev, sizeof(dev), "%u:%u", MAJOR(server->s_dev), MINOR(server->s_dev)); - snprintf(fsid, 17, "%llx:%llx", + snprintf(fsid, sizeof(fsid), "%llx:%llx", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); rcu_read_lock(); - seq_printf(m, "v%u %s %s %-7s %-17s %s\n", + seq_printf(m, "v%u %s %s %-12s %-33s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), @@ -1435,27 +1433,20 @@ void nfs_fs_proc_net_exit(struct net *net) */ int __init nfs_fs_proc_init(void) { - struct proc_dir_entry *p; - - proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL); - if (!proc_fs_nfs) + if (!proc_mkdir("fs/nfsfs", NULL)) goto error_0; /* a file of servers with which we're dealing */ - p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers"); - if (!p) + if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers")) goto error_1; /* a file of volumes that we have mounted */ - p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes"); - if (!p) - goto error_2; - return 0; + if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes")) + goto error_1; -error_2: - remove_proc_entry("servers", proc_fs_nfs); + return 0; error_1: - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("fs/nfsfs", NULL); error_0: return -ENOMEM; } @@ -1465,9 +1456,7 @@ error_0: */ void nfs_fs_proc_exit(void) { - remove_proc_entry("volumes", proc_fs_nfs); - remove_proc_entry("servers", proc_fs_nfs); - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("fs/nfsfs", NULL); } #endif /* CONFIG_PROC_FS */ diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index a6ad68865880..029d688a969f 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -378,7 +378,7 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct if (freeme == NULL) goto out; } - list_add_rcu(&delegation->super_list, &server->delegations); + list_add_tail_rcu(&delegation->super_list, &server->delegations); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -514,7 +514,7 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode) delegation = nfs_inode_detach_delegation(inode); if (delegation != NULL) - nfs_do_return_delegation(inode, delegation, 0); + nfs_do_return_delegation(inode, delegation, 1); } /** diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c19e16f0b2d0..547308a5ec6f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -416,15 +416,14 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry) { struct nfs_inode *nfsi; - if (dentry->d_inode == NULL) - goto different; + if (d_really_is_negative(dentry)) + return 0; - nfsi = NFS_I(dentry->d_inode); + nfsi = NFS_I(d_inode(dentry)); if (entry->fattr->fileid == nfsi->fileid) return 1; if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0) return 1; -different: return 0; } @@ -473,7 +472,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) struct qstr filename = QSTR_INIT(entry->name, entry->len); struct dentry *dentry; struct dentry *alias; - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct inode *inode; int status; @@ -497,9 +496,9 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) goto out; if (nfs_same_file(dentry, entry)) { nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); - status = nfs_refresh_inode(dentry->d_inode, entry->fattr); + status = nfs_refresh_inode(d_inode(dentry), entry->fattr); if (!status) - nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label); + nfs_setsecurity(d_inode(dentry), entry->fattr, entry->label); goto out; } else { d_invalidate(dentry); @@ -544,6 +543,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en if (scratch == NULL) return -ENOMEM; + if (buflen == 0) + goto out_nopages; + xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); @@ -565,6 +567,7 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en break; } while (!entry->eof); +out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { array = nfs_readdir_get_array(page); if (!IS_ERR(array)) { @@ -870,7 +873,7 @@ static bool nfs_dir_mapping_need_revalidate(struct inode *dir) static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); nfs_readdir_descriptor_t my_desc, *desc = &my_desc; struct nfs_open_dir_context *dir_ctx = file->private_data; @@ -1118,15 +1121,15 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) { parent = ACCESS_ONCE(dentry->d_parent); - dir = ACCESS_ONCE(parent->d_inode); + dir = d_inode_rcu(parent); if (!dir) return -ECHILD; } else { parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); } nfs_inc_stats(dir, NFSIOS_DENTRYREVALIDATE); - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode) { if (nfs_neg_need_reval(dir, dentry, flags)) { @@ -1242,7 +1245,7 @@ out_error: } /* - * A weaker form of d_revalidate for revalidating just the dentry->d_inode + * A weaker form of d_revalidate for revalidating just the d_inode(dentry) * when we don't really care about the dentry name. This is called when a * pathwalk ends on a dentry that was not found via a normal lookup in the * parent dir (e.g.: ".", "..", procfs symlinks or mountpoint traversals). @@ -1253,7 +1256,7 @@ out_error: static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) { int error; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); /* * I believe we can only get a negative dentry here in the case of a @@ -1287,7 +1290,7 @@ static int nfs_dentry_delete(const struct dentry *dentry) dentry, dentry->d_flags); /* Unhash any dentry with a stale inode */ - if (dentry->d_inode != NULL && NFS_STALE(dentry->d_inode)) + if (d_really_is_positive(dentry) && NFS_STALE(d_inode(dentry))) return 1; if (dentry->d_flags & DCACHE_NFSFS_RENAMED) { @@ -1467,9 +1470,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx, { int err; - if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) - *opened |= FILE_CREATED; - err = finish_open(file, dentry, do_open, opened); if (err) goto out; @@ -1491,7 +1491,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, int err; /* Expect a negative dentry */ - BUG_ON(dentry->d_inode); + BUG_ON(d_inode(dentry)); dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); @@ -1587,7 +1587,7 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1) goto no_open; - inode = dentry->d_inode; + inode = d_inode(dentry); /* We can't create new files in nfs_open_revalidate(), so we * optimize away revalidation of negative dentries. @@ -1598,12 +1598,12 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) { parent = ACCESS_ONCE(dentry->d_parent); - dir = ACCESS_ONCE(parent->d_inode); + dir = d_inode_rcu(parent); if (!dir) return -ECHILD; } else { parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); } if (!nfs_neg_need_reval(dir, dentry, flags)) ret = 1; @@ -1643,14 +1643,14 @@ int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fhandle, struct nfs4_label *label) { struct dentry *parent = dget_parent(dentry); - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct inode *inode; int error = -EACCES; d_drop(dentry); /* We may have been initialized further down */ - if (dentry->d_inode) + if (d_really_is_positive(dentry)) goto out; if (fhandle->size == 0) { error = NFS_PROTO(dir)->lookup(dir, &dentry->d_name, fhandle, fattr, NULL); @@ -1768,7 +1768,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { - if (dentry->d_inode != NULL && !d_unhashed(dentry)) + if (simple_positive(dentry)) d_delete(dentry); } @@ -1780,13 +1780,13 @@ int nfs_rmdir(struct inode *dir, struct dentry *dentry) dir->i_sb->s_id, dir->i_ino, dentry); trace_nfs_rmdir_enter(dir, dentry); - if (dentry->d_inode) { + if (d_really_is_positive(dentry)) { nfs_wait_on_sillyrename(dentry); error = NFS_PROTO(dir)->rmdir(dir, &dentry->d_name); /* Ensure the VFS deletes this inode */ switch (error) { case 0: - clear_nlink(dentry->d_inode); + clear_nlink(d_inode(dentry)); break; case -ENOENT: nfs_dentry_handle_enoent(dentry); @@ -1808,8 +1808,8 @@ EXPORT_SYMBOL_GPL(nfs_rmdir); */ static int nfs_safe_remove(struct dentry *dentry) { - struct inode *dir = dentry->d_parent->d_inode; - struct inode *inode = dentry->d_inode; + struct inode *dir = d_inode(dentry->d_parent); + struct inode *inode = d_inode(dentry); int error = -EBUSY; dfprintk(VFS, "NFS: safe_remove(%pd2)\n", dentry); @@ -1853,7 +1853,7 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) if (d_count(dentry) > 1) { spin_unlock(&dentry->d_lock); /* Start asynchronous writeout of the inode */ - write_inode_now(dentry->d_inode, 0); + write_inode_now(d_inode(dentry), 0); error = nfs_sillyrename(dir, dentry); goto out; } @@ -1931,7 +1931,7 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) * No big deal if we can't add this page to the page cache here. * READLINK will get the missing page from the server if needed. */ - if (!add_to_page_cache_lru(page, dentry->d_inode->i_mapping, 0, + if (!add_to_page_cache_lru(page, d_inode(dentry)->i_mapping, 0, GFP_KERNEL)) { SetPageUptodate(page); unlock_page(page); @@ -1950,7 +1950,7 @@ EXPORT_SYMBOL_GPL(nfs_symlink); int nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int error; dfprintk(VFS, "NFS: link(%pd2 -> %pd2)\n", @@ -1997,8 +1997,8 @@ EXPORT_SYMBOL_GPL(nfs_link); int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct dentry *dentry = NULL, *rehash = NULL; struct rpc_task *task; int error = -EBUSY; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index e907c8cf732e..38678d9a5cc4 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -129,22 +129,25 @@ nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) int i; ssize_t count; - WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count); - - count = dreq->mirrors[hdr->pgio_mirror_idx].count; - if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { - count = hdr->io_start + hdr->good_bytes - dreq->io_start; - dreq->mirrors[hdr->pgio_mirror_idx].count = count; - } - - /* update the dreq->count by finding the minimum agreed count from all - * mirrors */ - count = dreq->mirrors[0].count; + if (dreq->mirror_count == 1) { + dreq->mirrors[hdr->pgio_mirror_idx].count += hdr->good_bytes; + dreq->count += hdr->good_bytes; + } else { + /* mirrored writes */ + count = dreq->mirrors[hdr->pgio_mirror_idx].count; + if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) { + count = hdr->io_start + hdr->good_bytes - dreq->io_start; + dreq->mirrors[hdr->pgio_mirror_idx].count = count; + } + /* update the dreq->count by finding the minimum agreed count from all + * mirrors */ + count = dreq->mirrors[0].count; - for (i = 1; i < dreq->mirror_count; i++) - count = min(count, dreq->mirrors[i].count); + for (i = 1; i < dreq->mirror_count; i++) + count = min(count, dreq->mirrors[i].count); - dreq->count = count; + dreq->count = count; + } } /* @@ -240,7 +243,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, /** * nfs_direct_IO - NFS address space operation for direct I/O - * @rw: direction (read or write) * @iocb: target I/O control block * @iov: array of vectors that define I/O buffer * @pos: offset in file to begin the operation @@ -251,7 +253,7 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, * shunt off direct read and write requests before the VFS gets them, * so this method is only ever called for swap. */ -ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos) +ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos) { struct inode *inode = iocb->ki_filp->f_mapping->host; @@ -259,18 +261,11 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t if (!IS_SWAPFILE(inode)) return 0; -#ifndef CONFIG_NFS_SWAP - dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n", - iocb->ki_filp, (long long) pos, iter->nr_segs); + VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); - return -EINVAL; -#else - VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); - - if (rw == READ) + if (iov_iter_rw(iter) == READ) return nfs_file_direct_read(iocb, iter, pos); - return nfs_file_direct_write(iocb, iter, pos); -#endif /* CONFIG_NFS_SWAP */ + return nfs_file_direct_write(iocb, iter); } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -387,13 +382,13 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) if (write) nfs_zap_mapping(inode, inode->i_mapping); - inode_dio_done(inode); + inode_dio_end(inode); if (dreq->iocb) { long res = (long) dreq->error; if (!res) res = (long) dreq->count; - aio_complete(dreq->iocb, res, 0); + dreq->iocb->ki_complete(dreq->iocb, res, 0); } complete_all(&dreq->completion); @@ -404,8 +399,8 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq, bool write) static void nfs_direct_readpage_release(struct nfs_page *req) { dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", - req->wb_context->dentry->d_inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), + d_inode(req->wb_context->dentry)->i_sb->s_id, + (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), req->wb_bytes, (long long)req_offset(req)); nfs_release_request(req); @@ -487,7 +482,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, &nfs_direct_read_completion_ops); get_dreq(dreq); desc.pg_dreq = dreq; - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); while (iov_iter_count(iter)) { struct page **pagevec; @@ -539,7 +534,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { - inode_dio_done(inode); + inode_dio_end(inode); nfs_direct_req_release(dreq); return result < 0 ? result : -EIO; } @@ -873,7 +868,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, &nfs_direct_write_completion_ops); desc.pg_dreq = dreq; get_dreq(dreq); - atomic_inc(&inode->i_dio_count); + inode_dio_begin(inode); NFS_I(inode)->write_io += iov_iter_count(iter); while (iov_iter_count(iter)) { @@ -929,7 +924,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * generic layer handle the completion. */ if (requested_bytes == 0) { - inode_dio_done(inode); + inode_dio_end(inode); nfs_direct_req_release(dreq); return result < 0 ? result : -EIO; } @@ -960,8 +955,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * Note that O_APPEND is not supported for NFS direct writes, as there * is no atomic O_APPEND write facility in the NFS protocol. */ -ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos) +ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) { ssize_t result = -EINVAL; struct file *file = iocb->ki_filp; @@ -969,25 +963,16 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; - loff_t end; - size_t count = iov_iter_count(iter); - end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - - nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count); + loff_t pos, end; dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n", - file, count, (long long) pos); + file, iov_iter_count(iter), (long long) iocb->ki_pos); - result = generic_write_checks(file, &pos, &count, 0); - if (result) - goto out; + nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, + iov_iter_count(iter)); - result = -EINVAL; - if ((ssize_t) count < 0) - goto out; - result = 0; - if (!count) - goto out; + pos = iocb->ki_pos; + end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT; mutex_lock(&inode->i_mutex); @@ -1002,7 +987,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock; } - task_io_account_write(count); + task_io_account_write(iov_iter_count(iter)); result = -ENOMEM; dreq = nfs_direct_req_alloc(); @@ -1010,7 +995,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock; dreq->inode = inode; - dreq->bytes_left = count; + dreq->bytes_left = iov_iter_count(iter); dreq->io_start = pos; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); l_ctx = nfs_get_lock_context(dreq->ctx); @@ -1041,6 +1026,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, if (i_size_read(inode) < iocb->ki_pos) i_size_write(inode, iocb->ki_pos); spin_unlock(&inode->i_lock); + generic_write_sync(file, pos, result); } } nfs_direct_req_release(dreq); @@ -1050,7 +1036,6 @@ out_release: nfs_direct_req_release(dreq); out_unlock: mutex_unlock(&inode->i_mutex); -out: return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index e679d24c39d3..cc4fa1ed61fc 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -26,7 +26,6 @@ #include <linux/nfs_mount.h> #include <linux/mm.h> #include <linux/pagemap.h> -#include <linux/aio.h> #include <linux/gfp.h> #include <linux/swap.h> @@ -171,7 +170,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t result; - if (iocb->ki_filp->f_flags & O_DIRECT) + if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_read(iocb, to, iocb->ki_pos); dprintk("NFS: read(%pD2, %zu@%lu)\n", @@ -281,6 +280,7 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) trace_nfs_fsync_enter(inode); + nfs_inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page) return nfs_wb_page(inode, page); } -#ifdef CONFIG_NFS_SWAP static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { - int ret; struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); *span = sis->pages; - rcu_read_lock(); - ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1); - rcu_read_unlock(); - - return ret; + return rpc_clnt_swap_activate(clnt); } static void nfs_swap_deactivate(struct file *file) { struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); - rcu_read_lock(); - xs_swapper(rcu_dereference(clnt->cl_xprt), 0); - rcu_read_unlock(); + rpc_clnt_swap_deactivate(clnt); } -#endif const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, @@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, -#ifdef CONFIG_NFS_SWAP .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, -#endif }; /* @@ -675,17 +664,20 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) unsigned long written = 0; ssize_t result; size_t count = iov_iter_count(from); - loff_t pos = iocb->ki_pos; result = nfs_key_timeout_notify(file, inode); if (result) return result; - if (file->f_flags & O_DIRECT) - return nfs_file_direct_write(iocb, from, pos); + if (iocb->ki_flags & IOCB_DIRECT) { + result = generic_write_checks(iocb, from); + if (result <= 0) + return result; + return nfs_file_direct_write(iocb, from); + } dprintk("NFS: write(%pD2, %zu@%Ld)\n", - file, count, (long long) pos); + file, count, (long long) iocb->ki_pos); result = -EBUSY; if (IS_SWAPFILE(inode)) @@ -693,7 +685,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) /* * O_APPEND implies that we must revalidate the file length. */ - if (file->f_flags & O_APPEND) { + if (iocb->ki_flags & IOCB_APPEND) { result = nfs_revalidate_file_size(inode, file); if (result) goto out; @@ -780,7 +772,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) * Flush all pending writes before doing anything * with locks.. */ - nfs_sync_mapping(filp->f_mapping); + vfs_fsync(filp, 0); l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { @@ -927,8 +919,6 @@ EXPORT_SYMBOL_GPL(nfs_flock); const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = nfs_file_read, .write_iter = nfs_file_write, .mmap = nfs_file_mmap, diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 91e88a7ecef0..b34f2e228601 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -32,6 +32,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/backing-dev.h> #include <linux/sunrpc/metrics.h> @@ -258,7 +259,8 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) hdr->res.verf->committed != NFS_DATA_SYNC) return; - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -373,7 +375,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task, } if (data->verf.committed == NFS_UNSTABLE) - pnfs_commit_set_layoutcommit(data); + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } @@ -1086,7 +1088,7 @@ filelayout_alloc_deviceid_node(struct nfs_server *server, } static void -filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) +filelayout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_fl_free_deviceid(container_of(d, struct nfs4_file_layout_dsaddr, id_node)); } @@ -1137,7 +1139,8 @@ static struct pnfs_layoutdriver_type filelayout_type = { .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, .alloc_deviceid_node = filelayout_alloc_deviceid_node, - .free_deviceid_node = filelayout_free_deveiceid_node, + .free_deviceid_node = filelayout_free_deviceid_node, + .sync = pnfs_nfs_generic_sync, }; static int __init nfs4filelayout_init(void) diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4f372e224603..4946ef40ba87 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -55,7 +55,7 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) nfs4_pnfs_ds_put(ds); } kfree(dsaddr->stripe_indices); - kfree(dsaddr); + kfree_rcu(dsaddr, id_node.rcu); } /* Decode opaque device data and return the result */ diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 315cc68945b9..b3289d701eea 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -11,15 +11,16 @@ #include <linux/module.h> #include <linux/sunrpc/metrics.h> -#include <linux/nfs_idmap.h> #include "flexfilelayout.h" #include "../nfs4session.h" +#include "../nfs4idmap.h" #include "../internal.h" #include "../delegation.h" #include "../nfs4trace.h" #include "../iostat.h" #include "../nfs.h" +#include "../nfs42.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -182,17 +183,14 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { - struct nfs4_ff_layout_mirror *tmp; int i, j; for (i = 0; i < fls->mirror_array_cnt - 1; i++) { for (j = i + 1; j < fls->mirror_array_cnt; j++) if (fls->mirror_array[i]->efficiency < - fls->mirror_array[j]->efficiency) { - tmp = fls->mirror_array[i]; - fls->mirror_array[i] = fls->mirror_array[j]; - fls->mirror_array[j] = tmp; - } + fls->mirror_array[j]->efficiency) + swap(fls->mirror_array[i], + fls->mirror_array[j]); } } @@ -274,6 +272,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, spin_lock_init(&fls->mirror_array[i]->lock); fls->mirror_array[i]->ds_count = ds_count; + fls->mirror_array[i]->lseg = &fls->generic_hdr; /* deviceid */ rc = decode_deviceid(&stream, &devid); @@ -344,6 +343,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->gid); } + p = xdr_inline_decode(&stream, 4); + if (p) + fls->flags = be32_to_cpup(p); + ff_layout_sort_mirrors(fls); rc = ff_layout_check_layout(lgr); if (rc) @@ -415,6 +418,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) return 1; } +static void +nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer) +{ + /* first IO request? */ + if (atomic_inc_return(&timer->n_ops) == 1) { + timer->start_time = ktime_get(); + } +} + +static ktime_t +nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer) +{ + ktime_t start, now; + + if (atomic_dec_return(&timer->n_ops) < 0) + WARN_ON_ONCE(1); + + now = ktime_get(); + start = timer->start_time; + timer->start_time = now; + return ktime_sub(now, start); +} + +static ktime_t +nfs4_ff_layout_calc_completion_time(struct rpc_task *task) +{ + return ktime_sub(ktime_get(), task->tk_start); +} + +static bool +nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, + struct nfs4_ff_layoutstat *layoutstat) +{ + static const ktime_t notime = {0}; + ktime_t now = ktime_get(); + + nfs4_ff_start_busy_timer(&layoutstat->busy_timer); + if (ktime_equal(mirror->start_time, notime)) + mirror->start_time = now; + if (ktime_equal(mirror->last_report_time, notime)) + mirror->last_report_time = now; + if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= + FF_LAYOUTSTATS_REPORT_INTERVAL) { + mirror->last_report_time = now; + return true; + } + + return false; +} + +static void +nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat, + __u64 requested) +{ + struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; + + iostat->ops_requested++; + iostat->bytes_requested += requested; +} + +static void +nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, + __u64 requested, + __u64 completed, + ktime_t time_completed) +{ + struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; + ktime_t timer; + + iostat->ops_completed++; + iostat->bytes_completed += completed; + iostat->bytes_not_delivered += requested - completed; + + timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer); + iostat->total_busy_time = + ktime_add(iostat->total_busy_time, timer); + iostat->aggregate_completion_time = + ktime_add(iostat->aggregate_completion_time, time_completed); +} + +static void +nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror, + __u64 requested) +{ + bool report; + + spin_lock(&mirror->lock); + report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat); + nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + spin_unlock(&mirror->lock); + + if (report) + pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); +} + +static void +nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, + __u64 completed) +{ + spin_lock(&mirror->lock); + nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, + requested, completed, + nfs4_ff_layout_calc_completion_time(task)); + spin_unlock(&mirror->lock); +} + +static void +nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror, + __u64 requested) +{ + bool report; + + spin_lock(&mirror->lock); + report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat); + nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + spin_unlock(&mirror->lock); + + if (report) + pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); +} + +static void +nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, + __u64 completed, + enum nfs3_stable_how committed) +{ + if (committed == NFS_UNSTABLE) + requested = completed = 0; + + spin_lock(&mirror->lock); + nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, + requested, completed, + nfs4_ff_layout_calc_completion_time(task)); + spin_unlock(&mirror->lock); +} + static int ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, @@ -631,7 +774,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) nfs_direct_set_resched_writes(hdr->dreq); /* fake unstable write to let common nfs resend pages */ hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = 0; + hdr->good_bytes = hdr->args.count; } return; } @@ -879,6 +1022,12 @@ static int ff_layout_read_done_cb(struct rpc_task *task, return 0; } +static bool +ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg) +{ + return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT); +} + /* * We reference the rpc_cred of the first WRITE that triggers the need for * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. @@ -891,7 +1040,11 @@ static int ff_layout_read_done_cb(struct rpc_task *task, static void ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) { - pnfs_set_layoutcommit(hdr); + if (!ff_layout_need_layoutcommit(hdr->lseg)) + return; + + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); } @@ -908,6 +1061,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) static int ff_layout_read_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { + nfs4_ff_layout_stat_io_start_read( + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -961,15 +1118,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_read_prepare_common(task, hdr)) - return; - if (ff_layout_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) return; + if (ff_layout_read_prepare_common(task, hdr)) + return; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, FMODE_READ) == -EIO) rpc_exit(task, -EIO); /* lost lock, terminate I/O */ @@ -981,6 +1138,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data) dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + nfs4_ff_layout_stat_io_end_read(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1073,8 +1234,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE) - pnfs_commit_set_layoutcommit(data); + if (data->verf.committed == NFS_UNSTABLE + && ff_layout_need_layoutcommit(data->lseg)) + pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; } @@ -1082,6 +1244,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, static int ff_layout_write_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { + nfs4_ff_layout_stat_io_start_write( + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1115,15 +1281,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_write_prepare_common(task, hdr)) - return; - if (ff_layout_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) return; + if (ff_layout_write_prepare_common(task, hdr)) + return; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, FMODE_WRITE) == -EIO) rpc_exit(task, -EIO); /* lost lock, terminate I/O */ @@ -1133,6 +1299,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count, + hdr->res.verf->committed); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1151,8 +1322,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data) &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); } +static void ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + nfs4_ff_layout_stat_io_start_write( + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + 0); +} + static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { + ff_layout_commit_prepare_common(task, data); rpc_call_start(task); } @@ -1160,10 +1340,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) { struct nfs_commit_data *wdata = data; - ff_layout_setup_sequence(wdata->ds_clp, + if (ff_layout_setup_sequence(wdata->ds_clp, &wdata->args.seq_args, &wdata->res.seq_res, - task); + task)) + return; + ff_layout_commit_prepare_common(task, data); +} + +static void ff_layout_commit_done(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *cdata = data; + struct nfs_page *req; + __u64 count = 0; + + if (task->tk_status == 0) { + list_for_each_entry(req, &cdata->pages, wb_list) + count += req->wb_bytes; + } + + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + count, count, NFS_FILE_SYNC); + + pnfs_generic_write_commit_done(task, data); } static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) @@ -1204,14 +1404,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { .rpc_call_prepare = ff_layout_commit_prepare_v3, - .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, .rpc_release = pnfs_generic_commit_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { .rpc_call_prepare = ff_layout_commit_prepare_v4, - .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, .rpc_release = pnfs_generic_commit_release, }; @@ -1255,7 +1455,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) fh = nfs4_ff_layout_select_ds_fh(lseg, idx); if (fh) hdr->args.fh = fh; - /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. @@ -1384,6 +1583,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) data->args.fh = fh; + return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, @@ -1414,7 +1614,7 @@ ff_layout_get_ds_info(struct inode *inode) } static void -ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d) +ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d) { nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, id_node)); @@ -1487,6 +1687,247 @@ out: dprintk("%s: Return\n", __func__); } +static int +ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + + return snprintf(buf, buflen, "%pI4", &sin->sin_addr); +} + +static size_t +ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf, + const int buflen) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr *addr = &sin6->sin6_addr; + + /* + * RFC 4291, Section 2.2.2 + * + * Shorthanded ANY address + */ + if (ipv6_addr_any(addr)) + return snprintf(buf, buflen, "::"); + + /* + * RFC 4291, Section 2.2.2 + * + * Shorthanded loopback address + */ + if (ipv6_addr_loopback(addr)) + return snprintf(buf, buflen, "::1"); + + /* + * RFC 4291, Section 2.2.3 + * + * Special presentation address format for mapped v4 + * addresses. + */ + if (ipv6_addr_v4mapped(addr)) + return snprintf(buf, buflen, "::ffff:%pI4", + &addr->s6_addr32[3]); + + /* + * RFC 4291, Section 2.2.1 + */ + return snprintf(buf, buflen, "%pI6c", addr); +} + +/* Derived from rpc_sockaddr2uaddr */ +static void +ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) +{ + struct sockaddr *sap = (struct sockaddr *)&da->da_addr; + char portbuf[RPCBIND_MAXUADDRPLEN]; + char addrbuf[RPCBIND_MAXUADDRLEN]; + char *netid; + unsigned short port; + int len, netid_len; + __be32 *p; + + switch (sap->sa_family) { + case AF_INET: + if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0) + return; + port = ntohs(((struct sockaddr_in *)sap)->sin_port); + netid = "tcp"; + netid_len = 3; + break; + case AF_INET6: + if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0) + return; + port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); + netid = "tcp6"; + netid_len = 4; + break; + default: + /* we only support tcp and tcp6 */ + WARN_ON_ONCE(1); + return; + } + + snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff); + len = strlcat(addrbuf, portbuf, sizeof(addrbuf)); + + p = xdr_reserve_space(xdr, 4 + netid_len); + xdr_encode_opaque(p, netid, netid_len); + + p = xdr_reserve_space(xdr, 4 + len); + xdr_encode_opaque(p, addrbuf, len); +} + +static void +ff_layout_encode_nfstime(struct xdr_stream *xdr, + ktime_t t) +{ + struct timespec64 ts; + __be32 *p; + + p = xdr_reserve_space(xdr, 12); + ts = ktime_to_timespec64(t); + p = xdr_encode_hyper(p, ts.tv_sec); + *p++ = cpu_to_be32(ts.tv_nsec); +} + +static void +ff_layout_encode_io_latency(struct xdr_stream *xdr, + struct nfs4_ff_io_stat *stat) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 5 * 8); + p = xdr_encode_hyper(p, stat->ops_requested); + p = xdr_encode_hyper(p, stat->bytes_requested); + p = xdr_encode_hyper(p, stat->ops_completed); + p = xdr_encode_hyper(p, stat->bytes_completed); + p = xdr_encode_hyper(p, stat->bytes_not_delivered); + ff_layout_encode_nfstime(xdr, stat->total_busy_time); + ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time); +} + +static void +ff_layout_encode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args, + struct nfs42_layoutstat_devinfo *devinfo) +{ + struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private; + struct nfs4_pnfs_ds_addr *da; + struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; + struct nfs_fh *fh = &mirror->fh_versions[0]; + __be32 *p, *start; + + da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); + dprintk("%s: DS %s: encoding address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + /* layoutupdate length */ + start = xdr_reserve_space(xdr, 4); + /* netaddr4 */ + ff_layout_encode_netaddr(xdr, da); + /* nfs_fh4 */ + p = xdr_reserve_space(xdr, 4 + fh->size); + xdr_encode_opaque(p, fh->data, fh->size); + /* ff_io_latency4 read */ + spin_lock(&mirror->lock); + ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat); + /* ff_io_latency4 write */ + ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat); + spin_unlock(&mirror->lock); + /* nfstime4 */ + ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time)); + /* bool */ + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(false); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); +} + +static bool +ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args, + struct pnfs_layout_segment *pls, + int *dev_count, int dev_limit) +{ + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_deviceid_node *dev; + struct nfs42_layoutstat_devinfo *devinfo; + int i; + + for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) { + if (*dev_count >= dev_limit) + break; + mirror = FF_LAYOUT_COMP(pls, i); + if (!mirror || !mirror->mirror_ds) + continue; + dev = FF_LAYOUT_DEVID_NODE(pls, i); + devinfo = &args->devinfo[*dev_count]; + memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); + devinfo->offset = pls->pls_range.offset; + devinfo->length = pls->pls_range.length; + /* well, we don't really know if IO is continuous or not! */ + devinfo->read_count = mirror->read_stat.io_stat.bytes_completed; + devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; + devinfo->write_count = mirror->write_stat.io_stat.bytes_completed; + devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; + devinfo->layout_type = LAYOUT_FLEX_FILES; + devinfo->layoutstats_encode = ff_layout_encode_layoutstats; + devinfo->layout_private = mirror; + /* lseg refcount put in cleanup_layoutstats */ + pnfs_get_lseg(pls); + + ++(*dev_count); + } + + return *dev_count < dev_limit; +} + +static int +ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) +{ + struct pnfs_layout_segment *pls; + int dev_count = 0; + + spin_lock(&args->inode->i_lock); + list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { + dev_count += FF_LAYOUT_MIRROR_COUNT(pls); + } + spin_unlock(&args->inode->i_lock); + /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ + if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) { + dprintk("%s: truncating devinfo to limit (%d:%d)\n", + __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV); + dev_count = PNFS_LAYOUTSTATS_MAXDEV; + } + args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL); + if (!args->devinfo) + return -ENOMEM; + + dev_count = 0; + spin_lock(&args->inode->i_lock); + list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { + if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count, + PNFS_LAYOUTSTATS_MAXDEV)) { + break; + } + } + spin_unlock(&args->inode->i_lock); + args->num_dev = dev_count; + + return 0; +} + +static void +ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data) +{ + struct nfs4_ff_layout_mirror *mirror; + int i; + + for (i = 0; i < data->args.num_dev; i++) { + mirror = data->args.devinfo[i].layout_private; + data->args.devinfo[i].layout_private = NULL; + pnfs_put_lseg(mirror->lseg); + } +} + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", @@ -1498,7 +1939,7 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .pg_read_ops = &ff_layout_pg_read_ops, .pg_write_ops = &ff_layout_pg_write_ops, .get_ds_info = ff_layout_get_ds_info, - .free_deviceid_node = ff_layout_free_deveiceid_node, + .free_deviceid_node = ff_layout_free_deviceid_node, .mark_request_commit = pnfs_layout_mark_request_commit, .clear_request_commit = pnfs_generic_clear_request_commit, .scan_commit_lists = pnfs_generic_scan_commit_lists, @@ -1508,6 +1949,9 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .write_pagelist = ff_layout_write_pagelist, .alloc_deviceid_node = ff_layout_alloc_deviceid_node, .encode_layoutreturn = ff_layout_encode_layoutreturn, + .sync = pnfs_nfs_generic_sync, + .prepare_layoutstats = ff_layout_prepare_layoutstats, + .cleanup_layoutstats = ff_layout_cleanup_layoutstats, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 070f20445b2d..f92f9a0a856b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -9,12 +9,17 @@ #ifndef FS_NFS_NFS4FLEXFILELAYOUT_H #define FS_NFS_NFS4FLEXFILELAYOUT_H +#define FF_FLAGS_NO_LAYOUTCOMMIT 1 + #include "../pnfs.h" /* XXX: Let's filter out insanely large mirror count for now to avoid oom * due to network error etc. */ #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 +/* LAYOUTSTATS report interval in ms */ +#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) + struct nfs4_ff_ds_version { u32 version; u32 minor_version; @@ -41,24 +46,48 @@ struct nfs4_ff_layout_ds_err { struct nfs4_deviceid deviceid; }; +struct nfs4_ff_io_stat { + __u64 ops_requested; + __u64 bytes_requested; + __u64 ops_completed; + __u64 bytes_completed; + __u64 bytes_not_delivered; + ktime_t total_busy_time; + ktime_t aggregate_completion_time; +}; + +struct nfs4_ff_busy_timer { + ktime_t start_time; + atomic_t n_ops; +}; + +struct nfs4_ff_layoutstat { + struct nfs4_ff_io_stat io_stat; + struct nfs4_ff_busy_timer busy_timer; +}; + struct nfs4_ff_layout_mirror { + struct pnfs_layout_segment *lseg; /* back pointer */ u32 ds_count; u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; nfs4_stateid stateid; - struct nfs4_string user_name; - struct nfs4_string group_name; u32 uid; u32 gid; struct rpc_cred *cred; spinlock_t lock; + struct nfs4_ff_layoutstat read_stat; + struct nfs4_ff_layoutstat write_stat; + ktime_t start_time; + ktime_t last_report_time; }; struct nfs4_ff_layout_segment { struct pnfs_layout_segment generic_hdr; u64 stripe_unit; + u32 flags; u32 mirror_array_cnt; struct nfs4_ff_layout_mirror **mirror_array; }; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e2c01f204a95..f13e1969eedd 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -30,7 +30,7 @@ void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) { nfs4_print_deviceid(&mirror_ds->id_node.deviceid); nfs4_pnfs_ds_put(mirror_ds->ds); - kfree(mirror_ds); + kfree_rcu(mirror_ds, id_node.rcu); } /* Decode opaque device data and construct new_ds using it */ @@ -324,7 +324,8 @@ static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, __func__, PTR_ERR(cred)); return PTR_ERR(cred); } else { - mirror->cred = cred; + if (cmpxchg(&mirror->cred, NULL, cred)) + put_rpccred(cred); } } return 0; @@ -386,7 +387,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ smp_rmb(); if (ds->ds_clp) - goto out; + goto out_update_creds; flavor = nfs4_ff_layout_choose_authflavor(mirror); @@ -430,7 +431,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, } } } - +out_update_creds: if (ff_layout_update_mirror_cred(mirror, ds)) ds = NULL; out: diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 9ac3846cb59e..a608ffd28acc 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -56,11 +56,11 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i * This again causes shrink_dcache_for_umount_subtree() to * Oops, since the test for IS_ROOT() will fail. */ - spin_lock(&sb->s_root->d_inode->i_lock); + spin_lock(&d_inode(sb->s_root)->i_lock); spin_lock(&sb->s_root->d_lock); hlist_del_init(&sb->s_root->d_u.d_alias); spin_unlock(&sb->s_root->d_lock); - spin_unlock(&sb->s_root->d_inode->i_lock); + spin_unlock(&d_inode(sb->s_root)->i_lock); } return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index d42dff6d5e98..0adc7d245b3d 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -133,6 +133,13 @@ void nfs_evict_inode(struct inode *inode) nfs_clear_inode(inode); } +int nfs_sync_inode(struct inode *inode) +{ + nfs_inode_dio_wait(inode); + return nfs_wb_all(inode); +} +EXPORT_SYMBOL_GPL(nfs_sync_inode); + /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk */ @@ -192,7 +199,6 @@ void nfs_zap_caches(struct inode *inode) nfs_zap_caches_locked(inode); spin_unlock(&inode->i_lock); } -EXPORT_SYMBOL_GPL(nfs_zap_caches); void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { @@ -436,8 +442,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; - else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else @@ -495,7 +502,7 @@ EXPORT_SYMBOL_GPL(nfs_fhget); int nfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; int error = -ENOMEM; @@ -525,10 +532,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) trace_nfs_setattr_enter(inode); /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) { - nfs_inode_dio_wait(inode); - nfs_wb_all(inode); - } + if (S_ISREG(inode->i_mode)) + nfs_sync_inode(inode); fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -621,7 +626,7 @@ static void nfs_request_parent_use_readdirplus(struct dentry *dentry) struct dentry *parent; parent = dget_parent(dentry); - nfs_force_use_readdirplus(parent->d_inode); + nfs_force_use_readdirplus(d_inode(parent)); dput(parent); } @@ -637,15 +642,16 @@ static bool nfs_need_revalidate_inode(struct inode *inode) int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int need_atime = NFS_I(inode)->cache_validity & NFS_INO_INVALID_ATIME; int err = 0; trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - nfs_inode_dio_wait(inode); - err = filemap_write_and_wait(inode->i_mapping); + mutex_lock(&inode->i_mutex); + err = nfs_sync_inode(inode); + mutex_unlock(&inode->i_mutex); if (err) goto out; } @@ -673,6 +679,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + if (S_ISDIR(inode->i_mode)) + stat->blksize = NFS_SERVER(inode)->dtsize; } out: trace_nfs_getattr_exit(inode, err); @@ -708,7 +716,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) { struct nfs_lock_context *res, *new = NULL; - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); @@ -736,7 +744,7 @@ EXPORT_SYMBOL_GPL(nfs_get_lock_context); void nfs_put_lock_context(struct nfs_lock_context *l_ctx) { struct nfs_open_context *ctx = l_ctx->open_context; - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; @@ -763,7 +771,7 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) return; if (!is_sync) return; - inode = ctx->dentry->d_inode; + inode = d_inode(ctx->dentry); if (!list_empty(&NFS_I(inode)->open_files)) return; server = NFS_SERVER(inode); @@ -810,7 +818,7 @@ EXPORT_SYMBOL_GPL(get_nfs_open_context); static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); struct super_block *sb = ctx->dentry->d_sb; if (!list_empty(&ctx->list)) { @@ -842,7 +850,7 @@ EXPORT_SYMBOL_GPL(put_nfs_open_context); */ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) { - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); @@ -885,7 +893,7 @@ static void nfs_file_clear_open_context(struct file *filp) struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { - struct inode *inode = ctx->dentry->d_inode; + struct inode *inode = d_inode(ctx->dentry); filp->private_data = NULL; spin_lock(&inode->i_lock); @@ -1237,9 +1245,11 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); - if (cur_size != new_isize && nfsi->nrequests == 0) + if (cur_size != new_isize) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; } + if (nfsi->nrequests != 0) + invalid &= ~NFS_INO_REVAL_PAGECACHE; /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) @@ -1588,6 +1598,19 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); + +static inline bool nfs_fileid_valid(struct nfs_inode *nfsi, + struct nfs_fattr *fattr) +{ + bool ret1 = true, ret2 = true; + + if (fattr->valid & NFS_ATTR_FATTR_FILEID) + ret1 = (nfsi->fileid == fattr->fileid); + if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) + ret2 = (nfsi->fileid == fattr->mounted_on_fileid); + return ret1 || ret2; +} + /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state @@ -1614,7 +1637,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); - if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid) { + if (!nfs_fileid_valid(nfsi, fattr)) { printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, @@ -1664,13 +1687,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS - | NFS_INO_INVALID_ACL - | NFS_INO_REVAL_PAGECACHE; + | NFS_INO_INVALID_ACL; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); inode->i_version = fattr->change_attr; } - } else if (server->caps & NFS_CAP_CHANGE_ATTR) + } else nfsi->cache_validity |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { @@ -1697,7 +1719,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if ((nfsi->nrequests == 0) || new_isize > cur_isize) { i_size_write(inode, new_isize); invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA; - invalid &= ~NFS_INO_REVAL_PAGECACHE; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -1819,7 +1840,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; - nfsi = (struct nfs_inode *)kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); + nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; @@ -1990,17 +2011,15 @@ static int __init init_nfs_fs(void) if (err) goto out1; -#ifdef CONFIG_PROC_FS rpc_proc_register(&init_net, &nfs_rpcstat); -#endif - if ((err = register_nfs_fs()) != 0) + + err = register_nfs_fs(); + if (err) goto out0; return 0; out0: -#ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); -#endif nfs_destroy_directcache(); out1: nfs_destroy_writepagecache(); @@ -2031,9 +2050,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_nfspagecache(); nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); -#ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); -#endif unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9e6475bc5ba2..9b372b845f6a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -296,6 +296,22 @@ extern struct rpc_procinfo nfs4_procedures[]; #ifdef CONFIG_NFS_V4_SECURITY_LABEL extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline struct nfs4_label * +nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) +{ + if (!dst || !src) + return NULL; + + if (src->len > NFS4_MAXLABELLEN) + return NULL; + + dst->lfs = src->lfs; + dst->pi = src->pi; + dst->len = src->len; + memcpy(dst->label, src->label, src->len); + + return dst; +} static inline void nfs4_label_free(struct nfs4_label *label) { if (label) { @@ -316,6 +332,11 @@ static inline void nfs4_label_free(void *label) {} static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { } +static inline struct nfs4_label * +nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) +{ + return NULL; +} #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ /* proc.c */ @@ -607,7 +628,7 @@ void nfs_mark_page_unstable(struct page *page) struct inode *inode = page_file_mapping(page)->host; inc_zone_page_state(page, NR_UNSTABLE_NFS); - inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index b5a0afc3ee10..c8162c660c44 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -139,7 +139,7 @@ EXPORT_SYMBOL_GPL(nfs_path); struct vfsmount *nfs_d_automount(struct path *path) { struct vfsmount *mnt; - struct nfs_server *server = NFS_SERVER(path->dentry->d_inode); + struct nfs_server *server = NFS_SERVER(d_inode(path->dentry)); struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; @@ -180,16 +180,16 @@ out_nofree: static int nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - if (NFS_FH(dentry->d_inode)->size != 0) + if (NFS_FH(d_inode(dentry))->size != 0) return nfs_getattr(mnt, dentry, stat); - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); return 0; } static int nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) { - if (NFS_FH(dentry->d_inode)->size != 0) + if (NFS_FH(d_inode(dentry))->size != 0) return nfs_setattr(dentry, attr); return -EACCES; } @@ -279,7 +279,7 @@ struct vfsmount *nfs_submount(struct nfs_server *server, struct dentry *dentry, struct dentry *parent = dget_parent(dentry); /* Look it up again to get its attributes */ - err = server->nfs_client->rpc_ops->lookup(parent->d_inode, &dentry->d_name, fh, fattr, NULL); + err = server->nfs_client->rpc_ops->lookup(d_inode(parent), &dentry->d_name, fh, fattr, NULL); dput(parent); if (err != 0) return ERR_PTR(err); diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 658e586ca438..1ebe2fc7cda2 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -279,7 +279,7 @@ nfs3_list_one_acl(struct inode *inode, int type, const char *name, void *data, ssize_t nfs3_listxattr(struct dentry *dentry, char *data, size_t size) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); ssize_t result = 0; int error; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1f11d2533ee4..cb28cceefebe 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -120,7 +120,7 @@ static int nfs3_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nfs3_sattrargs arg = { .fh = NFS_FH(inode), .sattr = sattr, @@ -386,13 +386,13 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, * not sure this buys us anything (and I'd have * to revamp the NFSv3 XDR code) */ status = nfs3_proc_setattr(dentry, data->res.fattr, sattr); - nfs_post_op_update_inode(dentry->d_inode, data->res.fattr); + nfs_post_op_update_inode(d_inode(dentry), data->res.fattr); dprintk("NFS reply setattr (post-create): %d\n", status); if (status != 0) goto out_release_acls; } - status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); out_release_acls: posix_acl_release(acl); @@ -570,7 +570,7 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) if (status != 0) goto out_release_acls; - status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); out_release_acls: posix_acl_release(acl); @@ -623,7 +623,7 @@ static int nfs3_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); __be32 *verf = NFS_I(dir)->cookieverf; struct nfs3_readdirargs arg = { .fh = NFS_FH(dir), @@ -715,7 +715,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, if (status != 0) goto out_release_acls; - status = nfs3_proc_setacls(dentry->d_inode, acl, default_acl); + status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); out_release_acls: posix_acl_release(acl); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 53852a4bd88b..9b04c2e6fffc 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -1342,7 +1342,7 @@ static void nfs3_xdr_enc_setacl3args(struct rpc_rqst *req, if (args->npages != 0) xdr_write_pages(xdr, args->pages, 0, args->len); else - xdr_reserve_space(xdr, NFS_ACL_INLINE_BUFSIZE); + xdr_reserve_space(xdr, args->len); error = nfsacl_encode(xdr->buf, base, args->inode, (args->mask & NFS_ACL) ? diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 7afb8947dfdf..ff66ae700b89 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -5,11 +5,18 @@ #ifndef __LINUX_FS_NFS_NFS4_2_H #define __LINUX_FS_NFS_NFS4_2_H +/* + * FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support + * more? Need to consider not to pre-alloc too much for a compound. + */ +#define PNFS_LAYOUTSTATS_MAXDEV (4) + /* nfs4.2proc.c */ int nfs42_proc_allocate(struct file *, loff_t, loff_t); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); - +int nfs42_proc_layoutstats_generic(struct nfs_server *, + struct nfs42_layoutstat_data *); /* nfs4.2xdr.h */ extern struct rpc_procinfo nfs4_2_procedures[]; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index cb170722769c..d731bbf974aa 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -10,6 +10,11 @@ #include <linux/nfs_fs.h> #include "nfs4_fs.h" #include "nfs42.h" +#include "iostat.h" +#include "pnfs.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, fmode_t fmode) @@ -36,13 +41,16 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, loff_t offset, loff_t len) { struct inode *inode = file_inode(filep); + struct nfs_server *server = NFS_SERVER(inode); struct nfs42_falloc_args args = { .falloc_fh = NFS_FH(inode), .falloc_offset = offset, .falloc_length = len, + .falloc_bitmask = server->cache_consistency_bitmask, + }; + struct nfs42_falloc_res res = { + .falloc_server = server, }; - struct nfs42_falloc_res res; - struct nfs_server *server = NFS_SERVER(inode); int status; msg->rpc_argp = &args; @@ -52,8 +60,17 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, if (status) return status; - return nfs4_call_sync(server->client, server, msg, - &args.seq_args, &res.seq_res, 0); + res.falloc_fattr = nfs_alloc_fattr(); + if (!res.falloc_fattr) + return -ENOMEM; + + status = nfs4_call_sync(server->client, server, msg, + &args.seq_args, &res.seq_res, 0); + if (status == 0) + status = nfs_post_op_update_inode(inode, res.falloc_fattr); + + kfree(res.falloc_fattr); + return status; } static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, @@ -84,9 +101,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) return -EOPNOTSUPP; + mutex_lock(&inode->i_mutex); + err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + + mutex_unlock(&inode->i_mutex); return err; } @@ -101,13 +122,20 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_DEALLOCATE)) return -EOPNOTSUPP; + nfs_wb_all(inode); + mutex_lock(&inode->i_mutex); + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) + truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + + mutex_unlock(&inode->i_mutex); return err; } -loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct inode *inode = file_inode(filep); struct nfs42_seek_args args = { @@ -142,3 +170,102 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); } + +loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +{ + struct nfs_server *server = NFS_SERVER(file_inode(filep)); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_llseek(filep, offset, whence); + if (err == -ENOTSUPP) + return -EOPNOTSUPP; + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + + return err; +} + + +static void +nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, + &data->res.seq_res, task); +} + +static void +nfs42_layoutstat_done(struct rpc_task *task, void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -ENOTSUPP: + case -EOPNOTSUPP: + NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS; + default: + dprintk("%s server returns %d\n", __func__, task->tk_status); + } +} + +static void +nfs42_layoutstat_release(void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + struct nfs_server *nfss = NFS_SERVER(data->args.inode); + + if (nfss->pnfs_curr_ld->cleanup_layoutstats) + nfss->pnfs_curr_ld->cleanup_layoutstats(data); + + pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout); + smp_mb__before_atomic(); + clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags); + smp_mb__after_atomic(); + nfs_iput_and_deactive(data->inode); + kfree(data->args.devinfo); + kfree(data); +} + +static const struct rpc_call_ops nfs42_layoutstat_ops = { + .rpc_call_prepare = nfs42_layoutstat_prepare, + .rpc_call_done = nfs42_layoutstat_done, + .rpc_release = nfs42_layoutstat_release, +}; + +int nfs42_proc_layoutstats_generic(struct nfs_server *server, + struct nfs42_layoutstat_data *data) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs42_layoutstat_ops, + .callback_data = data, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + + data->inode = nfs_igrab_and_active(data->args.inode); + if (!data->inode) { + nfs42_layoutstat_release(data); + return -EAGAIN; + } + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + return 0; +} diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 038a7e1521fa..a6bd27da6286 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -4,6 +4,8 @@ #ifndef __LINUX_FS_NFS_NFS4_2XDR_H #define __LINUX_FS_NFS_NFS4_2XDR_H +#include "nfs42.h" + #define encode_fallocate_maxsz (encode_stateid_maxsz + \ 2 /* offset */ + \ 2 /* length */) @@ -22,25 +24,47 @@ 1 /* whence */ + \ 2 /* offset */ + \ 2 /* length */) +#define encode_io_info_maxsz 4 +#define encode_layoutstats_maxsz (op_decode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + encode_stateid_maxsz + \ + encode_io_info_maxsz + \ + encode_io_info_maxsz + \ + 1 /* opaque devaddr4 length */ + \ + XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE)) +#define decode_layoutstats_maxsz (op_decode_hdr_maxsz) #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_allocate_maxsz) + encode_allocate_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_allocate_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_allocate_maxsz) + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ - encode_deallocate_maxsz) + encode_deallocate_maxsz + \ + encode_getattr_maxsz) #define NFS4_dec_deallocate_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ - decode_deallocate_maxsz) + decode_deallocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_seek_maxsz) #define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_seek_maxsz) +#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz) +#define NFS4_dec_layoutstats_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz) static void encode_fallocate(struct xdr_stream *xdr, @@ -77,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr, encode_uint32(xdr, args->sa_what); } +static void encode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args, + struct nfs42_layoutstat_devinfo *devinfo, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr); + p = reserve_space(xdr, 8 + 8); + p = xdr_encode_hyper(p, devinfo->offset); + p = xdr_encode_hyper(p, devinfo->length); + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4); + p = xdr_encode_hyper(p, devinfo->read_count); + p = xdr_encode_hyper(p, devinfo->read_bytes); + p = xdr_encode_hyper(p, devinfo->write_count); + p = xdr_encode_hyper(p, devinfo->write_bytes); + p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data, + NFS4_DEVICEID4_SIZE); + /* Encode layoutupdate4 */ + *p++ = cpu_to_be32(devinfo->layout_type); + if (devinfo->layoutstats_encode != NULL) + devinfo->layoutstats_encode(xdr, args, devinfo); + else + encode_uint32(xdr, 0); +} + /* * Encode ALLOCATE request */ @@ -92,6 +143,7 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->falloc_fh, &hdr); encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); encode_nops(&hdr); } @@ -110,6 +162,7 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, encode_sequence(xdr, &args->seq_args, &hdr); encode_putfh(xdr, args->falloc_fh, &hdr); encode_deallocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); encode_nops(&hdr); } @@ -131,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode LAYOUTSTATS request + */ +static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args) +{ + int i; + + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV); + for (i = 0; i < args->num_dev; i++) + encode_layoutstats(xdr, args, &args->devinfo[i], &hdr); + encode_nops(&hdr); +} + static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_ALLOCATE); @@ -163,6 +238,12 @@ out_overflow: return -EIO; } +static int decode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_res *res) +{ + return decode_op_hdr(xdr, OP_LAYOUTSTATS); +} + /* * Decode ALLOCATE request */ @@ -183,6 +264,9 @@ static int nfs4_xdr_dec_allocate(struct rpc_rqst *rqstp, if (status) goto out; status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); out: return status; } @@ -207,6 +291,9 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst *rqstp, if (status) goto out; status = decode_deallocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); out: return status; } @@ -234,4 +321,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode LAYOUTSTATS request + */ +static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_layoutstat_res *res) +{ + struct compound_hdr hdr; + int status, i; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV); + for (i = 0; i < res->num_dev; i++) { + status = decode_layoutstats(xdr, res); + if (status) + goto out; + } +out: + res->rpc_status = status; + return status; +} + #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index fdef424b0cd3..ea3bee919a76 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, struct rpc_message *, struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); +extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 86d6214ea022..3aa6a9ba5113 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -4,7 +4,6 @@ */ #include <linux/module.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include <linux/nfs_mount.h> #include <linux/sunrpc/addr.h> #include <linux/sunrpc/auth.h> @@ -15,6 +14,7 @@ #include "callback.h" #include "delegation.h" #include "nfs4session.h" +#include "nfs4idmap.h" #include "pnfs.h" #include "netns.h" @@ -676,7 +676,6 @@ found: break; } - /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s status = %d\n", __func__, status); nfs_put_client(prev); @@ -1130,7 +1129,7 @@ error: */ static int nfs_probe_destination(struct nfs_server *server) { - struct inode *inode = server->super->s_root->d_inode; + struct inode *inode = d_inode(server->super->s_root); struct nfs_fattr *fattr; int error; diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 8b46389c4c5b..dcd39d4e2efe 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -10,6 +10,8 @@ #include "fscache.h" #include "pnfs.h" +#include "nfstrace.h" + #ifdef CONFIG_NFS_V4_2 #include "nfs42.h" #endif @@ -39,6 +41,10 @@ nfs4_file_open(struct inode *inode, struct file *filp) dprintk("NFS: open file(%pd2)\n", dentry); + err = nfs_check_flags(openflags); + if (err) + return err; + if ((openflags & O_ACCMODE) == 3) openflags--; @@ -46,7 +52,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) openflags &= ~(O_CREAT|O_EXCL); parent = dget_parent(dentry); - dir = parent->d_inode; + dir = d_inode(parent); ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode); err = PTR_ERR(ctx); @@ -57,7 +63,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) if (openflags & O_TRUNC) { attr.ia_valid |= ATTR_SIZE; attr.ia_size = 0; - nfs_wb_all(inode); + nfs_sync_inode(inode); } inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr, &opened); @@ -74,7 +80,7 @@ nfs4_file_open(struct inode *inode, struct file *filp) goto out_drop; } } - if (inode != dentry->d_inode) + if (inode != d_inode(dentry)) goto out_drop; nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); @@ -100,6 +106,9 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ret; struct inode *inode = file_inode(file); + trace_nfs_fsync_enter(inode); + + nfs_inode_dio_wait(inode); do { ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) @@ -107,7 +116,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) mutex_lock(&inode->i_mutex); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) - ret = pnfs_layoutcommit_inode(inode, true); + ret = pnfs_sync_inode(inode, !!datasync); mutex_unlock(&inode->i_mutex); /* * If nfs_file_fsync_commit detected a server reboot, then @@ -118,6 +127,7 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) end = LLONG_MAX; } while (ret == -EAGAIN); + trace_nfs_fsync_exit(inode, ret); return ret; } @@ -152,15 +162,9 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); if (mode & FALLOC_FL_PUNCH_HOLE) - ret = nfs42_proc_deallocate(filep, offset, len); - else - ret = nfs42_proc_allocate(filep, offset, len); - mutex_unlock(&inode->i_mutex); - - nfs_zap_caches(inode); - return ret; + return nfs42_proc_deallocate(filep, offset, len); + return nfs42_proc_allocate(filep, offset, len); } #endif /* CONFIG_NFS_V4_2 */ @@ -170,8 +174,6 @@ const struct file_operations nfs4_file_operations = { #else .llseek = nfs_file_llseek, #endif - .read = new_sync_read, - .write = new_sync_write, .read_iter = nfs_file_read, .write_iter = nfs_file_write, .mmap = nfs_file_mmap, diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index c0b3a16b4a00..039b3eb6d834 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -35,13 +35,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p goto out; } - if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { - printk(KERN_ERR "nfs4_get_rootfh:" - " getroot obtained referral\n"); - ret = -EREMOTE; - goto out; - } - memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); out: nfs_free_fattr(fsinfo.fattr); diff --git a/fs/nfs/idmap.c b/fs/nfs/nfs4idmap.c index 857e2a99acc8..535dfc69c628 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -36,7 +36,6 @@ #include <linux/types.h> #include <linux/parser.h> #include <linux/fs.h> -#include <linux/nfs_idmap.h> #include <net/net_namespace.h> #include <linux/sunrpc/rpc_pipe_fs.h> #include <linux/nfs_fs.h> @@ -49,6 +48,7 @@ #include "internal.h" #include "netns.h" +#include "nfs4idmap.h" #include "nfs4trace.h" #define NFS_UINT_MAXLEN 11 @@ -494,12 +494,7 @@ nfs_idmap_delete(struct nfs_client *clp) int nfs_idmap_init(void) { - int ret; - ret = nfs_idmap_init_keyring(); - if (ret != 0) - goto out; -out: - return ret; + return nfs_idmap_init_keyring(); } void nfs_idmap_quit(void) diff --git a/fs/nfs/nfs4idmap.h b/fs/nfs/nfs4idmap.h new file mode 100644 index 000000000000..de44d7330ab3 --- /dev/null +++ b/fs/nfs/nfs4idmap.h @@ -0,0 +1,68 @@ +/* + * fs/nfs/nfs4idmap.h + * + * UID and GID to name mapping for clients. + * + * Copyright (c) 2002 The Regents of the University of Michigan. + * All rights reserved. + * + * Marius Aamodt Eriksen <marius@umich.edu> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#ifndef NFS_IDMAP_H +#define NFS_IDMAP_H + +#include <linux/uidgid.h> +#include <uapi/linux/nfs_idmap.h> + + +/* Forward declaration to make this header independent of others */ +struct nfs_client; +struct nfs_server; +struct nfs_fattr; +struct nfs4_string; + +int nfs_idmap_init(void); +void nfs_idmap_quit(void); +int nfs_idmap_new(struct nfs_client *); +void nfs_idmap_delete(struct nfs_client *); + +void nfs_fattr_init_names(struct nfs_fattr *fattr, + struct nfs4_string *owner_name, + struct nfs4_string *group_name); +void nfs_fattr_free_names(struct nfs_fattr *); +void nfs_fattr_map_and_free_names(struct nfs_server *, struct nfs_fattr *); + +int nfs_map_name_to_uid(const struct nfs_server *, const char *, size_t, kuid_t *); +int nfs_map_group_to_gid(const struct nfs_server *, const char *, size_t, kgid_t *); +int nfs_map_uid_to_name(const struct nfs_server *, kuid_t, char *, size_t); +int nfs_map_gid_to_group(const struct nfs_server *, kgid_t, char *, size_t); + +int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res); + +extern unsigned int nfs_idmap_cache_timeout; +#endif /* NFS_IDMAP_H */ diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 3d83cb1fdc70..f592672373cb 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -375,7 +375,7 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * dprintk("%s: getting locations for %pd2\n", __func__, dentry); - err = nfs4_proc_fs_locations(client, parent->d_inode, &dentry->d_name, fs_locations, page); + err = nfs4_proc_fs_locations(client, d_inode(parent), &dentry->d_name, fs_locations, page); dput(parent); if (err != 0 || fs_locations->nlocations <= 0 || @@ -396,7 +396,7 @@ struct vfsmount *nfs4_submount(struct nfs_server *server, struct dentry *dentry, { rpc_authflavor_t flavor = server->client->cl_auth->au_flavor; struct dentry *parent = dget_parent(dentry); - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct qstr *name = &dentry->d_name; struct rpc_clnt *client; struct vfsmount *mnt; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 627f37c44456..3acb1eb72930 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -38,6 +38,7 @@ #include <linux/mm.h> #include <linux/delay.h> #include <linux/errno.h> +#include <linux/file.h> #include <linux/string.h> #include <linux/ratelimit.h> #include <linux/printk.h> @@ -51,7 +52,6 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/module.h> -#include <linux/nfs_idmap.h> #include <linux/xattr.h> #include <linux/utsname.h> #include <linux/freezer.h> @@ -63,6 +63,7 @@ #include "callback.h" #include "pnfs.h" #include "netns.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "fscache.h" @@ -185,7 +186,8 @@ const u32 nfs4_fattr_bitmap[3] = { | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_METADATA - | FATTR4_WORD1_TIME_MODIFY, + | FATTR4_WORD1_TIME_MODIFY + | FATTR4_WORD1_MOUNTED_ON_FILEID, #ifdef CONFIG_NFS_V4_SECURITY_LABEL FATTR4_WORD2_SECURITY_LABEL #endif @@ -293,7 +295,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = xdr_one; /* bitmap length */ *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ *p++ = htonl(8); /* attribute buffer length */ - p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_inode)); + p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry))); } *p++ = xdr_one; /* next */ @@ -305,7 +307,7 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = xdr_one; /* bitmap length */ *p++ = htonl(FATTR4_WORD0_FILEID); /* bitmap */ *p++ = htonl(8); /* attribute buffer length */ - p = xdr_encode_hyper(p, NFS_FILEID(dentry->d_parent->d_inode)); + p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent))); readdir->pgbase = (char *)p - (char *)start; readdir->count -= readdir->pgbase; @@ -354,6 +356,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ case 0: return 0; case -NFS4ERR_OPENMODE: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: if (inode && nfs4_have_delegation(inode, FMODE_READ)) { nfs4_inode_return_delegation(inode); exception->retry = 1; @@ -365,15 +370,6 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ if (ret < 0) break; goto wait_on_recovery; - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - if (state == NULL) - break; - ret = nfs4_schedule_stateid_recovery(server, state); - if (ret < 0) - break; - goto wait_on_recovery; case -NFS4ERR_EXPIRED: if (state != NULL) { ret = nfs4_schedule_stateid_recovery(server, state); @@ -471,7 +467,10 @@ static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) static void renew_lease(const struct nfs_server *server, unsigned long timestamp) { - do_renew_lease(server->nfs_client, timestamp); + struct nfs_client *clp = server->nfs_client; + + if (!nfs4_has_session(clp)) + do_renew_lease(clp, timestamp); } struct nfs4_call_sync_data { @@ -480,8 +479,8 @@ struct nfs4_call_sync_data { struct nfs4_sequence_res *seq_res; }; -static void nfs4_init_sequence(struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, int cache_reply) +void nfs4_init_sequence(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, int cache_reply) { args->sa_slot = NULL; args->sa_cache_this = cache_reply; @@ -620,8 +619,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ - if (res->sr_status_flags != 0) - nfs4_schedule_lease_recovery(clp); + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); nfs41_update_target_slotid(slot->table, slot, res); break; case 1: @@ -914,6 +912,7 @@ struct nfs4_opendata { struct nfs_open_confirmres c_res; struct nfs4_string owner_name; struct nfs4_string group_name; + struct nfs4_label *a_label; struct nfs_fattr f_attr; struct nfs4_label *f_label; struct dentry *dir; @@ -1004,7 +1003,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, gfp_t gfp_mask) { struct dentry *parent = dget_parent(dentry); - struct inode *dir = parent->d_inode; + struct inode *dir = d_inode(parent); struct nfs_server *server = NFS_SERVER(dir); struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); struct nfs4_opendata *p; @@ -1017,6 +1016,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, if (IS_ERR(p->f_label)) goto err_free_p; + p->a_label = nfs4_label_alloc(server, gfp_mask); + if (IS_ERR(p->a_label)) + goto err_free_f; + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask); if (IS_ERR(p->o_arg.seqid)) @@ -1045,7 +1048,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.server = server; p->o_arg.bitmask = nfs4_bitmask(server, label); p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; - p->o_arg.label = label; + p->o_arg.label = nfs4_label_copy(p->a_label, label); p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); switch (p->o_arg.claim) { case NFS4_OPEN_CLAIM_NULL: @@ -1057,7 +1060,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, case NFS4_OPEN_CLAIM_FH: case NFS4_OPEN_CLAIM_DELEG_CUR_FH: case NFS4_OPEN_CLAIM_DELEG_PREV_FH: - p->o_arg.fh = NFS_FH(dentry->d_inode); + p->o_arg.fh = NFS_FH(d_inode(dentry)); } if (attrs != NULL && attrs->ia_valid != 0) { __u32 verf[2]; @@ -1078,6 +1081,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, return p; err_free_label: + nfs4_label_free(p->a_label); +err_free_f: nfs4_label_free(p->f_label); err_free_p: kfree(p); @@ -1097,6 +1102,7 @@ static void nfs4_opendata_free(struct kref *kref) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); + nfs4_label_free(p->a_label); nfs4_label_free(p->f_label); dput(p->dir); @@ -1202,12 +1208,15 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state, static void nfs_resync_open_stateid_locked(struct nfs4_state *state) { + if (!(state->n_wronly || state->n_rdonly || state->n_rdwr)) + return; if (state->n_wronly) set_bit(NFS_O_WRONLY_STATE, &state->flags); if (state->n_rdonly) set_bit(NFS_O_RDONLY_STATE, &state->flags); if (state->n_rdwr) set_bit(NFS_O_RDWR_STATE, &state->flags); + set_bit(NFS_OPEN_STATE, &state->flags); } static void nfs_clear_open_stateid_locked(struct nfs4_state *state, @@ -1551,6 +1560,13 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod struct nfs4_state *newstate; int ret; + if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR || + opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) && + (opendata->o_arg.u.delegation_type & fmode) != fmode) + /* This mode can't have been delegated, so we must have + * a valid open_stateid to cover it - not need to reclaim. + */ + return 0; opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; opendata->o_arg.share_access = nfs4_map_atomic_open_share( @@ -1682,6 +1698,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct "%d.\n", __func__, err); case 0: case -ENOENT: + case -EAGAIN: case -ESTALE: break; case -NFS4ERR_BADSESSION: @@ -1794,7 +1811,7 @@ static const struct rpc_call_ops nfs4_open_confirm_ops = { */ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) { - struct nfs_server *server = NFS_SERVER(data->dir->d_inode); + struct nfs_server *server = NFS_SERVER(d_inode(data->dir)); struct rpc_task *task; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_CONFIRM], @@ -1951,7 +1968,7 @@ static const struct rpc_call_ops nfs4_open_ops = { static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) { - struct inode *dir = data->dir->d_inode; + struct inode *dir = d_inode(data->dir); struct nfs_server *server = NFS_SERVER(dir); struct nfs_openargs *o_arg = &data->o_arg; struct nfs_openres *o_res = &data->o_res; @@ -1998,7 +2015,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, int isrecover) static int _nfs4_recover_proc_open(struct nfs4_opendata *data) { - struct inode *dir = data->dir->d_inode; + struct inode *dir = d_inode(data->dir); struct nfs_openres *o_res = &data->o_res; int status; @@ -2067,7 +2084,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred, */ static int _nfs4_proc_open(struct nfs4_opendata *data) { - struct inode *dir = data->dir->d_inode; + struct inode *dir = d_inode(data->dir); struct nfs_server *server = NFS_SERVER(dir); struct nfs_openargs *o_arg = &data->o_arg; struct nfs_openres *o_res = &data->o_res; @@ -2314,7 +2331,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); dentry = opendata->dentry; - if (dentry->d_inode == NULL) { + if (d_really_is_negative(dentry)) { /* FIXME: Is this d_drop() ever needed? */ d_drop(dentry); dentry = d_add_unique(dentry, igrab(state->inode)); @@ -2325,7 +2342,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ctx->dentry = dget(dentry); } nfs_set_verifier(dentry, - nfs_save_change_attribute(opendata->dir->d_inode)); + nfs_save_change_attribute(d_inode(opendata->dir))); } ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags); @@ -2333,7 +2350,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, goto out; ctx->state = state; - if (dentry->d_inode == state->inode) { + if (d_inode(dentry) == state->inode) { nfs_inode_attach_open_context(ctx); if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) nfs4_schedule_stateid_recovery(server, state); @@ -2374,10 +2391,10 @@ static int _nfs4_do_open(struct inode *dir, status = nfs4_recover_expired_lease(server); if (status != 0) goto err_put_state_owner; - if (dentry->d_inode != NULL) - nfs4_return_incompatible_delegation(dentry->d_inode, fmode); + if (d_really_is_positive(dentry)) + nfs4_return_incompatible_delegation(d_inode(dentry), fmode); status = -ENOMEM; - if (dentry->d_inode) + if (d_really_is_positive(dentry)) claim = NFS4_OPEN_CLAIM_FH; opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, label, claim, GFP_KERNEL); @@ -2400,8 +2417,8 @@ static int _nfs4_do_open(struct inode *dir, } opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } - if (dentry->d_inode != NULL) - opendata->state = nfs4_get_open_state(dentry->d_inode, sp); + if (d_really_is_positive(dentry)) + opendata->state = nfs4_get_open_state(d_inode(dentry), sp); status = _nfs4_open_and_get_state(opendata, fmode, flags, ctx); if (status != 0) @@ -3095,16 +3112,13 @@ int nfs4_proc_get_rootfh(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fsinfo *info, bool auth_probe) { - int status; + int status = 0; - switch (auth_probe) { - case false: + if (!auth_probe) status = nfs4_lookup_root(server, fhandle, info); - if (status != -NFS4ERR_WRONGSEC) - break; - default: + + if (auth_probe || status == NFS4ERR_WRONGSEC) status = nfs4_do_find_root_sec(server, fhandle, info); - } if (status == 0) status = nfs4_server_capabilities(server, fhandle); @@ -3254,7 +3268,7 @@ static int nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct rpc_cred *cred = NULL; struct nfs4_state *state = NULL; struct nfs4_label *label = NULL; @@ -3356,6 +3370,8 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, goto out; case -NFS4ERR_MOVED: err = nfs4_get_referral(client, dir, name, fattr, fhandle); + if (err == -NFS4ERR_MOVED) + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); goto out; case -NFS4ERR_WRONGSEC: err = -EPERM; @@ -3871,13 +3887,13 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct nfs4_readdir_arg args = { .fh = NFS_FH(dir), .pages = pages, .pgbase = 0, .count = count, - .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask, + .bitmask = NFS_SERVER(d_inode(dentry))->attr_bitmask, .plus = plus, }; struct nfs4_readdir_res res; @@ -3914,8 +3930,8 @@ static int nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, do { err = _nfs4_proc_readdir(dentry, cred, cookie, pages, count, plus); - trace_nfs4_readdir(dentry->d_inode, err); - err = nfs4_handle_exception(NFS_SERVER(dentry->d_inode), err, + trace_nfs4_readdir(d_inode(dentry), err); + err = nfs4_handle_exception(NFS_SERVER(d_inode(dentry)), err, &exception); } while (exception.retry); return err; @@ -4830,7 +4846,7 @@ nfs4_set_security_label(struct dentry *dentry, const void *buf, size_t buflen) struct nfs4_label ilabel, *olabel = NULL; struct nfs_fattr fattr; struct rpc_cred *cred; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int status; if (!nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) @@ -4956,49 +4972,128 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, memcpy(bootverf->data, verf, sizeof(bootverf->data)); } -static unsigned int -nfs4_init_nonuniform_client_string(struct nfs_client *clp, - char *buf, size_t len) +static int +nfs4_init_nonuniform_client_string(struct nfs_client *clp) { - unsigned int result; + int result; + size_t len; + char *str; + bool retried = false; if (clp->cl_owner_id != NULL) - return strlcpy(buf, clp->cl_owner_id, len); + return 0; +retry: + rcu_read_lock(); + len = 10 + strlen(clp->cl_ipaddr) + 1 + + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + + 1 + + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) + + 1; + rcu_read_unlock(); + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; rcu_read_lock(); - result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR), - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_PROTO)); + result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", + clp->cl_ipaddr, + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); rcu_read_unlock(); - clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); - return result; + + /* Did something change? */ + if (result >= len) { + kfree(str); + if (retried) + return -EINVAL; + retried = true; + goto retry; + } + clp->cl_owner_id = str; + return 0; } -static unsigned int -nfs4_init_uniform_client_string(struct nfs_client *clp, - char *buf, size_t len) +static int +nfs4_init_uniquifier_client_string(struct nfs_client *clp) { - const char *nodename = clp->cl_rpcclient->cl_nodename; - unsigned int result; + int result; + size_t len; + char *str; + + len = 10 + 10 + 1 + 10 + 1 + + strlen(nfs4_client_id_uniquifier) + 1 + + strlen(clp->cl_rpcclient->cl_nodename) + 1; + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; + + result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s", + clp->rpc_ops->version, clp->cl_minorversion, + nfs4_client_id_uniquifier, + clp->cl_rpcclient->cl_nodename); + if (result >= len) { + kfree(str); + return -EINVAL; + } + clp->cl_owner_id = str; + return 0; +} + +static int +nfs4_init_uniform_client_string(struct nfs_client *clp) +{ + int result; + size_t len; + char *str; if (clp->cl_owner_id != NULL) - return strlcpy(buf, clp->cl_owner_id, len); + return 0; if (nfs4_client_id_uniquifier[0] != '\0') - result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", - clp->rpc_ops->version, - clp->cl_minorversion, - nfs4_client_id_uniquifier, - nodename); - else - result = scnprintf(buf, len, "Linux NFSv%u.%u %s", - clp->rpc_ops->version, clp->cl_minorversion, - nodename); - clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); - return result; + return nfs4_init_uniquifier_client_string(clp); + + len = 10 + 10 + 1 + 10 + 1 + + strlen(clp->cl_rpcclient->cl_nodename) + 1; + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; + + result = scnprintf(str, len, "Linux NFSv%u.%u %s", + clp->rpc_ops->version, clp->cl_minorversion, + clp->cl_rpcclient->cl_nodename); + if (result >= len) { + kfree(str); + return -EINVAL; + } + clp->cl_owner_id = str; + return 0; } /* @@ -5045,7 +5140,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, struct nfs4_setclientid setclientid = { .sc_verifier = &sc_verifier, .sc_prog = program, - .sc_cb_ident = clp->cl_cb_ident, + .sc_clnt = clp, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], @@ -5065,16 +5160,15 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, /* nfs_client_id4 */ nfs4_init_boot_verifier(clp, &sc_verifier); + if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) - setclientid.sc_name_len = - nfs4_init_uniform_client_string(clp, - setclientid.sc_name, - sizeof(setclientid.sc_name)); + status = nfs4_init_uniform_client_string(clp); else - setclientid.sc_name_len = - nfs4_init_nonuniform_client_string(clp, - setclientid.sc_name, - sizeof(setclientid.sc_name)); + status = nfs4_init_nonuniform_client_string(clp); + + if (status) + goto out; + /* cb_client4 */ setclientid.sc_netid_len = nfs4_init_callback_netid(clp, @@ -5084,9 +5178,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); - dprintk("NFS call setclientid auth=%s, '%.*s'\n", + dprintk("NFS call setclientid auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, - setclientid.sc_name_len, setclientid.sc_name); + clp->cl_owner_id); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) { status = PTR_ERR(task); @@ -5358,15 +5452,15 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } -static int do_vfs_lock(struct file *file, struct file_lock *fl) +static int do_vfs_lock(struct inode *inode, struct file_lock *fl) { int res = 0; switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { case FL_POSIX: - res = posix_lock_file_wait(file, fl); + res = posix_lock_inode_wait(inode, fl); break; case FL_FLOCK: - res = flock_lock_file_wait(file, fl); + res = flock_lock_inode_wait(inode, fl); break; default: BUG(); @@ -5426,7 +5520,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: renew_lease(calldata->server, calldata->timestamp); - do_vfs_lock(calldata->fl.fl_file, &calldata->fl); + do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl); if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; @@ -5534,7 +5628,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); - if (do_vfs_lock(request->fl_file, request) == -ENOENT) { + if (do_vfs_lock(inode, request) == -ENOENT) { up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -5606,6 +5700,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->server = server; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); + get_file(fl->fl_file); memcpy(&p->fl, fl, sizeof(p->fl)); return p; out_free_seqid: @@ -5670,11 +5765,11 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; switch (task->tk_status) { case 0: - renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), + renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)), data->timestamp); if (data->arg.new_lock) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); - if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) { + if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) { rpc_restart_call_prepare(task); break; } @@ -5718,6 +5813,7 @@ static void nfs4_lock_release(void *calldata) nfs_free_seqid(data->arg.lock_seqid); nfs4_put_lock_state(data->lsp); put_nfs_open_context(data->ctx); + fput(data->fl.fl_file); kfree(data); dprintk("%s: done!\n", __func__); } @@ -5915,7 +6011,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock if (status != 0) goto out; request->fl_flags |= FL_ACCESS; - status = do_vfs_lock(request->fl_file, request); + status = do_vfs_lock(state->inode, request); if (status < 0) goto out; down_read(&nfsi->rwsem); @@ -5923,7 +6019,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ request->fl_flags = fl_flags & ~FL_SLEEP; - status = do_vfs_lock(request->fl_file, request); + status = do_vfs_lock(state->inode, request); up_read(&nfsi->rwsem); goto out; } @@ -6112,7 +6208,7 @@ static int nfs4_xattr_set_nfs4_acl(struct dentry *dentry, const char *key, if (strcmp(key, "") != 0) return -EINVAL; - return nfs4_proc_set_acl(dentry->d_inode, buf, buflen); + return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); } static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, @@ -6121,7 +6217,7 @@ static int nfs4_xattr_get_nfs4_acl(struct dentry *dentry, const char *key, if (strcmp(key, "") != 0) return -EINVAL; - return nfs4_proc_get_acl(dentry->d_inode, buf, buflen); + return nfs4_proc_get_acl(d_inode(dentry), buf, buflen); } static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, @@ -6130,7 +6226,7 @@ static size_t nfs4_xattr_list_nfs4_acl(struct dentry *dentry, char *list, { size_t len = sizeof(XATTR_NAME_NFSV4_ACL); - if (!nfs4_server_supports_acls(NFS_SERVER(dentry->d_inode))) + if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)))) return 0; if (list && len <= list_len) @@ -6158,7 +6254,7 @@ static int nfs4_xattr_get_nfs4_label(struct dentry *dentry, const char *key, void *buf, size_t buflen, int type) { if (security_ismaclabel(key)) - return nfs4_get_security_label(dentry->d_inode, buf, buflen); + return nfs4_get_security_label(d_inode(dentry), buf, buflen); return -EOPNOTSUPP; } @@ -6168,10 +6264,10 @@ static size_t nfs4_xattr_list_nfs4_label(struct dentry *dentry, char *list, { size_t len = 0; - if (nfs_server_capable(dentry->d_inode, NFS_CAP_SECURITY_LABEL)) { - len = security_inode_listsecurity(dentry->d_inode, NULL, 0); + if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) { + len = security_inode_listsecurity(d_inode(dentry), NULL, 0); if (list && len <= list_len) - security_inode_listsecurity(dentry->d_inode, list, len); + security_inode_listsecurity(d_inode(dentry), list, len); } return len; } @@ -6845,11 +6941,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, }; nfs4_init_boot_verifier(clp, &verifier); - args.id_len = nfs4_init_uniform_client_string(clp, args.id, - sizeof(args.id)); - dprintk("NFS call exchange_id auth=%s, '%.*s'\n", + + status = nfs4_init_uniform_client_string(clp); + if (status) + goto out; + + dprintk("NFS call exchange_id auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, - args.id_len, args.id); + clp->cl_owner_id); res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), GFP_NOFS); @@ -6884,7 +6983,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, /* unsupported! */ WARN_ON_ONCE(1); status = -EINVAL; - goto out_server_scope; + goto out_impl_id; } status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); @@ -6912,6 +7011,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, /* use the most recent implementation id */ kfree(clp->cl_implid); clp->cl_implid = res.impl_id; + res.impl_id = NULL; if (clp->cl_serverscope != NULL && !nfs41_same_server_scope(clp->cl_serverscope, @@ -6925,15 +7025,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (clp->cl_serverscope == NULL) { clp->cl_serverscope = res.server_scope; - goto out; + res.server_scope = NULL; } - } else - kfree(res.impl_id); + } -out_server_owner: - kfree(res.server_owner); +out_impl_id: + kfree(res.impl_id); out_server_scope: kfree(res.server_scope); +out_server_owner: + kfree(res.server_owner); out: if (clp->cl_implid != NULL) dprintk("NFS reply exchange_id: Server Implementation ID: " @@ -7483,13 +7584,8 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) goto out; } ret = rpc_wait_for_completion_task(task); - if (!ret) { - struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; - - if (task->tk_status == 0) - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (!ret) ret = task->tk_status; - } rpc_put_task(task); out: dprintk("<-- %s status=%d\n", __func__, ret); @@ -7877,16 +7973,17 @@ static void nfs4_layoutreturn_release(void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct pnfs_layout_hdr *lo = lrp->args.layout; + LIST_HEAD(freeme); dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); pnfs_clear_layoutreturn_waitbit(lo); - clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); - rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); lo->plh_block_lgets--; spin_unlock(&lo->plh_inode->i_lock); + pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); kfree(calldata); @@ -7944,6 +8041,8 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, { struct nfs4_getdeviceinfo_args args = { .pdev = pdev, + .notify_types = NOTIFY_DEVICEID4_CHANGE | + NOTIFY_DEVICEID4_DELETE, }; struct nfs4_getdeviceinfo_res res = { .pdev = pdev, @@ -7958,6 +8057,11 @@ _nfs4_proc_getdeviceinfo(struct nfs_server *server, dprintk("--> %s\n", __func__); status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); + if (res.notification & ~args.notify_types) + dprintk("%s: unsupported notification\n", __func__); + if (res.notification != args.notify_types) + pdev->nocache = 1; + dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -8053,9 +8157,8 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) struct rpc_task *task; int status = 0; - dprintk("NFS: %4d initiating layoutcommit call. sync %d " - "lbw: %llu inode %lu\n", - data->task.tk_pid, sync, + dprintk("NFS: initiating layoutcommit call. sync %d " + "lbw: %llu inode %lu\n", sync, data->args.lastbytewritten, data->args.inode->i_ino); @@ -8494,7 +8597,6 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK, .init_client = nfs40_init_client, .shutdown_client = nfs40_shutdown_client, @@ -8520,7 +8622,6 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1, @@ -8543,13 +8644,13 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .minor_version = 2, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 | NFS_CAP_ALLOCATE | NFS_CAP_DEALLOCATE - | NFS_CAP_SEEK, + | NFS_CAP_SEEK + | NFS_CAP_LAYOUTSTATS, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f95e3b58bbc3..f2e2ad894461 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -42,7 +42,6 @@ #include <linux/slab.h> #include <linux/fs.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include <linux/kthread.h> #include <linux/module.h> #include <linux/random.h> @@ -57,6 +56,7 @@ #include "callback.h" #include "delegation.h" #include "internal.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "pnfs.h" #include "netns.h" @@ -309,7 +309,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) goto do_confirm; - nfs4_begin_drain_session(clp); status = nfs4_proc_exchange_id(clp, cred); if (status != 0) goto out; @@ -1482,6 +1481,8 @@ restart: spin_unlock(&state->state_lock); } nfs4_put_open_state(state); + clear_bit(NFS4CLNT_RECLAIM_NOGRACE, + &state->flags); spin_lock(&sp->so_lock); goto restart; } @@ -1830,6 +1831,7 @@ static int nfs4_establish_lease(struct nfs_client *clp) clp->cl_mvops->reboot_recovery_ops; int status; + nfs4_begin_drain_session(clp); cred = nfs4_get_clid_cred(clp); if (cred == NULL) return -ENOENT; @@ -1902,7 +1904,7 @@ static int nfs4_try_migration(struct nfs_server *server, struct rpc_cred *cred) goto out; } - inode = server->super->s_root->d_inode; + inode = d_inode(server->super->s_root); result = nfs4_proc_get_locations(inode, locations, page, cred); if (result) { dprintk("<-- %s: failed to retrieve fs_locations: %d\n", @@ -2021,7 +2023,7 @@ restart: rcu_read_unlock(); - inode = server->super->s_root->d_inode; + inode = d_inode(server->super->s_root); status = nfs4_proc_fsid_present(inode, cred); if (status != -NFS4ERR_MOVED) goto restart; /* wasn't this one */ @@ -2189,25 +2191,35 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp) } } -static void nfs41_handle_state_revoked(struct nfs_client *clp) +static void nfs41_handle_all_state_revoked(struct nfs_client *clp) { nfs4_reset_all_state(clp); dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); } +static void nfs41_handle_some_state_revoked(struct nfs_client *clp) +{ + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); + nfs4_schedule_state_manager(clp); + + dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); +} + static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) { - /* This will need to handle layouts too */ - nfs_expire_all_delegations(clp); + /* FIXME: For now, we destroy all layouts. */ + pnfs_destroy_all_layouts(clp); + /* FIXME: For now, we test all delegations+open state+locks. */ + nfs41_handle_some_state_revoked(clp); dprintk("%s: Recallable state revoked on server %s!\n", __func__, clp->cl_hostname); } static void nfs41_handle_backchannel_fault(struct nfs_client *clp) { - nfs_expire_all_delegations(clp); - if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + nfs4_schedule_state_manager(clp); + dprintk("%s: server %s declared a backchannel fault\n", __func__, clp->cl_hostname); } @@ -2229,10 +2241,11 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); - if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | - SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED)) + nfs41_handle_all_state_revoked(clp); + if (flags & (SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED)) - nfs41_handle_state_revoked(clp); + nfs41_handle_some_state_revoked(clp); if (flags & SEQ4_STATUS_LEASE_MOVED) nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index 75090feeafad..6fb7cb6b3f4b 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -3,12 +3,12 @@ */ #include <linux/init.h> #include <linux/module.h> -#include <linux/nfs_idmap.h> #include <linux/nfs4_mount.h> #include <linux/nfs_fs.h> #include "delegation.h" #include "internal.h" #include "nfs4_fs.h" +#include "nfs4idmap.h" #include "dns_resolve.h" #include "pnfs.h" #include "nfs.h" @@ -91,10 +91,11 @@ static void nfs4_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - pnfs_return_layout(inode); - pnfs_destroy_layout(NFS_I(inode)); /* If we are holding a delegation, return it! */ nfs_inode_return_delegation_noreclaim(inode); + /* Note that above delegreturn would trigger pnfs return-on-close */ + pnfs_return_layout(inode); + pnfs_destroy_layout(NFS_I(inode)); /* First call standard NFS clear_inode() code */ nfs_clear_inode(inode); } diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index b6ebe7e445f6..0fbd3ab1be22 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -6,10 +6,10 @@ * Copyright (c) 2006 Trond Myklebust <Trond.Myklebust@netapp.com> */ #include <linux/sysctl.h> -#include <linux/nfs_idmap.h> #include <linux/nfs_fs.h> #include "nfs4_fs.h" +#include "nfs4idmap.h" #include "callback.h" static const int nfs_set_port_min = 0; diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 1c32adbe728d..470af1a78bec 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -418,7 +418,7 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fileid = 0; __entry->fhandle = 0; } - __entry->dir = NFS_FILEID(ctx->dentry->d_parent->d_inode); + __entry->dir = NFS_FILEID(d_inode(ctx->dentry->d_parent)); __assign_str(name, ctx->dentry->d_name.name); ), @@ -1110,7 +1110,7 @@ TRACE_EVENT(nfs4_layoutget, ), TP_fast_assign( - const struct inode *inode = ctx->dentry->d_inode; + const struct inode *inode = d_inode(ctx->dentry); __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 5c399ec41079..558cd65dbdb7 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -52,10 +52,10 @@ #include <linux/nfs.h> #include <linux/nfs4.h> #include <linux/nfs_fs.h> -#include <linux/nfs_idmap.h> #include "nfs4_fs.h" #include "internal.h" +#include "nfs4idmap.h" #include "nfs4session.h" #include "pnfs.h" #include "netns.h" @@ -139,7 +139,8 @@ static int nfs4_stat_to_errno(int); #define encode_setclientid_maxsz \ (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \ - XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \ + /* client name */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 1 /* sc_prog */ + \ 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \ @@ -288,7 +289,8 @@ static int nfs4_stat_to_errno(int); #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ encode_verifier_maxsz + \ 1 /* co_ownerid.len */ + \ - XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \ + /* eia_clientowner */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 1 /* flags */ + \ 1 /* spa_how */ + \ /* max is SP4_MACH_CRED (for now) */ + \ @@ -1667,13 +1669,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr); encode_nfs4_verifier(xdr, setclientid->sc_verifier); - encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); + encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id), + setclientid->sc_clnt->cl_owner_id); p = reserve_space(xdr, 4); *p = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); p = reserve_space(xdr, 4); - *p = cpu_to_be32(setclientid->sc_cb_ident); + *p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident); } static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) @@ -1747,7 +1750,8 @@ static void encode_exchange_id(struct xdr_stream *xdr, encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); encode_nfs4_verifier(xdr, args->verifier); - encode_string(xdr, args->id_len, args->id); + encode_string(xdr, strlen(args->client->cl_owner_id), + args->client->cl_owner_id); encode_uint32(xdr, args->flags); encode_uint32(xdr, args->state_protect.how); @@ -1920,7 +1924,7 @@ encode_getdeviceinfo(struct xdr_stream *xdr, p = reserve_space(xdr, 4 + 4); *p++ = cpu_to_be32(1); /* bitmap length */ - *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE); + *p++ = cpu_to_be32(args->notify_types); } static void @@ -5753,8 +5757,9 @@ out_overflow: #if defined(CONFIG_NFS_V4_1) static int decode_getdeviceinfo(struct xdr_stream *xdr, - struct pnfs_device *pdev) + struct nfs4_getdeviceinfo_res *res) { + struct pnfs_device *pdev = res->pdev; __be32 *p; uint32_t len, type; int status; @@ -5802,12 +5807,7 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, if (unlikely(!p)) goto out_overflow; - if (be32_to_cpup(p++) & - ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) { - dprintk("%s: unsupported notification\n", - __func__); - } - + res->notification = be32_to_cpup(p++); for (i = 1; i < len; i++) { if (be32_to_cpup(p++)) { dprintk("%s: unsupported notification\n", @@ -7061,7 +7061,7 @@ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, status = decode_sequence(xdr, &res->seq_res, rqstp); if (status != 0) goto out; - status = decode_getdeviceinfo(xdr, res->pdev); + status = decode_getdeviceinfo(xdr, res); out: return status; } @@ -7365,6 +7365,11 @@ nfs4_stat_to_errno(int stat) .p_name = #proc, \ } +#define STUB(proc) \ +[NFSPROC4_CLNT_##proc] = { \ + .p_name = #proc, \ +} + struct rpc_procinfo nfs4_procedures[] = { PROC(READ, enc_read, dec_read), PROC(WRITE, enc_write, dec_write), @@ -7417,6 +7422,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), + STUB(GETDEVICELIST), PROC(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), @@ -7425,6 +7431,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SEEK, enc_seek, dec_seek), PROC(ALLOCATE, enc_allocate, dec_allocate), PROC(DEALLOCATE, enc_deallocate, dec_deallocate), + PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), #endif /* CONFIG_NFS_V4_2 */ }; diff --git a/fs/nfs/nfstrace.c b/fs/nfs/nfstrace.c index 4eb0aead69b6..c74f7af23d77 100644 --- a/fs/nfs/nfstrace.c +++ b/fs/nfs/nfstrace.c @@ -7,3 +7,6 @@ #define CREATE_TRACE_POINTS #include "nfstrace.h" + +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_enter); +EXPORT_TRACEPOINT_SYMBOL_GPL(nfs_fsync_exit); diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index 24e1d7403c0b..5aaed363556a 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -57,7 +57,7 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) dprintk("%s: free od=%p\n", __func__, de->od.od); osduld_put_device(de->od.od); - kfree(de); + kfree_rcu(d, rcu); } struct objio_segment { @@ -637,6 +637,8 @@ static struct pnfs_layoutdriver_type objlayout_type = { .pg_read_ops = &objio_pg_read_ops, .pg_write_ops = &objio_pg_write_ops, + .sync = pnfs_generic_sync, + .free_deviceid_node = objio_free_deviceid_node, .encode_layoutcommit = objlayout_encode_layoutcommit, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index d57190a0d533..4984bbe55ff1 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -636,9 +636,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); - dprintk("NFS: %5u initiated pgio call " + dprintk("NFS: initiated pgio call " "(req %s/%llu, %u bytes @ offset %llu)\n", - hdr->task.tk_pid, hdr->inode->i_sb->s_id, (unsigned long long)NFS_FILEID(hdr->inode), hdr->args.count, @@ -690,8 +689,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, static void nfs_pgio_release(void *calldata) { struct nfs_pgio_header *hdr = calldata; - if (hdr->rw_ops->rw_release) - hdr->rw_ops->rw_release(hdr); nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -711,7 +708,9 @@ static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror, * nfs_pageio_init - initialise a page io descriptor * @desc: pointer to descriptor * @inode: pointer to inode - * @doio: pointer to io function + * @pg_ops: pointer to pageio operations + * @compl_ops: pointer to pageio completion operations + * @rw_ops: pointer to nfs read/write operations * @bsize: io block size * @io_flags: extra parameters for the io function */ @@ -938,7 +937,7 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev, if (prev) { if (!nfs_match_open_context(req->wb_context, prev->wb_context)) return false; - flctx = req->wb_context->dentry->d_inode->i_flctx; + flctx = d_inode(req->wb_context->dentry)->i_flctx; if (flctx != NULL && !(list_empty_careful(&flctx->flc_posix) && list_empty_careful(&flctx->flc_flock)) && @@ -1101,8 +1100,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) mirror->pg_base = 0; mirror->pg_recoalesce = 0; - desc->pg_moreio = 0; - while (!list_empty(&head)) { struct nfs_page *req; @@ -1110,8 +1107,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) nfs_list_remove_request(req); if (__nfs_pageio_add_request(desc, req)) continue; - if (desc->pg_error < 0) + if (desc->pg_error < 0) { + list_splice_tail(&head, &mirror->pg_list); + mirror->pg_recoalesce = 1; return 0; + } break; } } while (mirror->pg_recoalesce); @@ -1186,6 +1186,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an * nfs_pageio_descriptor * @desc: pointer to io descriptor + * @mirror_idx: pointer to mirror index */ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, u32 mirror_idx) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 4f802b02fbb9..70bf706b1090 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -35,6 +35,7 @@ #include "iostat.h" #include "nfs4trace.h" #include "delegation.h" +#include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_PNFS #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) @@ -351,7 +352,7 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, { struct pnfs_layout_segment *s; - if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) return false; list_for_each_entry(s, &lo->plh_segs, pls_list) @@ -361,6 +362,18 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, return true; } +static bool +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +{ + if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return false; + lo->plh_return_iomode = 0; + lo->plh_block_lgets++; + pnfs_get_layout_hdr(lo); + clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); + return true; +} + static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, struct pnfs_layout_hdr *lo, struct inode *inode) { @@ -371,17 +384,16 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, if (pnfs_layout_need_return(lo, lseg)) { nfs4_stateid stateid; enum pnfs_iomode iomode; + bool send; stateid = lo->plh_stateid; iomode = lo->plh_return_iomode; - /* decreased in pnfs_send_layoutreturn() */ - lo->plh_block_lgets++; - lo->plh_return_iomode = 0; + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); - pnfs_get_layout_hdr(lo); - - /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, stateid, iomode, false); + if (send) { + /* Send an async layoutreturn so we dont deadlock */ + pnfs_send_layoutreturn(lo, stateid, iomode, false); + } } else spin_unlock(&inode->i_lock); } @@ -410,6 +422,10 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + spin_unlock(&inode->i_lock); + return; + } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); spin_unlock(&inode->i_lock); @@ -450,6 +466,8 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); if (atomic_dec_and_test(&lseg->pls_refcount)) { struct pnfs_layout_hdr *lo = lseg->pls_layout; + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + return; pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); pnfs_free_lseg_async(lseg); @@ -923,6 +941,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); smp_mb__after_atomic(); wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); + rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } static int @@ -977,6 +996,7 @@ _pnfs_return_layout(struct inode *ino) LIST_HEAD(tmp_list); nfs4_stateid stateid; int status = 0, empty; + bool send; dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); @@ -1006,17 +1026,18 @@ _pnfs_return_layout(struct inode *ino) /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); - pnfs_put_layout_hdr(lo); dprintk("NFS: %s no layout segments to return\n", __func__); - goto out; + goto out_put_layout_hdr; } set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - lo->plh_block_lgets++; + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - - status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + if (send) + status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); +out_put_layout_hdr: + pnfs_put_layout_hdr(lo); out: dprintk("<-- %s status: %d\n", __func__, status); return status; @@ -1090,22 +1111,21 @@ bool pnfs_roc(struct inode *ino) pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */ spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); + pnfs_layoutcommit_inode(ino, true); return true; out_noroc: if (lo) { stateid = lo->plh_stateid; - layoutreturn = - test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); - if (layoutreturn) { - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); - } + if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo); } spin_unlock(&ino->i_lock); - if (layoutreturn) + if (layoutreturn) { + pnfs_layoutcommit_inode(ino, true); pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + } return false; } @@ -1142,15 +1162,18 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) struct pnfs_layout_segment *lseg; nfs4_stateid stateid; u32 current_seqid; - bool found = false, layoutreturn = false; + bool layoutreturn = false; spin_lock(&ino->i_lock); - list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) - if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - found = true; - goto out; - } + list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) + continue; + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + continue; + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + spin_unlock(&ino->i_lock); + return true; + } lo = nfsi->layout; current_seqid = be32_to_cpu(lo->plh_stateid.seqid); @@ -1158,23 +1181,19 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) * a barrier, we choose the worst-case barrier. */ *barrier = current_seqid + atomic_read(&lo->plh_outstanding); -out: - if (!found) { - stateid = lo->plh_stateid; - layoutreturn = - test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); - if (layoutreturn) { - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); - } - } + stateid = lo->plh_stateid; + if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo); + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + spin_unlock(&ino->i_lock); if (layoutreturn) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false); + return true; } - return found; + return false; } /* @@ -1691,7 +1710,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, spin_lock(&inode->i_lock); /* set failure bit so that pnfs path will be retried later */ pnfs_layout_set_fail_bit(lo, iomode); - set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); if (lo->plh_return_iomode == 0) lo->plh_return_iomode = range.iomode; else if (lo->plh_return_iomode != range.iomode) @@ -1818,6 +1836,7 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, hdr->completion_ops); + set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags); return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); @@ -1841,7 +1860,8 @@ void pnfs_ld_write_done(struct nfs_pgio_header *hdr) { trace_nfs4_pnfs_write(hdr, hdr->pnfs_error); if (!hdr->pnfs_error) { - pnfs_set_layoutcommit(hdr); + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, + hdr->mds_offset + hdr->res.count); hdr->mds_ops->rpc_call_done(&hdr->task, hdr); } else pnfs_ld_handle_write_error(hdr); @@ -1861,6 +1881,7 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, mirror->pg_recoalesce = 1; } nfs_pgio_data_destroy(hdr); + hdr->release(hdr); } static enum pnfs_try_status @@ -1902,7 +1923,6 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) pnfs_put_lseg(hdr->lseg); nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_writehdr_free); int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) @@ -1976,6 +1996,7 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, mirror->pg_recoalesce = 1; } nfs_pgio_data_destroy(hdr); + hdr->release(hdr); } /* @@ -2032,7 +2053,6 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) pnfs_put_lseg(hdr->lseg); nfs_pgio_header_free(hdr); } -EXPORT_SYMBOL_GPL(pnfs_readhdr_free); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) @@ -2099,64 +2119,34 @@ void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg) EXPORT_SYMBOL_GPL(pnfs_set_lo_fail); void -pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) +pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg, + loff_t end_pos) { - struct inode *inode = hdr->inode; struct nfs_inode *nfsi = NFS_I(inode); - loff_t end_pos = hdr->mds_offset + hdr->res.count; bool mark_as_dirty = false; spin_lock(&inode->i_lock); if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { - mark_as_dirty = true; - dprintk("%s: Set layoutcommit for inode %lu ", - __func__, inode->i_ino); - } - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &hdr->lseg->pls_flags)) { - /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(hdr->lseg); - } - if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - spin_unlock(&inode->i_lock); - dprintk("%s: lseg %p end_pos %llu\n", - __func__, hdr->lseg, nfsi->layout->plh_lwb); - - /* if pnfs_layoutcommit_inode() runs between inode locks, the next one - * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ - if (mark_as_dirty) - mark_inode_dirty_sync(inode); -} -EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); - -void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data) -{ - struct inode *inode = data->inode; - struct nfs_inode *nfsi = NFS_I(inode); - bool mark_as_dirty = false; - - spin_lock(&inode->i_lock); - if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { mark_as_dirty = true; dprintk("%s: Set layoutcommit for inode %lu ", __func__, inode->i_ino); - } - if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { + } else if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) { /* references matched in nfs4_layoutcommit_release */ - pnfs_get_lseg(data->lseg); + pnfs_get_lseg(lseg); } - if (data->lwb > nfsi->layout->plh_lwb) - nfsi->layout->plh_lwb = data->lwb; spin_unlock(&inode->i_lock); dprintk("%s: lseg %p end_pos %llu\n", - __func__, data->lseg, nfsi->layout->plh_lwb); + __func__, lseg, nfsi->layout->plh_lwb); /* if pnfs_layoutcommit_inode() runs between inode locks, the next one * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ if (mark_as_dirty) mark_inode_dirty_sync(inode); } -EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit); +EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) { @@ -2216,7 +2206,6 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) pnfs_list_write_lseg(inode, &data->lseg_list); end_pos = nfsi->layout->plh_lwb; - nfsi->layout->plh_lwb = 0; nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid); spin_unlock(&inode->i_lock); @@ -2232,13 +2221,12 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (ld->prepare_layoutcommit) { status = ld->prepare_layoutcommit(&data->args); if (status) { - spin_lock(&inode->i_lock); - if (end_pos < nfsi->layout->plh_lwb) - nfsi->layout->plh_lwb = end_pos; - spin_unlock(&inode->i_lock); put_rpccred(data->cred); + spin_lock(&inode->i_lock); set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); - goto clear_layoutcommitting; + if (end_pos > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + goto out_unlock; } } @@ -2258,6 +2246,13 @@ clear_layoutcommitting: } EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode); +int +pnfs_generic_sync(struct inode *inode, bool datasync) +{ + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_generic_sync); + struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) { struct nfs4_threshold *thp; @@ -2269,3 +2264,63 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) } return thp; } + +#if IS_ENABLED(CONFIG_NFS_V4_2) +int +pnfs_report_layoutstat(struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs42_layoutstat_data *data; + struct pnfs_layout_hdr *hdr; + int status = 0; + + if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats) + goto out; + + if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS)) + goto out; + + if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags)) + goto out; + + spin_lock(&inode->i_lock); + if (!NFS_I(inode)->layout) { + spin_unlock(&inode->i_lock); + goto out; + } + hdr = NFS_I(inode)->layout; + pnfs_get_layout_hdr(hdr); + spin_unlock(&inode->i_lock); + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) { + status = -ENOMEM; + goto out_put; + } + + data->args.fh = NFS_FH(inode); + data->args.inode = inode; + nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid); + status = ld->prepare_layoutstats(&data->args); + if (status) + goto out_free; + + status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data); + +out: + dprintk("%s returns %d\n", __func__, status); + return status; + +out_free: + kfree(data); +out_put: + pnfs_put_layout_hdr(hdr); + smp_mb__before_atomic(); + clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); + smp_mb__after_atomic(); + goto out; +} +EXPORT_SYMBOL_GPL(pnfs_report_layoutstat); +#endif diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 635f0865671c..3e6ab7bfbabd 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -155,6 +155,8 @@ struct pnfs_layoutdriver_type { int how, struct nfs_commit_info *cinfo); + int (*sync)(struct inode *inode, bool datasync); + /* * Return PNFS_ATTEMPTED to indicate the layout code has attempted * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS @@ -176,6 +178,8 @@ struct pnfs_layoutdriver_type { void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); + int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); + void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data); }; struct pnfs_layout_hdr { @@ -203,6 +207,7 @@ struct pnfs_device { struct page **pages; unsigned int pgbase; unsigned int pglen; /* reply buffer length */ + unsigned char nocache : 1;/* May not be cached */ }; #define NFS4_PNFS_GETDEVLIST_MAXNUM 16 @@ -263,10 +268,11 @@ bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); -void pnfs_set_layoutcommit(struct nfs_pgio_header *); -void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data); +void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); +int pnfs_generic_sync(struct inode *inode, bool datasync); +int pnfs_nfs_generic_sync(struct inode *inode, bool datasync); int _pnfs_return_layout(struct inode *); int pnfs_commit_and_return_layout(struct inode *); void pnfs_ld_write_done(struct nfs_pgio_header *); @@ -286,11 +292,11 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg); - /* nfs4_deviceid_flags */ enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ NFS_DEVICEID_UNAVAILABLE, /* device temporarily unavailable */ + NFS_DEVICEID_NOCACHE, /* device may not be cached */ }; /* pnfs_dev.c */ @@ -302,6 +308,7 @@ struct nfs4_deviceid_node { unsigned long flags; unsigned long timestamp_unavailable; struct nfs4_deviceid deviceid; + struct rcu_head rcu; atomic_t ref; }; @@ -426,7 +433,7 @@ static inline bool pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (lseg == NULL || ld->mark_request_commit == NULL) @@ -438,7 +445,7 @@ pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static inline bool pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; if (ld == NULL || ld->clear_request_commit == NULL) @@ -486,6 +493,14 @@ pnfs_ld_read_whole_page(struct inode *inode) return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE; } +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return 0; + return NFS_SERVER(inode)->pnfs_curr_ld->sync(inode, datasync); +} + static inline bool pnfs_layoutcommit_outstanding(struct inode *inode) { @@ -568,6 +583,12 @@ pnfs_ld_read_whole_page(struct inode *inode) return false; } +static inline int +pnfs_sync_inode(struct inode *inode, bool datasync) +{ + return 0; +} + static inline bool pnfs_roc(struct inode *ino) { @@ -669,4 +690,14 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) #endif /* CONFIG_NFS_V4_1 */ +#if IS_ENABLED(CONFIG_NFS_V4_2) +int pnfs_report_layoutstat(struct inode *inode); +#else +static inline int +pnfs_report_layoutstat(struct inode *inode) +{ + return 0; +} +#endif + #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index aa2ec0015183..2961fcd7a2df 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -149,6 +149,8 @@ nfs4_get_device_info(struct nfs_server *server, */ d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev, gfp_flags); + if (d && pdev->nocache) + set_bit(NFS_DEVICEID_NOCACHE, &d->flags); out_free_pages: for (i = 0; i < max_pages; i++) @@ -175,8 +177,8 @@ __nfs4_find_get_deviceid(struct nfs_server *server, rcu_read_lock(); d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id, hash); - if (d != NULL) - atomic_inc(&d->ref); + if (d != NULL && !atomic_inc_not_zero(&d->ref)) + d = NULL; rcu_read_unlock(); return d; } @@ -235,12 +237,11 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, return; } hlist_del_init_rcu(&d->node); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); spin_unlock(&nfs4_deviceid_lock); - synchronize_rcu(); /* balance the initial ref set in pnfs_insert_deviceid */ - if (atomic_dec_and_test(&d->ref)) - d->ld->free_deviceid_node(d); + nfs4_put_deviceid_node(d); } EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); @@ -271,6 +272,11 @@ EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *d) { + if (test_bit(NFS_DEVICEID_NOCACHE, &d->flags)) { + if (atomic_add_unless(&d->ref, -1, 2)) + return false; + nfs4_delete_deviceid(d->ld, d->nfs_client, &d->deviceid); + } if (!atomic_dec_and_test(&d->ref)) return false; d->ld->free_deviceid_node(d); @@ -314,6 +320,7 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) if (d->nfs_client == clp && atomic_read(&d->ref)) { hlist_del_init_rcu(&d->node); hlist_add_head(&d->tmpnode, &tmp); + clear_bit(NFS_DEVICEID_NOCACHE, &d->flags); } rcu_read_unlock(); spin_unlock(&nfs4_deviceid_lock); @@ -321,12 +328,10 @@ _deviceid_purge_client(const struct nfs_client *clp, long hash) if (hlist_empty(&tmp)) return; - synchronize_rcu(); while (!hlist_empty(&tmp)) { d = hlist_entry(tmp.first, struct nfs4_deviceid_node, tmpnode); hlist_del(&d->tmpnode); - if (atomic_dec_and_test(&d->ref)) - d->ld->free_deviceid_node(d); + nfs4_put_deviceid_node(d); } } diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 54e36b38fb5f..f37e25b6311c 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -561,7 +561,7 @@ static bool load_v3_ds_connect(void) return(get_v3_ds_connect != NULL); } -void __exit nfs4_pnfs_v3_ds_connect_unload(void) +void nfs4_pnfs_v3_ds_connect_unload(void) { if (get_v3_ds_connect) { symbol_put(nfs3_set_ds_client); @@ -868,3 +868,13 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, nfs_request_add_commit_list(req, list, cinfo); } EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit); + +int +pnfs_nfs_generic_sync(struct inode *inode, bool datasync) +{ + if (datasync) + return 0; + return pnfs_layoutcommit_inode(inode, true); +} +EXPORT_SYMBOL_GPL(pnfs_nfs_generic_sync); + diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index c63189acd052..b417bbcd9704 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -118,7 +118,7 @@ static int nfs_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct iattr *sattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nfs_sattrargs arg = { .fh = NFS_FH(inode), .sattr = sattr @@ -487,7 +487,7 @@ static int nfs_proc_readdir(struct dentry *dentry, struct rpc_cred *cred, u64 cookie, struct page **pages, unsigned int count, int plus) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct nfs_readdirargs arg = { .fh = NFS_FH(dir), .cookie = cookie, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 568ecf0a880f..ae0ff7a11b40 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -117,15 +117,15 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, static void nfs_readpage_release(struct nfs_page *req) { - struct inode *d_inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); - dprintk("NFS: read done (%s/%llu %d@%lld)\n", d_inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(d_inode), req->wb_bytes, + dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode), req->wb_bytes, (long long)req_offset(req)); if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(d_inode, req->wb_page, 0); + nfs_readpage_to_fscache(inode, req->wb_page, 0); unlock_page(req->wb_page); } @@ -284,7 +284,7 @@ int nfs_readpage(struct file *file, struct page *page) dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", page, PAGE_CACHE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); - nfs_inc_stats(inode, NFSIOS_READPAGES); + nfs_add_stats(inode, NFSIOS_READPAGES, 1); /* * Try to flush any pending writes to the file.. diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 322b2de02988..aa62004f1706 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -43,7 +43,6 @@ #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/namei.h> -#include <linux/nfs_idmap.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/in6.h> @@ -433,7 +432,7 @@ int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct nfs_server *server = NFS_SB(dentry->d_sb); unsigned char blockbits; unsigned long blockres; - struct nfs_fh *fh = NFS_FH(dentry->d_inode); + struct nfs_fh *fh = NFS_FH(d_inode(dentry)); struct nfs_fsstat res; int error = -ENOMEM; @@ -447,7 +446,7 @@ int nfs_statfs(struct dentry *dentry, struct kstatfs *buf) pd_dentry = dget_parent(dentry); if (pd_dentry != NULL) { - nfs_zap_caches(pd_dentry->d_inode); + nfs_zap_caches(d_inode(pd_dentry)); dput(pd_dentry); } } @@ -2193,7 +2192,7 @@ nfs_compare_remount_data(struct nfs_server *nfss, data->version != nfss->nfs_client->rpc_ops->version || data->minorversion != nfss->nfs_client->cl_minorversion || data->retrans != nfss->client->cl_timeout->to_retries || - data->selected_flavor != nfss->client->cl_auth->au_flavor || + !nfs_auth_info_match(&data->auth_info, nfss->client->cl_auth->au_flavor) || data->acregmin != nfss->acregmin / HZ || data->acregmax != nfss->acregmax / HZ || data->acdirmin != nfss->acdirmin / HZ || @@ -2241,7 +2240,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) data->wsize = nfss->wsize; data->retrans = nfss->client->cl_timeout->to_retries; data->selected_flavor = nfss->client->cl_auth->au_flavor; - data->auth_info = nfss->auth_info; data->acregmin = nfss->acregmin / HZ; data->acregmax = nfss->acregmax / HZ; data->acdirmin = nfss->acdirmin / HZ; @@ -2526,7 +2524,7 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { /* clone any lsm security options from the parent to the new sb */ - if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) + if (d_inode(mntroot)->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops) return -ESTALE; return security_sb_clone_mnt_opts(mount_info->cloned->sb, s); } @@ -2849,7 +2847,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp) *((unsigned int *)kp->arg) = num; return 0; } -static struct kernel_param_ops param_ops_portnr = { +static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 05c9e02f4153..b6de433da5db 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -20,7 +20,6 @@ #include <linux/stat.h> #include <linux/mm.h> #include <linux/string.h> -#include <linux/namei.h> /* Symlink caching in the page cache is even more simplistic * and straight-forward than readdir caching. @@ -43,27 +42,21 @@ error: return -EIO; } -static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *nfs_follow_link(struct dentry *dentry, void **cookie) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct page *page; void *err; err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) - goto read_failed; + return err; page = read_cache_page(&inode->i_data, 0, (filler_t *)nfs_symlink_filler, inode); - if (IS_ERR(page)) { - err = page; - goto read_failed; - } - nd_set_link(nd, kmap(page)); - return page; - -read_failed: - nd_set_link(nd, err); - return NULL; + if (IS_ERR(page)) + return ERR_CAST(page); + *cookie = page; + return kmap(page); } /* diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index de54129336c6..fa538b2ba251 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -143,7 +143,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n nfs_free_dname(data); ret = nfs_copy_dname(alias, data); spin_lock(&alias->d_lock); - if (ret == 0 && alias->d_inode != NULL && + if (ret == 0 && d_really_is_positive(alias) && !(alias->d_flags & DCACHE_NFSFS_RENAMED)) { devname_garbage = alias->d_fsdata; alias->d_fsdata = data; @@ -190,7 +190,7 @@ static int nfs_call_unlink(struct dentry *dentry, struct nfs_unlinkdata *data) parent = dget_parent(dentry); if (parent == NULL) goto out_free; - dir = parent->d_inode; + dir = d_inode(parent); /* Non-exclusive lock protects against concurrent lookup() calls */ spin_lock(&dir->i_lock); if (atomic_inc_not_zero(&NFS_I(dir)->silly_count) == 0) { @@ -210,21 +210,21 @@ out_free: void nfs_wait_on_sillyrename(struct dentry *dentry) { - struct nfs_inode *nfsi = NFS_I(dentry->d_inode); + struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); wait_event(nfsi->waitqueue, atomic_read(&nfsi->silly_count) <= 1); } void nfs_block_sillyrename(struct dentry *dentry) { - struct nfs_inode *nfsi = NFS_I(dentry->d_inode); + struct nfs_inode *nfsi = NFS_I(d_inode(dentry)); wait_event(nfsi->waitqueue, atomic_cmpxchg(&nfsi->silly_count, 1, 0) == 1); } void nfs_unblock_sillyrename(struct dentry *dentry) { - struct inode *dir = dentry->d_inode; + struct inode *dir = d_inode(dentry); struct nfs_inode *nfsi = NFS_I(dir); struct nfs_unlinkdata *data; @@ -367,8 +367,8 @@ static void nfs_async_rename_release(void *calldata) struct nfs_renamedata *data = calldata; struct super_block *sb = data->old_dir->i_sb; - if (data->old_dentry->d_inode) - nfs_mark_for_revalidate(data->old_dentry->d_inode); + if (d_really_is_positive(data->old_dentry)) + nfs_mark_for_revalidate(d_inode(data->old_dentry)); dput(data->old_dentry); dput(data->new_dentry); @@ -529,10 +529,10 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) if (dentry->d_flags & DCACHE_NFSFS_RENAMED) goto out; - fileid = NFS_FILEID(dentry->d_inode); + fileid = NFS_FILEID(d_inode(dentry)); /* Return delegation in anticipation of the rename */ - NFS_PROTO(dentry->d_inode)->return_delegation(dentry->d_inode); + NFS_PROTO(d_inode(dentry))->return_delegation(d_inode(dentry)); sdentry = NULL; do { @@ -554,7 +554,7 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) */ if (IS_ERR(sdentry)) goto out; - } while (sdentry->d_inode != NULL); /* need negative lookup */ + } while (d_inode(sdentry) != NULL); /* need negative lookup */ /* queue unlink first. Can't do this from rpc_release as it * has to allocate memory diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 849ed784d6ac..75a35a1afa79 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -580,7 +580,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); - nfs_inc_stats(inode, NFSIOS_WRITEPAGES); + nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); nfs_pageio_cond_complete(pgio, page_file_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); @@ -702,7 +702,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) */ static void nfs_inode_remove_request(struct nfs_page *req) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct nfs_inode *nfsi = NFS_I(inode); struct nfs_page *head; @@ -853,7 +853,8 @@ static void nfs_clear_page_commit(struct page *page) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE); + dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, + WB_RECLAIMABLE); } /* Called holding inode (/cinfo) lock */ @@ -861,7 +862,7 @@ static void nfs_clear_request_commit(struct nfs_page *req) { if (test_bit(PG_CLEAN, &req->wb_flags)) { - struct inode *inode = req->wb_context->dentry->d_inode; + struct inode *inode = d_inode(req->wb_context->dentry); struct nfs_commit_info cinfo; nfs_init_cinfo_from_inode(&cinfo, inode); @@ -1289,6 +1290,7 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, static void nfs_redirty_request(struct nfs_page *req) { nfs_mark_request_dirty(req); + set_bit(NFS_CONTEXT_RESEND_WRITES, &req->wb_context->flags); nfs_unlock_request(req); nfs_end_page_writeback(req); nfs_release_request(req); @@ -1347,11 +1349,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -static void nfs_writeback_release_common(struct nfs_pgio_header *hdr) -{ - /* do nothing! */ -} - /* * Special version of should_remove_suid() that ignores capabilities. */ @@ -1382,24 +1379,27 @@ static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, { struct nfs_pgio_args *argp = &hdr->args; struct nfs_pgio_res *resp = &hdr->res; + u64 size = argp->offset + resp->count; if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + fattr->size = size; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) { + fattr->valid &= ~NFS_ATTR_FATTR_SIZE; return; - if (argp->offset + resp->count != fattr->size) - return; - if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + } + if (size != fattr->size) return; /* Set attribute barrier */ nfs_fattr_set_barrier(fattr); + /* ...and update size */ + fattr->valid |= NFS_ATTR_FATTR_SIZE; } void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) { - struct nfs_fattr *fattr = hdr->res.fattr; + struct nfs_fattr *fattr = &hdr->fattr; struct inode *inode = hdr->inode; - if (fattr == NULL) - return; spin_lock(&inode->i_lock); nfs_writeback_check_extend(hdr, fattr); nfs_post_op_update_inode_force_wcc_locked(inode, fattr); @@ -1555,7 +1555,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, /* Set up the initial task struct. */ nfs_ops->commit_setup(data, &msg); - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + dprintk("NFS: initiated commit call\n"); nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); @@ -1591,7 +1591,7 @@ void nfs_init_commit(struct nfs_commit_data *data, struct nfs_commit_info *cinfo) { struct nfs_page *first = nfs_list_entry(head->next); - struct inode *inode = first->wb_context->dentry->d_inode; + struct inode *inode = d_inode(first->wb_context->dentry); /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -1690,7 +1690,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) dprintk("NFS: commit (%s/%llu %d@%lld)", req->wb_context->dentry->d_sb->s_id, - (unsigned long long)NFS_FILEID(req->wb_context->dentry->d_inode), + (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), req->wb_bytes, (long long)req_offset(req)); if (status < 0) { @@ -1840,18 +1840,20 @@ EXPORT_SYMBOL_GPL(nfs_write_inode); */ int nfs_wb_all(struct inode *inode) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = 0, - .range_end = LLONG_MAX, - }; int ret; trace_nfs_writeback_inode_enter(inode); - ret = sync_inode(inode, &wbc); + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + goto out; + ret = nfs_commit_inode(inode, FLUSH_SYNC); + if (ret < 0) + goto out; + pnfs_sync_inode(inode, true); + ret = 0; +out: trace_nfs_writeback_inode_exit(inode, ret); return ret; } @@ -1876,11 +1878,6 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) * request from the inode / page_private pointer and * release it */ nfs_inode_remove_request(req); - /* - * In case nfs_inode_remove_request has marked the - * page as being dirty - */ - cancel_dirty_page(page, PAGE_CACHE_SIZE); nfs_unlock_and_release_request(req); } @@ -2015,7 +2012,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = { .rw_mode = FMODE_WRITE, .rw_alloc_header = nfs_writehdr_alloc, .rw_free_header = nfs_writehdr_free, - .rw_release = nfs_writeback_release_common, .rw_done = nfs_writeback_done, .rw_result = nfs_writeback_result, .rw_initiate = nfs_initiate_write, diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 683bf718aead..a0b77fc1bd39 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -6,6 +6,7 @@ config NFSD select SUNRPC select EXPORTFS select NFS_ACL_SUPPORT if NFSD_V2_ACL + depends on MULTIUSER help Choose Y here if you want to allow other computers to access files residing on this system using Sun's Network File System @@ -107,7 +108,7 @@ config NFSD_V4_SECURITY_LABEL config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" - depends on NFSD_V4 && DEBUG_KERNEL + depends on NFSD_V4 && DEBUG_KERNEL && DEBUG_FS help This option enables support for manually injecting faults into the NFS server. This is intended to be used for diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 03d647bf195d..cdefaa331a07 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -181,6 +181,17 @@ nfsd4_block_proc_layoutcommit(struct inode *inode, } const struct nfsd4_layout_ops bl_layout_ops = { + /* + * Pretend that we send notification to the client. This is a blatant + * lie to force recent Linux clients to cache our device IDs. + * We rarely ever change the device ID, so the harm of leaking deviceids + * for a while isn't too bad. Unfortunately RFC5661 is a complete mess + * in this regard, but I filed errata 4119 for this a while ago, and + * hopefully the Linux client will eventually start caching deviceids + * without this again. + */ + .notify_types = + NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, .proc_getdeviceinfo = nfsd4_block_proc_getdeviceinfo, .encode_getdeviceinfo = nfsd4_block_encode_getdeviceinfo, .proc_layoutget = nfsd4_block_proc_layoutget, diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c3e3b6e55ae2..f79521a59747 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -599,7 +599,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) goto out4; } - err = check_export(exp.ex_path.dentry->d_inode, &exp.ex_flags, + err = check_export(d_inode(exp.ex_path.dentry), &exp.ex_flags, exp.ex_uuid); if (err) goto out4; @@ -691,8 +691,7 @@ static int svc_export_match(struct cache_head *a, struct cache_head *b) struct svc_export *orig = container_of(a, struct svc_export, h); struct svc_export *new = container_of(b, struct svc_export, h); return orig->ex_client == new->ex_client && - orig->ex_path.dentry == new->ex_path.dentry && - orig->ex_path.mnt == new->ex_path.mnt; + path_equal(&orig->ex_path, &new->ex_path); } static void svc_export_init(struct cache_head *cnew, struct cache_head *citem) @@ -891,7 +890,7 @@ exp_rootfh(struct net *net, struct auth_domain *clp, char *name, printk("nfsd: exp_rootfh path not found %s", name); return err; } - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n", name, path.dentry, clp->name, @@ -1159,6 +1158,7 @@ static struct flags { { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, { NFSEXP_NOAUTHNLM, {"insecure_locks", ""}}, { NFSEXP_V4ROOT, {"v4root", ""}}, + { NFSEXP_PNFS, {"pnfs", ""}}, { 0, {"", ""}} }; diff --git a/fs/nfsd/nfs2acl.c b/fs/nfsd/nfs2acl.c index ac54ea60b3f6..d54701f6dc78 100644 --- a/fs/nfsd/nfs2acl.c +++ b/fs/nfsd/nfs2acl.c @@ -42,7 +42,7 @@ static __be32 nfsacld_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); @@ -103,7 +103,7 @@ static __be32 nfsacld_proc_setacl(struct svc_rqst * rqstp, if (nfserr) goto out; - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { error = -EOPNOTSUPP; goto out_errno; @@ -266,9 +266,9 @@ static int nfsaclsvc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, * nfsd_dispatch actually ensures the following cannot happen. * However, it seems fragile to depend on that. */ - if (dentry == NULL || dentry->d_inode == NULL) + if (dentry == NULL || d_really_is_negative(dentry)) return 0; - inode = dentry->d_inode; + inode = d_inode(dentry); p = nfs2svc_encode_fattr(rqstp, p, &resp->fh, &resp->stat); *p++ = htonl(resp->mask); diff --git a/fs/nfsd/nfs3acl.c b/fs/nfsd/nfs3acl.c index 34cbbab6abd7..882b1a14bc3e 100644 --- a/fs/nfsd/nfs3acl.c +++ b/fs/nfsd/nfs3acl.c @@ -39,7 +39,7 @@ static __be32 nfsd3_proc_getacl(struct svc_rqst * rqstp, if (nfserr) RETURN_STATUS(nfserr); - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (argp->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) RETURN_STATUS(nfserr_inval); @@ -94,7 +94,7 @@ static __be32 nfsd3_proc_setacl(struct svc_rqst * rqstp, if (nfserr) goto out; - inode = fh->fh_dentry->d_inode; + inode = d_inode(fh->fh_dentry); if (!IS_POSIXACL(inode) || !inode->i_op->set_acl) { error = -EOPNOTSUPP; goto out_errno; @@ -174,8 +174,8 @@ static int nfs3svc_encode_getaclres(struct svc_rqst *rqstp, __be32 *p, struct dentry *dentry = resp->fh.fh_dentry; p = nfs3svc_encode_post_op_attr(rqstp, p, &resp->fh); - if (resp->status == 0 && dentry && dentry->d_inode) { - struct inode *inode = dentry->d_inode; + if (resp->status == 0 && dentry && d_really_is_positive(dentry)) { + struct inode *inode = d_inode(dentry); struct kvec *head = rqstp->rq_res.head; unsigned int base; int n; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 12f2aab4f614..7b755b7f785c 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -166,7 +166,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, rqstp->rq_vec, argp->vlen, &resp->count); if (nfserr == 0) { - struct inode *inode = resp->fh.fh_dentry->d_inode; + struct inode *inode = d_inode(resp->fh.fh_dentry); resp->eof = (argp->offset + resp->count) >= inode->i_size; } @@ -551,7 +551,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, * different read/write sizes for file systems known to have * problems with large blocks */ if (nfserr == 0) { - struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; + struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb; /* Note that we don't care for remote fs's here */ if (sb->s_magic == MSDOS_SUPER_MAGIC) { @@ -587,7 +587,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); if (nfserr == 0) { - struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; + struct super_block *sb = d_inode(argp->fh.fh_dentry)->i_sb; /* Note that we don't care for remote fs's here */ switch (sb->s_magic) { diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 39c5eb3ad33a..f6e7cbabac5a 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -146,7 +146,7 @@ static __be32 *encode_fsid(__be32 *p, struct svc_fh *fhp) default: case FSIDSOURCE_DEV: p = xdr_encode_hyper(p, (u64)huge_encode_dev - (fhp->fh_dentry->d_inode->i_sb->s_dev)); + (d_inode(fhp->fh_dentry)->i_sb->s_dev)); break; case FSIDSOURCE_FSID: p = xdr_encode_hyper(p, (u64) fhp->fh_export->ex_fsid); @@ -203,14 +203,14 @@ static __be32 * encode_post_op_attr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) { struct dentry *dentry = fhp->fh_dentry; - if (dentry && dentry->d_inode) { + if (dentry && d_really_is_positive(dentry)) { __be32 err; struct kstat stat; err = fh_getattr(fhp, &stat); if (!err) { *p++ = xdr_one; /* attributes follow */ - lease_get_mtime(dentry->d_inode, &stat.mtime); + lease_get_mtime(d_inode(dentry), &stat.mtime); return encode_fattr3(rqstp, p, fhp, &stat); } } @@ -233,7 +233,7 @@ encode_wcc_data(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp) { struct dentry *dentry = fhp->fh_dentry; - if (dentry && dentry->d_inode && fhp->fh_post_saved) { + if (dentry && d_really_is_positive(dentry) && fhp->fh_post_saved) { if (fhp->fh_pre_saved) { *p++ = xdr_one; p = xdr_encode_hyper(p, (u64) fhp->fh_pre_size); @@ -260,11 +260,11 @@ void fill_post_wcc(struct svc_fh *fhp) printk("nfsd: inode locked twice during operation.\n"); err = fh_getattr(fhp, &fhp->fh_post_attr); - fhp->fh_post_change = fhp->fh_dentry->d_inode->i_version; + fhp->fh_post_change = d_inode(fhp->fh_dentry)->i_version; if (err) { fhp->fh_post_saved = 0; /* Grab the ctime anyway - set_change_info might use it */ - fhp->fh_post_attr.ctime = fhp->fh_dentry->d_inode->i_ctime; + fhp->fh_post_attr.ctime = d_inode(fhp->fh_dentry)->i_ctime; } else fhp->fh_post_saved = 1; } @@ -628,7 +628,7 @@ nfs3svc_encode_attrstat(struct svc_rqst *rqstp, __be32 *p, struct nfsd3_attrstat *resp) { if (resp->status == 0) { - lease_get_mtime(resp->fh.fh_dentry->d_inode, + lease_get_mtime(d_inode(resp->fh.fh_dentry), &resp->stat.mtime); p = encode_fattr3(rqstp, p, &resp->fh, &resp->stat); } @@ -805,7 +805,7 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, static __be32 compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, - const char *name, int namlen) + const char *name, int namlen, u64 ino) { struct svc_export *exp; struct dentry *dparent, *dchild; @@ -828,7 +828,9 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, return rv; if (d_mountpoint(dchild)) goto out; - if (!dchild->d_inode) + if (d_really_is_negative(dchild)) + goto out; + if (dchild->d_inode->i_ino != ino) goto out; rv = fh_compose(fhp, exp, dchild, &cd->fh); out: @@ -836,13 +838,13 @@ out: return rv; } -static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) +static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino) { struct svc_fh *fh = &cd->scratch; __be32 err; fh_init(fh, NFS3_FHSIZE); - err = compose_entry_fh(cd, fh, name, namlen); + err = compose_entry_fh(cd, fh, name, namlen, ino); if (err) { *p++ = 0; *p++ = 0; @@ -927,7 +929,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p = encode_entry_baggage(cd, p, name, namlen, ino); if (plus) - p = encode_entryplus_baggage(cd, p, name, namlen); + p = encode_entryplus_baggage(cd, p, name, namlen, ino); num_entry_words = p - cd->buffer; } else if (*(page+1) != NULL) { /* temporarily encode entry into next page, then move back to @@ -941,7 +943,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p1 = encode_entry_baggage(cd, p1, name, namlen, ino); if (plus) - p1 = encode_entryplus_baggage(cd, p1, name, namlen); + p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino); /* determine entry word length and lengths to go in pages */ num_entry_words = p1 - tmp; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 59fd76651781..eb5accf1b37f 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -52,10 +52,6 @@ #define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE) #define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL) -/* We don't support these bits; insist they be neither allowed nor denied */ -#define NFS4_MASK_UNSUPP (NFS4_ACE_DELETE | NFS4_ACE_WRITE_OWNER \ - | NFS4_ACE_READ_NAMED_ATTRS | NFS4_ACE_WRITE_NAMED_ATTRS) - /* flags used to simulate posix default ACLs */ #define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ | NFS4_ACE_DIRECTORY_INHERIT_ACE) @@ -64,9 +60,6 @@ | NFS4_ACE_INHERIT_ONLY_ACE \ | NFS4_ACE_IDENTIFIER_GROUP) -#define MASK_EQUAL(mask1, mask2) \ - ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) ) - static u32 mask_from_posix(unsigned short perm, unsigned int flags) { @@ -126,11 +119,6 @@ low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags) *mode |= ACL_EXECUTE; } -struct ace_container { - struct nfs4_ace *ace; - struct list_head ace_l; -}; - static short ace2type(struct nfs4_ace *); static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int); @@ -139,7 +127,7 @@ int nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_acl **acl) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error = 0; struct posix_acl *pacl = NULL, *dpacl = NULL; unsigned int flags = 0; @@ -384,7 +372,6 @@ pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2) static void sort_pacl_range(struct posix_acl *pacl, int start, int end) { int sorted = 0, i; - struct posix_acl_entry tmp; /* We just do a bubble sort; easy to do in place, and we're not * expecting acl's to be long enough to justify anything more. */ @@ -394,9 +381,8 @@ sort_pacl_range(struct posix_acl *pacl, int start, int end) { if (pace_gt(&pacl->a_entries[i], &pacl->a_entries[i+1])) { sorted = 0; - tmp = pacl->a_entries[i]; - pacl->a_entries[i] = pacl->a_entries[i+1]; - pacl->a_entries[i+1] = tmp; + swap(pacl->a_entries[i], + pacl->a_entries[i + 1]); } } } @@ -499,43 +485,13 @@ static inline void add_to_mask(struct posix_acl_state *state, struct posix_ace_s state->mask.allow |= astate->allow; } -/* - * Certain bits (SYNCHRONIZE, DELETE, WRITE_OWNER, READ/WRITE_NAMED_ATTRS, - * READ_ATTRIBUTES, READ_ACL) are currently unenforceable and don't translate - * to traditional read/write/execute permissions. - * - * It's problematic to reject acls that use certain mode bits, because it - * places the burden on users to learn the rules about which bits one - * particular server sets, without giving the user a lot of help--we return an - * error that could mean any number of different things. To make matters - * worse, the problematic bits might be introduced by some application that's - * automatically mapping from some other acl model. - * - * So wherever possible we accept anything, possibly erring on the side of - * denying more permissions than necessary. - * - * However we do reject *explicit* DENY's of a few bits representing - * permissions we could never deny: - */ - -static inline int check_deny(u32 mask, int isowner) -{ - if (mask & (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL)) - return -EINVAL; - if (!isowner) - return 0; - if (mask & (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL)) - return -EINVAL; - return 0; -} - static struct posix_acl * posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) { struct posix_acl_entry *pace; struct posix_acl *pacl; int nace; - int i, error = 0; + int i; /* * ACLs with no ACEs are treated differently in the inheritable @@ -560,17 +516,11 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) pace = pacl->a_entries; pace->e_tag = ACL_USER_OBJ; - error = check_deny(state->owner.deny, 1); - if (error) - goto out_err; low_mode_from_nfs4(state->owner.allow, &pace->e_perm, flags); for (i=0; i < state->users->n; i++) { pace++; pace->e_tag = ACL_USER; - error = check_deny(state->users->aces[i].perms.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->users->aces[i].perms.allow, &pace->e_perm, flags); pace->e_uid = state->users->aces[i].uid; @@ -579,18 +529,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) pace++; pace->e_tag = ACL_GROUP_OBJ; - error = check_deny(state->group.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->group.allow, &pace->e_perm, flags); add_to_mask(state, &state->group); for (i=0; i < state->groups->n; i++) { pace++; pace->e_tag = ACL_GROUP; - error = check_deny(state->groups->aces[i].perms.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->groups->aces[i].perms.allow, &pace->e_perm, flags); pace->e_gid = state->groups->aces[i].gid; @@ -605,15 +549,9 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags) pace++; pace->e_tag = ACL_OTHER; - error = check_deny(state->other.deny, 0); - if (error) - goto out_err; low_mode_from_nfs4(state->other.allow, &pace->e_perm, flags); return pacl; -out_err: - posix_acl_release(pacl); - return ERR_PTR(error); } static inline void allow_bits(struct posix_ace_state *astate, u32 mask) @@ -828,7 +766,7 @@ nfsd4_set_nfs4_acl(struct svc_rqst *rqstp, struct svc_fh *fhp, return error; dentry = fhp->fh_dentry; - inode = dentry->d_inode; + inode = d_inode(dentry); if (!inode->i_op->set_acl || !IS_POSIXACL(inode)) return nfserr_attrnotsupp; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 58277859a467..a49201835a97 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -224,7 +224,7 @@ static int nfs_cb_stat_to_errno(int status) } static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, - enum nfsstat4 *status) + int *status) { __be32 *p; u32 op; @@ -235,7 +235,7 @@ static int decode_cb_op_status(struct xdr_stream *xdr, enum nfs_opnum4 expected, op = be32_to_cpup(p++); if (unlikely(op != expected)) goto out_unexpected; - *status = be32_to_cpup(p); + *status = nfs_cb_stat_to_errno(be32_to_cpup(p)); return 0; out_overflow: print_overflow_msg(__func__, xdr); @@ -446,22 +446,17 @@ out_overflow: static int decode_cb_sequence4res(struct xdr_stream *xdr, struct nfsd4_callback *cb) { - enum nfsstat4 nfserr; int status; if (cb->cb_minorversion == 0) return 0; - status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - goto out_default; - status = decode_cb_sequence4resok(xdr, cb); -out: - return status; -out_default: - return nfs_cb_stat_to_errno(nfserr); + status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_status); + if (unlikely(status || cb->cb_status)) + return status; + + cb->cb_update_seq_nr = true; + return decode_cb_sequence4resok(xdr, cb); } /* @@ -524,26 +519,19 @@ static int nfs4_xdr_dec_cb_recall(struct rpc_rqst *rqstp, struct nfsd4_callback *cb) { struct nfs4_cb_compound_hdr hdr; - enum nfsstat4 nfserr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) - goto out; + return status; if (cb != NULL) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status)) - goto out; + if (unlikely(status || cb->cb_status)) + return status; } - status = decode_cb_op_status(xdr, OP_CB_RECALL, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - status = nfs_cb_stat_to_errno(nfserr); -out: - return status; + return decode_cb_op_status(xdr, OP_CB_RECALL, &cb->cb_status); } #ifdef CONFIG_NFSD_PNFS @@ -621,24 +609,18 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp, struct nfsd4_callback *cb) { struct nfs4_cb_compound_hdr hdr; - enum nfsstat4 nfserr; int status; status = decode_cb_compound4res(xdr, &hdr); if (unlikely(status)) - goto out; + return status; + if (cb) { status = decode_cb_sequence4res(xdr, cb); - if (unlikely(status)) - goto out; + if (unlikely(status || cb->cb_status)) + return status; } - status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr); - if (unlikely(status)) - goto out; - if (unlikely(nfserr != NFS4_OK)) - status = nfs_cb_stat_to_errno(nfserr); -out: - return status; + return decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &cb->cb_status); } #endif /* CONFIG_NFSD_PNFS */ @@ -894,17 +876,12 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) u32 minorversion = clp->cl_minorversion; cb->cb_minorversion = minorversion; + cb->cb_update_seq_nr = false; + cb->cb_status = 0; if (minorversion) { if (!nfsd41_cb_get_slot(clp, task)) return; } - spin_lock(&clp->cl_lock); - if (list_empty(&cb->cb_per_client)) { - /* This is the first call, not a restart */ - cb->cb_done = false; - list_add(&cb->cb_per_client, &clp->cl_callbacks); - } - spin_unlock(&clp->cl_lock); rpc_call_start(task); } @@ -917,23 +894,41 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) clp->cl_minorversion); if (clp->cl_minorversion) { - /* No need for lock, access serialized in nfsd4_cb_prepare */ - ++clp->cl_cb_session->se_cb_seq_nr; + /* + * No need for lock, access serialized in nfsd4_cb_prepare + * + * RFC5661 20.9.3 + * If CB_SEQUENCE returns an error, then the state of the slot + * (sequence ID, cached reply) MUST NOT change. + */ + if (cb->cb_update_seq_nr) + ++clp->cl_cb_session->se_cb_seq_nr; + clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, clp->cl_cb_session->se_cb_seq_nr); } - if (clp->cl_cb_client != task->tk_client) { - /* We're shutting down or changing cl_cb_client; leave - * it to nfsd4_process_cb_update to restart the call if - * necessary. */ + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (task->tk_flags & RPC_TASK_KILLED) { + task->tk_status = 0; + cb->cb_need_restart = true; return; } - if (cb->cb_done) - return; + if (cb->cb_status) { + WARN_ON_ONCE(task->tk_status); + task->tk_status = cb->cb_status; + } switch (cb->cb_ops->done(cb, task)) { case 0: @@ -949,21 +944,17 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) default: BUG(); } - cb->cb_done = true; } static void nfsd4_cb_release(void *calldata) { struct nfsd4_callback *cb = calldata; - struct nfs4_client *clp = cb->cb_clp; - - if (cb->cb_done) { - spin_lock(&clp->cl_lock); - list_del(&cb->cb_per_client); - spin_unlock(&clp->cl_lock); + if (cb->cb_need_restart) + nfsd4_run_cb(cb); + else cb->cb_ops->release(cb); - } + } static const struct rpc_call_ops nfsd4_cb_ops = { @@ -1058,9 +1049,6 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) nfsd4_mark_cb_down(clp, err); return; } - /* Yay, the callback channel's back! Restart any callbacks: */ - list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) - queue_work(callback_wq, &cb->cb_work); } static void @@ -1071,8 +1059,12 @@ nfsd4_run_cb_work(struct work_struct *work) struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; - if (cb->cb_ops && cb->cb_ops->prepare) - cb->cb_ops->prepare(cb); + if (cb->cb_need_restart) { + cb->cb_need_restart = false; + } else { + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + } if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); @@ -1084,6 +1076,15 @@ nfsd4_run_cb_work(struct work_struct *work) cb->cb_ops->release(cb); return; } + + /* + * Don't send probe messages for 4.1 or later. + */ + if (!cb->cb_ops && clp->cl_minorversion) { + clp->cl_cb_state = NFSD4_CB_UP; + return; + } + cb->cb_msg.rpc_cred = clp->cl_cb_cred; rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); @@ -1098,8 +1099,9 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_resp = cb; cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); - INIT_LIST_HEAD(&cb->cb_per_client); - cb->cb_done = true; + cb->cb_status = 0; + cb->cb_update_seq_nr = false; + cb->cb_need_restart = false; } void nfsd4_run_cb(struct nfsd4_callback *cb) diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 6904213a4363..ebf90e487c75 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -212,6 +212,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { + fput(ls->ls_file); put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 92b9d97aff4f..90cfda75313c 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -52,7 +52,7 @@ static inline void nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u32 *bmval) { - struct inode *inode = resfh->fh_dentry->d_inode; + struct inode *inode = d_inode(resfh->fh_dentry); int status; mutex_lock(&inode->i_mutex); @@ -110,7 +110,7 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * in current environment or not. */ if (bmval[0] & FATTR4_WORD0_ACL) { - if (!IS_POSIXACL(dentry->d_inode)) + if (!IS_POSIXACL(d_inode(dentry))) return nfserr_attrnotsupp; } @@ -209,7 +209,7 @@ do_open_permission(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfs static __be32 nfsd_check_obj_isreg(struct svc_fh *fh) { - umode_t mode = fh->fh_dentry->d_inode->i_mode; + umode_t mode = d_inode(fh->fh_dentry)->i_mode; if (S_ISREG(mode)) return nfs_ok; @@ -470,7 +470,7 @@ out: fh_put(resfh); kfree(resfh); } - nfsd4_cleanup_open_state(cstate, open, status); + nfsd4_cleanup_open_state(cstate, open); nfsd4_bump_seqid(cstate, status); return status; } @@ -760,8 +760,6 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - /* no need to check permission - this will be done in nfsd_read() */ - read->rd_filp = NULL; if (read->rd_offset >= OFFSET_MAX) return nfserr_inval; @@ -778,9 +776,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ - if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), - cstate, &read->rd_stateid, - RD_STATE, &read->rd_filp))) { + status = nfs4_preprocess_stateid_op(rqstp, cstate, &read->rd_stateid, + RD_STATE, &read->rd_filp, &read->rd_tmp_file); + if (status) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; } @@ -881,7 +879,7 @@ nfsd4_secinfo(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &exp, &dentry); if (err) return err; - if (dentry->d_inode == NULL) { + if (d_really_is_negative(dentry)) { exp_put(exp); err = nfserr_noent; } else @@ -924,8 +922,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int err; if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { - status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, - &setattr->sa_stateid, WR_STATE, NULL); + status = nfs4_preprocess_stateid_op(rqstp, cstate, + &setattr->sa_stateid, WR_STATE, NULL, NULL); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -986,13 +984,11 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, unsigned long cnt; int nvecs; - /* no need to check permission - this will be done in nfsd_write() */ - if (write->wr_offset >= OFFSET_MAX) return nfserr_inval; - status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), - cstate, stateid, WR_STATE, &filp); + status = nfs4_preprocess_stateid_op(rqstp, cstate, stateid, WR_STATE, + &filp, NULL); if (status) { dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; @@ -1005,11 +1001,10 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nvecs = fill_in_write_vector(rqstp->rq_vec, write); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); - status = nfsd_write(rqstp, &cstate->current_fh, filp, - write->wr_offset, rqstp->rq_vec, nvecs, - &cnt, &write->wr_how_written); - if (filp) - fput(filp); + status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, nvecs, &cnt, + &write->wr_how_written); + fput(filp); write->wr_bytes_written = cnt; @@ -1023,9 +1018,9 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status = nfserr_notsupp; struct file *file; - status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + status = nfs4_preprocess_stateid_op(rqstp, cstate, &fallocate->falloc_stateid, - WR_STATE, &file); + WR_STATE, &file, NULL); if (status != nfs_ok) { dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); return status; @@ -1062,9 +1057,9 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct file *file; - status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + status = nfs4_preprocess_stateid_op(rqstp, cstate, &seek->seek_stateid, - RD_STATE, &file); + RD_STATE, &file, NULL); if (status) { dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); return status; @@ -1308,7 +1303,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) goto out_put_stid; - nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode, + nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry), current_fh, lgp); if (nfserr) goto out_put_stid; @@ -1342,7 +1337,7 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type); if (!ops) goto out; - inode = current_fh->fh_dentry->d_inode; + inode = d_inode(current_fh->fh_dentry); nfserr = nfserr_inval; if (new_size <= seg->offset) { @@ -1728,10 +1723,6 @@ encode_op: be32_to_cpu(status)); nfsd4_cstate_clear_replay(cstate); - /* XXX Ugh, we need to get rid of this kind of special case: */ - if (op->opnum == OP_READ && op->u.read.rd_filp) - fput(op->u.read.rd_filp); - nfsd4_increment_op_stats(op->opnum); } @@ -1815,7 +1806,7 @@ static inline u32 nfsd4_getattr_rsize(struct svc_rqst *rqstp, bmap0 &= ~FATTR4_WORD0_FILEHANDLE; } if (bmap2 & FATTR4_WORD2_SECURITY_LABEL) { - ret += NFSD4_MAX_SEC_LABEL_LEN + 12; + ret += NFS4_MAXLABELLEN + 12; bmap2 &= ~FATTR4_WORD2_SECURITY_LABEL; } /* @@ -2282,13 +2273,13 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_func = (nfsd4op_func)nfsd4_allocate, .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_ALLOCATE", - .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_DEALLOCATE] = { .op_func = (nfsd4op_func)nfsd4_deallocate, .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, .op_name = "OP_DEALLOCATE", - .op_rsize_bop = (nfsd4op_rsize)nfsd4_write_rsize, + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 1c307f02baa8..d88ea7b9a85c 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -192,14 +192,14 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dir = nn->rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&dir->d_inode->i_mutex); + mutex_lock(&d_inode(dir)->i_mutex); dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; } - if (dentry->d_inode) + if (d_really_is_positive(dentry)) /* * In the 4.1 case, where we're called from * reclaim_complete(), records from the previous reboot @@ -209,11 +209,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(dir->d_inode, dentry, S_IRWXU); + status = vfs_mkdir(d_inode(dir), dentry, S_IRWXU); out_put: dput(dentry); out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); if (status == 0) { if (nn->in_grace) { crp = nfs4_client_to_reclaim(dname, nn); @@ -285,7 +285,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } status = iterate_dir(nn->rec_file, &ctx.ctx); - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); while (!list_empty(&ctx.names)) { struct name_list *entry; entry = list_entry(ctx.names.next, struct name_list, list); @@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) list_del(&entry->list); kfree(entry); } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); nfs4_reset_creds(original_cred); return status; } @@ -316,20 +316,20 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); dir = nn->rec_file->f_path.dentry; - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_PARENT); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; } status = -ENOENT; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out; - status = vfs_rmdir(dir->d_inode, dentry); + status = vfs_rmdir(d_inode(dir), dentry); out: dput(dentry); out_unlock: - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); return status; } @@ -385,7 +385,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) if (nfs4_has_reclaimed_state(child->d_name.name, nn)) return 0; - status = vfs_rmdir(parent->d_inode, child); + status = vfs_rmdir(d_inode(parent), child); if (status) printk("failed to remove client recovery directory %pd\n", child); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8ba1d888f1e6..95202719a1fd 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -94,6 +94,7 @@ static struct kmem_cache *lockowner_slab; static struct kmem_cache *file_slab; static struct kmem_cache *stateid_slab; static struct kmem_cache *deleg_slab; +static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); @@ -281,6 +282,7 @@ put_nfs4_file(struct nfs4_file *fi) if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del_rcu(&fi->fi_hash); spin_unlock(&state_lock); + WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); WARN_ON_ONCE(!list_empty(&fi->fi_delegations)); call_rcu(&fi->fi_rcu, nfsd4_free_file_rcu); } @@ -471,6 +473,86 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access) __nfs4_file_put_access(fp, O_RDONLY); } +/* + * Allocate a new open/delegation state counter. This is needed for + * pNFS for proper return on close semantics. + * + * Note that we only allocate it for pNFS-enabled exports, otherwise + * all pointers to struct nfs4_clnt_odstate are always NULL. + */ +static struct nfs4_clnt_odstate * +alloc_clnt_odstate(struct nfs4_client *clp) +{ + struct nfs4_clnt_odstate *co; + + co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL); + if (co) { + co->co_client = clp; + atomic_set(&co->co_odcount, 1); + } + return co; +} + +static void +hash_clnt_odstate_locked(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp = co->co_file; + + lockdep_assert_held(&fp->fi_lock); + list_add(&co->co_perfile, &fp->fi_clnt_odstate); +} + +static inline void +get_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + if (co) + atomic_inc(&co->co_odcount); +} + +static void +put_clnt_odstate(struct nfs4_clnt_odstate *co) +{ + struct nfs4_file *fp; + + if (!co) + return; + + fp = co->co_file; + if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { + list_del(&co->co_perfile); + spin_unlock(&fp->fi_lock); + + nfsd4_return_all_file_layouts(co->co_client, fp); + kmem_cache_free(odstate_slab, co); + } +} + +static struct nfs4_clnt_odstate * +find_or_hash_clnt_odstate(struct nfs4_file *fp, struct nfs4_clnt_odstate *new) +{ + struct nfs4_clnt_odstate *co; + struct nfs4_client *cl; + + if (!new) + return NULL; + + cl = new->co_client; + + spin_lock(&fp->fi_lock); + list_for_each_entry(co, &fp->fi_clnt_odstate, co_perfile) { + if (co->co_client == cl) { + get_clnt_odstate(co); + goto out; + } + } + co = new; + co->co_file = fp; + hash_clnt_odstate_locked(new); +out: + spin_unlock(&fp->fi_lock); + return co; +} + struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *slab) { @@ -606,7 +688,8 @@ static void block_delegations(struct knfsd_fh *fh) } static struct nfs4_delegation * -alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) +alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh, + struct nfs4_clnt_odstate *odstate) { struct nfs4_delegation *dp; long n; @@ -631,6 +714,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); + dp->dl_clnt_odstate = odstate; + get_clnt_odstate(odstate); dp->dl_type = NFS4_OPEN_DELEGATE_READ; dp->dl_retries = 1; nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, @@ -714,6 +799,7 @@ static void destroy_delegation(struct nfs4_delegation *dp) spin_lock(&state_lock); unhash_delegation_locked(dp); spin_unlock(&state_lock); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } @@ -724,6 +810,7 @@ static void revoke_delegation(struct nfs4_delegation *dp) WARN_ON(!list_empty(&dp->dl_recall_lru)); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); if (clp->cl_minorversion == 0) @@ -933,6 +1020,7 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid) { struct nfs4_ol_stateid *stp = openlockstateid(stid); + put_clnt_odstate(stp->st_clnt_odstate); release_all_access(stp); if (stp->st_stateowner) nfs4_put_stateowner(stp->st_stateowner); @@ -1139,7 +1227,7 @@ hash_sessionid(struct nfs4_sessionid *sessionid) return sid->sequence % SESSION_HASH_SIZE; } -#ifdef NFSD_DEBUG +#ifdef CONFIG_SUNRPC_DEBUG static inline void dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid) { @@ -1538,7 +1626,6 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name) INIT_LIST_HEAD(&clp->cl_openowners); INIT_LIST_HEAD(&clp->cl_delegations); INIT_LIST_HEAD(&clp->cl_lru); - INIT_LIST_HEAD(&clp->cl_callbacks); INIT_LIST_HEAD(&clp->cl_revoked); #ifdef CONFIG_NFSD_PNFS INIT_LIST_HEAD(&clp->cl_lo_states); @@ -1634,6 +1721,7 @@ __destroy_client(struct nfs4_client *clp) while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } @@ -3057,6 +3145,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); + INIT_LIST_HEAD(&fp->fi_clnt_odstate); fh_copy_shallow(&fp->fi_fhandle, fh); fp->fi_deleg_file = NULL; fp->fi_had_conflict = false; @@ -3073,6 +3162,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, void nfsd4_free_slabs(void) { + kmem_cache_destroy(odstate_slab); kmem_cache_destroy(openowner_slab); kmem_cache_destroy(lockowner_slab); kmem_cache_destroy(file_slab); @@ -3103,8 +3193,14 @@ nfsd4_init_slabs(void) sizeof(struct nfs4_delegation), 0, 0, NULL); if (deleg_slab == NULL) goto out_free_stateid_slab; + odstate_slab = kmem_cache_create("nfsd4_odstate", + sizeof(struct nfs4_clnt_odstate), 0, 0, NULL); + if (odstate_slab == NULL) + goto out_free_deleg_slab; return 0; +out_free_deleg_slab: + kmem_cache_destroy(deleg_slab); out_free_stateid_slab: kmem_cache_destroy(stateid_slab); out_free_file_slab: @@ -3581,6 +3677,14 @@ alloc_stateid: open->op_stp = nfs4_alloc_open_stateid(clp); if (!open->op_stp) return nfserr_jukebox; + + if (nfsd4_has_session(cstate) && + (cstate->current_fh.fh_export->ex_flags & NFSEXP_PNFS)) { + open->op_odstate = alloc_clnt_odstate(clp); + if (!open->op_odstate) + return nfserr_jukebox; + } + return nfs_ok; } @@ -3757,7 +3861,7 @@ static __be32 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { __be32 status; - unsigned char old_deny_bmap; + unsigned char old_deny_bmap = stp->st_deny_bmap; if (!test_access(open->op_share_access, stp)) return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open); @@ -3766,7 +3870,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c spin_lock(&fp->fi_lock); status = nfs4_file_check_deny(fp, open->op_share_deny); if (status == nfs_ok) { - old_deny_bmap = stp->st_deny_bmap; set_deny(open->op_share_deny, stp); fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); @@ -3869,7 +3972,7 @@ out_fput: static struct nfs4_delegation * nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, - struct nfs4_file *fp) + struct nfs4_file *fp, struct nfs4_clnt_odstate *odstate) { int status; struct nfs4_delegation *dp; @@ -3877,7 +3980,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, if (fp->fi_had_conflict) return ERR_PTR(-EAGAIN); - dp = alloc_init_deleg(clp, fh); + dp = alloc_init_deleg(clp, fh, odstate); if (!dp) return ERR_PTR(-ENOMEM); @@ -3903,6 +4006,7 @@ out_unlock: spin_unlock(&state_lock); out: if (status) { + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_stid(&dp->dl_stid); return ERR_PTR(status); } @@ -3980,7 +4084,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, default: goto out_no_deleg; } - dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file); + dp = nfs4_set_delegation(clp, fh, stp->st_stid.sc_file, stp->st_clnt_odstate); if (IS_ERR(dp)) goto out_no_deleg; @@ -4049,7 +4153,6 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfserr_bad_stateid; if (nfsd4_is_deleg_cur(open)) goto out; - status = nfserr_jukebox; } /* @@ -4070,6 +4173,11 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf release_open_stateid(stp); goto out; } + + stp->st_clnt_odstate = find_or_hash_clnt_odstate(fp, + open->op_odstate); + if (stp->st_clnt_odstate == open->op_odstate) + open->op_odstate = NULL; } update_stateid(&stp->st_stid.sc_stateid); memcpy(&open->op_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); @@ -4118,7 +4226,7 @@ out: } void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, - struct nfsd4_open *open, __be32 status) + struct nfsd4_open *open) { if (open->op_openowner) { struct nfs4_stateowner *so = &open->op_openowner->oo_owner; @@ -4130,6 +4238,8 @@ void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, kmem_cache_free(file_slab, open->op_file); if (open->op_stp) nfs4_put_stid(&open->op_stp->st_stid); + if (open->op_odstate) + kmem_cache_free(odstate_slab, open->op_odstate); } __be32 @@ -4286,9 +4396,9 @@ laundromat_main(struct work_struct *laundry) queue_delayed_work(laundry_wq, &nn->laundromat_work, t*HZ); } -static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp) +static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stid *stp) { - if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle)) + if (!fh_match(&fhp->fh_handle, &stp->sc_file->fi_fhandle)) return nfserr_bad_stateid; return nfs_ok; } @@ -4386,10 +4496,17 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfserr_old_stateid; } +static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) +{ + if (ols->st_stateowner->so_is_open_owner && + !(openowner(ols->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) + return nfserr_bad_stateid; + return nfs_ok; +} + static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) { struct nfs4_stid *s; - struct nfs4_ol_stateid *ols; __be32 status = nfserr_bad_stateid; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) @@ -4419,13 +4536,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) break; case NFS4_OPEN_STID: case NFS4_LOCK_STID: - ols = openlockstateid(s); - if (ols->st_stateowner->so_is_open_owner - && !(openowner(ols->st_stateowner)->oo_flags - & NFS4_OO_CONFIRMED)) - status = nfserr_bad_stateid; - else - status = nfs_ok; + status = nfsd4_check_openowner_confirmed(openlockstateid(s)); break; default: printk("unknown stateid type %x\n", s->sc_type); @@ -4462,85 +4573,130 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, return nfs_ok; } +static struct file * +nfs4_find_file(struct nfs4_stid *s, int flags) +{ + if (!s) + return NULL; + + switch (s->sc_type) { + case NFS4_DELEG_STID: + if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file)) + return NULL; + return get_file(s->sc_file->fi_deleg_file); + case NFS4_OPEN_STID: + case NFS4_LOCK_STID: + if (flags & RD_STATE) + return find_readable_file(s->sc_file); + else + return find_writeable_file(s->sc_file); + break; + } + + return NULL; +} + +static __be32 +nfs4_check_olstateid(struct svc_fh *fhp, struct nfs4_ol_stateid *ols, int flags) +{ + __be32 status; + + status = nfsd4_check_openowner_confirmed(ols); + if (status) + return status; + return nfs4_check_openmode(ols, flags); +} + +static __be32 +nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, + struct file **filpp, bool *tmp_file, int flags) +{ + int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; + struct file *file; + __be32 status; + + file = nfs4_find_file(s, flags); + if (file) { + status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + acc | NFSD_MAY_OWNER_OVERRIDE); + if (status) { + fput(file); + return status; + } + + *filpp = file; + } else { + status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp); + if (status) + return status; + + if (tmp_file) + *tmp_file = true; + } + + return 0; +} + /* -* Checks for stateid operations -*/ + * Checks for stateid operations + */ __be32 -nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, - stateid_t *stateid, int flags, struct file **filpp) +nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + int flags, struct file **filpp, bool *tmp_file) { - struct nfs4_stid *s; - struct nfs4_ol_stateid *stp = NULL; - struct nfs4_delegation *dp = NULL; - struct svc_fh *current_fh = &cstate->current_fh; - struct inode *ino = current_fh->fh_dentry->d_inode; + struct svc_fh *fhp = &cstate->current_fh; + struct inode *ino = d_inode(fhp->fh_dentry); + struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct file *file = NULL; + struct nfs4_stid *s = NULL; __be32 status; if (filpp) *filpp = NULL; + if (tmp_file) + *tmp_file = false; if (grace_disallows_io(net, ino)) return nfserr_grace; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - return check_special_stateids(net, current_fh, stateid, flags); + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { + status = check_special_stateids(net, fhp, stateid, flags); + goto done; + } status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, &s, nn); if (status) return status; - status = check_stateid_generation(stateid, &s->sc_stateid, nfsd4_has_session(cstate)); + status = check_stateid_generation(stateid, &s->sc_stateid, + nfsd4_has_session(cstate)); if (status) goto out; + switch (s->sc_type) { case NFS4_DELEG_STID: - dp = delegstateid(s); - status = nfs4_check_delegmode(dp, flags); - if (status) - goto out; - if (filpp) { - file = dp->dl_stid.sc_file->fi_deleg_file; - if (!file) { - WARN_ON_ONCE(1); - status = nfserr_serverfault; - goto out; - } - get_file(file); - } + status = nfs4_check_delegmode(delegstateid(s), flags); break; case NFS4_OPEN_STID: case NFS4_LOCK_STID: - stp = openlockstateid(s); - status = nfs4_check_fh(current_fh, stp); - if (status) - goto out; - if (stp->st_stateowner->so_is_open_owner - && !(openowner(stp->st_stateowner)->oo_flags & NFS4_OO_CONFIRMED)) - goto out; - status = nfs4_check_openmode(stp, flags); - if (status) - goto out; - if (filpp) { - struct nfs4_file *fp = stp->st_stid.sc_file; - - if (flags & RD_STATE) - file = find_readable_file(fp); - else - file = find_writeable_file(fp); - } + status = nfs4_check_olstateid(fhp, openlockstateid(s), flags); break; default: status = nfserr_bad_stateid; - goto out; + break; } - status = nfs_ok; - if (file) - *filpp = file; + if (status) + goto out; + status = nfs4_check_fh(fhp, s); + +done: + if (!status && filpp) + status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags); out: - nfs4_put_stid(s); + if (s) + nfs4_put_stid(s); return status; } @@ -4642,7 +4798,7 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status) return status; - return nfs4_check_fh(current_fh, stp); + return nfs4_check_fh(current_fh, &stp->st_stid); } /* @@ -4853,9 +5009,6 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, update_stateid(&stp->st_stid.sc_stateid); memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t)); - nfsd4_return_all_file_layouts(stp->st_stateowner->so_client, - stp->st_stid.sc_file); - nfsd4_close_open_stateid(stp); /* put reference from nfs4_preprocess_seqid_op */ @@ -4932,20 +5085,22 @@ nfs4_transform_lock_offset(struct file_lock *lock) lock->fl_end = OFFSET_MAX; } -static void nfsd4_fl_get_owner(struct file_lock *dst, struct file_lock *src) +static fl_owner_t +nfsd4_fl_get_owner(fl_owner_t owner) { - struct nfs4_lockowner *lo = (struct nfs4_lockowner *)src->fl_owner; - dst->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lo->lo_owner)); + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; + + nfs4_get_stateowner(&lo->lo_owner); + return owner; } -static void nfsd4_fl_put_owner(struct file_lock *fl) +static void +nfsd4_fl_put_owner(fl_owner_t owner) { - struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)owner; - if (lo) { + if (lo) nfs4_put_stateowner(&lo->lo_owner); - fl->fl_owner = NULL; - } } static const struct lock_manager_operations nfsd_posix_mng_ops = { @@ -5169,7 +5324,7 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_file *fi = ost->st_stid.sc_file; struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *cl = oo->oo_owner.so_client; - struct inode *inode = cstate->current_fh.fh_dentry->d_inode; + struct inode *inode = d_inode(cstate->current_fh.fh_dentry); struct nfs4_lockowner *lo; unsigned int strhashval; @@ -5394,7 +5549,7 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (!err) { err = nfserrno(vfs_test_lock(file, lock)); - nfsd_close(file); + fput(file); } return err; } @@ -6487,6 +6642,7 @@ nfs4_state_shutdown_net(struct net *net) list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); + put_clnt_odstate(dp->dl_clnt_odstate); nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 5fb7e78169a6..75e0563c09d1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -33,6 +33,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/statfs.h> @@ -424,7 +425,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, len += 4; dummy32 = be32_to_cpup(p++); READ_BUF(dummy32); - if (dummy32 > NFSD4_MAX_SEC_LABEL_LEN) + if (dummy32 > NFS4_MAXLABELLEN) return nfserr_badlabel; len += (XDR_QUADLEN(dummy32) << 2); READMEM(buf, dummy32); @@ -2020,7 +2021,7 @@ static __be32 nfsd4_encode_path(struct xdr_stream *xdr, * dentries/path components in an array. */ for (;;) { - if (cur.dentry == root->dentry && cur.mnt == root->mnt) + if (path_equal(&cur, root)) break; if (cur.dentry == cur.mnt->mnt_root) { if (follow_up(&cur)) @@ -2142,6 +2143,7 @@ nfsd4_encode_aclname(struct xdr_stream *xdr, struct svc_rqst *rqstp, #define WORD0_ABSENT_FS_ATTRS (FATTR4_WORD0_FS_LOCATIONS | FATTR4_WORD0_FSID | \ FATTR4_WORD0_RDATTR_ERROR) #define WORD1_ABSENT_FS_ATTRS FATTR4_WORD1_MOUNTED_ON_FILEID +#define WORD2_ABSENT_FS_ATTRS 0 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static inline __be32 @@ -2170,7 +2172,7 @@ nfsd4_encode_security_label(struct xdr_stream *xdr, struct svc_rqst *rqstp, { return 0; } #endif -static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) +static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *bmval2, u32 *rdattr_err) { /* As per referral draft: */ if (*bmval0 & ~WORD0_ABSENT_FS_ATTRS || @@ -2183,6 +2185,7 @@ static __be32 fattr_handle_absent_fs(u32 *bmval0, u32 *bmval1, u32 *rdattr_err) } *bmval0 &= WORD0_ABSENT_FS_ATTRS; *bmval1 &= WORD1_ABSENT_FS_ATTRS; + *bmval2 &= WORD2_ABSENT_FS_ATTRS; return 0; } @@ -2227,7 +2230,6 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, u32 rdattr_err = 0; __be32 status; int err; - int aclsupport = 0; struct nfs4_acl *acl = NULL; void *context = NULL; int contextlen; @@ -2246,8 +2248,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion)); if (exp->ex_fslocs.migrated) { - BUG_ON(bmval[2]); - status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); + status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err); if (status) goto out; } @@ -2274,25 +2275,21 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out; fhp = tempfh; } - if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT - | FATTR4_WORD0_SUPPORTED_ATTRS)) { + if (bmval0 & FATTR4_WORD0_ACL) { err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); - aclsupport = (err == 0); - if (bmval0 & FATTR4_WORD0_ACL) { - if (err == -EOPNOTSUPP) - bmval0 &= ~FATTR4_WORD0_ACL; - else if (err == -EINVAL) { - status = nfserr_attrnotsupp; - goto out; - } else if (err != 0) - goto out_nfserr; - } + if (err == -EOPNOTSUPP) + bmval0 &= ~FATTR4_WORD0_ACL; + else if (err == -EINVAL) { + status = nfserr_attrnotsupp; + goto out; + } else if (err != 0) + goto out_nfserr; } #ifdef CONFIG_NFSD_V4_SECURITY_LABEL - if ((bmval[2] & FATTR4_WORD2_SECURITY_LABEL) || - bmval[0] & FATTR4_WORD0_SUPPORTED_ATTRS) { - err = security_inode_getsecctx(dentry->d_inode, + if ((bmval2 & FATTR4_WORD2_SECURITY_LABEL) || + bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { + err = security_inode_getsecctx(d_inode(dentry), &context, &contextlen); contextsupport = (err == 0); if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) { @@ -2338,7 +2335,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, u32 word1 = nfsd_suppattrs1(minorversion); u32 word2 = nfsd_suppattrs2(minorversion); - if (!aclsupport) + if (!IS_POSIXACL(dentry->d_inode)) word0 &= ~FATTR4_WORD0_ACL; if (!contextsupport) word2 &= ~FATTR4_WORD2_SECURITY_LABEL; @@ -2384,7 +2381,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, p = xdr_reserve_space(xdr, 8); if (!p) goto out_resource; - p = encode_change(p, &stat, dentry->d_inode); + p = encode_change(p, &stat, d_inode(dentry)); } if (bmval0 & FATTR4_WORD0_SIZE) { p = xdr_reserve_space(xdr, 8); @@ -2486,7 +2483,7 @@ out_acl: p = xdr_reserve_space(xdr, 4); if (!p) goto out_resource; - *p++ = cpu_to_be32(aclsupport ? + *p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ? ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); } if (bmval0 & FATTR4_WORD0_CANSETTIME) { @@ -2807,7 +2804,7 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { /* * nfsd_buffered_readdir drops the i_mutex between * readdir and calling this callback, leaving a window @@ -3324,7 +3321,7 @@ static __be32 nfsd4_encode_splice_read( } eof = (read->rd_offset + maxcount >= - read->rd_fhp->fh_dentry->d_inode->i_size); + d_inode(read->rd_fhp->fh_dentry)->i_size); *(p++) = htonl(eof); *(p++) = htonl(maxcount); @@ -3401,7 +3398,7 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp, xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3)); eof = (read->rd_offset + maxcount >= - read->rd_fhp->fh_dentry->d_inode->i_size); + d_inode(read->rd_fhp->fh_dentry)->i_size); tmp = htonl(eof); write_bytes_to_xdr_buf(xdr->buf, starting_len , &tmp, 4); @@ -3423,47 +3420,50 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, struct xdr_stream *xdr = &resp->xdr; struct file *file = read->rd_filp; int starting_len = xdr->buf->len; - struct raparms *ra; + struct raparms *ra = NULL; __be32 *p; - __be32 err; if (nfserr) - return nfserr; + goto out; p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ if (!p) { WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); - return nfserr_resource; + nfserr = nfserr_resource; + goto out; } - if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { + if (resp->xdr.buf->page_len && + test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { WARN_ON_ONCE(1); - return nfserr_resource; + nfserr = nfserr_resource; + goto out; } xdr_commit_encode(xdr); maxcount = svc_max_payload(resp->rqstp); - maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len)); + maxcount = min_t(unsigned long, maxcount, + (xdr->buf->buflen - xdr->buf->len)); maxcount = min_t(unsigned long, maxcount, read->rd_length); - if (!read->rd_filp) { - err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, - &file, &ra); - if (err) - goto err_truncate; - } + if (read->rd_tmp_file) + ra = nfsd_init_raparms(file); - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) - err = nfsd4_encode_splice_read(resp, read, file, maxcount); + if (file->f_op->splice_read && + test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); else - err = nfsd4_encode_readv(resp, read, file, maxcount); + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); - if (!read->rd_filp) - nfsd_put_tmp_read_open(file, ra); + if (ra) + nfsd_put_raparams(file, ra); -err_truncate: - if (err) + if (nfserr) xdr_truncate_encode(xdr, starting_len); - return err; + +out: + if (file) + fput(file); + return nfserr; } static __be32 diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index aa47d75ddb26..9690cb4dd588 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1250,15 +1250,15 @@ static int __init init_nfsd(void) int retval; printk(KERN_INFO "Installing knfsd (copyright (C) 1996 okir@monad.swb.de).\n"); - retval = register_cld_notifier(); - if (retval) - return retval; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) - goto out_unregister_notifier; - retval = nfsd4_init_slabs(); + return retval; + retval = register_cld_notifier(); if (retval) goto out_unregister_pernet; + retval = nfsd4_init_slabs(); + if (retval) + goto out_unregister_notifier; retval = nfsd4_init_pnfs(); if (retval) goto out_free_slabs; @@ -1290,10 +1290,10 @@ out_exit_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); -out_unregister_pernet: - unregister_pernet_subsys(&nfsd_net_ops); out_unregister_notifier: unregister_cld_notifier(); +out_unregister_pernet: + unregister_pernet_subsys(&nfsd_net_ops); return retval; } @@ -1308,8 +1308,8 @@ static void __exit exit_nfsd(void) nfsd4_exit_pnfs(); nfsd_fault_inject_cleanup(); unregister_filesystem(&nfsd_fs_type); - unregister_pernet_subsys(&nfsd_net_ops); unregister_cld_notifier(); + unregister_pernet_subsys(&nfsd_net_ops); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 565c4da1a9eb..cf980523898b 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -24,7 +24,7 @@ #include "export.h" #undef ifdebug -#ifdef NFSD_DEBUG +#ifdef CONFIG_SUNRPC_DEBUG # define ifdebug(flag) if (nfsd_debug & NFSDDBG_##flag) #else # define ifdebug(flag) if (0) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index e9fa966fc37f..350041a40fe5 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -38,7 +38,7 @@ static int nfsd_acceptable(void *expv, struct dentry *dentry) /* make sure parents give x permission to user */ int err; parent = dget_parent(tdentry); - err = inode_permission(parent->d_inode, MAY_EXEC); + err = inode_permission(d_inode(parent), MAY_EXEC); if (err < 0) { dput(parent); break; @@ -340,7 +340,7 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access) if (error) goto out; - error = nfsd_mode_check(rqstp, dentry->d_inode->i_mode, type); + error = nfsd_mode_check(rqstp, d_inode(dentry)->i_mode, type); if (error) goto out; @@ -412,8 +412,8 @@ static inline void _fh_update_old(struct dentry *dentry, struct svc_export *exp, struct knfsd_fh *fh) { - fh->ofh_ino = ino_t_to_u32(dentry->d_inode->i_ino); - fh->ofh_generation = dentry->d_inode->i_generation; + fh->ofh_ino = ino_t_to_u32(d_inode(dentry)->i_ino); + fh->ofh_generation = d_inode(dentry)->i_generation; if (d_is_dir(dentry) || (exp->ex_flags & NFSEXP_NOSUBTREECHECK)) fh->ofh_dirino = 0; @@ -426,7 +426,7 @@ static bool is_root_export(struct svc_export *exp) static struct super_block *exp_sb(struct svc_export *exp) { - return exp->ex_path.dentry->d_inode->i_sb; + return d_inode(exp->ex_path.dentry)->i_sb; } static bool fsid_type_ok_for_exp(u8 fsid_type, struct svc_export *exp) @@ -520,12 +520,12 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, * */ - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); dev_t ex_dev = exp_sb(exp)->s_dev; dprintk("nfsd: fh_compose(exp %02x:%02x/%ld %pd2, ino=%ld)\n", MAJOR(ex_dev), MINOR(ex_dev), - (long) exp->ex_path.dentry->d_inode->i_ino, + (long) d_inode(exp->ex_path.dentry)->i_ino, dentry, (inode ? inode->i_ino : 0)); @@ -558,7 +558,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, fhp->fh_handle.ofh_dev = old_encode_dev(ex_dev); fhp->fh_handle.ofh_xdev = fhp->fh_handle.ofh_dev; fhp->fh_handle.ofh_xino = - ino_t_to_u32(exp->ex_path.dentry->d_inode->i_ino); + ino_t_to_u32(d_inode(exp->ex_path.dentry)->i_ino); fhp->fh_handle.ofh_dirino = ino_t_to_u32(parent_ino(dentry)); if (inode) _fh_update_old(dentry, exp, &fhp->fh_handle); @@ -570,7 +570,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, mk_fsid(fhp->fh_handle.fh_fsid_type, fhp->fh_handle.fh_fsid, ex_dev, - exp->ex_path.dentry->d_inode->i_ino, + d_inode(exp->ex_path.dentry)->i_ino, exp->ex_fsid, exp->ex_uuid); if (inode) @@ -597,7 +597,7 @@ fh_update(struct svc_fh *fhp) goto out_bad; dentry = fhp->fh_dentry; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) goto out_negative; if (fhp->fh_handle.fh_version != 1) { _fh_update_old(dentry, fhp->fh_export, &fhp->fh_handle); diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index f22920442172..1e90dad4926b 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -225,7 +225,7 @@ fill_pre_wcc(struct svc_fh *fhp) { struct inode *inode; - inode = fhp->fh_dentry->d_inode; + inode = d_inode(fhp->fh_dentry); if (!fhp->fh_pre_saved) { fhp->fh_pre_mtime = inode->i_mtime; fhp->fh_pre_ctime = inode->i_ctime; @@ -264,7 +264,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) return; } - inode = dentry->d_inode; + inode = d_inode(dentry); mutex_lock_nested(&inode->i_mutex, subclass); fill_pre_wcc(fhp); fhp->fh_locked = 1; @@ -284,7 +284,7 @@ fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { fill_post_wcc(fhp); - mutex_unlock(&fhp->fh_dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); fhp->fh_locked = 0; } } diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index b8680738f588..4cd78ef4c95c 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -59,13 +59,61 @@ static __be32 nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp, struct nfsd_attrstat *resp) { + struct iattr *iap = &argp->attrs; + struct svc_fh *fhp; __be32 nfserr; + dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n", SVCFH_fmt(&argp->fh), argp->attrs.ia_valid, (long) argp->attrs.ia_size); - fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0); + fhp = fh_copy(&resp->fh, &argp->fh); + + /* + * NFSv2 does not differentiate between "set-[ac]time-to-now" + * which only requires access, and "set-[ac]time-to-X" which + * requires ownership. + * So if it looks like it might be "set both to the same time which + * is close to now", and if inode_change_ok fails, then we + * convert to "set to now" instead of "set to explicit time" + * + * We only call inode_change_ok as the last test as technically + * it is not an interface that we should be using. + */ +#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) +#define MAX_TOUCH_TIME_ERROR (30*60) + if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET && + iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) { + /* + * Looks probable. + * + * Now just make sure time is in the right ballpark. + * Solaris, at least, doesn't seem to care what the time + * request is. We require it be within 30 minutes of now. + */ + time_t delta = iap->ia_atime.tv_sec - get_seconds(); + struct inode *inode; + + nfserr = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + if (nfserr) + goto done; + inode = d_inode(fhp->fh_dentry); + + if (delta < 0) + delta = -delta; + if (delta < MAX_TOUCH_TIME_ERROR && + inode_change_ok(inode, iap) != 0) { + /* + * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. + * This will cause notify_change to set these times + * to "now" + */ + iap->ia_valid &= ~BOTH_TIME_SET; + } + } + + nfserr = nfsd_setattr(rqstp, fhp, iap, 0, (time_t)0); +done: return nfsd_return_attrs(nfserr, resp); } @@ -223,7 +271,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, } fh_init(newfhp, NFS_FHSIZE); nfserr = fh_compose(newfhp, dirfhp->fh_export, dchild, dirfhp); - if (!nfserr && !dchild->d_inode) + if (!nfserr && d_really_is_negative(dchild)) nfserr = nfserr_noent; dput(dchild); if (nfserr) { @@ -241,7 +289,7 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, } } - inode = newfhp->fh_dentry->d_inode; + inode = d_inode(newfhp->fh_dentry); /* Unfudge the mode bits */ if (attr->ia_valid & ATTR_MODE) { diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 412d7061f9e5..79d964aa8079 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -187,7 +187,7 @@ encode_fattr(struct svc_rqst *rqstp, __be32 *p, struct svc_fh *fhp, *p++ = htonl((u32) stat->ino); *p++ = htonl((u32) stat->atime.tv_sec); *p++ = htonl(stat->atime.tv_nsec ? stat->atime.tv_nsec / 1000 : 0); - lease_get_mtime(dentry->d_inode, &time); + lease_get_mtime(d_inode(dentry), &time); *p++ = htonl((u32) time.tv_sec); *p++ = htonl(time.tv_nsec ? time.tv_nsec / 1000 : 0); *p++ = htonl((u32) stat->ctime.tv_sec); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4f3bfeb11766..4874ce515fc1 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -63,12 +63,13 @@ typedef struct { struct nfsd4_callback { struct nfs4_client *cb_clp; - struct list_head cb_per_client; u32 cb_minorversion; struct rpc_message cb_msg; struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; - bool cb_done; + int cb_status; + bool cb_update_seq_nr; + bool cb_need_restart; }; struct nfsd4_callback_ops { @@ -126,6 +127,7 @@ struct nfs4_delegation { struct list_head dl_perfile; struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ + struct nfs4_clnt_odstate *dl_clnt_odstate; u32 dl_type; time_t dl_time; /* For recall: */ @@ -332,7 +334,6 @@ struct nfs4_client { int cl_cb_state; struct nfsd4_callback cl_cb_null; struct nfsd4_session *cl_cb_session; - struct list_head cl_callbacks; /* list of in-progress callbacks */ /* for all client information that callback code might need: */ spinlock_t cl_lock; @@ -465,6 +466,17 @@ static inline struct nfs4_lockowner * lockowner(struct nfs4_stateowner *so) } /* + * Per-client state indicating no. of opens and outstanding delegations + * on a file from a particular client.'od' stands for 'open & delegation' + */ +struct nfs4_clnt_odstate { + struct nfs4_client *co_client; + struct nfs4_file *co_file; + struct list_head co_perfile; + atomic_t co_odcount; +}; + +/* * nfs4_file: a file opened by some number of (open) nfs4_stateowners. * * These objects are global. nfsd keeps one instance of a nfs4_file per @@ -485,6 +497,7 @@ struct nfs4_file { struct list_head fi_delegations; struct rcu_head fi_rcu; }; + struct list_head fi_clnt_odstate; /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ struct file * fi_fds[3]; /* @@ -526,6 +539,7 @@ struct nfs4_ol_stateid { struct list_head st_perstateowner; struct list_head st_locks; struct nfs4_stateowner * st_stateowner; + struct nfs4_clnt_odstate * st_clnt_odstate; unsigned char st_access_bmap; unsigned char st_deny_bmap; struct nfs4_ol_stateid * st_openstp; @@ -569,9 +583,9 @@ enum nfsd4_cb_op { struct nfsd4_compound_state; struct nfsd_net; -extern __be32 nfs4_preprocess_stateid_op(struct net *net, - struct nfsd4_compound_state *cstate, - stateid_t *stateid, int flags, struct file **filp); +extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + int flags, struct file **filp, bool *tmp_file); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 368526582429..b5e077a6e7d4 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -174,7 +174,7 @@ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) return 1; if (!(exp->ex_flags & NFSEXP_V4ROOT)) return 0; - return dentry->d_inode != NULL; + return d_inode(dentry) != NULL; } __be32 @@ -270,7 +270,7 @@ nfsd_lookup(struct svc_rqst *rqstp, struct svc_fh *fhp, const char *name, * dentry may be negative, it may need to be updated. */ err = fh_compose(resfh, exp, dentry, fhp); - if (!err && !dentry->d_inode) + if (!err && d_really_is_negative(dentry)) err = nfserr_noent; out: dput(dentry); @@ -284,7 +284,7 @@ out: static int commit_metadata(struct svc_fh *fhp) { - struct inode *inode = fhp->fh_dentry->d_inode; + struct inode *inode = d_inode(fhp->fh_dentry); const struct export_operations *export_ops = inode->i_sb->s_export_op; if (!EX_ISSYNC(fhp->fh_export)) @@ -302,42 +302,6 @@ commit_metadata(struct svc_fh *fhp) static void nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) { - /* - * NFSv2 does not differentiate between "set-[ac]time-to-now" - * which only requires access, and "set-[ac]time-to-X" which - * requires ownership. - * So if it looks like it might be "set both to the same time which - * is close to now", and if inode_change_ok fails, then we - * convert to "set to now" instead of "set to explicit time" - * - * We only call inode_change_ok as the last test as technically - * it is not an interface that we should be using. - */ -#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) -#define MAX_TOUCH_TIME_ERROR (30*60) - if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET && - iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) { - /* - * Looks probable. - * - * Now just make sure time is in the right ballpark. - * Solaris, at least, doesn't seem to care what the time - * request is. We require it be within 30 minutes of now. - */ - time_t delta = iap->ia_atime.tv_sec - get_seconds(); - if (delta < 0) - delta = -delta; - if (delta < MAX_TOUCH_TIME_ERROR && - inode_change_ok(inode, iap) != 0) { - /* - * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. - * This will cause notify_change to set these times - * to "now" - */ - iap->ia_valid &= ~BOTH_TIME_SET; - } - } - /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { iap->ia_mode &= S_IALLUGO; @@ -364,7 +328,7 @@ static __be32 nfsd_get_write_access(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap) { - struct inode *inode = fhp->fh_dentry->d_inode; + struct inode *inode = d_inode(fhp->fh_dentry); int host_err; if (iap->ia_size < inode->i_size) { @@ -426,7 +390,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, } dentry = fhp->fh_dentry; - inode = dentry->d_inode; + inode = d_inode(dentry); /* Ignore any mode updates on symlinks */ if (S_ISLNK(inode->i_mode)) @@ -495,7 +459,7 @@ out: */ int nfsd4_is_junction(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode == NULL) return 0; @@ -521,9 +485,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; - mutex_lock(&dentry->d_inode->i_mutex); + mutex_lock(&d_inode(dentry)->i_mutex); host_error = security_inode_setsecctx(dentry, label->data, label->len); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); return nfserrno(host_error); } #else @@ -538,16 +502,11 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, loff_t len, int flags) { - __be32 err; int error; if (!S_ISREG(file_inode(file)->i_mode)) return nfserr_inval; - err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE); - if (err) - return err; - error = vfs_fallocate(file, flags, offset, len); if (!error) error = commit_metadata(fhp); @@ -706,7 +665,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); /* Disallow write access to files with the append-only bit set * or any access when mandatory locking enabled @@ -744,7 +703,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, host_err = ima_file_check(file, may_flags, 0); if (host_err) { - nfsd_close(file); + fput(file); goto out_nfserr; } @@ -761,23 +720,12 @@ out: return err; } -/* - * Close a file. - */ -void -nfsd_close(struct file *filp) -{ - fput(filp); -} - -/* - * Obtain the readahead parameters for the file - * specified by (dev, ino). - */ - -static inline struct raparms * -nfsd_get_raparms(dev_t dev, ino_t ino) +struct raparms * +nfsd_init_raparms(struct file *file) { + struct inode *inode = file_inode(file); + dev_t dev = inode->i_sb->s_dev; + ino_t ino = inode->i_ino; struct raparms *ra, **rap, **frap = NULL; int depth = 0; unsigned int hash; @@ -814,9 +762,23 @@ found: ra->p_count++; nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; spin_unlock(&rab->pb_lock); + + if (ra->p_set) + file->f_ra = ra->p_ra; return ra; } +void nfsd_put_raparams(struct file *file, struct raparms *ra) +{ + struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; + + spin_lock(&rab->pb_lock); + ra->p_ra = file->f_ra; + ra->p_set = 1; + ra->p_count--; + spin_unlock(&rab->pb_lock); +} + /* * Grab and keep cached pages associated with a file in the svc_rqst * so that they can be passed to the network sendmsg/sendpage routines @@ -945,7 +907,7 @@ static int wait_for_concurrent_writes(struct file *file) return err; } -static __be32 +__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, int *stablep) @@ -1009,40 +971,6 @@ out_nfserr: return err; } -__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file **file, struct raparms **ra) -{ - struct inode *inode; - __be32 err; - - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file); - if (err) - return err; - - inode = file_inode(*file); - - /* Get readahead parameters */ - *ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); - - if (*ra && (*ra)->p_set) - (*file)->f_ra = (*ra)->p_ra; - return nfs_ok; -} - -void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra) -{ - /* Write back readahead params */ - if (ra) { - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); - } - nfsd_close(file); -} - /* * Read data from a file. count must contain the requested read count * on entry. On return, *count contains the number of bytes actually read. @@ -1055,13 +983,15 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct raparms *ra; __be32 err; - err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra); + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (err) return err; + ra = nfsd_init_raparms(file); err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); - - nfsd_put_tmp_read_open(file, ra); + if (ra) + nfsd_put_raparams(file, ra); + fput(file); return err; } @@ -1093,7 +1023,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (cnt) err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); - nfsd_close(file); + fput(file); } out: return err; @@ -1138,7 +1068,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notsupp; } - nfsd_close(file); + fput(file); out: return err; } @@ -1211,7 +1141,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; dentry = fhp->fh_dentry; - dirp = dentry->d_inode; + dirp = d_inode(dentry); err = nfserr_notdir; if (!dirp->i_op->lookup) @@ -1250,7 +1180,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, * Make sure the child dentry is still negative ... */ err = nfserr_exist; - if (dchild->d_inode) { + if (d_really_is_positive(dchild)) { dprintk("nfsd_create: dentry %pd/%pd not negative!\n", dentry, dchild); goto out; @@ -1353,7 +1283,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out; dentry = fhp->fh_dentry; - dirp = dentry->d_inode; + dirp = d_inode(dentry); /* Get all the sanity checks out of the way before * we lock the parent. */ @@ -1376,7 +1306,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; /* If file doesn't exist, check for permissions to create one */ - if (!dchild->d_inode) { + if (d_really_is_negative(dchild)) { err = fh_verify(rqstp, fhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1397,7 +1327,7 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, v_atime = verifier[1]&0x7fffffff; } - if (dchild->d_inode) { + if (d_really_is_positive(dchild)) { err = 0; switch (createmode) { @@ -1420,17 +1350,17 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, } break; case NFS3_CREATE_EXCLUSIVE: - if ( dchild->d_inode->i_mtime.tv_sec == v_mtime - && dchild->d_inode->i_atime.tv_sec == v_atime - && dchild->d_inode->i_size == 0 ) { + if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime + && d_inode(dchild)->i_atime.tv_sec == v_atime + && d_inode(dchild)->i_size == 0 ) { if (created) *created = 1; break; } case NFS4_CREATE_EXCLUSIVE4_1: - if ( dchild->d_inode->i_mtime.tv_sec == v_mtime - && dchild->d_inode->i_atime.tv_sec == v_atime - && dchild->d_inode->i_size == 0 ) { + if ( d_inode(dchild)->i_mtime.tv_sec == v_mtime + && d_inode(dchild)->i_atime.tv_sec == v_atime + && d_inode(dchild)->i_size == 0 ) { if (created) *created = 1; goto set_attr; @@ -1513,7 +1443,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) path.mnt = fhp->fh_export->ex_path.mnt; path.dentry = fhp->fh_dentry; - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); err = nfserr_inval; if (!inode->i_op->readlink) @@ -1576,7 +1506,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (IS_ERR(dnew)) goto out_nfserr; - host_err = vfs_symlink(dentry->d_inode, dnew, path); + host_err = vfs_symlink(d_inode(dentry), dnew, path); err = nfserrno(host_err); if (!err) err = nfserrno(commit_metadata(fhp)); @@ -1632,7 +1562,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, fh_lock_nested(ffhp, I_MUTEX_PARENT); ddir = ffhp->fh_dentry; - dirp = ddir->d_inode; + dirp = d_inode(ddir); dnew = lookup_one_len(name, ddir, len); host_err = PTR_ERR(dnew); @@ -1642,7 +1572,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dold = tfhp->fh_dentry; err = nfserr_noent; - if (!dold->d_inode) + if (d_really_is_negative(dold)) goto out_dput; host_err = vfs_link(dold, dirp, dnew, NULL); if (!host_err) { @@ -1689,10 +1619,10 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out; fdentry = ffhp->fh_dentry; - fdir = fdentry->d_inode; + fdir = d_inode(fdentry); tdentry = tfhp->fh_dentry; - tdir = tdentry->d_inode; + tdir = d_inode(tdentry); err = nfserr_perm; if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen)) @@ -1717,7 +1647,7 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, goto out_nfserr; host_err = -ENOENT; - if (!odentry->d_inode) + if (d_really_is_negative(odentry)) goto out_dput_old; host_err = -EINVAL; if (odentry == trap) @@ -1790,21 +1720,21 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, fh_lock_nested(fhp, I_MUTEX_PARENT); dentry = fhp->fh_dentry; - dirp = dentry->d_inode; + dirp = d_inode(dentry); rdentry = lookup_one_len(fname, dentry, flen); host_err = PTR_ERR(rdentry); if (IS_ERR(rdentry)) goto out_nfserr; - if (!rdentry->d_inode) { + if (d_really_is_negative(rdentry)) { dput(rdentry); err = nfserr_noent; goto out; } if (!type) - type = rdentry->d_inode->i_mode & S_IFMT; + type = d_inode(rdentry)->i_mode & S_IFMT; if (type != S_IFDIR) host_err = vfs_unlink(dirp, rdentry, NULL); @@ -1977,7 +1907,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, if (err == nfserr_eof || err == nfserr_toosmall) err = nfs_ok; /* can still be found in ->err */ out_close: - nfsd_close(file); + fput(file); out: return err; } @@ -2015,7 +1945,7 @@ __be32 nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, struct dentry *dentry, int acc) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err; if ((acc & NFSD_MAY_MASK) == NFSD_MAY_NOP) diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 2050cb016998..5be875e3e638 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -71,11 +71,7 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, #endif /* CONFIG_NFSD_V3 */ __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -void nfsd_close(struct file *); struct raparms; -__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *, - struct file **, struct raparms **); -void nfsd_put_tmp_read_open(struct file *, struct raparms *); __be32 nfsd_splice_read(struct svc_rqst *, struct file *, loff_t, unsigned long *); __be32 nfsd_readv(struct file *, loff_t, struct kvec *, int, @@ -84,6 +80,10 @@ __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, loff_t, struct kvec *,int, unsigned long *, int *); +__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + struct kvec *vec, int vlen, unsigned long *cnt, + int *stablep); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, @@ -104,6 +104,9 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); +struct raparms *nfsd_init_raparms(struct file *file); +void nfsd_put_raparams(struct file *file, struct raparms *ra); + static inline int fh_want_write(struct svc_fh *fh) { int ret = mnt_want_write(fh->fh_export->ex_path.mnt); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 0bda93e58e1b..9f991007a578 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -40,7 +40,6 @@ #include "state.h" #include "nfsd.h" -#define NFSD4_MAX_SEC_LABEL_LEN 2048 #define NFSD4_MAX_TAGLEN 128 #define XDR_LEN(n) (((n) + 3) & ~3) @@ -248,6 +247,7 @@ struct nfsd4_open { struct nfs4_openowner *op_openowner; /* used during processing */ struct nfs4_file *op_file; /* used during processing */ struct nfs4_ol_stateid *op_stp; /* used during processing */ + struct nfs4_clnt_odstate *op_odstate; /* used during processing */ struct nfs4_acl *op_acl; struct xdr_netobj op_label; }; @@ -273,6 +273,7 @@ struct nfsd4_read { u32 rd_length; /* request */ int rd_vlen; struct file *rd_filp; + bool rd_tmp_file; struct svc_rqst *rd_rqstp; /* response */ struct svc_fh * rd_fhp; /* response */ @@ -632,7 +633,7 @@ set_change_info(struct nfsd4_change_info *cinfo, struct svc_fh *fhp) { BUG_ON(!fhp->fh_pre_saved); cinfo->atomic = fhp->fh_post_saved; - cinfo->change_supported = IS_I_VERSION(fhp->fh_dentry->d_inode); + cinfo->change_supported = IS_I_VERSION(d_inode(fhp->fh_dentry)); cinfo->before_change = fhp->fh_pre_change; cinfo->after_change = fhp->fh_post_change; @@ -683,7 +684,7 @@ extern __be32 nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open); extern void nfsd4_cstate_clear_replay(struct nfsd4_compound_state *cstate); extern void nfsd4_cleanup_open_state(struct nfsd4_compound_state *cstate, - struct nfsd4_open *open, __be32 status); + struct nfsd4_open *open); extern __be32 nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *, struct nfsd4_open_confirm *oc); extern __be32 nfsd4_close(struct svc_rqst *rqstp, diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 741fd02e0444..8df0f3b7839b 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -405,13 +405,14 @@ nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode, static int nilfs_palloc_count_desc_blocks(struct inode *inode, unsigned long *desc_blocks) { - unsigned long blknum; + __u64 blknum; int ret; ret = nilfs_bmap_last_key(NILFS_I(inode)->i_bmap, &blknum); if (likely(!ret)) *desc_blocks = DIV_ROUND_UP( - blknum, NILFS_MDT(inode)->mi_blocks_per_desc_block); + (unsigned long)blknum, + NILFS_MDT(inode)->mi_blocks_per_desc_block); return ret; } diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index aadbd0b5e3e8..27f75bcbeb30 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -152,9 +152,7 @@ static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) * * %-EEXIST - A record associated with @key already exist. */ -int nilfs_bmap_insert(struct nilfs_bmap *bmap, - unsigned long key, - unsigned long rec) +int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec) { int ret; @@ -191,19 +189,47 @@ static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key) return bmap->b_ops->bop_delete(bmap, key); } -int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) +/** + * nilfs_bmap_seek_key - seek a valid entry and return its key + * @bmap: bmap struct + * @start: start key number + * @keyp: place to store valid key + * + * Description: nilfs_bmap_seek_key() seeks a valid key on @bmap + * starting from @start, and stores it to @keyp if found. + * + * Return Value: On success, 0 is returned. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - No valid entry was found + */ +int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp) { - __u64 lastkey; int ret; down_read(&bmap->b_sem); - ret = bmap->b_ops->bop_last_key(bmap, &lastkey); + ret = bmap->b_ops->bop_seek_key(bmap, start, keyp); + up_read(&bmap->b_sem); + + if (ret < 0) + ret = nilfs_bmap_convert_error(bmap, __func__, ret); + return ret; +} + +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp) +{ + int ret; + + down_read(&bmap->b_sem); + ret = bmap->b_ops->bop_last_key(bmap, keyp); up_read(&bmap->b_sem); if (ret < 0) ret = nilfs_bmap_convert_error(bmap, __func__, ret); - else - *key = lastkey; return ret; } @@ -224,7 +250,7 @@ int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key) * * %-ENOENT - A record associated with @key does not exist. */ -int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) +int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key) { int ret; @@ -235,7 +261,7 @@ int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key) return nilfs_bmap_convert_error(bmap, __func__, ret); } -static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) +static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, __u64 key) { __u64 lastkey; int ret; @@ -276,7 +302,7 @@ static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key) * * %-ENOMEM - Insufficient amount of memory available. */ -int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key) +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key) { int ret; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index b89e68076adc..bfa817ce40b3 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -76,8 +76,10 @@ struct nilfs_bmap_operations { union nilfs_binfo *); int (*bop_mark)(struct nilfs_bmap *, __u64, int); - /* The following functions are internal use only. */ + int (*bop_seek_key)(const struct nilfs_bmap *, __u64, __u64 *); int (*bop_last_key)(const struct nilfs_bmap *, __u64 *); + + /* The following functions are internal use only. */ int (*bop_check_insert)(const struct nilfs_bmap *, __u64); int (*bop_check_delete)(struct nilfs_bmap *, __u64); int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int); @@ -153,10 +155,11 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); -int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); -int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); -int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *); -int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long); +int nilfs_bmap_insert(struct nilfs_bmap *bmap, __u64 key, unsigned long rec); +int nilfs_bmap_delete(struct nilfs_bmap *bmap, __u64 key); +int nilfs_bmap_seek_key(struct nilfs_bmap *bmap, __u64 start, __u64 *keyp); +int nilfs_bmap_last_key(struct nilfs_bmap *bmap, __u64 *keyp); +int nilfs_bmap_truncate(struct nilfs_bmap *bmap, __u64 key); void nilfs_bmap_clear(struct nilfs_bmap *); int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *); void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index ecdbae19a766..919fd5bb14a8 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -388,7 +388,7 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node, nchildren = nilfs_btree_node_get_nchildren(node); if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || - level > NILFS_BTREE_LEVEL_MAX || + level >= NILFS_BTREE_LEVEL_MAX || nchildren < 0 || nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) { pr_crit("NILFS: bad btree root (inode number=%lu): level = %d, flags = 0x%x, nchildren = %d\n", @@ -633,6 +633,44 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree, return 0; } +/** + * nilfs_btree_get_next_key - get next valid key from btree path array + * @btree: bmap struct of btree + * @path: array of nilfs_btree_path struct + * @minlevel: start level + * @nextkey: place to store the next valid key + * + * Return Value: If a next key was found, 0 is returned. Otherwise, + * -ENOENT is returned. + */ +static int nilfs_btree_get_next_key(const struct nilfs_bmap *btree, + const struct nilfs_btree_path *path, + int minlevel, __u64 *nextkey) +{ + struct nilfs_btree_node *node; + int maxlevel = nilfs_btree_height(btree) - 1; + int index, next_adj, level; + + /* Next index is already set to bp_index for leaf nodes. */ + next_adj = 0; + for (level = minlevel; level <= maxlevel; level++) { + if (level == maxlevel) + node = nilfs_btree_get_root(btree); + else + node = nilfs_btree_get_nonroot_node(path, level); + + index = path[level].bp_index + next_adj; + if (index < nilfs_btree_node_get_nchildren(node)) { + /* Next key is in this node */ + *nextkey = nilfs_btree_node_get_key(node, index); + return 0; + } + /* For non-leaf nodes, next index is stored at bp_index + 1. */ + next_adj = 1; + } + return -ENOENT; +} + static int nilfs_btree_lookup(const struct nilfs_bmap *btree, __u64 key, int level, __u64 *ptrp) { @@ -1563,6 +1601,27 @@ out: return ret; } +static int nilfs_btree_seek_key(const struct nilfs_bmap *btree, __u64 start, + __u64 *keyp) +{ + struct nilfs_btree_path *path; + const int minlevel = NILFS_BTREE_LEVEL_NODE_MIN; + int ret; + + path = nilfs_btree_alloc_path(); + if (!path) + return -ENOMEM; + + ret = nilfs_btree_do_lookup(btree, path, start, NULL, minlevel, 0); + if (!ret) + *keyp = start; + else if (ret == -ENOENT) + ret = nilfs_btree_get_next_key(btree, path, minlevel, keyp); + + nilfs_btree_free_path(path); + return ret; +} + static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp) { struct nilfs_btree_path *path; @@ -2298,7 +2357,9 @@ static const struct nilfs_bmap_operations nilfs_btree_ops = { .bop_assign = nilfs_btree_assign, .bop_mark = nilfs_btree_mark, + .bop_seek_key = nilfs_btree_seek_key, .bop_last_key = nilfs_btree_last_key, + .bop_check_insert = NULL, .bop_check_delete = nilfs_btree_check_delete, .bop_gather_data = nilfs_btree_gather_data, @@ -2318,7 +2379,9 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { .bop_assign = nilfs_btree_assign_gc, .bop_mark = NULL, + .bop_seek_key = NULL, .bop_last_key = NULL, + .bop_check_insert = NULL, .bop_check_delete = NULL, .bop_gather_data = NULL, diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index 0d58075f34e2..b6596cab9e99 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -53,6 +53,13 @@ nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno) return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile)); } +static __u64 nilfs_cpfile_first_checkpoint_in_block(const struct inode *cpfile, + unsigned long blkoff) +{ + return (__u64)nilfs_cpfile_checkpoints_per_block(cpfile) * blkoff + + 1 - NILFS_MDT(cpfile)->mi_first_entry_offset; +} + static unsigned long nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile, __u64 curr, @@ -146,6 +153,44 @@ static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile, create, nilfs_cpfile_block_init, bhp); } +/** + * nilfs_cpfile_find_checkpoint_block - find and get a buffer on cpfile + * @cpfile: inode of cpfile + * @start_cno: start checkpoint number (inclusive) + * @end_cno: end checkpoint number (inclusive) + * @cnop: place to store the next checkpoint number + * @bhp: place to store a pointer to buffer_head struct + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - no block exists in the range. + */ +static int nilfs_cpfile_find_checkpoint_block(struct inode *cpfile, + __u64 start_cno, __u64 end_cno, + __u64 *cnop, + struct buffer_head **bhp) +{ + unsigned long start, end, blkoff; + int ret; + + if (unlikely(start_cno > end_cno)) + return -ENOENT; + + start = nilfs_cpfile_get_blkoff(cpfile, start_cno); + end = nilfs_cpfile_get_blkoff(cpfile, end_cno); + + ret = nilfs_mdt_find_block(cpfile, start, end, &blkoff, bhp); + if (!ret) + *cnop = (blkoff == start) ? start_cno : + nilfs_cpfile_first_checkpoint_in_block(cpfile, blkoff); + return ret; +} + static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile, __u64 cno) { @@ -403,14 +448,15 @@ static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop, return -ENOENT; /* checkpoint number 0 is invalid */ down_read(&NILFS_MDT(cpfile)->mi_sem); - for (n = 0; cno < cur_cno && n < nci; cno += ncps) { - ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); - ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); + for (n = 0; n < nci; cno += ncps) { + ret = nilfs_cpfile_find_checkpoint_block( + cpfile, cno, cur_cno - 1, &cno, &bh); if (ret < 0) { - if (ret != -ENOENT) - goto out; - continue; /* skip hole */ + if (likely(ret == -ENOENT)) + break; + goto out; } + ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno); kaddr = kmap_atomic(bh->b_page); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 197a63e9d102..6b8b92b19cec 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. @@ -435,7 +430,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, */ int nilfs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned chunk_size = nilfs_chunk_size(dir); diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 82f4865e86dd..ebf89fd8ac1a 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -173,6 +173,21 @@ static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) return ret; } +static int nilfs_direct_seek_key(const struct nilfs_bmap *direct, __u64 start, + __u64 *keyp) +{ + __u64 key; + + for (key = start; key <= NILFS_DIRECT_KEY_MAX; key++) { + if (nilfs_direct_get_ptr(direct, key) != + NILFS_BMAP_INVALID_PTR) { + *keyp = key; + return 0; + } + } + return -ENOENT; +} + static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp) { __u64 key, lastkey; @@ -355,7 +370,9 @@ static const struct nilfs_bmap_operations nilfs_direct_ops = { .bop_assign = nilfs_direct_assign, .bop_mark = NULL, + .bop_seek_key = nilfs_direct_seek_key, .bop_last_key = nilfs_direct_last_key, + .bop_check_insert = nilfs_direct_check_insert, .bop_check_delete = NULL, .bop_gather_data = nilfs_direct_gather_data, diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index a8c728acb7a8..54575e3cc1a2 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -143,8 +143,6 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma) */ const struct file_operations nilfs_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .unlocked_ioctl = nilfs_ioctl, diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 8b5969538f39..4a73d6dffabf 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -26,7 +26,7 @@ #include <linux/mpage.h> #include <linux/pagemap.h> #include <linux/writeback.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "nilfs.h" #include "btnode.h" #include "segment.h" @@ -106,7 +106,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, err = nilfs_transaction_begin(inode->i_sb, &ti, 1); if (unlikely(err)) goto out; - err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff, + err = nilfs_bmap_insert(ii->i_bmap, blkoff, (unsigned long)bh_result); if (unlikely(err != 0)) { if (err == -EEXIST) { @@ -305,35 +305,15 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, } static ssize_t -nilfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - ssize_t size; + struct inode *inode = file_inode(iocb->ki_filp); - if (rw == WRITE) + if (iov_iter_rw(iter) == WRITE) return 0; /* Needs synchronization with the cleaner */ - size = blockdev_direct_IO(rw, iocb, inode, iter, offset, - nilfs_get_block); - - /* - * In case of error extending write may have instantiated a few - * blocks outside i_size. Trim these off again. - */ - if (unlikely((rw & WRITE) && size < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + count; - - if (end > isize) - nilfs_write_failed(mapping, end); - } - - return size; + return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); } const struct address_space_operations nilfs_aops = { @@ -443,21 +423,20 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode) void nilfs_set_inode_flags(struct inode *inode) { unsigned int flags = NILFS_I(inode)->i_flags; + unsigned int new_fl = 0; - inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | - S_DIRSYNC); if (flags & FS_SYNC_FL) - inode->i_flags |= S_SYNC; + new_fl |= S_SYNC; if (flags & FS_APPEND_FL) - inode->i_flags |= S_APPEND; + new_fl |= S_APPEND; if (flags & FS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; + new_fl |= S_IMMUTABLE; if (flags & FS_NOATIME_FL) - inode->i_flags |= S_NOATIME; + new_fl |= S_NOATIME; if (flags & FS_DIRSYNC_FL) - inode->i_flags |= S_DIRSYNC; - mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + new_fl |= S_DIRSYNC; + inode_set_flags(inode, new_fl, S_SYNC | S_APPEND | S_IMMUTABLE | + S_NOATIME | S_DIRSYNC); } int nilfs_read_inode_common(struct inode *inode, @@ -542,6 +521,8 @@ static int __nilfs_read_inode(struct super_block *sb, brelse(bh); up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem); nilfs_set_inode_flags(inode); + mapping_set_gfp_mask(inode->i_mapping, + mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); return 0; failed_unmap: @@ -714,7 +695,7 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, unsigned long from) { - unsigned long b; + __u64 b; int ret; if (!test_bit(NILFS_I_BMAP, &ii->i_state)) @@ -729,7 +710,7 @@ repeat: if (b < from) return; - b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from); + b -= min_t(__u64, NILFS_MAX_TRUNCATE_BLOCKS, b - from); ret = nilfs_bmap_truncate(ii->i_bmap, b); nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); if (!ret || (ret == -ENOMEM && @@ -836,7 +817,7 @@ void nilfs_evict_inode(struct inode *inode) int nilfs_setattr(struct dentry *dentry, struct iattr *iattr) { struct nilfs_transaction_info ti; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; int err; diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 9a20e513d7eb..aba43811d6ef 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1369,7 +1369,6 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_SYNC: case NILFS_IOCTL_RESIZE: case NILFS_IOCTL_SET_ALLOC_RANGE: - case FITRIM: break; default: return -ENOIOCTLCMD; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 892cf5ffdb8e..dee34d990281 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -261,6 +261,60 @@ int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create, } /** + * nilfs_mdt_find_block - find and get a buffer on meta data file. + * @inode: inode of the meta data file + * @start: start block offset (inclusive) + * @end: end block offset (inclusive) + * @blkoff: block offset + * @out_bh: place to store a pointer to buffer_head struct + * + * nilfs_mdt_find_block() looks up an existing block in range of + * [@start, @end] and stores pointer to a buffer head of the block to + * @out_bh, and block offset to @blkoff, respectively. @out_bh and + * @blkoff are substituted only when zero is returned. + * + * Return Value: On success, it returns 0. On error, the following negative + * error code is returned. + * + * %-ENOMEM - Insufficient memory available. + * + * %-EIO - I/O error + * + * %-ENOENT - no block was found in the range + */ +int nilfs_mdt_find_block(struct inode *inode, unsigned long start, + unsigned long end, unsigned long *blkoff, + struct buffer_head **out_bh) +{ + __u64 next; + int ret; + + if (unlikely(start > end)) + return -ENOENT; + + ret = nilfs_mdt_read_block(inode, start, true, out_bh); + if (!ret) { + *blkoff = start; + goto out; + } + if (unlikely(ret != -ENOENT || start == ULONG_MAX)) + goto out; + + ret = nilfs_bmap_seek_key(NILFS_I(inode)->i_bmap, start + 1, &next); + if (!ret) { + if (next <= end) { + ret = nilfs_mdt_read_block(inode, next, true, out_bh); + if (!ret) + *blkoff = next; + } else { + ret = -ENOENT; + } + } +out: + return ret; +} + +/** * nilfs_mdt_delete_block - make a hole on the meta data file. * @inode: inode of the meta data file * @block: block offset diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index ab172e8549c5..fe529a87a208 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -78,6 +78,9 @@ int nilfs_mdt_get_block(struct inode *, unsigned long, int, void (*init_block)(struct inode *, struct buffer_head *, void *), struct buffer_head **); +int nilfs_mdt_find_block(struct inode *inode, unsigned long start, + unsigned long end, unsigned long *blkoff, + struct buffer_head **out_bh); int nilfs_mdt_delete_block(struct inode *, unsigned long); int nilfs_mdt_forget_block(struct inode *, unsigned long); int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); @@ -111,7 +114,10 @@ static inline __u64 nilfs_mdt_cno(struct inode *inode) return ((struct the_nilfs *)inode->i_sb->s_fs_info)->ns_cno; } -#define nilfs_mdt_bgl_lock(inode, bg) \ - (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock) +static inline spinlock_t * +nilfs_mdt_bgl_lock(struct inode *inode, unsigned int block_group) +{ + return bgl_lock_ptr(NILFS_MDT(inode)->mi_bgl, block_group); +} #endif /* _NILFS_MDT_H */ diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 0f84b257932c..37dd6b05b1b5 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -192,7 +192,7 @@ out_fail: static int nilfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct nilfs_transaction_info ti; int err; @@ -283,7 +283,7 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry) if (!de) goto out; - inode = dentry->d_inode; + inode = d_inode(dentry); err = -EIO; if (le64_to_cpu(de->inode) != inode->i_ino) goto out; @@ -318,7 +318,7 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry) if (!err) { nilfs_mark_inode_dirty(dir); - nilfs_mark_inode_dirty(dentry->d_inode); + nilfs_mark_inode_dirty(d_inode(dentry)); err = nilfs_transaction_commit(dir->i_sb); } else nilfs_transaction_abort(dir->i_sb); @@ -328,7 +328,7 @@ static int nilfs_unlink(struct inode *dir, struct dentry *dentry) static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct nilfs_transaction_info ti; int err; @@ -358,8 +358,8 @@ static int nilfs_rmdir(struct inode *dir, struct dentry *dentry) static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct nilfs_dir_entry *dir_de = NULL; struct page *old_page; @@ -453,13 +453,13 @@ static struct dentry *nilfs_get_parent(struct dentry *child) struct qstr dotdot = QSTR_INIT("..", 2); struct nilfs_root *root; - ino = nilfs_inode_by_name(child->d_inode, &dotdot); + ino = nilfs_inode_by_name(d_inode(child), &dotdot); if (!ino) return ERR_PTR(-ENOENT); - root = NILFS_I(child->d_inode)->i_root; + root = NILFS_I(d_inode(child))->i_root; - inode = nilfs_iget(child->d_inode->i_sb, root, ino); + inode = nilfs_iget(d_inode(child)->i_sb, root, ino); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -496,8 +496,7 @@ static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh, { struct nilfs_fid *fid = (struct nilfs_fid *)fh; - if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE && - fh_len != NILFS_FID_SIZE_CONNECTABLE) || + if (fh_len < NILFS_FID_SIZE_NON_CONNECTABLE || (fh_type != FILEID_NILFS_WITH_PARENT && fh_type != FILEID_NILFS_WITHOUT_PARENT)) return NULL; @@ -510,7 +509,7 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh, { struct nilfs_fid *fid = (struct nilfs_fid *)fh; - if (fh_len != NILFS_FID_SIZE_CONNECTABLE || + if (fh_len < NILFS_FID_SIZE_CONNECTABLE || fh_type != FILEID_NILFS_WITH_PARENT) return NULL; diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 700ecbcca55d..45d650addd56 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -89,18 +89,16 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, void nilfs_forget_buffer(struct buffer_head *bh) { struct page *page = bh->b_page; + const unsigned long clear_bits = + (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped | + 1 << BH_Async_Write | 1 << BH_NILFS_Volatile | + 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected); lock_buffer(bh); - clear_buffer_nilfs_volatile(bh); - clear_buffer_nilfs_checked(bh); - clear_buffer_nilfs_redirected(bh); - clear_buffer_async_write(bh); - clear_buffer_dirty(bh); + set_mask_bits(&bh->b_state, clear_bits, 0); if (nilfs_page_buffers_clean(page)) __nilfs_clear_page_dirty(page); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); bh->b_blocknr = -1; ClearPageUptodate(page); ClearPageMappedToDisk(page); @@ -421,6 +419,10 @@ void nilfs_clear_dirty_page(struct page *page, bool silent) if (page_has_buffers(page)) { struct buffer_head *bh, *head; + const unsigned long clear_bits = + (1 << BH_Uptodate | 1 << BH_Dirty | 1 << BH_Mapped | + 1 << BH_Async_Write | 1 << BH_NILFS_Volatile | + 1 << BH_NILFS_Checked | 1 << BH_NILFS_Redirected); bh = head = page_buffers(page); do { @@ -430,13 +432,7 @@ void nilfs_clear_dirty_page(struct page *page, bool silent) "discard block %llu, size %zu", (u64)bh->b_blocknr, bh->b_size); } - clear_buffer_async_write(bh); - clear_buffer_dirty(bh); - clear_buffer_nilfs_volatile(bh); - clear_buffer_nilfs_checked(bh); - clear_buffer_nilfs_redirected(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); + set_mask_bits(&bh->b_state, clear_bits, 0); unlock_buffer(bh); } while (bh = bh->b_this_page, bh != head); } diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc3a9efdaab8..42468e5ab3e7 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err) const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - /* to be detected by nilfs_segbuf_submit_bio() */ - } - if (!uptodate) atomic_inc(&segbuf->sb_err); @@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; - bio_get(bio); submit_bio(mode, bio); segbuf->sb_nbio++; - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - bio_put(bio); - err = -EOPNOTSUPP; - goto failed; - } - bio_put(bio); wi->bio = NULL; wi->rest_blocks -= wi->end - wi->start; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 0c3f303baf32..c6abbad9b8e3 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -24,6 +24,7 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> +#include <linux/bitops.h> #include <linux/bio.h> #include <linux/completion.h> #include <linux/blkdev.h> @@ -1588,7 +1589,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci) list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { - set_buffer_async_write(bh); if (bh->b_page != bd_page) { if (bd_page) { lock_page(bd_page); @@ -1688,7 +1688,6 @@ static void nilfs_abort_logs(struct list_head *logs, int err) list_for_each_entry(segbuf, logs, sb_list) { list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) { - clear_buffer_async_write(bh); if (bh->b_page != bd_page) { if (bd_page) end_page_writeback(bd_page); @@ -1768,7 +1767,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) b_assoc_buffers) { set_buffer_uptodate(bh); clear_buffer_dirty(bh); - clear_buffer_async_write(bh); if (bh->b_page != bd_page) { if (bd_page) end_page_writeback(bd_page); @@ -1788,12 +1786,13 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) */ list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) { - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - clear_buffer_async_write(bh); - clear_buffer_delay(bh); - clear_buffer_nilfs_volatile(bh); - clear_buffer_nilfs_redirected(bh); + const unsigned long set_bits = (1 << BH_Uptodate); + const unsigned long clear_bits = + (1 << BH_Dirty | 1 << BH_Async_Write | + 1 << BH_Delay | 1 << BH_NILFS_Volatile | + 1 << BH_NILFS_Redirected); + + set_mask_bits(&bh->b_state, clear_bits, set_bits); if (bh == segbuf->sb_super_root) { if (bh->b_page != bd_page) { end_page_writeback(bd_page); diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 5bc2a1cf73c3..f47585bfeb01 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -610,7 +610,7 @@ static int nilfs_unfreeze(struct super_block *sb) static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root; + struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root; struct the_nilfs *nilfs = root->nilfs; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); unsigned long long blocks; @@ -681,7 +681,7 @@ static int nilfs_show_options(struct seq_file *seq, struct dentry *dentry) { struct super_block *sb = dentry->d_sb; struct the_nilfs *nilfs = sb->s_fs_info; - struct nilfs_root *root = NILFS_I(dentry->d_inode)->i_root; + struct nilfs_root *root = NILFS_I(d_inode(dentry))->i_root; if (!nilfs_test_opt(nilfs, BARRIER)) seq_puts(seq, ",nobarrier"); @@ -1020,7 +1020,7 @@ int nilfs_checkpoint_is_mounted(struct super_block *sb, __u64 cno) struct dentry *dentry; int ret; - if (cno < 0 || cno > nilfs->ns_cno) + if (cno > nilfs->ns_cno) return false; if (cno >= nilfs_last_cno(nilfs)) @@ -1190,7 +1190,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; - root = NILFS_I(sb->s_root->d_inode)->i_root; + root = NILFS_I(d_inode(sb->s_root))->i_root; err = nilfs_attach_log_writer(sb, root); if (err) goto restore_opts; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 450648697433..5b1e2a497e51 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -26,7 +26,7 @@ #include <linux/fs.h> /* struct inode */ #include <linux/fsnotify_backend.h> #include <linux/idr.h> -#include <linux/init.h> /* module_init */ +#include <linux/init.h> /* fs_initcall */ #include <linux/inotify.h> #include <linux/kernel.h> /* roundup() */ #include <linux/namei.h> /* LOOKUP_FOLLOW */ @@ -812,4 +812,4 @@ static int __init inotify_user_setup(void) return 0; } -module_init(inotify_user_setup); +fs_initcall(inotify_user_setup); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 92e48c70f0f0..39ddcaf0918f 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -412,16 +412,36 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags) { struct fsnotify_mark *lmark, *mark; + LIST_HEAD(to_free); + /* + * We have to be really careful here. Anytime we drop mark_mutex, e.g. + * fsnotify_clear_marks_by_inode() can come and free marks. Even in our + * to_free list so we have to use mark_mutex even when accessing that + * list. And freeing mark requires us to drop mark_mutex. So we can + * reliably free only the first mark in the list. That's why we first + * move marks to free to to_free list in one go and then free marks in + * to_free list one by one. + */ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { - if (mark->flags & flags) { - fsnotify_get_mark(mark); - fsnotify_destroy_mark_locked(mark, group); - fsnotify_put_mark(mark); - } + if (mark->flags & flags) + list_move(&mark->g_list, &to_free); } mutex_unlock(&group->mark_mutex); + + while (1) { + mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); + if (list_empty(&to_free)) { + mutex_unlock(&group->mark_mutex); + break; + } + mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); + fsnotify_get_mark(mark); + fsnotify_destroy_mark_locked(mark, group); + mutex_unlock(&group->mark_mutex); + fsnotify_put_mark(mark); + } } /* diff --git a/fs/nsfs.c b/fs/nsfs.c index af1b24fa899d..99521e7c492b 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -13,7 +13,7 @@ static const struct file_operations ns_file_operations = { static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = dentry->d_fsdata; return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]", @@ -22,7 +22,7 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) static void ns_prune_dentry(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (inode) { struct ns_common *ns = inode->i_private; atomic_long_set(&ns->stashed, 0); diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile index 36ae529511c4..2ff263e6d363 100644 --- a/fs/ntfs/Makefile +++ b/fs/ntfs/Makefile @@ -8,7 +8,7 @@ ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o -ccflags-y := -DNTFS_VERSION=\"2.1.31\" +ccflags-y := -DNTFS_VERSION=\"2.1.32\" ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1da9b2d184dc..262561fea923 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1,7 +1,7 @@ /* * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001-2015 Anton Altaparmakov and Tuxera Inc. * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -28,7 +28,6 @@ #include <linux/swap.h> #include <linux/uio.h> #include <linux/writeback.h> -#include <linux/aio.h> #include <asm/page.h> #include <asm/uaccess.h> @@ -329,62 +328,166 @@ err_out: return err; } -/** - * ntfs_fault_in_pages_readable - - * - * Fault a number of userspace pages into pagetables. - * - * Unlike include/linux/pagemap.h::fault_in_pages_readable(), this one copes - * with more than two userspace pages as well as handling the single page case - * elegantly. - * - * If you find this difficult to understand, then think of the while loop being - * the following code, except that we do without the integer variable ret: - * - * do { - * ret = __get_user(c, uaddr); - * uaddr += PAGE_SIZE; - * } while (!ret && uaddr < end); - * - * Note, the final __get_user() may well run out-of-bounds of the user buffer, - * but _not_ out-of-bounds of the page the user buffer belongs to, and since - * this is only a read and not a write, and since it is still in the same page, - * it should not matter and this makes the code much simpler. - */ -static inline void ntfs_fault_in_pages_readable(const char __user *uaddr, - int bytes) +static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, + struct iov_iter *from) { - const char __user *end; - volatile char c; - - /* Set @end to the first byte outside the last page we care about. */ - end = (const char __user*)PAGE_ALIGN((unsigned long)uaddr + bytes); - - while (!__get_user(c, uaddr) && (uaddr += PAGE_SIZE, uaddr < end)) - ; -} - -/** - * ntfs_fault_in_pages_readable_iovec - - * - * Same as ntfs_fault_in_pages_readable() but operates on an array of iovecs. - */ -static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov, - size_t iov_ofs, int bytes) -{ - do { - const char __user *buf; - unsigned len; + loff_t pos; + s64 end, ll; + ssize_t err; + unsigned long flags; + struct file *file = iocb->ki_filp; + struct inode *vi = file_inode(file); + ntfs_inode *base_ni, *ni = NTFS_I(vi); + ntfs_volume *vol = ni->vol; - buf = iov->iov_base + iov_ofs; - len = iov->iov_len - iov_ofs; - if (len > bytes) - len = bytes; - ntfs_fault_in_pages_readable(buf, len); - bytes -= len; - iov++; - iov_ofs = 0; - } while (bytes); + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " + "0x%llx, count 0x%zx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)iocb->ki_pos, + iov_iter_count(from)); + err = generic_write_checks(iocb, from); + if (unlikely(err <= 0)) + goto out; + /* + * All checks have passed. Before we start doing any writing we want + * to abort any totally illegal writes. + */ + BUG_ON(NInoMstProtected(ni)); + BUG_ON(ni->type != AT_DATA); + /* If file is encrypted, deny access, just like NT4. */ + if (NInoEncrypted(ni)) { + /* Only $DATA attributes can be encrypted. */ + /* + * Reminder for later: Encrypted files are _always_ + * non-resident so that the content can always be encrypted. + */ + ntfs_debug("Denying write access to encrypted file."); + err = -EACCES; + goto out; + } + if (NInoCompressed(ni)) { + /* Only unnamed $DATA attribute can be compressed. */ + BUG_ON(ni->name_len); + /* + * Reminder for later: If resident, the data is not actually + * compressed. Only on the switch to non-resident does + * compression kick in. This is in contrast to encrypted files + * (see above). + */ + ntfs_error(vi->i_sb, "Writing to compressed files is not " + "implemented yet. Sorry."); + err = -EOPNOTSUPP; + goto out; + } + base_ni = ni; + if (NInoAttr(ni)) + base_ni = ni->ext.base_ntfs_ino; + err = file_remove_privs(file); + if (unlikely(err)) + goto out; + /* + * Our ->update_time method always succeeds thus file_update_time() + * cannot fail either so there is no need to check the return code. + */ + file_update_time(file); + pos = iocb->ki_pos; + /* The first byte after the last cluster being written to. */ + end = (pos + iov_iter_count(from) + vol->cluster_size_mask) & + ~(u64)vol->cluster_size_mask; + /* + * If the write goes beyond the allocated size, extend the allocation + * to cover the whole of the write, rounded up to the nearest cluster. + */ + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (end > ll) { + /* + * Extend the allocation without changing the data size. + * + * Note we ensure the allocation is big enough to at least + * write some data but we do not require the allocation to be + * complete, i.e. it may be partial. + */ + ll = ntfs_attr_extend_allocation(ni, end, -1, pos); + if (likely(ll >= 0)) { + BUG_ON(pos >= ll); + /* If the extension was partial truncate the write. */ + if (end > ll) { + ntfs_debug("Truncating write to inode 0x%lx, " + "attribute type 0x%x, because " + "the allocation was only " + "partially extended.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + iov_iter_truncate(from, ll - pos); + } + } else { + err = ll; + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->allocated_size; + read_unlock_irqrestore(&ni->size_lock, flags); + /* Perform a partial write if possible or fail. */ + if (pos < ll) { + ntfs_debug("Truncating write to inode 0x%lx " + "attribute type 0x%x, because " + "extending the allocation " + "failed (error %d).", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type), + (int)-err); + iov_iter_truncate(from, ll - pos); + } else { + if (err != -ENOSPC) + ntfs_error(vi->i_sb, "Cannot perform " + "write to inode " + "0x%lx, attribute " + "type 0x%x, because " + "extending the " + "allocation failed " + "(error %ld).", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type), + (long)-err); + else + ntfs_debug("Cannot perform write to " + "inode 0x%lx, " + "attribute type 0x%x, " + "because there is not " + "space left.", + vi->i_ino, (unsigned) + le32_to_cpu(ni->type)); + goto out; + } + } + } + /* + * If the write starts beyond the initialized size, extend it up to the + * beginning of the write and initialize all non-sparse space between + * the old initialized size and the new one. This automatically also + * increments the vfs inode->i_size to keep it above or equal to the + * initialized_size. + */ + read_lock_irqsave(&ni->size_lock, flags); + ll = ni->initialized_size; + read_unlock_irqrestore(&ni->size_lock, flags); + if (pos > ll) { + /* + * Wait for ongoing direct i/o to complete before proceeding. + * New direct i/o cannot start as we hold i_mutex. + */ + inode_dio_wait(vi); + err = ntfs_attr_extend_initialized(ni, pos); + if (unlikely(err < 0)) + ntfs_error(vi->i_sb, "Cannot perform write to inode " + "0x%lx, attribute type 0x%x, because " + "extending the initialized size " + "failed (error %d).", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (int)-err); + } +out: + return err; } /** @@ -421,8 +524,9 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, goto err_out; } } - err = add_to_page_cache_lru(*cached_page, mapping, index, - GFP_KERNEL); + err = add_to_page_cache_lru(*cached_page, mapping, + index, + GFP_KERNEL & mapping_gfp_mask(mapping)); if (unlikely(err)) { if (err == -EEXIST) continue; @@ -1268,180 +1372,6 @@ rl_not_mapped_enoent: return err; } -/* - * Copy as much as we can into the pages and return the number of bytes which - * were successfully copied. If a fault is encountered then clear the pages - * out to (ofs + bytes) and return the number of bytes which were copied. - */ -static inline size_t ntfs_copy_from_user(struct page **pages, - unsigned nr_pages, unsigned ofs, const char __user *buf, - size_t bytes) -{ - struct page **last_page = pages + nr_pages; - char *addr; - size_t total = 0; - unsigned len; - int left; - - do { - len = PAGE_CACHE_SIZE - ofs; - if (len > bytes) - len = bytes; - addr = kmap_atomic(*pages); - left = __copy_from_user_inatomic(addr + ofs, buf, len); - kunmap_atomic(addr); - if (unlikely(left)) { - /* Do it the slow way. */ - addr = kmap(*pages); - left = __copy_from_user(addr + ofs, buf, len); - kunmap(*pages); - if (unlikely(left)) - goto err_out; - } - total += len; - bytes -= len; - if (!bytes) - break; - buf += len; - ofs = 0; - } while (++pages < last_page); -out: - return total; -err_out: - total += len - left; - /* Zero the rest of the target like __copy_from_user(). */ - while (++pages < last_page) { - bytes -= len; - if (!bytes) - break; - len = PAGE_CACHE_SIZE; - if (len > bytes) - len = bytes; - zero_user(*pages, 0, len); - } - goto out; -} - -static size_t __ntfs_copy_from_user_iovec_inatomic(char *vaddr, - const struct iovec *iov, size_t iov_ofs, size_t bytes) -{ - size_t total = 0; - - while (1) { - const char __user *buf = iov->iov_base + iov_ofs; - unsigned len; - size_t left; - - len = iov->iov_len - iov_ofs; - if (len > bytes) - len = bytes; - left = __copy_from_user_inatomic(vaddr, buf, len); - total += len; - bytes -= len; - vaddr += len; - if (unlikely(left)) { - total -= left; - break; - } - if (!bytes) - break; - iov++; - iov_ofs = 0; - } - return total; -} - -static inline void ntfs_set_next_iovec(const struct iovec **iovp, - size_t *iov_ofsp, size_t bytes) -{ - const struct iovec *iov = *iovp; - size_t iov_ofs = *iov_ofsp; - - while (bytes) { - unsigned len; - - len = iov->iov_len - iov_ofs; - if (len > bytes) - len = bytes; - bytes -= len; - iov_ofs += len; - if (iov->iov_len == iov_ofs) { - iov++; - iov_ofs = 0; - } - } - *iovp = iov; - *iov_ofsp = iov_ofs; -} - -/* - * This has the same side-effects and return value as ntfs_copy_from_user(). - * The difference is that on a fault we need to memset the remainder of the - * pages (out to offset + bytes), to emulate ntfs_copy_from_user()'s - * single-segment behaviour. - * - * We call the same helper (__ntfs_copy_from_user_iovec_inatomic()) both when - * atomic and when not atomic. This is ok because it calls - * __copy_from_user_inatomic() and it is ok to call this when non-atomic. In - * fact, the only difference between __copy_from_user_inatomic() and - * __copy_from_user() is that the latter calls might_sleep() and the former - * should not zero the tail of the buffer on error. And on many architectures - * __copy_from_user_inatomic() is just defined to __copy_from_user() so it - * makes no difference at all on those architectures. - */ -static inline size_t ntfs_copy_from_user_iovec(struct page **pages, - unsigned nr_pages, unsigned ofs, const struct iovec **iov, - size_t *iov_ofs, size_t bytes) -{ - struct page **last_page = pages + nr_pages; - char *addr; - size_t copied, len, total = 0; - - do { - len = PAGE_CACHE_SIZE - ofs; - if (len > bytes) - len = bytes; - addr = kmap_atomic(*pages); - copied = __ntfs_copy_from_user_iovec_inatomic(addr + ofs, - *iov, *iov_ofs, len); - kunmap_atomic(addr); - if (unlikely(copied != len)) { - /* Do it the slow way. */ - addr = kmap(*pages); - copied = __ntfs_copy_from_user_iovec_inatomic(addr + - ofs, *iov, *iov_ofs, len); - if (unlikely(copied != len)) - goto err_out; - kunmap(*pages); - } - total += len; - ntfs_set_next_iovec(iov, iov_ofs, len); - bytes -= len; - if (!bytes) - break; - ofs = 0; - } while (++pages < last_page); -out: - return total; -err_out: - BUG_ON(copied > len); - /* Zero the rest of the target like __copy_from_user(). */ - memset(addr + ofs + copied, 0, len - copied); - kunmap(*pages); - total += copied; - ntfs_set_next_iovec(iov, iov_ofs, copied); - while (++pages < last_page) { - bytes -= len; - if (!bytes) - break; - len = PAGE_CACHE_SIZE; - if (len > bytes) - len = bytes; - zero_user(*pages, 0, len); - } - goto out; -} - static inline void ntfs_flush_dcache_pages(struct page **pages, unsigned nr_pages) { @@ -1762,86 +1692,83 @@ err_out: return err; } -static void ntfs_write_failed(struct address_space *mapping, loff_t to) +/* + * Copy as much as we can into the pages and return the number of bytes which + * were successfully copied. If a fault is encountered then clear the pages + * out to (ofs + bytes) and return the number of bytes which were copied. + */ +static size_t ntfs_copy_from_user_iter(struct page **pages, unsigned nr_pages, + unsigned ofs, struct iov_iter *i, size_t bytes) { - struct inode *inode = mapping->host; + struct page **last_page = pages + nr_pages; + size_t total = 0; + struct iov_iter data = *i; + unsigned len, copied; - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - ntfs_truncate_vfs(inode); - } + do { + len = PAGE_CACHE_SIZE - ofs; + if (len > bytes) + len = bytes; + copied = iov_iter_copy_from_user_atomic(*pages, &data, ofs, + len); + total += copied; + bytes -= copied; + if (!bytes) + break; + iov_iter_advance(&data, copied); + if (copied < len) + goto err; + ofs = 0; + } while (++pages < last_page); +out: + return total; +err: + /* Zero the rest of the target like __copy_from_user(). */ + len = PAGE_CACHE_SIZE - copied; + do { + if (len > bytes) + len = bytes; + zero_user(*pages, copied, len); + bytes -= len; + copied = 0; + len = PAGE_CACHE_SIZE; + } while (++pages < last_page); + goto out; } /** - * ntfs_file_buffered_write - - * - * Locking: The vfs is holding ->i_mutex on the inode. + * ntfs_perform_write - perform buffered write to a file + * @file: file to write to + * @i: iov_iter with data to write + * @pos: byte offset in file at which to begin writing to */ -static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, - loff_t pos, loff_t *ppos, size_t count) +static ssize_t ntfs_perform_write(struct file *file, struct iov_iter *i, + loff_t pos) { - struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *vi = mapping->host; ntfs_inode *ni = NTFS_I(vi); ntfs_volume *vol = ni->vol; struct page *pages[NTFS_MAX_PAGES_PER_CLUSTER]; struct page *cached_page = NULL; - char __user *buf = NULL; - s64 end, ll; VCN last_vcn; LCN lcn; - unsigned long flags; - size_t bytes, iov_ofs = 0; /* Offset in the current iovec. */ - ssize_t status, written; + size_t bytes; + ssize_t status, written = 0; unsigned nr_pages; - int err; - ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, " - "pos 0x%llx, count 0x%lx.", - vi->i_ino, (unsigned)le32_to_cpu(ni->type), - (unsigned long long)pos, (unsigned long)count); - if (unlikely(!count)) - return 0; - BUG_ON(NInoMstProtected(ni)); - /* - * If the attribute is not an index root and it is encrypted or - * compressed, we cannot write to it yet. Note we need to check for - * AT_INDEX_ALLOCATION since this is the type of both directory and - * index inodes. - */ - if (ni->type != AT_INDEX_ALLOCATION) { - /* If file is encrypted, deny access, just like NT4. */ - if (NInoEncrypted(ni)) { - /* - * Reminder for later: Encrypted files are _always_ - * non-resident so that the content can always be - * encrypted. - */ - ntfs_debug("Denying write access to encrypted file."); - return -EACCES; - } - if (NInoCompressed(ni)) { - /* Only unnamed $DATA attribute can be compressed. */ - BUG_ON(ni->type != AT_DATA); - BUG_ON(ni->name_len); - /* - * Reminder for later: If resident, the data is not - * actually compressed. Only on the switch to non- - * resident does compression kick in. This is in - * contrast to encrypted files (see above). - */ - ntfs_error(vi->i_sb, "Writing to compressed files is " - "not implemented yet. Sorry."); - return -EOPNOTSUPP; - } - } + ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, pos " + "0x%llx, count 0x%lx.", vi->i_ino, + (unsigned)le32_to_cpu(ni->type), + (unsigned long long)pos, + (unsigned long)iov_iter_count(i)); /* * If a previous ntfs_truncate() failed, repeat it and abort if it * fails again. */ if (unlikely(NInoTruncateFailed(ni))) { + int err; + inode_dio_wait(vi); err = ntfs_truncate(vi); if (err || NInoTruncateFailed(ni)) { @@ -1855,81 +1782,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, return err; } } - /* The first byte after the write. */ - end = pos + count; - /* - * If the write goes beyond the allocated size, extend the allocation - * to cover the whole of the write, rounded up to the nearest cluster. - */ - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (end > ll) { - /* Extend the allocation without changing the data size. */ - ll = ntfs_attr_extend_allocation(ni, end, -1, pos); - if (likely(ll >= 0)) { - BUG_ON(pos >= ll); - /* If the extension was partial truncate the write. */ - if (end > ll) { - ntfs_debug("Truncating write to inode 0x%lx, " - "attribute type 0x%x, because " - "the allocation was only " - "partially extended.", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type)); - end = ll; - count = ll - pos; - } - } else { - err = ll; - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->allocated_size; - read_unlock_irqrestore(&ni->size_lock, flags); - /* Perform a partial write if possible or fail. */ - if (pos < ll) { - ntfs_debug("Truncating write to inode 0x%lx, " - "attribute type 0x%x, because " - "extending the allocation " - "failed (error code %i).", - vi->i_ino, (unsigned) - le32_to_cpu(ni->type), err); - end = ll; - count = ll - pos; - } else { - ntfs_error(vol->sb, "Cannot perform write to " - "inode 0x%lx, attribute type " - "0x%x, because extending the " - "allocation failed (error " - "code %i).", vi->i_ino, - (unsigned) - le32_to_cpu(ni->type), err); - return err; - } - } - } - written = 0; - /* - * If the write starts beyond the initialized size, extend it up to the - * beginning of the write and initialize all non-sparse space between - * the old initialized size and the new one. This automatically also - * increments the vfs inode->i_size to keep it above or equal to the - * initialized_size. - */ - read_lock_irqsave(&ni->size_lock, flags); - ll = ni->initialized_size; - read_unlock_irqrestore(&ni->size_lock, flags); - if (pos > ll) { - err = ntfs_attr_extend_initialized(ni, pos); - if (err < 0) { - ntfs_error(vol->sb, "Cannot perform write to inode " - "0x%lx, attribute type 0x%x, because " - "extending the initialized size " - "failed (error code %i).", vi->i_ino, - (unsigned)le32_to_cpu(ni->type), err); - status = err; - goto err_out; - } - } /* * Determine the number of pages per cluster for non-resident * attributes. @@ -1937,10 +1789,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, nr_pages = 1; if (vol->cluster_size > PAGE_CACHE_SIZE && NInoNonResident(ni)) nr_pages = vol->cluster_size >> PAGE_CACHE_SHIFT; - /* Finally, perform the actual write. */ last_vcn = -1; - if (likely(nr_segs == 1)) - buf = iov->iov_base; do { VCN vcn; pgoff_t idx, start_idx; @@ -1965,10 +1814,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, vol->cluster_size_bits, false); up_read(&ni->runlist.lock); if (unlikely(lcn < LCN_HOLE)) { - status = -EIO; if (lcn == LCN_ENOMEM) status = -ENOMEM; - else + else { + status = -EIO; ntfs_error(vol->sb, "Cannot " "perform write to " "inode 0x%lx, " @@ -1977,6 +1826,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, "is corrupt.", vi->i_ino, (unsigned) le32_to_cpu(ni->type)); + } break; } if (lcn == LCN_HOLE) { @@ -1989,8 +1839,9 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, } } } - if (bytes > count) - bytes = count; + if (bytes > iov_iter_count(i)) + bytes = iov_iter_count(i); +again: /* * Bring in the user page(s) that we will copy from _first_. * Otherwise there is a nasty deadlock on copying from the same @@ -1999,10 +1850,10 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, * pages being swapped out between us bringing them into memory * and doing the actual copying. */ - if (likely(nr_segs == 1)) - ntfs_fault_in_pages_readable(buf, bytes); - else - ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes); + if (unlikely(iov_iter_fault_in_multipages_readable(i, bytes))) { + status = -EFAULT; + break; + } /* Get and lock @do_pages starting at index @start_idx. */ status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages, pages, &cached_page); @@ -2018,56 +1869,57 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, status = ntfs_prepare_pages_for_non_resident_write( pages, do_pages, pos, bytes); if (unlikely(status)) { - loff_t i_size; - do { unlock_page(pages[--do_pages]); page_cache_release(pages[do_pages]); } while (do_pages); - /* - * The write preparation may have instantiated - * allocated space outside i_size. Trim this - * off again. We can ignore any errors in this - * case as we will just be waisting a bit of - * allocated space, which is not a disaster. - */ - i_size = i_size_read(vi); - if (pos + bytes > i_size) { - ntfs_write_failed(mapping, pos + bytes); - } break; } } u = (pos >> PAGE_CACHE_SHIFT) - pages[0]->index; - if (likely(nr_segs == 1)) { - copied = ntfs_copy_from_user(pages + u, do_pages - u, - ofs, buf, bytes); - buf += copied; - } else - copied = ntfs_copy_from_user_iovec(pages + u, - do_pages - u, ofs, &iov, &iov_ofs, - bytes); + copied = ntfs_copy_from_user_iter(pages + u, do_pages - u, ofs, + i, bytes); ntfs_flush_dcache_pages(pages + u, do_pages - u); - status = ntfs_commit_pages_after_write(pages, do_pages, pos, - bytes); - if (likely(!status)) { - written += copied; - count -= copied; - pos += copied; - if (unlikely(copied != bytes)) - status = -EFAULT; + status = 0; + if (likely(copied == bytes)) { + status = ntfs_commit_pages_after_write(pages, do_pages, + pos, bytes); + if (!status) + status = bytes; } do { unlock_page(pages[--do_pages]); page_cache_release(pages[do_pages]); } while (do_pages); - if (unlikely(status)) + if (unlikely(status < 0)) break; - balance_dirty_pages_ratelimited(mapping); + copied = status; cond_resched(); - } while (count); -err_out: - *ppos = pos; + if (unlikely(!copied)) { + size_t sc; + + /* + * We failed to copy anything. Fall back to single + * segment length write. + * + * This is needed to avoid possible livelock in the + * case that all segments in the iov cannot be copied + * at once without a pagefault. + */ + sc = iov_iter_single_seg_count(i); + if (bytes > sc) + bytes = sc; + goto again; + } + iov_iter_advance(i, copied); + pos += copied; + written += copied; + balance_dirty_pages_ratelimited(mapping); + if (fatal_signal_pending(current)) { + status = -EINTR; + break; + } + } while (iov_iter_count(i)); if (cached_page) page_cache_release(cached_page); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", @@ -2077,63 +1929,36 @@ err_out: } /** - * ntfs_file_aio_write_nolock - + * ntfs_file_write_iter - simple wrapper for ntfs_file_write_iter_nolock() + * @iocb: IO state structure + * @from: iov_iter with data to write + * + * Basically the same as generic_file_write_iter() except that it ends up + * up calling ntfs_perform_write() instead of generic_perform_write() and that + * O_DIRECT is not implemented. */ -static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t *ppos) +static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - loff_t pos; - size_t count; /* after file limit checks */ - ssize_t written, err; + struct inode *vi = file_inode(file); + ssize_t written = 0; + ssize_t err; - count = iov_length(iov, nr_segs); - pos = *ppos; + mutex_lock(&vi->i_mutex); /* We can write back this queue in page reclaim. */ - current->backing_dev_info = inode_to_bdi(inode); - written = 0; - err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); - if (err) - goto out; - if (!count) - goto out; - err = file_remove_suid(file); - if (err) - goto out; - err = file_update_time(file); - if (err) - goto out; - written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos, - count); -out: + current->backing_dev_info = inode_to_bdi(vi); + err = ntfs_prepare_file_for_write(iocb, from); + if (iov_iter_count(from) && !err) + written = ntfs_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; - return written ? written : err; -} - -/** - * ntfs_file_aio_write - - */ -static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - BUG_ON(iocb->ki_pos != pos); - - mutex_lock(&inode->i_mutex); - ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); - mutex_unlock(&inode->i_mutex); - if (ret > 0) { - int err = generic_write_sync(file, iocb->ki_pos - ret, ret); + mutex_unlock(&vi->i_mutex); + if (likely(written > 0)) { + err = generic_write_sync(file, iocb->ki_pos, written); if (err < 0) - ret = err; + written = 0; } - return ret; + iocb->ki_pos += written; + return written ? written : err; } /** @@ -2197,37 +2022,15 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, #endif /* NTFS_RW */ const struct file_operations ntfs_file_ops = { - .llseek = generic_file_llseek, /* Seek inside file. */ - .read = new_sync_read, /* Read from file. */ - .read_iter = generic_file_read_iter, /* Async read from file. */ + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, #ifdef NTFS_RW - .write = do_sync_write, /* Write to file. */ - .aio_write = ntfs_file_aio_write, /* Async write to file. */ - /*.release = ,*/ /* Last file is closed. See - fs/ext2/file.c:: - ext2_release_file() for - how to use this to discard - preallocated space for - write opened files. */ - .fsync = ntfs_file_fsync, /* Sync a file to disk. */ - /*.aio_fsync = ,*/ /* Sync all outstanding async - i/o operations on a - kiocb. */ + .write_iter = ntfs_file_write_iter, + .fsync = ntfs_file_fsync, #endif /* NTFS_RW */ - /*.ioctl = ,*/ /* Perform function on the - mounted filesystem. */ - .mmap = generic_file_mmap, /* Mmap file. */ - .open = ntfs_file_open, /* Open file. */ - .splice_read = generic_file_splice_read /* Zero-copy data send with - the data source being on - the ntfs partition. We do - not need to care about the - data destination. */ - /*.sendpage = ,*/ /* Zero-copy data send with - the data destination being - on the ntfs partition. We - do not need to care about - the data source. */ + .mmap = generic_file_mmap, + .open = ntfs_file_open, + .splice_read = generic_file_splice_read, }; const struct inode_operations ntfs_file_inode_ops = { diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 898b9949d363..d284f07eda77 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -28,7 +28,6 @@ #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/log2.h> -#include <linux/aio.h> #include "aops.h" #include "attrib.h" @@ -2890,7 +2889,7 @@ void ntfs_truncate_vfs(struct inode *vi) { */ int ntfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *vi = dentry->d_inode; + struct inode *vi = d_inode(dentry); int err; unsigned int ia_valid = attr->ia_valid; diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 76b6cfb579d7..b3c3469de6cb 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -239,7 +239,7 @@ typedef struct { */ static inline ntfs_inode *NTFS_I(struct inode *inode) { - return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode); + return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); } static inline struct inode *VFS_I(ntfs_inode *ni) diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index a44b14cbceeb..ab172e5f51d9 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -85,12 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size) static inline void ntfs_free(void *addr) { - if (!is_vmalloc_addr(addr)) { - kfree(addr); - /* free_page((unsigned long)addr); */ - return; - } - vfree(addr); + kvfree(addr); } #endif /* _LINUX_NTFS_MALLOC_H */ diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index b3973c2fd190..443abecf01b7 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -35,7 +35,7 @@ * ntfs_lookup - find the inode represented by a dentry in a directory inode * @dir_ino: directory inode in which to look for the inode * @dent: dentry representing the inode to look for - * @nd: lookup nameidata + * @flags: lookup flags * * In short, ntfs_lookup() looks for the inode represented by the dentry @dent * in the directory inode @dir_ino and if found attaches the inode to the @@ -292,14 +292,14 @@ const struct inode_operations ntfs_dir_inode_ops = { * The code is based on the ext3 ->get_parent() implementation found in * fs/ext3/namei.c::ext3_get_parent(). * - * Note: ntfs_get_parent() is called with @child_dent->d_inode->i_mutex down. + * Note: ntfs_get_parent() is called with @d_inode(child_dent)->i_mutex down. * * Return the dentry of the parent directory on success or the error code on * error (IS_ERR() is true). */ static struct dentry *ntfs_get_parent(struct dentry *child_dent) { - struct inode *vi = child_dent->d_inode; + struct inode *vi = d_inode(child_dent); ntfs_inode *ni = NTFS_I(vi); MFT_RECORD *mrec; ntfs_attr_search_ctx *ctx; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 044158bd22be..5997c00a1515 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2925,7 +2925,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, struct ocfs2_path *right_path = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); - BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); + if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))) + return 0; *empty_extent_path = NULL; @@ -3370,7 +3371,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, ret = ocfs2_get_right_path(et, left_path, &right_path); if (ret) { mlog_errno(ret); - goto out; + return ret; } right_el = path_leaf_el(right_path); @@ -3453,8 +3454,7 @@ static int ocfs2_merge_rec_right(struct ocfs2_path *left_path, subtree_index); } out: - if (right_path) - ocfs2_free_path(right_path); + ocfs2_free_path(right_path); return ret; } @@ -3536,7 +3536,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, ret = ocfs2_get_left_path(et, right_path, &left_path); if (ret) { mlog_errno(ret); - goto out; + return ret; } left_el = path_leaf_el(left_path); @@ -3647,8 +3647,7 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, right_path, subtree_index); } out: - if (left_path) - ocfs2_free_path(left_path); + ocfs2_free_path(left_path); return ret; } @@ -4313,13 +4312,13 @@ out: return ret; } -static enum ocfs2_contig_type -ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, +static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_list *el, int index, - struct ocfs2_extent_rec *split_rec) + struct ocfs2_extent_rec *split_rec, + struct ocfs2_merge_ctxt *ctxt) { - int status; + int status = 0; enum ocfs2_contig_type ret = CONTIG_NONE; u32 left_cpos, right_cpos; struct ocfs2_extent_rec *rec = NULL; @@ -4334,17 +4333,20 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, } else if (path->p_tree_depth > 0) { status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos); if (status) - goto out; + goto exit; if (left_cpos != 0) { left_path = ocfs2_new_path_from_path(path); - if (!left_path) - goto out; + if (!left_path) { + status = -ENOMEM; + mlog_errno(status); + goto exit; + } status = ocfs2_find_path(et->et_ci, left_path, left_cpos); if (status) - goto out; + goto free_left_path; new_el = path_leaf_el(left_path); @@ -4361,7 +4363,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, le16_to_cpu(new_el->l_next_free_rec), le16_to_cpu(new_el->l_count)); status = -EINVAL; - goto out; + goto free_left_path; } rec = &new_el->l_recs[ le16_to_cpu(new_el->l_next_free_rec) - 1]; @@ -4388,18 +4390,21 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, path->p_tree_depth > 0) { status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos); if (status) - goto out; + goto free_left_path; if (right_cpos == 0) - goto out; + goto free_left_path; right_path = ocfs2_new_path_from_path(path); - if (!right_path) - goto out; + if (!right_path) { + status = -ENOMEM; + mlog_errno(status); + goto free_left_path; + } status = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (status) - goto out; + goto free_right_path; new_el = path_leaf_el(right_path); rec = &new_el->l_recs[0]; @@ -4413,7 +4418,7 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, (unsigned long long)le64_to_cpu(eb->h_blkno), le16_to_cpu(new_el->l_next_free_rec)); status = -EINVAL; - goto out; + goto free_right_path; } rec = &new_el->l_recs[1]; } @@ -4430,13 +4435,15 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, ret = contig_type; } -out: - if (left_path) - ocfs2_free_path(left_path); - if (right_path) - ocfs2_free_path(right_path); +free_right_path: + ocfs2_free_path(right_path); +free_left_path: + ocfs2_free_path(left_path); +exit: + if (status == 0) + ctxt->c_contig_type = ret; - return ret; + return status; } static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, @@ -5042,9 +5049,14 @@ int ocfs2_split_extent(handle_t *handle, goto out; } - ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, - split_index, - split_rec); + ret = ocfs2_figure_merge_contig_type(et, path, el, + split_index, + split_rec, + &ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } /* * The core merge / split code wants to know how much room is @@ -6858,13 +6870,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, if (pages == NULL) { ret = -ENOMEM; mlog_errno(ret); - goto out; + return ret; } ret = ocfs2_reserve_clusters(osb, 1, &data_ac); if (ret) { mlog_errno(ret); - goto out; + goto free_pages; } } @@ -6996,9 +7008,8 @@ out_commit: out: if (data_ac) ocfs2_free_alloc_context(data_ac); - if (pages) - kfree(pages); - +free_pages: + kfree(pages); return ret; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 44db1808cdb5..0f5fd9db8194 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -29,6 +29,7 @@ #include <linux/mpage.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/uio.h> #include <cluster/masklog.h> @@ -522,7 +523,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; unsigned long len = bh_result->b_size; - unsigned int clusters_to_alloc = 0; + unsigned int clusters_to_alloc = 0, contig_clusters = 0; cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); @@ -559,8 +560,10 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, /* fill hole, allocate blocks can't be larger than the size * of the hole */ clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); - if (clusters_to_alloc > contig_blocks) - clusters_to_alloc = contig_blocks; + contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb, + contig_blocks); + if (clusters_to_alloc > contig_clusters) + clusters_to_alloc = contig_clusters; /* allocate extent and insert them into the extent tree */ ret = ocfs2_extend_allocation(inode, cpos, @@ -618,9 +621,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - if (ocfs2_iocb_is_sem_locked(iocb)) - ocfs2_iocb_clear_sem_locked(iocb); - if (ocfs2_iocb_is_unaligned_aio(iocb)) { ocfs2_iocb_clear_unaligned_aio(iocb); @@ -663,6 +663,117 @@ static int ocfs2_is_overwrite(struct ocfs2_super *osb, return 0; } +static int ocfs2_direct_IO_zero_extend(struct ocfs2_super *osb, + struct inode *inode, loff_t offset, + u64 zero_len, int cluster_align) +{ + u32 p_cpos = 0; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + int ret = 0; + + if (offset <= i_size_read(inode) || cluster_align) + return 0; + + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, + &ext_flags); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + u64 s = i_size_read(inode); + sector_t sector = ((u64)p_cpos << (osb->s_clustersize_bits - 9)) + + (do_div(s, osb->s_clustersize) >> 9); + + ret = blkdev_issue_zeroout(osb->sb->s_bdev, sector, + zero_len >> 9, GFP_NOFS, false); + if (ret < 0) + mlog_errno(ret); + } + + return ret; +} + +static int ocfs2_direct_IO_extend_no_holes(struct ocfs2_super *osb, + struct inode *inode, loff_t offset) +{ + u64 zero_start, zero_len, total_zero_len; + u32 p_cpos = 0, clusters_to_add; + u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, i_size_read(inode)); + unsigned int num_clusters = 0; + unsigned int ext_flags = 0; + u32 size_div, offset_div; + int ret = 0; + + { + u64 o = offset; + u64 s = i_size_read(inode); + + offset_div = do_div(o, osb->s_clustersize); + size_div = do_div(s, osb->s_clustersize); + } + + if (offset <= i_size_read(inode)) + return 0; + + clusters_to_add = ocfs2_bytes_to_clusters(inode->i_sb, offset) - + ocfs2_bytes_to_clusters(inode->i_sb, i_size_read(inode)); + total_zero_len = offset - i_size_read(inode); + if (clusters_to_add) + total_zero_len -= offset_div; + + /* Allocate clusters to fill out holes, and this is only needed + * when we add more than one clusters. Otherwise the cluster will + * be allocated during direct IO */ + if (clusters_to_add > 1) { + ret = ocfs2_extend_allocation(inode, + OCFS2_I(inode)->ip_clusters, + clusters_to_add - 1, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + while (total_zero_len) { + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, + &ext_flags); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + zero_start = ocfs2_clusters_to_bytes(osb->sb, p_cpos) + + size_div; + zero_len = ocfs2_clusters_to_bytes(osb->sb, num_clusters) - + size_div; + zero_len = min(total_zero_len, zero_len); + + if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN)) { + ret = blkdev_issue_zeroout(osb->sb->s_bdev, + zero_start >> 9, zero_len >> 9, + GFP_NOFS, false); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + total_zero_len -= zero_len; + v_cpos += ocfs2_bytes_to_clusters(osb->sb, zero_len + size_div); + + /* Only at first iteration can be cluster not aligned. + * So set size_div to 0 for the rest */ + size_div = 0; + } + +out: + return ret; +} + static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) @@ -677,8 +788,8 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, struct buffer_head *di_bh = NULL; size_t count = iter->count; journal_t *journal = osb->journal->j_journal; - u32 zero_len; - int cluster_align; + u64 zero_len_head, zero_len_tail; + int cluster_align_head, cluster_align_tail; loff_t final_size = offset + count; int append_write = offset >= i_size_read(inode) ? 1 : 0; unsigned int num_clusters = 0; @@ -686,9 +797,16 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, { u64 o = offset; + u64 s = i_size_read(inode); - zero_len = do_div(o, 1 << osb->s_clustersize_bits); - cluster_align = !zero_len; + zero_len_head = do_div(o, 1 << osb->s_clustersize_bits); + cluster_align_head = !zero_len_head; + + zero_len_tail = osb->s_clustersize - + do_div(s, osb->s_clustersize); + if ((offset - i_size_read(inode)) < zero_len_tail) + zero_len_tail = offset - i_size_read(inode); + cluster_align_tail = !zero_len_tail; } /* @@ -706,21 +824,23 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, } if (append_write) { - ret = ocfs2_inode_lock(inode, &di_bh, 1); + ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { mlog_errno(ret); goto clean_orphan; } + /* zeroing out the previously allocated cluster tail + * that but not zeroed */ if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) - ret = ocfs2_zero_extend(inode, di_bh, offset); + ret = ocfs2_direct_IO_zero_extend(osb, inode, offset, + zero_len_tail, cluster_align_tail); else - ret = ocfs2_extend_no_holes(inode, di_bh, offset, + ret = ocfs2_direct_IO_extend_no_holes(osb, inode, offset); if (ret < 0) { mlog_errno(ret); ocfs2_inode_unlock(inode, 1); - brelse(di_bh); goto clean_orphan; } @@ -728,19 +848,15 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, if (is_overwrite < 0) { mlog_errno(is_overwrite); ocfs2_inode_unlock(inode, 1); - brelse(di_bh); goto clean_orphan; } ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - di_bh = NULL; } - written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev, - iter, offset, - ocfs2_direct_IO_get_blocks, - ocfs2_dio_end_io, NULL, 0); + written = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, + offset, ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); if (unlikely(written < 0)) { loff_t i_size = i_size_read(inode); @@ -771,25 +887,35 @@ static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb, if (ret < 0) mlog_errno(ret); } - } else if (written < 0 && append_write && !is_overwrite && - !cluster_align) { + } else if (written > 0 && append_write && !is_overwrite && + !cluster_align_head) { + /* zeroing out the allocated cluster head */ u32 p_cpos = 0; u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset); + ret = ocfs2_inode_lock(inode, NULL, 0); + if (ret < 0) { + mlog_errno(ret); + goto clean_orphan; + } + ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos, &num_clusters, &ext_flags); if (ret < 0) { mlog_errno(ret); + ocfs2_inode_unlock(inode, 0); goto clean_orphan; } BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN)); ret = blkdev_issue_zeroout(osb->sb->s_bdev, - p_cpos << (osb->s_clustersize_bits - 9), - zero_len >> 9, GFP_KERNEL, false); + (u64)p_cpos << (osb->s_clustersize_bits - 9), + zero_len_head >> 9, GFP_NOFS, false); if (ret < 0) mlog_errno(ret); + + ocfs2_inode_unlock(inode, 0); } clean_orphan: @@ -798,13 +924,23 @@ clean_orphan: int update_isize = written > 0 ? 1 : 0; loff_t end = update_isize ? offset + written : 0; - tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, + tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (tmp_ret < 0) { + ret = tmp_ret; + mlog_errno(ret); + goto out; + } + + tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, update_isize, end); if (tmp_ret < 0) { ret = tmp_ret; + mlog_errno(ret); goto out; } + ocfs2_inode_unlock(inode, 1); + tmp_ret = jbd2_journal_force_commit(journal); if (tmp_ret < 0) { ret = tmp_ret; @@ -818,9 +954,7 @@ out: return ret; } -static ssize_t ocfs2_direct_IO(int rw, - struct kiocb *iocb, - struct iov_iter *iter, +static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; @@ -842,12 +976,11 @@ static ssize_t ocfs2_direct_IO(int rw, if (i_size_read(inode) <= offset && !full_coherency) return 0; - if (rw == READ) - return __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, - iter, offset, - ocfs2_direct_IO_get_blocks, - ocfs2_dio_end_io, NULL, 0); + if (iov_iter_rw(iter) == READ) + return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, offset, + ocfs2_direct_IO_get_blocks, + ocfs2_dio_end_io, NULL, 0); else return ocfs2_direct_IO_write(iocb, iter, offset); } diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 6cae155d54df..24e496d6bdcd 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,7 +22,7 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H -#include <linux/aio.h> +#include <linux/fs.h> handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, @@ -79,7 +79,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, OCFS2_IOCB_RW_LOCK_LEVEL, - OCFS2_IOCB_SEM, OCFS2_IOCB_UNALIGNED_IO, OCFS2_IOCB_NUM_LOCKS }; @@ -88,12 +87,6 @@ enum ocfs2_iocb_lock_bits { clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) -#define ocfs2_iocb_set_sem_locked(iocb) \ - set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) -#define ocfs2_iocb_clear_sem_locked(iocb) \ - clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) -#define ocfs2_iocb_is_sem_locked(iocb) \ - test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) #define ocfs2_iocb_set_unaligned_aio(iocb) \ set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index af7598bff1b5..dfe162f5fd4c 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) return count; } +void __mlog_printk(const u64 *mask, const char *func, int line, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + const char *level; + const char *prefix = ""; + + if (!__mlog_test_u64(*mask, mlog_and_bits) || + __mlog_test_u64(*mask, mlog_not_bits)) + return; + + if (*mask & ML_ERROR) { + level = KERN_ERR; + prefix = "ERROR: "; + } else if (*mask & ML_NOTICE) { + level = KERN_NOTICE; + } else { + level = KERN_INFO; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s(%s,%u,%u):%s:%d %s%pV", + level, current->comm, task_pid_nr(current), + raw_smp_processor_id(), func, line, prefix, &vaf); + + va_end(args); +} +EXPORT_SYMBOL_GPL(__mlog_printk); + struct mlog_attribute { struct attribute attr; u64 mask; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 2260fb9e6508..308ea0eb35fd 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -162,47 +162,30 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; #endif -/* - * smp_processor_id() "helpfully" screams when called outside preemptible - * regions in current kernels. sles doesn't have the variants that don't - * scream. just do this instead of trying to guess which we're building - * against.. *sigh*. - */ -#define __mlog_cpu_guess ({ \ - unsigned long _cpu = get_cpu(); \ - put_cpu(); \ - _cpu; \ -}) +__printf(4, 5) +void __mlog_printk(const u64 *m, const char *func, int line, + const char *fmt, ...); -/* In the following two macros, the whitespace after the ',' just - * before ##args is intentional. Otherwise, gcc 2.95 will eat the - * previous token if args expands to nothing. +/* + * Testing before the __mlog_printk call lets the compiler eliminate the + * call completely when (m & ML_ALLOWED_BITS) is 0. */ -#define __mlog_printk(level, fmt, args...) \ - printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ - task_pid_nr(current), __mlog_cpu_guess, \ - __PRETTY_FUNCTION__, __LINE__ , ##args) - -#define mlog(mask, fmt, args...) do { \ - u64 __m = MLOG_MASK_PREFIX | (mask); \ - if ((__m & ML_ALLOWED_BITS) && \ - __mlog_test_u64(__m, mlog_and_bits) && \ - !__mlog_test_u64(__m, mlog_not_bits)) { \ - if (__m & ML_ERROR) \ - __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ - else if (__m & ML_NOTICE) \ - __mlog_printk(KERN_NOTICE, fmt , ##args); \ - else __mlog_printk(KERN_INFO, fmt , ##args); \ - } \ +#define mlog(mask, fmt, ...) \ +do { \ + u64 _m = MLOG_MASK_PREFIX | (mask); \ + if (_m & ML_ALLOWED_BITS) \ + __mlog_printk(&_m, __func__, __LINE__, fmt, \ + ##__VA_ARGS__); \ } while (0) -#define mlog_errno(st) do { \ +#define mlog_errno(st) ({ \ int _st = (st); \ if (_st != -ERESTARTSYS && _st != -EINTR && \ _st != AOP_TRUNCATED_PAGE && _st != -ENOSPC && \ _st != -EDQUOT) \ mlog(ML_ERROR, "status = %lld\n", (long long)_st); \ -} while (0) + _st; \ +}) #define mlog_bug_on_msg(cond, fmt, args...) do { \ if (cond) { \ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 56c403a563bc..2d0acd6678fe 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -2204,7 +2204,7 @@ out: kfree(o2net_hand); kfree(o2net_keep_req); kfree(o2net_keep_resp); - + o2net_debugfs_exit(); o2quo_exit(); return -ENOMEM; } diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 4fda7a5f3088..290373024d9d 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -42,8 +42,8 @@ void ocfs2_dentry_attach_gen(struct dentry *dentry) { unsigned long gen = - OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; - BUG_ON(dentry->d_inode); + OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; + BUG_ON(d_inode(dentry)); dentry->d_fsdata = (void *)gen; } @@ -57,7 +57,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); osb = OCFS2_SB(dentry->d_sb); trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len, @@ -71,7 +71,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, unsigned int flags) unsigned long gen = (unsigned long) dentry->d_fsdata; unsigned long pgen; spin_lock(&dentry->d_lock); - pgen = OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen; + pgen = OCFS2_I(d_inode(dentry->d_parent))->ip_dir_lock_gen; spin_unlock(&dentry->d_lock); trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len, dentry->d_name.name, @@ -146,7 +146,7 @@ static int ocfs2_match_dentry(struct dentry *dentry, if (skip_unhashed && d_unhashed(dentry)) return 0; - parent = dentry->d_parent->d_inode; + parent = d_inode(dentry->d_parent); /* Negative parent dentry? */ if (!parent) return 0; @@ -243,7 +243,7 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry, if (!inode) return 0; - if (!dentry->d_inode && dentry->d_fsdata) { + if (d_really_is_negative(dentry) && dentry->d_fsdata) { /* Converting a negative dentry to positive Clear dentry->d_fsdata */ dentry->d_fsdata = dl = NULL; @@ -446,7 +446,7 @@ void ocfs2_dentry_move(struct dentry *dentry, struct dentry *target, { int ret; struct ocfs2_super *osb = OCFS2_SB(old_dir->i_sb); - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); /* * Move within the same directory, so the actual lock info won't diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index b08050bd3f2e..02878a83f0b4 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -18,7 +18,7 @@ * * linux/fs/minix/dir.c * - * Copyright (C) 1991, 1992 Linux Torvalds + * Copyright (C) 1991, 1992 Linus Torvalds * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public @@ -1617,7 +1617,7 @@ int __ocfs2_add_entry(handle_t *handle, struct ocfs2_dir_entry *de, *de1; struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; struct super_block *sb = dir->i_sb; - int retval, status; + int retval; unsigned int size = sb->s_blocksize; struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; @@ -1695,25 +1695,25 @@ int __ocfs2_add_entry(handle_t *handle, } if (insert_bh == parent_fe_bh) - status = ocfs2_journal_access_di(handle, + retval = ocfs2_journal_access_di(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); else { - status = ocfs2_journal_access_db(handle, + retval = ocfs2_journal_access_db(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (ocfs2_dir_indexed(dir)) { - status = ocfs2_dx_dir_insert(dir, + if (!retval && ocfs2_dir_indexed(dir)) + retval = ocfs2_dx_dir_insert(dir, handle, lookup); - if (status) { - mlog_errno(status); - goto bail; - } - } + } + + if (retval) { + mlog_errno(retval); + goto bail; } /* By now the buffer is marked for journaling */ @@ -2047,22 +2047,19 @@ int ocfs2_check_dir_for_entry(struct inode *dir, const char *name, int namelen) { - int ret; + int ret = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; trace_ocfs2_check_dir_for_entry( (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name); - ret = -EEXIST; - if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) - goto bail; + if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0) { + ret = -EEXIST; + mlog_errno(ret); + } - ret = 0; -bail: ocfs2_free_dir_lookup_result(&lookup); - if (ret) - mlog_errno(ret); return ret; } @@ -3546,13 +3543,10 @@ static void dx_leaf_sort_swap(void *a, void *b, int size) { struct ocfs2_dx_entry *entry1 = a; struct ocfs2_dx_entry *entry2 = b; - struct ocfs2_dx_entry tmp; BUG_ON(size != sizeof(*entry1)); - tmp = *entry1; - *entry1 = *entry2; - *entry2 = tmp; + swap(*entry1, *entry2); } static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index f0344b75b14d..3d8639f38973 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -72,7 +72,7 @@ static inline int ocfs2_add_entry(handle_t *handle, struct buffer_head *parent_fe_bh, struct ocfs2_dir_lookup_result *lookup) { - return __ocfs2_add_entry(handle, dentry->d_parent->d_inode, + return __ocfs2_add_entry(handle, d_inode(dentry->d_parent), dentry->d_name.name, dentry->d_name.len, inode, blkno, parent_fe_bh, lookup); } diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index fae17c640df3..e88ccf8c83ff 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, /* will exit holding res->spinlock, but may drop in function */ void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); -void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags); /* will exit holding res->spinlock, but may drop in function */ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a6944b25fd5b..fdf4b41d0609 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -757,6 +757,19 @@ lookup: if (tmpres) { spin_unlock(&dlm->spinlock); spin_lock(&tmpres->spinlock); + + /* + * Right after dlm spinlock was released, dlm_thread could have + * purged the lockres. Check if lockres got unhashed. If so + * start over. + */ + if (hlist_unhashed(&tmpres->hash_node)) { + spin_unlock(&tmpres->spinlock); + dlm_lockres_put(tmpres); + tmpres = NULL; + goto lookup; + } + /* Wait on the thread that is mastering the resource */ if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { __dlm_wait_on_lockres(tmpres); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 061ba6a91bf2..b5cf27dcb18a 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -208,7 +208,7 @@ static int dlmfs_file_release(struct inode *inode, static int dlmfs_file_setattr(struct dentry *dentry, struct iattr *attr) { int error; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); attr->ia_valid &= ~ATTR_SIZE; error = inode_change_ok(inode, attr); @@ -549,7 +549,7 @@ static int dlmfs_unlink(struct inode *dir, struct dentry *dentry) { int status; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); mlog(0, "unlink inode %lu\n", inode->i_ino); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 11849a44dc5a..23157e40dd74 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1391,6 +1391,11 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, int noqueue_attempted = 0; int dlm_locked = 0; + if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) { + mlog_errno(-EINVAL); + return -EINVAL; + } + ocfs2_init_mask_waiter(&mw); if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) @@ -4020,9 +4025,13 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb) osb->dc_work_sequence = osb->dc_wake_sequence; processed = osb->blocked_lock_count; - while (processed) { - BUG_ON(list_empty(&osb->blocked_lock_list)); - + /* + * blocked lock processing in this loop might call iput which can + * remove items off osb->blocked_lock_list. Downconvert up to + * 'processed' number of locks, but stop short if we had some + * removed in ocfs2_mark_lockres_freeing when downconverting. + */ + while (processed && !list_empty(&osb->blocked_lock_list)) { lockres = list_entry(osb->blocked_lock_list.next, struct ocfs2_lock_res, l_blocked_list); list_del_init(&lockres->l_blocked_list); diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c index 29651167190d..827fc9809bc2 100644 --- a/fs/ocfs2/export.c +++ b/fs/ocfs2/export.c @@ -82,7 +82,6 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, } status = ocfs2_test_inode_bit(osb, blkno, &set); - trace_ocfs2_get_dentry_test_bit(status, set); if (status < 0) { if (status == -EINVAL) { /* @@ -96,6 +95,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb, goto unlock_nfs_sync; } + trace_ocfs2_get_dentry_test_bit(status, set); /* If the inode allocator bit is clear, this inode must be stale */ if (!set) { status = -ESTALE; @@ -147,7 +147,7 @@ static struct dentry *ocfs2_get_parent(struct dentry *child) int status; u64 blkno; struct dentry *parent; - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 46e0d4e857c7..719f7f4c7a37 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -37,6 +37,7 @@ #include <linux/falloc.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <cluster/masklog.h> @@ -1126,7 +1127,7 @@ out: int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) { int status = 0, size_change; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; @@ -1275,8 +1276,8 @@ int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; - struct super_block *sb = dentry->d_inode->i_sb; + struct inode *inode = d_inode(dentry); + struct super_block *sb = d_inode(dentry)->i_sb; struct ocfs2_super *osb = sb->s_fs_info; int err; @@ -2106,7 +2107,7 @@ out: } static int ocfs2_prepare_inode_for_write(struct file *file, - loff_t *ppos, + loff_t pos, size_t count, int appending, int *direct_io, @@ -2114,8 +2115,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file, { int ret = 0, meta_level = 0; struct dentry *dentry = file->f_path.dentry; - struct inode *inode = dentry->d_inode; - loff_t saved_pos = 0, end; + struct inode *inode = d_inode(dentry); + loff_t end; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); @@ -2155,23 +2156,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file, } } - /* work on a copy of ppos until we're sure that we won't have - * to recalculate it due to relocking. */ - if (appending) - saved_pos = i_size_read(inode); - else - saved_pos = *ppos; - - end = saved_pos + count; + end = pos + count; - ret = ocfs2_check_range_for_refcount(inode, saved_pos, count); + ret = ocfs2_check_range_for_refcount(inode, pos, count); if (ret == 1) { ocfs2_inode_unlock(inode, meta_level); meta_level = -1; ret = ocfs2_prepare_inode_for_refcount(inode, file, - saved_pos, + pos, count, &meta_level); if (has_refcount) @@ -2227,7 +2221,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * caller will have to retake some cluster * locks and initiate the io as buffered. */ - ret = ocfs2_check_range_for_holes(inode, saved_pos, count); + ret = ocfs2_check_range_for_holes(inode, pos, count); if (ret == 1) { /* * Fallback to old way if the feature bit is not set. @@ -2242,12 +2236,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file, break; } - if (appending) - *ppos = saved_pos; - out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, - saved_pos, appending, count, + pos, appending, count, direct_io, has_refcount); if (meta_level >= 0) @@ -2260,19 +2251,20 @@ out: static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int ret, direct_io, appending, rw_level, have_alloc_sem = 0; + int direct_io, appending, rw_level; int can_do_direct, has_refcount = 0; ssize_t written = 0; - size_t count = iov_iter_count(from); - loff_t old_size, *ppos = &iocb->ki_pos; + ssize_t ret; + size_t count = iov_iter_count(from), orig_count; + loff_t old_size; u32 old_clusters; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int full_coherency = !(osb->s_mount_opt & OCFS2_MOUNT_COHERENCY_BUFFERED); int unaligned_dio = 0; + int dropped_dio = 0; trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2280,24 +2272,15 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)from->nr_segs); /* GRRRRR */ - if (iocb->ki_nbytes == 0) + if (count == 0) return 0; - appending = file->f_flags & O_APPEND ? 1 : 0; - direct_io = file->f_flags & O_DIRECT ? 1 : 0; + appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; + direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; mutex_lock(&inode->i_mutex); - ocfs2_iocb_clear_sem_locked(iocb); - relock: - /* to match setattr's i_mutex -> rw_lock ordering */ - if (direct_io) { - have_alloc_sem = 1; - /* communicate with ocfs2_dio_end_io */ - ocfs2_iocb_set_sem_locked(iocb); - } - /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". @@ -2307,7 +2290,7 @@ relock: ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { mlog_errno(ret); - goto out_sems; + goto out_mutex; } /* @@ -2329,9 +2312,17 @@ relock: ocfs2_inode_unlock(inode, 1); } + orig_count = iov_iter_count(from); + ret = generic_write_checks(iocb, from); + if (ret <= 0) { + if (ret) + mlog_errno(ret); + goto out; + } + count = ret; + can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file, ppos, - iocb->ki_nbytes, appending, + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, appending, &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); @@ -2339,8 +2330,7 @@ relock: } if (direct_io && !is_sync_kiocb(iocb)) - unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes, - *ppos); + unaligned_dio = ocfs2_is_io_unaligned(inode, count, iocb->ki_pos); /* * We can't complete the direct I/O as requested, fall back to @@ -2349,10 +2339,12 @@ relock: if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); - have_alloc_sem = 0; rw_level = -1; direct_io = 0; + iocb->ki_flags &= ~IOCB_DIRECT; + iov_iter_reexpand(from, orig_count); + dropped_dio = 1; goto relock; } @@ -2376,74 +2368,18 @@ relock: /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); - ret = generic_write_checks(file, ppos, &count, - S_ISBLK(inode->i_mode)); - if (ret) - goto out_dio; - - iov_iter_truncate(from, count); - if (direct_io) { - loff_t endbyte; - ssize_t written_buffered; - written = generic_file_direct_write(iocb, from, *ppos); - if (written < 0 || written == count) { - ret = written; - goto out_dio; - } - - /* - * for completing the rest of the request. - */ - *ppos += written; - count -= written; - written_buffered = generic_perform_write(file, from, *ppos); - /* - * If generic_file_buffered_write() returned a synchronous error - * then we want to return the number of bytes which were - * direct-written, or the error code if that was zero. Note - * that this differs from normal direct-io semantics, which - * will return -EFOO even if some bytes were written. - */ - if (written_buffered < 0) { - ret = written_buffered; - goto out_dio; - } - - iocb->ki_pos = *ppos + written_buffered; - /* We need to ensure that the page cache pages are written to - * disk and invalidated to preserve the expected O_DIRECT - * semantics. - */ - endbyte = *ppos + written_buffered - 1; - ret = filemap_write_and_wait_range(file->f_mapping, *ppos, - endbyte); - if (ret == 0) { - written += written_buffered; - invalidate_mapping_pages(mapping, - *ppos >> PAGE_CACHE_SHIFT, - endbyte >> PAGE_CACHE_SHIFT); - } else { - /* - * We don't know how much we wrote, so just return - * the number of bytes which were direct-written - */ - } - } else { - current->backing_dev_info = inode_to_bdi(inode); - written = generic_perform_write(file, from, *ppos); - if (likely(written >= 0)) - iocb->ki_pos = *ppos + written; - current->backing_dev_info = NULL; - } - -out_dio: + written = __generic_file_write_iter(iocb, from); /* buffered aio wouldn't have proper lock coverage today */ - BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); + BUG_ON(written == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); + + if (unlikely(written <= 0)) + goto no_sync; - if (((file->f_flags & O_DSYNC) && !direct_io) || IS_SYNC(inode) || - ((file->f_flags & O_DIRECT) && !direct_io)) { - ret = filemap_fdatawrite_range(file->f_mapping, *ppos, - *ppos + count - 1); + if (((file->f_flags & O_DSYNC) && !direct_io) || + IS_SYNC(inode) || dropped_dio) { + ret = filemap_fdatawrite_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); if (ret < 0) written = ret; @@ -2454,10 +2390,12 @@ out_dio: } if (!ret) - ret = filemap_fdatawait_range(file->f_mapping, *ppos, - *ppos + count - 1); + ret = filemap_fdatawait_range(file->f_mapping, + iocb->ki_pos - written, + iocb->ki_pos - 1); } +no_sync: /* * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io * function pointer which is called when o_direct io completes so that @@ -2469,7 +2407,6 @@ out_dio: */ if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { rw_level = -1; - have_alloc_sem = 0; unaligned_dio = 0; } @@ -2482,10 +2419,7 @@ out: if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); -out_sems: - if (have_alloc_sem) - ocfs2_iocb_clear_sem_locked(iocb); - +out_mutex: mutex_unlock(&inode->i_mutex); if (written) @@ -2526,7 +2460,7 @@ bail: static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; + int ret = 0, rw_level = -1, lock_level = 0; struct file *filp = iocb->ki_filp; struct inode *inode = file_inode(filp); @@ -2543,16 +2477,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, goto bail; } - ocfs2_iocb_clear_sem_locked(iocb); - /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. */ - if (filp->f_flags & O_DIRECT) { - have_alloc_sem = 1; - ocfs2_iocb_set_sem_locked(iocb); - + if (iocb->ki_flags & IOCB_DIRECT) { ret = ocfs2_rw_lock(inode, 0); if (ret < 0) { mlog_errno(ret); @@ -2583,18 +2512,14 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, trace_generic_file_aio_read_ret(ret); /* buffered aio wouldn't have proper lock coverage today */ - BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); + BUG_ON(ret == -EIOCBQUEUED && !(iocb->ki_flags & IOCB_DIRECT)); /* see ocfs2_file_write_iter */ if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { rw_level = -1; - have_alloc_sem = 0; } bail: - if (have_alloc_sem) - ocfs2_iocb_clear_sem_locked(iocb); - if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); @@ -2678,8 +2603,6 @@ const struct inode_operations ocfs2_special_file_iops = { */ const struct file_operations ocfs2_fops = { .llseek = ocfs2_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .mmap = ocfs2_mmap, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, @@ -2726,8 +2649,6 @@ const struct file_operations ocfs2_dops = { */ const struct file_operations ocfs2_fops_no_plocks = { .llseek = ocfs2_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .mmap = ocfs2_mmap, .fsync = ocfs2_sync_file, .release = ocfs2_file_release, diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 3025c0da6b8a..b254416dc8d9 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -624,7 +624,7 @@ static int ocfs2_remove_inode(struct inode *inode, ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE, le16_to_cpu(di->i_suballoc_slot)); if (!inode_alloc_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto bail; } @@ -742,7 +742,7 @@ static int ocfs2_wipe_inode(struct inode *inode, ORPHAN_DIR_SYSTEM_INODE, orphaned_slot); if (!orphan_dir_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto bail; } @@ -1209,7 +1209,7 @@ int ocfs2_drop_inode(struct inode *inode) */ int ocfs2_inode_revalidate(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int status = 0; trace_ocfs2_inode_revalidate(inode, diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 53e6c40ed4c6..3cb097ccce60 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -980,7 +980,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: - case FITRIM: break; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ff531928269e..7c099f7032fd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -108,7 +108,7 @@ struct ocfs2_replay_map { unsigned char rm_replay_slots[0]; }; -void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) +static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) { if (!osb->replay_map) return; @@ -153,7 +153,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) return 0; } -void ocfs2_queue_replay_slots(struct ocfs2_super *osb, +static void ocfs2_queue_replay_slots(struct ocfs2_super *osb, enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_replay_map *replay_map = osb->replay_map; @@ -173,7 +173,7 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb, replay_map->rm_state = REPLAY_DONE; } -void ocfs2_free_replay_slots(struct ocfs2_super *osb) +static void ocfs2_free_replay_slots(struct ocfs2_super *osb) { struct ocfs2_replay_map *replay_map = osb->replay_map; @@ -571,9 +571,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, (unsigned long)bh, (unsigned long long)bh->b_blocknr); - /* We aren't guaranteed to have the superblock here - but if we - * don't, it'll just crash. */ - ocfs2_error(bh->b_assoc_map->host->i_sb, + ocfs2_error(bh->b_bdev->bd_super, "JBD2 has aborted our journal, ocfs2 cannot continue\n"); } @@ -775,7 +773,20 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr); status = jbd2_journal_dirty_metadata(handle, bh); - BUG_ON(status); + if (status) { + mlog_errno(status); + if (!is_handle_aborted(handle)) { + journal_t *journal = handle->h_transaction->t_journal; + struct super_block *sb = bh->b_bdev->bd_super; + + mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " + "Aborting transaction and journal.\n"); + handle->h_err = status; + jbd2_journal_abort_handle(handle); + jbd2_journal_abort(journal, status); + ocfs2_abort(sb, "Journal already aborted.\n"); + } + } } #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) @@ -1884,7 +1895,7 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void) * hasn't happened. The node queues a scan and increments the * sequence number in the LVB. */ -void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) +static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) { struct ocfs2_orphan_scan *os; int status, i; @@ -1933,7 +1944,7 @@ out: } /* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ -void ocfs2_orphan_scan_work(struct work_struct *work) +static void ocfs2_orphan_scan_work(struct work_struct *work) { struct ocfs2_orphan_scan *os; struct ocfs2_super *osb; @@ -2137,6 +2148,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, struct inode *inode = NULL; struct inode *iter; struct ocfs2_inode_info *oi; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; trace_ocfs2_recover_orphans(slot); @@ -2157,16 +2170,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto next; + } /* * We need to take and drop the inode lock to * force read inode from disk. */ - ret = ocfs2_inode_lock(inode, NULL, 0); + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { mlog_errno(ret); - goto next; + goto unlock_rw; } - ocfs2_inode_unlock(inode, 0); + + di = (struct ocfs2_dinode *)di_bh->b_data; if (inode->i_nlink == 0) { spin_lock(&oi->ip_lock); @@ -2174,43 +2193,30 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; spin_unlock(&oi->ip_lock); - } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { - struct buffer_head *di_bh = NULL; - - ret = ocfs2_rw_lock(inode, 1); - if (ret) { - mlog_errno(ret); - goto next; - } - - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - ocfs2_rw_unlock(inode, 1); - mlog_errno(ret); - goto next; - } - + } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && + (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { ret = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); - ocfs2_inode_unlock(inode, 1); - ocfs2_rw_unlock(inode, 1); - brelse(di_bh); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); - goto next; + goto unlock_inode; } - ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); + ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); if (ret) mlog_errno(ret); wake_up(&OCFS2_I(inode)->append_dio_wq); } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ - +unlock_inode: + ocfs2_inode_unlock(inode, 1); +unlock_rw: + ocfs2_rw_unlock(inode, 1); next: iput(inode); - + brelse(di_bh); + di_bh = NULL; inode = iter; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 044013455621..857bbbcd39f3 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -666,7 +666,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, if (le32_to_cpu(alloc->id1.bitmap1.i_used) != ocfs2_local_alloc_count_bits(alloc)) { ocfs2_error(osb->sb, "local alloc inode %llu says it has " - "%u free bits, but a count shows %u", + "%u used bits, but a count shows %u", (unsigned long long)le64_to_cpu(alloc->i_blkno), le32_to_cpu(alloc->id1.bitmap1.i_used), ocfs2_local_alloc_count_bits(alloc)); @@ -839,7 +839,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, u32 *numbits, struct ocfs2_alloc_reservation *resv) { - int numfound, bitoff, left, startoff, lastzero; + int numfound = 0, bitoff, left, startoff, lastzero; int local_resv = 0; struct ocfs2_alloc_reservation r; void *bitmap = NULL; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b5c3a5ea3ee6..6e6abb93fda5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -689,8 +689,8 @@ static int ocfs2_link(struct dentry *old_dentry, struct dentry *dentry) { handle_t *handle; - struct inode *inode = old_dentry->d_inode; - struct inode *old_dir = old_dentry->d_parent->d_inode; + struct inode *inode = d_inode(old_dentry); + struct inode *old_dir = d_inode(old_dentry->d_parent); int err; struct buffer_head *fe_bh = NULL; struct buffer_head *old_dir_bh = NULL; @@ -879,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir, int status; int child_locked = 0; bool is_unlinkable = false; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct inode *orphan_dir = NULL; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); u64 blkno; @@ -898,7 +898,7 @@ static int ocfs2_unlink(struct inode *dir, dquot_initialize(dir); - BUG_ON(dentry->d_parent->d_inode != dir); + BUG_ON(d_inode(dentry->d_parent) != dir); if (inode == osb->root_inode) return -EPERM; @@ -1116,8 +1116,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, int inode1_is_ancestor, inode2_is_ancestor; struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); - struct buffer_head **tmpbh; - struct inode *tmpinode; trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, (unsigned long long)oi2->ip_blkno); @@ -1148,13 +1146,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, (oi1->ip_blkno < oi2->ip_blkno && inode2_is_ancestor == 0)) { /* switch id1 and id2 around */ - tmpbh = bh2; - bh2 = bh1; - bh1 = tmpbh; - - tmpinode = inode2; - inode2 = inode1; - inode1 = tmpinode; + swap(bh2, bh1); + swap(inode2, inode1); } /* lock id2 */ status = ocfs2_inode_lock_nested(inode2, bh2, 1, @@ -1209,8 +1202,8 @@ static int ocfs2_rename(struct inode *old_dir, { int status = 0, rename_lock = 0, parents_locked = 0, target_exists = 0; int old_child_locked = 0, new_child_locked = 0, update_dot_dot = 0; - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct inode *orphan_dir = NULL; struct ocfs2_dinode *newfe = NULL; char orphan_name[OCFS2_ORPHAN_NAMELEN + 1]; @@ -1454,7 +1447,7 @@ static int ocfs2_rename(struct inode *old_dir, should_add_orphan = true; } } else { - BUG_ON(new_dentry->d_parent->d_inode != new_dir); + BUG_ON(d_inode(new_dentry->d_parent) != new_dir); status = ocfs2_check_dir_for_entry(new_dir, new_dentry->d_name.name, @@ -2322,10 +2315,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, trace_ocfs2_orphan_del( (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, - name, namelen); + name, strlen(name)); /* find it's spot in the orphan directory */ - status = ocfs2_find_entry(name, namelen, orphan_dir_inode, + status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, &lookup); if (status) { mlog_errno(status); @@ -2670,30 +2663,22 @@ bail: } int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, - struct inode *inode, int update_isize, - loff_t end) + struct inode *inode, struct buffer_head *di_bh, + int update_isize, loff_t end) { struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; - struct buffer_head *di_bh = NULL; - struct ocfs2_dinode *di = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle = NULL; int status = 0; - status = ocfs2_inode_lock(inode, &di_bh, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - di = (struct ocfs2_dinode *) di_bh->b_data; - orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, le16_to_cpu(di->i_dio_orphaned_slot)); if (!orphan_dir_inode) { status = -ENOENT; mlog_errno(status); - goto bail_unlock_inode; + goto bail; } mutex_lock(&orphan_dir_inode->i_mutex); @@ -2702,7 +2687,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); mlog_errno(status); - goto bail_unlock_inode; + goto bail; } handle = ocfs2_start_trans(osb, @@ -2749,10 +2734,6 @@ bail_unlock_orphan: brelse(orphan_dir_bh); iput(orphan_dir_inode); -bail_unlock_inode: - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - bail: return status; } @@ -2808,7 +2789,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, ORPHAN_DIR_SYSTEM_INODE, osb->slot_num); if (!orphan_dir_inode) { - status = -EEXIST; + status = -ENOENT; mlog_errno(status); goto leave; } diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 5ddecce172fa..e173329eb830 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, struct inode *inode); int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, - struct inode *inode, int update_isize, - loff_t end); + struct inode *inode, struct buffer_head *di_bh, + int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 460c6c37e683..690ddc60189b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -717,6 +717,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, return (u64)clusters << c_to_b_bits; } +static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb, + u64 blocks) +{ + int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + blocks += (1 << b_to_c_bits) - 1; + return (u32)(blocks >> b_to_c_bits); +} + static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, u64 blocks) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index ee541f92dab4..b69dd14c0b9b 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1406,11 +1406,9 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b) static void swap_refcount_rec(void *a, void *b, int size) { - struct ocfs2_refcount_rec *l = a, *r = b, tmp; + struct ocfs2_refcount_rec *l = a, *r = b; - tmp = *l; - *l = *r; - *r = tmp; + swap(*l, *r); } /* @@ -4194,7 +4192,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, bool preserve) { int ret; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct buffer_head *new_bh = NULL; if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) { @@ -4263,7 +4261,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { int error; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct buffer_head *old_bh = NULL; struct inode *new_orphan_inode = NULL; struct posix_acl *default_acl, *acl; @@ -4276,7 +4274,7 @@ static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir, error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) { mlog_errno(error); - goto out; + return error; } error = ocfs2_create_inode_in_orphan(dir, mode, @@ -4357,7 +4355,7 @@ out: /* copied from may_create in VFS. */ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) { - if (child->d_inode) + if (d_really_is_positive(child)) return -EEXIST; if (IS_DEADDIR(dir)) return -ENOENT; @@ -4375,7 +4373,7 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child) static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, bool preserve) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int error; if (!inode) @@ -4463,7 +4461,7 @@ int ocfs2_reflink_ioctl(struct inode *inode, } error = ocfs2_vfs_reflink(old_path.dentry, - new_path.dentry->d_inode, + d_inode(new_path.dentry), new_dentry, preserve); out_dput: done_path_create(&new_path, new_dentry); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index d5493e361a38..e78a203d44c8 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -427,7 +427,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) if (!si) { status = -ENOMEM; mlog_errno(status); - goto bail; + return status; } si->si_extended = ocfs2_uses_extended_slot_map(osb); @@ -452,7 +452,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb) osb->slot_info = (struct ocfs2_slot_info *)si; bail: - if (status < 0 && si) + if (status < 0) __ocfs2_free_slot_info(si); return status; diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 1724d43d3da1..220cae7bbdbc 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -295,7 +295,7 @@ static int o2cb_cluster_check(void) set_bit(node_num, netmap); if (!memcmp(hbmap, netmap, sizeof(hbmap))) return 0; - if (i < O2CB_MAP_STABILIZE_COUNT) + if (i < O2CB_MAP_STABILIZE_COUNT - 1) msleep(1000); } diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 720aa389e0ea..2768eb1da2b8 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -1004,10 +1004,8 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) BUG_ON(conn == NULL); lc = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL); - if (!lc) { - rc = -ENOMEM; - goto out; - } + if (!lc) + return -ENOMEM; init_waitqueue_head(&lc->oc_wait); init_completion(&lc->oc_sync_wait); @@ -1063,7 +1061,7 @@ static int user_cluster_connect(struct ocfs2_cluster_connection *conn) } out: - if (rc && lc) + if (rc) kfree(lc); return rc; } diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 0cb889a17ae1..4479029630bb 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2499,6 +2499,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle, alloc_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); + ocfs2_block_group_set_bits(handle, alloc_inode, group, group_bh, + start_bit, count); goto bail; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 26675185b886..403c5660b306 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2069,6 +2069,8 @@ static int ocfs2_initialize_super(struct super_block *sb, cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits); + memcpy(sb->s_uuid, di->id2.i_super.s_uuid, + sizeof(di->id2.i_super.s_uuid)); osb->osb_dx_mask = (1 << (cbits - bbits)) - 1; @@ -2333,7 +2335,7 @@ static int ocfs2_initialize_super(struct super_block *sb, mlog_errno(status); goto bail; } - cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb); + cleancache_init_shared_fs(sb); bail: return status; @@ -2563,22 +2565,22 @@ static void ocfs2_handle_error(struct super_block *sb) ocfs2_set_ro_flag(osb, 0); } -static char error_buf[1024]; - -void __ocfs2_error(struct super_block *sb, - const char *function, - const char *fmt, ...) +void __ocfs2_error(struct super_block *sb, const char *function, + const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; /* Not using mlog here because we want to show the actual * function the error came from. */ - printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %s\n", - sb->s_id, function, error_buf); + printk(KERN_CRIT "OCFS2: ERROR (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); ocfs2_handle_error(sb); } @@ -2586,18 +2588,21 @@ void __ocfs2_error(struct super_block *sb, /* Handle critical errors. This is intentionally more drastic than * ocfs2_handle_error, so we only use for things like journal errors, * etc. */ -void __ocfs2_abort(struct super_block* sb, - const char *function, +void __ocfs2_abort(struct super_block *sb, const char *function, const char *fmt, ...) { + struct va_format vaf; va_list args; va_start(args, fmt); - vsnprintf(error_buf, sizeof(error_buf), fmt, args); - va_end(args); - printk(KERN_CRIT "OCFS2: abort (device %s): %s: %s\n", - sb->s_id, function, error_buf); + vaf.fmt = fmt; + vaf.va = &args; + + printk(KERN_CRIT "OCFS2: abort (device %s): %s: %pV\n", + sb->s_id, function, &vaf); + + va_end(args); /* We don't have the cluster support yet to go straight to * hard readonly in here. Until then, we want to keep diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 85b190dc132f..889f3796a0d7 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1020,7 +1020,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, int ret = 0, i_ret = 0, b_ret = 0; struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode); + struct ocfs2_inode_info *oi = OCFS2_I(d_inode(dentry)); if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb))) return -EOPNOTSUPP; @@ -1028,7 +1028,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) return ret; - ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0); + ret = ocfs2_inode_lock(d_inode(dentry), &di_bh, 0); if (ret < 0) { mlog_errno(ret); return ret; @@ -1037,7 +1037,7 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, di = (struct ocfs2_dinode *)di_bh->b_data; down_read(&oi->ip_xattr_sem); - i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size); + i_ret = ocfs2_xattr_ibody_list(d_inode(dentry), di, buffer, size); if (i_ret < 0) b_ret = 0; else { @@ -1045,13 +1045,13 @@ ssize_t ocfs2_listxattr(struct dentry *dentry, buffer += i_ret; size -= i_ret; } - b_ret = ocfs2_xattr_block_list(dentry->d_inode, di, + b_ret = ocfs2_xattr_block_list(d_inode(dentry), di, buffer, size); if (b_ret < 0) i_ret = 0; } up_read(&oi->ip_xattr_sem); - ocfs2_inode_unlock(dentry->d_inode, 0); + ocfs2_inode_unlock(d_inode(dentry), 0); brelse(di_bh); @@ -1238,6 +1238,10 @@ static int ocfs2_xattr_block_get(struct inode *inode, i, &block_off, &name_offset); + if (ret) { + mlog_errno(ret); + goto cleanup; + } xs->base = bucket_block(xs->bucket, block_off); } if (ocfs2_xattr_is_local(xs->here)) { @@ -5665,6 +5669,10 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, ret = ocfs2_get_xattr_tree_value_root(inode->i_sb, bucket, i, &xv, NULL); + if (ret) { + mlog_errno(ret); + break; + } ret = ocfs2_lock_xattr_remove_allocators(inode, xv, args->ref_ci, @@ -7249,7 +7257,7 @@ static int ocfs2_xattr_security_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -7259,11 +7267,11 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_SECURITY, + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } -int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, +static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { const struct xattr *xattr; @@ -7339,7 +7347,7 @@ static int ocfs2_xattr_trusted_get(struct dentry *dentry, const char *name, { if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -7349,7 +7357,7 @@ static int ocfs2_xattr_trusted_set(struct dentry *dentry, const char *name, if (strcmp(name, "") == 0) return -EINVAL; - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_TRUSTED, + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } @@ -7391,7 +7399,7 @@ static int ocfs2_xattr_user_get(struct dentry *dentry, const char *name, return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_get(dentry->d_inode, OCFS2_XATTR_INDEX_USER, name, + return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, buffer, size); } @@ -7405,7 +7413,7 @@ static int ocfs2_xattr_user_set(struct dentry *dentry, const char *name, if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; - return ocfs2_xattr_set(dentry->d_inode, OCFS2_XATTR_INDEX_USER, + return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, value, size, flags); } diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c index 082234581d05..83f4e76511c2 100644 --- a/fs/omfs/bitmap.c +++ b/fs/omfs/bitmap.c @@ -159,7 +159,7 @@ int omfs_allocate_range(struct super_block *sb, goto out; found: - *return_block = i * bits_per_entry + bit; + *return_block = (u64) i * bits_per_entry + bit; *return_size = run; ret = set_run(sb, i, bits_per_entry, bit, run, 1); diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 1b8e9e8405b2..f833bf8d5792 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -110,7 +110,7 @@ int omfs_make_empty(struct inode *inode, struct super_block *sb) static int omfs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct omfs_inode *oi; @@ -155,7 +155,7 @@ out: static int omfs_delete_entry(struct dentry *dentry) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct inode *dirty; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; @@ -237,7 +237,7 @@ static int omfs_dir_is_empty(struct inode *inode) static int omfs_remove(struct inode *dir, struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int ret; @@ -373,8 +373,8 @@ static bool omfs_fill_chain(struct inode *dir, struct dir_context *ctx, static int omfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *new_inode = new_dentry->d_inode; - struct inode *old_inode = old_dentry->d_inode; + struct inode *new_inode = d_inode(new_dentry); + struct inode *old_inode = d_inode(old_dentry); int err; if (new_inode) { diff --git a/fs/omfs/file.c b/fs/omfs/file.c index 902e88527fce..d9e26cfbb793 100644 --- a/fs/omfs/file.c +++ b/fs/omfs/file.c @@ -337,8 +337,6 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block) const struct file_operations omfs_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, @@ -348,7 +346,7 @@ const struct file_operations omfs_file_operations = { static int omfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 138321b0c6c2..3d935c81789a 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -306,7 +306,8 @@ static const struct super_operations omfs_sops = { */ static int omfs_get_imap(struct super_block *sb) { - unsigned int bitmap_size, count, array_size; + unsigned int bitmap_size, array_size; + int count; struct omfs_sb_info *sbi = OMFS_SB(sb); struct buffer_head *bh; unsigned long **ptr; @@ -359,7 +360,7 @@ nomem: } enum { - Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask + Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_err }; static const match_table_t tokens = { @@ -368,6 +369,7 @@ static const match_table_t tokens = { {Opt_umask, "umask=%o"}, {Opt_dmask, "dmask=%o"}, {Opt_fmask, "fmask=%o"}, + {Opt_err, NULL}, }; static int parse_options(char *options, struct omfs_sb_info *sbi) @@ -548,8 +550,10 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent) } sb->s_root = d_make_root(root); - if (!sb->s_root) + if (!sb->s_root) { + ret = -ENOMEM; goto out_brelse_bh2; + } printk(KERN_DEBUG "omfs: Mounted volume %s\n", omfs_rb->r_name); ret = 0; diff --git a/fs/open.c b/fs/open.c index 33f9cbf2610b..e33dab287fa0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -51,8 +51,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, newattrs.ia_valid |= ATTR_FILE; } - /* Remove suid/sgid on truncate too */ - ret = should_remove_suid(dentry); + /* Remove suid, sgid, and file capabilities on truncate too */ + ret = dentry_needs_remove_privs(dentry); + if (ret < 0) + return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; @@ -231,8 +233,7 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EINVAL; /* Return error if mode is not supported */ - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + if (mode & ~FALLOC_FL_SUPPORTED_MASK) return -EOPNOTSUPP; /* Punch hole and zero range are mutually exclusive */ @@ -250,6 +251,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) (mode & ~FALLOC_FL_COLLAPSE_RANGE)) return -EINVAL; + /* Insert range should only be used exclusively. */ + if ((mode & FALLOC_FL_INSERT_RANGE) && + (mode & ~FALLOC_FL_INSERT_RANGE)) + return -EINVAL; + if (!(file->f_mode & FMODE_WRITE)) return -EBADF; @@ -363,7 +369,7 @@ retry: if (res) goto out; - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* @@ -570,6 +576,7 @@ static int chown_common(struct path *path, uid_t user, gid_t group) uid = make_kuid(current_user_ns(), user); gid = make_kgid(current_user_ns(), group); +retry_deleg: newattrs.ia_valid = ATTR_CTIME; if (user != (uid_t) -1) { if (!uid_valid(uid)) @@ -586,7 +593,6 @@ static int chown_common(struct path *path, uid_t user, gid_t group) if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; -retry_deleg: mutex_lock(&inode->i_mutex); error = security_path_chown(path, uid, gid); if (!error) @@ -674,18 +680,18 @@ int open_check_o_direct(struct file *f) } static int do_dentry_open(struct file *f, + struct inode *inode, int (*open)(struct inode *, struct file *), const struct cred *cred) { static const struct file_operations empty_fops = {}; - struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; path_get(&f->f_path); - inode = f->f_inode = f->f_path.dentry->d_inode; + f->f_inode = inode; f->f_mapping = inode->i_mapping; if (unlikely(f->f_flags & O_PATH)) { @@ -734,10 +740,10 @@ static int do_dentry_open(struct file *f, if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) i_readcount_inc(inode); if ((f->f_mode & FMODE_READ) && - likely(f->f_op->read || f->f_op->aio_read || f->f_op->read_iter)) + likely(f->f_op->read || f->f_op->read_iter)) f->f_mode |= FMODE_CAN_READ; if ((f->f_mode & FMODE_WRITE) && - likely(f->f_op->write || f->f_op->aio_write || f->f_op->write_iter)) + likely(f->f_op->write || f->f_op->write_iter)) f->f_mode |= FMODE_CAN_WRITE; f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -789,7 +795,8 @@ int finish_open(struct file *file, struct dentry *dentry, BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; - error = do_dentry_open(file, open, current_cred()); + error = do_dentry_open(file, d_backing_inode(dentry), open, + current_cred()); if (!error) *opened |= FILE_OPENED; @@ -818,6 +825,34 @@ int finish_no_open(struct file *file, struct dentry *dentry) } EXPORT_SYMBOL(finish_no_open); +char *file_path(struct file *filp, char *buf, int buflen) +{ + return d_path(&filp->f_path, buf, buflen); +} +EXPORT_SYMBOL(file_path); + +/** + * vfs_open - open the file at the given path + * @path: path to open + * @file: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *file, + const struct cred *cred) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = dentry->d_inode; + + file->f_path = *path; + if (dentry->d_flags & DCACHE_OP_SELECT_INODE) { + inode = dentry->d_op->d_select_inode(dentry, file->f_flags); + if (IS_ERR(inode)) + return PTR_ERR(inode); + } + + return do_dentry_open(file, inode, NULL, cred); +} + struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { @@ -849,26 +884,6 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); -/** - * vfs_open - open the file at the given path - * @path: path to open - * @filp: newly allocated file with f_flag initialized - * @cred: credentials to use - */ -int vfs_open(const struct path *path, struct file *filp, - const struct cred *cred) -{ - struct inode *inode = path->dentry->d_inode; - - if (inode->i_op->dentry_open) - return inode->i_op->dentry_open(path->dentry, filp, cred); - else { - filp->f_path = *path; - return do_dentry_open(filp, NULL, cred); - } -} -EXPORT_SYMBOL(vfs_open); - static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; @@ -988,9 +1003,6 @@ struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt, return ERR_PTR(err); if (flags & O_CREAT) return ERR_PTR(-EINVAL); - if (!filename && (flags & O_DIRECTORY)) - if (!dentry->d_inode->i_op->lookup) - return ERR_PTR(-ENOTDIR); return do_file_open_root(dentry, mnt, filename, &op); } EXPORT_SYMBOL(file_open_root); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 24f640441bd9..84d693d37428 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -299,6 +299,9 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, struct cred *override_cred; char *link = NULL; + if (WARN_ON(!workdir)) + return -EROFS; + ovl_path_upper(parent, &parentpath); upperdir = parentpath.dentry; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index d139405d2bfa..692ceda3bc21 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -222,6 +222,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, struct kstat stat; int err; + if (WARN_ON(!workdir)) + return ERR_PTR(-EROFS); + err = ovl_lock_rename_workdir(workdir, upperdir); if (err) goto out; @@ -322,6 +325,9 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; + if (WARN_ON(!workdir)) + return -EROFS; + err = ovl_lock_rename_workdir(workdir, upperdir); if (err) goto out; @@ -506,11 +512,28 @@ static int ovl_remove_and_whiteout(struct dentry *dentry, bool is_dir) struct dentry *opaquedir = NULL; int err; - if (is_dir && OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { - opaquedir = ovl_check_empty_and_clear(dentry); - err = PTR_ERR(opaquedir); - if (IS_ERR(opaquedir)) - goto out; + if (WARN_ON(!workdir)) + return -EROFS; + + if (is_dir) { + if (OVL_TYPE_MERGE_OR_LOWER(ovl_path_type(dentry))) { + opaquedir = ovl_check_empty_and_clear(dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } else { + LIST_HEAD(list); + + /* + * When removing an empty opaque directory, then it + * makes no sense to replace it with an exact replica of + * itself. But emptiness still needs to be checked. + */ + err = ovl_check_empty_dir(dentry, &list); + ovl_cache_free(&list); + if (err) + goto out; + } } err = ovl_lock_rename_workdir(workdir, upperdir); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 04f124884687..d9da5a4e9382 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -140,11 +140,12 @@ struct ovl_link_data { void *cookie; }; -static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *ovl_follow_link(struct dentry *dentry, void **cookie) { - void *ret; struct dentry *realdentry; struct inode *realinode; + struct ovl_link_data *data = NULL; + const char *ret; realdentry = ovl_dentry_real(dentry); realinode = realdentry->d_inode; @@ -152,28 +153,28 @@ static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) if (WARN_ON(!realinode->i_op->follow_link)) return ERR_PTR(-EPERM); - ret = realinode->i_op->follow_link(realdentry, nd); - if (IS_ERR(ret)) - return ret; - if (realinode->i_op->put_link) { - struct ovl_link_data *data; - data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); - if (!data) { - realinode->i_op->put_link(realdentry, nd, ret); + if (!data) return ERR_PTR(-ENOMEM); - } data->realdentry = realdentry; - data->cookie = ret; + } - return data; - } else { - return NULL; + ret = realinode->i_op->follow_link(realdentry, cookie); + if (IS_ERR_OR_NULL(ret)) { + kfree(data); + return ret; } + + if (data) + data->cookie = *cookie; + + *cookie = data; + + return ret; } -static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +static void ovl_put_link(struct inode *unused, void *c) { struct inode *realinode; struct ovl_link_data *data = c; @@ -182,7 +183,7 @@ static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) return; realinode = data->realdentry->d_inode; - realinode->i_op->put_link(data->realdentry, nd, data->cookie); + realinode->i_op->put_link(realinode, data->cookie); kfree(data); } @@ -336,37 +337,33 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, return true; } -static int ovl_dentry_open(struct dentry *dentry, struct file *file, - const struct cred *cred) +struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags) { int err; struct path realpath; enum ovl_path_type type; - bool want_write = false; + + if (d_is_dir(dentry)) + return d_backing_inode(dentry); type = ovl_path_real(dentry, &realpath); - if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { - want_write = true; + if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { err = ovl_want_write(dentry); if (err) - goto out; + return ERR_PTR(err); - if (file->f_flags & O_TRUNC) + if (file_flags & O_TRUNC) err = ovl_copy_up_last(dentry, NULL, true); else err = ovl_copy_up(dentry); + ovl_drop_write(dentry); if (err) - goto out_drop_write; + return ERR_PTR(err); ovl_path_upper(dentry, &realpath); } - err = vfs_open(&realpath, file, cred); -out_drop_write: - if (want_write) - ovl_drop_write(dentry); -out: - return err; + return d_backing_inode(realpath.dentry); } static const struct inode_operations ovl_file_inode_operations = { @@ -377,7 +374,6 @@ static const struct inode_operations ovl_file_inode_operations = { .getxattr = ovl_getxattr, .listxattr = ovl_listxattr, .removexattr = ovl_removexattr, - .dentry_open = ovl_dentry_open, }; static const struct inode_operations ovl_symlink_inode_operations = { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 17ac5afc9ffb..ea5a40b06e3a 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -173,6 +173,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); int ovl_removexattr(struct dentry *dentry, const char *name); +struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, struct ovl_entry *oe); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 907870e81a72..70e9af551600 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -23,6 +23,7 @@ struct ovl_cache_entry { u64 ino; struct list_head l_node; struct rb_node node; + struct ovl_cache_entry *next_maybe_whiteout; bool is_whiteout; char name[]; }; @@ -39,7 +40,7 @@ struct ovl_readdir_data { struct rb_root root; struct list_head *list; struct list_head middle; - struct dentry *dir; + struct ovl_cache_entry *first_maybe_whiteout; int count; int err; }; @@ -79,7 +80,7 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, return NULL; } -static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, +static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, u64 ino, unsigned int d_type) { @@ -98,29 +99,8 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct dentry *dir, p->is_whiteout = false; if (d_type == DT_CHR) { - struct dentry *dentry; - const struct cred *old_cred; - struct cred *override_cred; - - override_cred = prepare_creds(); - if (!override_cred) { - kfree(p); - return NULL; - } - - /* - * CAP_DAC_OVERRIDE for lookup - */ - cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); - old_cred = override_creds(override_cred); - - dentry = lookup_one_len(name, dir, len); - if (!IS_ERR(dentry)) { - p->is_whiteout = ovl_is_whiteout(dentry); - dput(dentry); - } - revert_creds(old_cred); - put_cred(override_cred); + p->next_maybe_whiteout = rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p; } return p; } @@ -148,7 +128,7 @@ static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, return 0; } - p = ovl_cache_entry_new(rdd->dir, name, len, ino, d_type); + p = ovl_cache_entry_new(rdd, name, len, ino, d_type); if (p == NULL) return -ENOMEM; @@ -169,7 +149,7 @@ static int ovl_fill_lower(struct ovl_readdir_data *rdd, if (p) { list_move_tail(&p->l_node, &rdd->middle); } else { - p = ovl_cache_entry_new(rdd->dir, name, namelen, ino, d_type); + p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else @@ -219,6 +199,43 @@ static int ovl_fill_merge(struct dir_context *ctx, const char *name, return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type); } +static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) +{ + int err; + struct ovl_cache_entry *p; + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) + return -ENOMEM; + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + err = mutex_lock_killable(&dir->d_inode->i_mutex); + if (!err) { + while (rdd->first_maybe_whiteout) { + p = rdd->first_maybe_whiteout; + rdd->first_maybe_whiteout = p->next_maybe_whiteout; + dentry = lookup_one_len(p->name, dir, p->len); + if (!IS_ERR(dentry)) { + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + } + mutex_unlock(&dir->d_inode->i_mutex); + } + revert_creds(old_cred); + put_cred(override_cred); + + return err; +} + static inline int ovl_dir_read(struct path *realpath, struct ovl_readdir_data *rdd) { @@ -229,7 +246,7 @@ static inline int ovl_dir_read(struct path *realpath, if (IS_ERR(realfile)) return PTR_ERR(realfile); - rdd->dir = realpath->dentry; + rdd->first_maybe_whiteout = NULL; rdd->ctx.pos = 0; do { rdd->count = 0; @@ -238,6 +255,10 @@ static inline int ovl_dir_read(struct path *realpath, if (err >= 0) err = rdd->err; } while (!err && rdd->count); + + if (!err && rdd->first_maybe_whiteout) + err = ovl_check_whiteouts(realpath->dentry, rdd); + fput(realfile); return err; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 5f0d1993e6e3..7466ff339c66 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -273,8 +273,56 @@ static void ovl_dentry_release(struct dentry *dentry) } } +static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct ovl_entry *oe = dentry->d_fsdata; + unsigned int i; + int ret = 1; + + for (i = 0; i < oe->numlower; i++) { + struct dentry *d = oe->lowerstack[i].dentry; + + if (d->d_flags & DCACHE_OP_REVALIDATE) { + ret = d->d_op->d_revalidate(d, flags); + if (ret < 0) + return ret; + if (!ret) { + if (!(flags & LOOKUP_RCU)) + d_invalidate(d); + return -ESTALE; + } + } + } + return 1; +} + +static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct ovl_entry *oe = dentry->d_fsdata; + unsigned int i; + int ret = 1; + + for (i = 0; i < oe->numlower; i++) { + struct dentry *d = oe->lowerstack[i].dentry; + + if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) { + ret = d->d_op->d_weak_revalidate(d, flags); + if (ret <= 0) + break; + } + } + return ret; +} + static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, + .d_select_inode = ovl_d_select_inode, +}; + +static const struct dentry_operations ovl_reval_dentry_operations = { + .d_release = ovl_dentry_release, + .d_revalidate = ovl_dentry_revalidate, + .d_weak_revalidate = ovl_dentry_weak_revalidate, }; static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) @@ -288,6 +336,20 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) return oe; } +static bool ovl_dentry_remote(struct dentry *dentry) +{ + return dentry->d_flags & + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); +} + +static bool ovl_dentry_weird(struct dentry *dentry) +{ + return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | + DCACHE_MANAGE_TRANSIT | + DCACHE_OP_HASH | + DCACHE_OP_COMPARE); +} + static inline struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name) { @@ -303,6 +365,10 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, } else if (!dentry->d_inode) { dput(dentry); dentry = NULL; + } else if (ovl_dentry_weird(dentry)) { + dput(dentry); + /* Don't support traversing automounts and other weirdness */ + dentry = ERR_PTR(-EREMOTE); } return dentry; } @@ -350,6 +416,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out; if (this) { + if (unlikely(ovl_dentry_remote(this))) { + dput(this); + err = -EREMOTE; + goto out; + } if (ovl_is_whiteout(this)) { dput(this); this = NULL; @@ -529,7 +600,7 @@ static int ovl_remount(struct super_block *sb, int *flags, char *data) { struct ovl_fs *ufs = sb->s_fs_info; - if (!(*flags & MS_RDONLY) && !ufs->upper_mnt) + if (!(*flags & MS_RDONLY) && (!ufs->upper_mnt || !ufs->workdir)) return -EROFS; return 0; @@ -694,25 +765,6 @@ static void ovl_unescape(char *s) } } -static bool ovl_is_allowed_fs_type(struct dentry *root) -{ - const struct dentry_operations *dop = root->d_op; - - /* - * We don't support: - * - automount filesystems - * - filesystems with revalidate (FIXME for lower layer) - * - filesystems with case insensitive names - */ - if (dop && - (dop->d_manage || dop->d_automount || - dop->d_revalidate || dop->d_weak_revalidate || - dop->d_compare || dop->d_hash)) { - return false; - } - return true; -} - static int ovl_mount_dir_noesc(const char *name, struct path *path) { int err = -EINVAL; @@ -727,7 +779,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) goto out; } err = -EINVAL; - if (!ovl_is_allowed_fs_type(path->dentry)) { + if (ovl_dentry_weird(path->dentry)) { pr_err("overlayfs: filesystem on '%s' not supported\n", name); goto out_put; } @@ -751,13 +803,21 @@ static int ovl_mount_dir(const char *name, struct path *path) if (tmp) { ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); + + if (!err) + if (ovl_dentry_remote(path->dentry)) { + pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n", + tmp); + path_put(path); + err = -EINVAL; + } kfree(tmp); } return err; } static int ovl_lower_dir(const char *name, struct path *path, long *namelen, - int *stack_depth) + int *stack_depth, bool *remote) { int err; struct kstatfs statfs; @@ -774,6 +834,9 @@ static int ovl_lower_dir(const char *name, struct path *path, long *namelen, *namelen = max(*namelen, statfs.f_namelen); *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); + if (ovl_dentry_remote(path->dentry)) + *remote = true; + return 0; out_put: @@ -827,6 +890,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) unsigned int numlower; unsigned int stacklen = 0; unsigned int i; + bool remote = false; int err; err = -ENOMEM; @@ -900,7 +964,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) lower = lowertmp; for (numlower = 0; numlower < stacklen; numlower++) { err = ovl_lower_dir(lower, &stack[numlower], - &ufs->lower_namelen, &sb->s_stack_depth); + &ufs->lower_namelen, &sb->s_stack_depth, + &remote); if (err) goto out_put_lowerpath; @@ -925,9 +990,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); err = PTR_ERR(ufs->workdir); if (IS_ERR(ufs->workdir)) { - pr_err("overlayfs: failed to create directory %s/%s\n", - ufs->config.workdir, OVL_WORKDIR_NAME); - goto out_put_upper_mnt; + pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n", + ufs->config.workdir, OVL_WORKDIR_NAME, -err); + sb->s_flags |= MS_RDONLY; + ufs->workdir = NULL; } } @@ -957,7 +1023,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; - sb->s_d_op = &ovl_dentry_operations; + if (remote) + sb->s_d_op = &ovl_reval_dentry_operations; + else + sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; oe = ovl_alloc_entry(numlower); @@ -997,7 +1066,6 @@ out_put_lower_mnt: kfree(ufs->lower_mnt); out_put_workdir: dput(ufs->workdir); -out_put_upper_mnt: mntput(ufs->upper_mnt); out_put_lowerpath: for (i = 0; i < numlower; i++) diff --git a/fs/pipe.c b/fs/pipe.c index 21981e58e2a6..8865f7963700 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,7 +21,6 @@ #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> -#include <linux/aio.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -628,7 +627,7 @@ static struct vfsmount *pipe_mnt __read_mostly; static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]", - dentry->d_inode->i_ino); + d_inode(dentry)->i_ino); } static const struct dentry_operations pipefs_dentry_operations = { @@ -947,9 +946,7 @@ err: const struct file_operations pipefifo_fops = { .open = fifo_open, .llseek = no_llseek, - .read = new_sync_read, .read_iter = pipe_read, - .write = new_sync_write, .write_iter = pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, diff --git a/fs/pnode.c b/fs/pnode.c index 260ac8f898a4..6367e1e435c6 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -362,6 +362,46 @@ int propagate_mount_busy(struct mount *mnt, int refcnt) } /* + * Clear MNT_LOCKED when it can be shown to be safe. + * + * mount_lock lock must be held for write + */ +void propagate_mount_unlock(struct mount *mnt) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m, *child; + + BUG_ON(parent == mnt); + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); + if (child) + child->mnt.mnt_flags &= ~MNT_LOCKED; + } +} + +/* + * Mark all mounts that the MNT_LOCKED logic will allow to be unmounted. + */ +static void mark_umount_candidates(struct mount *mnt) +{ + struct mount *parent = mnt->mnt_parent; + struct mount *m; + + BUG_ON(parent == mnt); + + for (m = propagation_next(parent, parent); m; + m = propagation_next(m, parent)) { + struct mount *child = __lookup_mnt_last(&m->mnt, + mnt->mnt_mountpoint); + if (child && (!IS_MNT_LOCKED(child) || IS_MNT_MARKED(m))) { + SET_MNT_MARK(child); + } + } +} + +/* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ @@ -378,13 +418,16 @@ static void __propagate_umount(struct mount *mnt) struct mount *child = __lookup_mnt_last(&m->mnt, mnt->mnt_mountpoint); /* - * umount the child only if the child has no - * other children + * umount the child only if the child has no children + * and the child is marked safe to unmount. */ - if (child && list_empty(&child->mnt_mounts)) { + if (!child || !IS_MNT_MARKED(child)) + continue; + CLEAR_MNT_MARK(child); + if (list_empty(&child->mnt_mounts)) { list_del_init(&child->mnt_child); - hlist_del_init_rcu(&child->mnt_hash); - hlist_add_before_rcu(&child->mnt_hash, &mnt->mnt_hash); + child->mnt.mnt_flags |= MNT_UMOUNT; + list_move_tail(&child->mnt_list, &mnt->mnt_list); } } } @@ -396,11 +439,14 @@ static void __propagate_umount(struct mount *mnt) * * vfsmount lock must be held for write */ -int propagate_umount(struct hlist_head *list) +int propagate_umount(struct list_head *list) { struct mount *mnt; - hlist_for_each_entry(mnt, list, mnt_hash) + list_for_each_entry_reverse(mnt, list, mnt_list) + mark_umount_candidates(mnt); + + list_for_each_entry(mnt, list, mnt_list) __propagate_umount(mnt); return 0; } diff --git a/fs/pnode.h b/fs/pnode.h index 4a246358b031..0fcdbe7ca648 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -19,6 +19,7 @@ #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) #define SET_MNT_MARK(m) ((m)->mnt.mnt_flags |= MNT_MARKED) #define CLEAR_MNT_MARK(m) ((m)->mnt.mnt_flags &= ~MNT_MARKED) +#define IS_MNT_LOCKED(m) ((m)->mnt.mnt_flags & MNT_LOCKED) #define CL_EXPIRE 0x01 #define CL_SLAVE 0x02 @@ -40,14 +41,14 @@ static inline void set_mnt_shared(struct mount *mnt) void change_mnt_propagation(struct mount *, int); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); -int propagate_umount(struct hlist_head *); +int propagate_umount(struct list_head *); int propagate_mount_busy(struct mount *, int); +void propagate_mount_unlock(struct mount *); void mnt_release_group_id(struct mount *); int get_dominating_id(struct mount *mnt, const struct path *root); unsigned int mnt_get_count(struct mount *mnt); void mnt_set_mountpoint(struct mount *, struct mountpoint *, struct mount *); -void umount_tree(struct mount *, int); struct mount *copy_tree(struct mount *, struct dentry *, int); bool is_path_reachable(struct mount *, struct dentry *, const struct path *root); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 3a48bb789c9f..4fb17ded7d47 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { struct posix_acl *p; + struct posix_acl *clone; int ret; + *acl = NULL; + *default_acl = NULL; + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) - goto no_acl; + return 0; p = get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(p)) { - if (p == ERR_PTR(-EOPNOTSUPP)) - goto apply_umask; - return PTR_ERR(p); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; } + if (IS_ERR(p)) + return PTR_ERR(p); - if (!p) - goto apply_umask; - - *acl = posix_acl_clone(p, GFP_NOFS); - if (!*acl) + clone = posix_acl_clone(p, GFP_NOFS); + if (!clone) goto no_mem; - ret = posix_acl_create_masq(*acl, mode); + ret = posix_acl_create_masq(clone, mode); if (ret < 0) goto no_mem_clone; - if (ret == 0) { - posix_acl_release(*acl); - *acl = NULL; - } + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; - if (!S_ISDIR(*mode)) { + if (!S_ISDIR(*mode)) posix_acl_release(p); - *default_acl = NULL; - } else { + else *default_acl = p; - } - return 0; -apply_umask: - *mode &= ~current_umask(); -no_acl: - *default_acl = NULL; - *acl = NULL; return 0; no_mem_clone: - posix_acl_release(*acl); + posix_acl_release(clone); no_mem: posix_acl_release(p); return -ENOMEM; @@ -774,12 +768,12 @@ posix_acl_xattr_get(struct dentry *dentry, const char *name, struct posix_acl *acl; int error; - if (!IS_POSIXACL(dentry->d_inode)) + if (!IS_POSIXACL(d_backing_inode(dentry))) return -EOPNOTSUPP; if (d_is_symlink(dentry)) return -EOPNOTSUPP; - acl = get_acl(dentry->d_inode, type); + acl = get_acl(d_backing_inode(dentry), type); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl == NULL) @@ -795,7 +789,7 @@ static int posix_acl_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int type) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_backing_inode(dentry); struct posix_acl *acl = NULL; int ret; @@ -834,7 +828,7 @@ posix_acl_xattr_list(struct dentry *dentry, char *list, size_t list_size, const char *xname; size_t size; - if (!IS_POSIXACL(dentry->d_inode)) + if (!IS_POSIXACL(d_backing_inode(dentry))) return -EOPNOTSUPP; if (d_is_symlink(dentry)) return -EOPNOTSUPP; diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 2183fcf41d59..1ade1206bb89 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -71,3 +71,13 @@ config PROC_PAGE_MONITOR /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, /proc/kpagecount, and /proc/kpageflags. Disabling these interfaces will reduce the size of the kernel by approximately 4kb. + +config PROC_CHILDREN + bool "Include /proc/<pid>/task/<tid>/children file" + default n + help + Provides a fast way to retrieve first level children pids of a task. See + <file:Documentation/filesystems/proc.txt> for more information. + + Say Y if you are running any user-space software which takes benefit from + this interface. For example, rkt is such a piece of software. diff --git a/fs/proc/array.c b/fs/proc/array.c index 1295a00ca316..ce065cf3104f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -99,8 +99,8 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) buf = m->buf + m->count; /* Ignore error for now */ - string_escape_str(tcomm, &buf, m->size - m->count, - ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); + buf += string_escape_str(tcomm, buf, m->size - m->count, + ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\"); m->count = buf - m->buf; seq_putc(m, '\n'); @@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; + /* + * Parked tasks do not run; they sit in __kthread_parkme(). + * Without this check, we would report them as running, which is + * clearly wrong, so we report them as sleeping instead. + */ + if (tsk->state == TASK_PARKED) + state = TASK_INTERRUPTIBLE; + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); return task_state_array[fls(state)]; @@ -188,6 +196,24 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, from_kgid_munged(user_ns, GROUP_AT(group_info, g))); put_cred(cred); +#ifdef CONFIG_PID_NS + seq_puts(m, "\nNStgid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_tgid_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSpid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_pid_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSpgid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_pgrp_nr_ns(p, pid->numbers[g].ns)); + seq_puts(m, "\nNSsid:"); + for (g = ns->level; g <= pid->level; g++) + seq_printf(m, "\t%d", + task_session_nr_ns(p, pid->numbers[g].ns)); +#endif seq_putc(m, '\n'); } @@ -551,7 +577,7 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, return 0; } -#ifdef CONFIG_CHECKPOINT_RESTORE +#ifdef CONFIG_PROC_CHILDREN static struct pid * get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) { @@ -614,7 +640,9 @@ static int children_seq_show(struct seq_file *seq, void *v) pid_t pid; pid = pid_nr_ns(v, inode->i_sb->s_fs_info); - return seq_printf(seq, "%d ", pid); + seq_printf(seq, "%d ", pid); + + return 0; } static void *children_seq_start(struct seq_file *seq, loff_t *pos) @@ -672,4 +700,4 @@ const struct file_operations proc_tid_children_operations = { .llseek = seq_lseek, .release = children_seq_release, }; -#endif /* CONFIG_CHECKPOINT_RESTORE */ +#endif /* CONFIG_PROC_CHILDREN */ diff --git a/fs/proc/base.c b/fs/proc/base.c index 3f3d7aeb0712..aa50d1ac28fc 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -169,7 +169,7 @@ static int get_task_root(struct task_struct *task, struct path *root) static int proc_cwd_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(dentry->d_inode); + struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { @@ -186,7 +186,7 @@ static int proc_cwd_link(struct dentry *dentry, struct path *path) static int proc_root_link(struct dentry *dentry, struct path *path) { - struct task_struct *task = get_proc_task(dentry->d_inode); + struct task_struct *task = get_proc_task(d_inode(dentry)); int result = -ENOENT; if (task) { @@ -196,18 +196,210 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task) +static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, + size_t _count, loff_t *pos) { + struct task_struct *tsk; + struct mm_struct *mm; + char *page; + unsigned long count = _count; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long len1, len2, len; + unsigned long p; + char c; + ssize_t rv; + + BUG_ON(*pos < 0); + + tsk = get_proc_task(file_inode(file)); + if (!tsk) + return -ESRCH; + mm = get_task_mm(tsk); + put_task_struct(tsk); + if (!mm) + return 0; + /* Check if process spawned far enough to have cmdline. */ + if (!mm->env_end) { + rv = 0; + goto out_mmput; + } + + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) { + rv = -ENOMEM; + goto out_mmput; + } + + down_read(&mm->mmap_sem); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + + BUG_ON(arg_start > arg_end); + BUG_ON(env_start > env_end); + + len1 = arg_end - arg_start; + len2 = env_end - env_start; + + /* Empty ARGV. */ + if (len1 == 0) { + rv = 0; + goto out_free_page; + } /* - * Rely on struct seq_operations::show() being called once - * per internal buffer allocation. See single_open(), traverse(). + * Inherently racy -- command line shares address space + * with code and data. */ - BUG_ON(m->size < PAGE_SIZE); - m->count += get_cmdline(task, m->buf, PAGE_SIZE); - return 0; + rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + if (rv <= 0) + goto out_free_page; + + rv = 0; + + if (c == '\0') { + /* Command line (set of strings) occupies whole ARGV. */ + if (len1 <= *pos) + goto out_free_page; + + p = arg_start + *pos; + len = len1 - *pos; + while (count > 0 && len > 0) { + unsigned int _count; + int nr_read; + + _count = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, _count, 0); + if (nr_read < 0) + rv = nr_read; + if (nr_read <= 0) + goto out_free_page; + + if (copy_to_user(buf, page, nr_read)) { + rv = -EFAULT; + goto out_free_page; + } + + p += nr_read; + len -= nr_read; + buf += nr_read; + count -= nr_read; + rv += nr_read; + } + } else { + /* + * Command line (1 string) occupies ARGV and maybe + * extends into ENVP. + */ + if (len1 + len2 <= *pos) + goto skip_argv_envp; + if (len1 <= *pos) + goto skip_argv; + + p = arg_start + *pos; + len = len1 - *pos; + while (count > 0 && len > 0) { + unsigned int _count, l; + int nr_read; + bool final; + + _count = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, _count, 0); + if (nr_read < 0) + rv = nr_read; + if (nr_read <= 0) + goto out_free_page; + + /* + * Command line can be shorter than whole ARGV + * even if last "marker" byte says it is not. + */ + final = false; + l = strnlen(page, nr_read); + if (l < nr_read) { + nr_read = l; + final = true; + } + + if (copy_to_user(buf, page, nr_read)) { + rv = -EFAULT; + goto out_free_page; + } + + p += nr_read; + len -= nr_read; + buf += nr_read; + count -= nr_read; + rv += nr_read; + + if (final) + goto out_free_page; + } +skip_argv: + /* + * Command line (1 string) occupies ARGV and + * extends into ENVP. + */ + if (len1 <= *pos) { + p = env_start + *pos - len1; + len = len1 + len2 - *pos; + } else { + p = env_start; + len = len2; + } + while (count > 0 && len > 0) { + unsigned int _count, l; + int nr_read; + bool final; + + _count = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, _count, 0); + if (nr_read < 0) + rv = nr_read; + if (nr_read <= 0) + goto out_free_page; + + /* Find EOS. */ + final = false; + l = strnlen(page, nr_read); + if (l < nr_read) { + nr_read = l; + final = true; + } + + if (copy_to_user(buf, page, nr_read)) { + rv = -EFAULT; + goto out_free_page; + } + + p += nr_read; + len -= nr_read; + buf += nr_read; + count -= nr_read; + rv += nr_read; + + if (final) + goto out_free_page; + } +skip_argv_envp: + ; + } + +out_free_page: + free_page((unsigned long)page); +out_mmput: + mmput(mm); + if (rv > 0) + *pos += rv; + return rv; } +static const struct file_operations proc_pid_cmdline_ops = { + .read = proc_pid_cmdline_read, + .llseek = generic_file_llseek, +}; + static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -238,13 +430,15 @@ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, wchan = get_wchan(task); - if (lookup_symbol_name(wchan, symname) < 0) + if (lookup_symbol_name(wchan, symname) < 0) { if (!ptrace_may_access(task, PTRACE_MODE_READ)) return 0; - else - return seq_printf(m, "%lu", wchan); - else - return seq_printf(m, "%s", symname); + seq_printf(m, "%lu", wchan); + } else { + seq_printf(m, "%s", symname); + } + + return 0; } #endif /* CONFIG_KALLSYMS */ @@ -302,17 +496,22 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, } #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO /* * Provides /proc/PID/schedstat */ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - return seq_printf(m, "%llu %llu %lu\n", - (unsigned long long)task->se.sum_exec_runtime, - (unsigned long long)task->sched_info.run_delay, - task->sched_info.pcount); + if (unlikely(!sched_info_on())) + seq_printf(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", + (unsigned long long)task->se.sum_exec_runtime, + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + + return 0; } #endif @@ -387,7 +586,9 @@ static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, points = oom_badness(task, NULL, NULL, totalpages) * 1000 / totalpages; read_unlock(&tasklist_lock); - return seq_printf(m, "%lu\n", points); + seq_printf(m, "%lu\n", points); + + return 0; } struct limit_names { @@ -432,15 +633,15 @@ static int proc_pid_limits(struct seq_file *m, struct pid_namespace *ns, * print the file header */ seq_printf(m, "%-25s %-20s %-20s %-10s\n", - "Limit", "Soft Limit", "Hard Limit", "Units"); + "Limit", "Soft Limit", "Hard Limit", "Units"); for (i = 0; i < RLIM_NLIMITS; i++) { if (rlim[i].rlim_cur == RLIM_INFINITY) seq_printf(m, "%-25s %-20s ", - lnames[i].name, "unlimited"); + lnames[i].name, "unlimited"); else seq_printf(m, "%-25s %-20lu ", - lnames[i].name, rlim[i].rlim_cur); + lnames[i].name, rlim[i].rlim_cur); if (rlim[i].rlim_max == RLIM_INFINITY) seq_printf(m, "%-20s ", "unlimited"); @@ -462,7 +663,9 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, { long nr; unsigned long args[6], sp, pc; - int res = lock_trace(task); + int res; + + res = lock_trace(task); if (res) return res; @@ -477,7 +680,8 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, args[0], args[1], args[2], args[3], args[4], args[5], sp, pc); unlock_trace(task); - return res; + + return 0; } #endif /* CONFIG_HAVE_ARCH_TRACEHOOK */ @@ -505,7 +709,7 @@ static int proc_fd_access_allowed(struct inode *inode) int proc_setattr(struct dentry *dentry, struct iattr *attr) { int error; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); if (attr->ia_valid & ATTR_MODE) return -EPERM; @@ -1353,7 +1557,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) struct mm_struct *mm; struct file *exe_file; - task = get_proc_task(dentry->d_inode); + task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; mm = get_task_mm(task); @@ -1371,9 +1575,9 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) return -ENOENT; } -static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct path path; int error = -EACCES; @@ -1385,7 +1589,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) if (error) goto out; - nd_jump_link(nd, &path); + nd_jump_link(&path); return NULL; out: return ERR_PTR(error); @@ -1418,7 +1622,7 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen) static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen) { int error = -EACCES; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct path path; /* Are we allowed to snoop on the tasks file descriptors? */ @@ -1488,7 +1692,7 @@ out_unlock: int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct task_struct *task; const struct cred *cred; struct pid_namespace *pid = dentry->d_sb->s_fs_info; @@ -1545,7 +1749,7 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); task = get_proc_task(inode); if (task) { @@ -1579,7 +1783,7 @@ int pid_delete_dentry(const struct dentry *dentry) * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return proc_inode_is_dead(dentry->d_inode); + return proc_inode_is_dead(d_inode(dentry)); } const struct dentry_operations pid_dentry_operations = @@ -1617,12 +1821,12 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, child = d_alloc(dir, &qname); if (!child) goto end_instantiate; - if (instantiate(dir->d_inode, child, task, ptr) < 0) { + if (instantiate(d_inode(dir), child, task, ptr) < 0) { dput(child); goto end_instantiate; } } - inode = child->d_inode; + inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); @@ -1665,7 +1869,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_notask; } - inode = dentry->d_inode; + inode = d_inode(dentry); task = get_proc_task(inode); if (!task) goto out_notask; @@ -1718,7 +1922,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path) int rc; rc = -ENOENT; - task = get_proc_task(dentry->d_inode); + task = get_proc_task(d_inode(dentry)); if (!task) goto out; @@ -2002,12 +2206,13 @@ static int show_timer(struct seq_file *m, void *v) notify = timer->it_sigev_notify; seq_printf(m, "ID: %d\n", timer->it_id); - seq_printf(m, "signal: %d/%p\n", timer->sigq->info.si_signo, - timer->sigq->info.si_value.sival_ptr); + seq_printf(m, "signal: %d/%p\n", + timer->sigq->info.si_signo, + timer->sigq->info.si_value.sival_ptr); seq_printf(m, "notify: %s/%s.%d\n", - nstr[notify & ~SIGEV_THREAD_ID], - (notify & SIGEV_THREAD_ID) ? "tid" : "pid", - pid_nr_ns(timer->it_pid, tp->ns)); + nstr[notify & ~SIGEV_THREAD_ID], + (notify & SIGEV_THREAD_ID) ? "tid" : "pid", + pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); return 0; @@ -2352,21 +2557,23 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unlock_task_sighand(task, &flags); } - result = seq_printf(m, - "rchar: %llu\n" - "wchar: %llu\n" - "syscr: %llu\n" - "syscw: %llu\n" - "read_bytes: %llu\n" - "write_bytes: %llu\n" - "cancelled_write_bytes: %llu\n", - (unsigned long long)acct.rchar, - (unsigned long long)acct.wchar, - (unsigned long long)acct.syscr, - (unsigned long long)acct.syscw, - (unsigned long long)acct.read_bytes, - (unsigned long long)acct.write_bytes, - (unsigned long long)acct.cancelled_write_bytes); + seq_printf(m, + "rchar: %llu\n" + "wchar: %llu\n" + "syscr: %llu\n" + "syscw: %llu\n" + "read_bytes: %llu\n" + "write_bytes: %llu\n" + "cancelled_write_bytes: %llu\n", + (unsigned long long)acct.rchar, + (unsigned long long)acct.wchar, + (unsigned long long)acct.syscr, + (unsigned long long)acct.syscw, + (unsigned long long)acct.read_bytes, + (unsigned long long)acct.write_bytes, + (unsigned long long)acct.cancelled_write_bytes); + result = 0; + out_unlock: mutex_unlock(&task->signal->cred_guard_mutex); return result; @@ -2560,7 +2767,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - ONE("cmdline", S_IRUGO, proc_pid_cmdline), + REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), @@ -2588,7 +2795,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP @@ -2851,13 +3058,13 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; if (pos == TGID_OFFSET - 2) { - struct inode *inode = ns->proc_self->d_inode; + struct inode *inode = d_inode(ns->proc_self); if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { - struct inode *inode = ns->proc_thread_self->d_inode; + struct inode *inode = d_inode(ns->proc_thread_self); if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) return 0; ctx->pos = pos = pos + 1; @@ -2906,11 +3113,11 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - ONE("cmdline", S_IRUGO, proc_pid_cmdline), + REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_tid_maps_operations), -#ifdef CONFIG_CHECKPOINT_RESTORE +#ifdef CONFIG_PROC_CHILDREN REG("children", S_IRUGO, proc_tid_children_operations), #endif #ifdef CONFIG_NUMA @@ -2936,7 +3143,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP @@ -3176,7 +3383,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct task_struct *p = get_proc_task(inode); generic_fillattr(inode, stat); diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 8e5ad83b629a..6e5fcd00733e 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -8,6 +8,7 @@ #include <linux/security.h> #include <linux/file.h> #include <linux/seq_file.h> +#include <linux/fs.h> #include <linux/proc_fs.h> @@ -48,17 +49,23 @@ static int seq_show(struct seq_file *m, void *v) put_files_struct(files); } - if (!ret) { - seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", - (long long)file->f_pos, f_flags, - real_mount(file->f_path.mnt)->mnt_id); - if (file->f_op->show_fdinfo) - file->f_op->show_fdinfo(m, file); - ret = seq_has_overflowed(m); - fput(file); - } + if (ret) + return ret; - return ret; + seq_printf(m, "pos:\t%lli\nflags:\t0%o\nmnt_id:\t%i\n", + (long long)file->f_pos, f_flags, + real_mount(file->f_path.mnt)->mnt_id); + + show_fd_locks(m, file, files); + if (seq_has_overflowed(m)) + goto out; + + if (file->f_op->show_fdinfo) + file->f_op->show_fdinfo(m, file); + +out: + fput(file); + return 0; } static int seq_fdinfo_open(struct inode *inode, struct file *file) @@ -84,7 +91,7 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) if (flags & LOOKUP_RCU) return -ECHILD; - inode = dentry->d_inode; + inode = d_inode(dentry); task = get_proc_task(inode); fd = proc_fd(inode); @@ -144,14 +151,14 @@ static int proc_fd_link(struct dentry *dentry, struct path *path) struct task_struct *task; int ret = -ENOENT; - task = get_proc_task(dentry->d_inode); + task = get_proc_task(d_inode(dentry)); if (task) { files = get_files_struct(task); put_task_struct(task); } if (files) { - int fd = proc_fd(dentry->d_inode); + int fd = proc_fd(d_inode(dentry)); struct file *fd_file; spin_lock(&files->file_lock); diff --git a/fs/proc/generic.c b/fs/proc/generic.c index be65b2082135..e5dee5c3188e 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -101,7 +101,7 @@ static bool pde_subdir_insert(struct proc_dir_entry *dir, static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); int error; @@ -120,7 +120,7 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr) static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct proc_dir_entry *de = PDE(inode); if (de && de->nlink) set_nlink(inode, de->nlink); @@ -373,6 +373,10 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, WARN(1, "create '/proc/%s' by hand\n", qstr.name); return NULL; } + if (is_empty_pde(*parent)) { + WARN(1, "attempt to add to permanently empty directory"); + return NULL; + } ent = kzalloc(sizeof(struct proc_dir_entry) + qstr.len + 1, GFP_KERNEL); if (!ent) @@ -455,6 +459,25 @@ struct proc_dir_entry *proc_mkdir(const char *name, } EXPORT_SYMBOL(proc_mkdir); +struct proc_dir_entry *proc_create_mount_point(const char *name) +{ + umode_t mode = S_IFDIR | S_IRUGO | S_IXUGO; + struct proc_dir_entry *ent, *parent = NULL; + + ent = __proc_create(&parent, name, mode, 2); + if (ent) { + ent->data = NULL; + ent->proc_fops = NULL; + ent->proc_iops = NULL; + if (proc_register(parent, ent) < 0) { + kfree(ent); + parent->nlink--; + ent = NULL; + } + } + return ent; +} + struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops, diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 7697b6621cfd..bd95b9fdebb0 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -23,7 +23,6 @@ #include <linux/slab.h> #include <linux/mount.h> #include <linux/magic.h> -#include <linux/namei.h> #include <asm/uaccess.h> @@ -394,16 +393,16 @@ static const struct file_operations proc_reg_file_ops_no_compat = { }; #endif -static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_follow_link(struct dentry *dentry, void **cookie) { - struct proc_dir_entry *pde = PDE(dentry->d_inode); + struct proc_dir_entry *pde = PDE(d_inode(dentry)); if (unlikely(!use_pde(pde))) return ERR_PTR(-EINVAL); - nd_set_link(nd, pde->data); - return pde; + *cookie = pde; + return pde->data; } -static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +static void proc_put_link(struct inode *unused, void *p) { unuse_pde(p); } @@ -423,6 +422,10 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->pde = de; + if (is_empty_pde(de)) { + make_empty_dir_inode(inode); + return inode; + } if (de->mode) { inode->i_mode = de->mode; inode->i_uid = de->uid; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index c835b94c0cd3..aa2781095bd1 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -191,6 +191,12 @@ static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) } extern void pde_put(struct proc_dir_entry *); +static inline bool is_empty_pde(const struct proc_dir_entry *pde) +{ + return S_ISDIR(pde->mode) && !pde->proc_iops; +} +struct proc_dir_entry *proc_create_mount_point(const char *name); + /* * inode.c */ diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 91a4e6426321..92e6726f6e37 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) roundup(sizeof(CORE_STR), 4)) + roundup(sizeof(struct elf_prstatus), 4) + roundup(sizeof(struct elf_prpsinfo), 4) + - roundup(sizeof(struct task_struct), 4); + roundup(arch_task_struct_size, 4); *elf_buflen = PAGE_ALIGN(*elf_buflen); return size + *elf_buflen; } @@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) /* set up the task structure */ notes[2].name = CORE_STR; notes[2].type = NT_TASKSTRUCT; - notes[2].datasz = sizeof(struct task_struct); + notes[2].datasz = arch_task_struct_size; notes[2].data = current; nhdr->p_filesz += notesize(¬es[2]); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index c9eac4563fa8..f6e8354b8cea 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -30,9 +30,9 @@ static const struct proc_ns_operations *ns_entries[] = { &mntns_operations, }; -static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; @@ -45,7 +45,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) if (ptrace_may_access(task, PTRACE_MODE_READ)) { error = ns_get_path(&ns_path, task, ns_ops); if (!error) - nd_jump_link(nd, &ns_path); + nd_jump_link(&ns_path); } put_task_struct(task); return error; @@ -53,7 +53,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; char name[50]; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index d4a35746cab9..f8595e8b5cd0 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -64,7 +64,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, ""); + seq_file_path(m, file, ""); } seq_putc(m, '\n'); diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 1bde894bc624..350984a19c83 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -142,7 +142,7 @@ static struct dentry *proc_tgid_net_lookup(struct inode *dir, static int proc_tgid_net_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct net *net; net = get_proc_task_net(inode); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f92d5dd578a4..fdda62e6115e 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -19,6 +19,28 @@ static const struct inode_operations proc_sys_inode_operations; static const struct file_operations proc_sys_dir_file_operations; static const struct inode_operations proc_sys_dir_operations; +/* Support for permanently empty directories */ + +struct ctl_table sysctl_mount_point[] = { + { } +}; + +static bool is_empty_dir(struct ctl_table_header *head) +{ + return head->ctl_table[0].child == sysctl_mount_point; +} + +static void set_empty_dir(struct ctl_dir *dir) +{ + dir->header.ctl_table[0].child = sysctl_mount_point; +} + +static void clear_empty_dir(struct ctl_dir *dir) + +{ + dir->header.ctl_table[0].child = NULL; +} + void proc_sys_poll_notify(struct ctl_table_poll *poll) { if (!poll) @@ -187,6 +209,17 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header) struct ctl_table *entry; int err; + /* Is this a permanently empty directory? */ + if (is_empty_dir(&dir->header)) + return -EROFS; + + /* Am I creating a permanently empty directory? */ + if (header->ctl_table == sysctl_mount_point) { + if (!RB_EMPTY_ROOT(&dir->root)) + return -EINVAL; + set_empty_dir(dir); + } + dir->header.nreg++; header->parent = dir; err = insert_links(header); @@ -202,6 +235,8 @@ fail: erase_header(header); put_links(header); fail_links: + if (header->ctl_table == sysctl_mount_point) + clear_empty_dir(dir); header->parent = NULL; drop_sysctl_table(&dir->header); return err; @@ -419,6 +454,8 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_mode |= S_IFDIR; inode->i_op = &proc_sys_dir_operations; inode->i_fop = &proc_sys_dir_file_operations; + if (is_empty_dir(head)) + make_empty_dir_inode(inode); } out: return inode; @@ -604,7 +641,7 @@ static bool proc_sys_fill_cache(struct file *file, return false; } } - inode = child->d_inode; + inode = d_inode(child); ino = inode->i_ino; type = inode->i_mode >> 12; dput(child); @@ -710,7 +747,7 @@ static int proc_sys_permission(struct inode *inode, int mask) static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; if (attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) @@ -727,7 +764,7 @@ static int proc_sys_setattr(struct dentry *dentry, struct iattr *attr) static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; @@ -773,12 +810,12 @@ static int proc_sys_revalidate(struct dentry *dentry, unsigned int flags) { if (flags & LOOKUP_RCU) return -ECHILD; - return !PROC_I(dentry->d_inode)->sysctl->unregistering; + return !PROC_I(d_inode(dentry))->sysctl->unregistering; } static int proc_sys_delete(const struct dentry *dentry) { - return !!PROC_I(dentry->d_inode)->sysctl->unregistering; + return !!PROC_I(d_inode(dentry))->sysctl->unregistering; } static int sysctl_is_seen(struct ctl_table_header *p) @@ -805,7 +842,7 @@ static int proc_sys_compare(const struct dentry *parent, const struct dentry *de /* Although proc doesn't have negative dentries, rcu-walk means * that inode here can be NULL */ /* AV: can it, indeed? */ - inode = ACCESS_ONCE(dentry->d_inode); + inode = d_inode_rcu(dentry); if (!inode) return 1; if (name->len != len) diff --git a/fs/proc/root.c b/fs/proc/root.c index e74ac9f1a2c0..68feb0f70e63 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -112,9 +112,6 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = task_active_pid_ns(current); options = data; - if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) - return ERR_PTR(-EPERM); - /* Does the mounter have privilege over the pid namespace? */ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); @@ -159,7 +156,7 @@ static struct file_system_type proc_fs_type = { .name = "proc", .mount = proc_mount, .kill_sb = proc_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, }; void __init proc_root_init(void) @@ -182,10 +179,10 @@ void __init proc_root_init(void) #endif proc_mkdir("fs", NULL); proc_mkdir("driver", NULL); - proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */ + proc_create_mount_point("fs/nfsd"); /* somewhere for the nfsd filesystem to be mounted */ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ - proc_mkdir("openprom", NULL); + proc_create_mount_point("openprom"); #endif proc_tty_init(); proc_mkdir("bus", NULL); @@ -195,7 +192,7 @@ void __init proc_root_init(void) static int proc_root_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat ) { - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); stat->nlink = proc_root.nlink + nr_processes(); return 0; } diff --git a/fs/proc/self.c b/fs/proc/self.c index 4348bb8907c2..113b8d061fc0 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,5 +1,4 @@ #include <linux/sched.h> -#include <linux/namei.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" @@ -19,21 +18,20 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_self_follow_link(struct dentry *dentry, void **cookie) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (tgid) { - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d", tgid); - } - nd_set_link(nd, name); - return NULL; + char *name; + + if (!tgid) + return ERR_PTR(-ENOENT); + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); + if (!name) + return ERR_PTR(-ENOMEM); + sprintf(name, "%d", tgid); + return *cookie = name; } static const struct inode_operations proc_self_inode_operations = { @@ -46,7 +44,7 @@ static unsigned self_inum; int proc_setup_self(struct super_block *s) { - struct inode *root_inode = s->s_root->d_inode; + struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = s->s_fs_info; struct dentry *self; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6dee68d013ff..ca1e091881d4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -310,7 +310,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) */ if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, "\n"); + seq_file_path(m, file, "\n"); goto done; } @@ -1509,7 +1509,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (file) { seq_puts(m, " file="); - seq_path(m, &file->f_path, "\n\t= "); + seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); } else { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 599ec2e20104..e0d64c92e4f6 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -180,7 +180,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, ""); + seq_file_path(m, file, ""); } else if (mm) { pid_t tid = pid_of_stack(priv, vma, is_pid); diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 59075b509df3..947b0f4fd0a1 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -1,5 +1,4 @@ #include <linux/sched.h> -#include <linux/namei.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" @@ -20,21 +19,20 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (pid) { - name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d/task/%d", tgid, pid); - } - nd_set_link(nd, name); - return NULL; + char *name; + + if (!pid) + return ERR_PTR(-ENOENT); + name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); + if (!name) + return ERR_PTR(-ENOMEM); + sprintf(name, "%d/task/%d", tgid, pid); + return *cookie = name; } static const struct inode_operations proc_thread_self_inode_operations = { @@ -47,7 +45,7 @@ static unsigned thread_self_inum; int proc_setup_thread_self(struct super_block *s) { - struct inode *root_inode = s->s_root->d_inode; + struct inode *root_inode = d_inode(s->s_root); struct pid_namespace *ns = s->s_fs_info; struct dentry *thread_self; diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 8db932da4009..8ebd9a334085 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -17,7 +17,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) { - struct proc_mounts *p = proc_mounts(file->private_data); + struct seq_file *m = file->private_data; + struct proc_mounts *p = m->private; struct mnt_namespace *ns = p->ns; unsigned res = POLLIN | POLLRDNORM; int event; @@ -25,8 +26,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) poll_wait(file, &p->ns->poll, wait); event = ACCESS_ONCE(ns->event); - if (p->m.poll_event != event) { - p->m.poll_event = event; + if (m->poll_event != event) { + m->poll_event = event; res |= POLLERR | POLLPRI; } @@ -92,7 +93,7 @@ static void show_type(struct seq_file *m, struct super_block *sb) static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -126,7 +127,7 @@ out: static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -186,7 +187,7 @@ out: static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; @@ -236,6 +237,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, struct mnt_namespace *ns = NULL; struct path root; struct proc_mounts *p; + struct seq_file *m; int ret = -EINVAL; if (!task) @@ -260,26 +262,21 @@ static int mounts_open_common(struct inode *inode, struct file *file, task_unlock(task); put_task_struct(task); - ret = -ENOMEM; - p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); - if (!p) + ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts)); + if (ret) goto err_put_path; - file->private_data = &p->m; - ret = seq_open(file, &mounts_op); - if (ret) - goto err_free; + m = file->private_data; + m->poll_event = ns->event; + p = m->private; p->ns = ns; p->root = root; - p->m.poll_event = ns->event; p->show = show; p->cached_event = ~0ULL; return 0; - err_free: - kfree(p); err_put_path: path_put(&root); err_put_ns: @@ -290,10 +287,11 @@ static int mounts_open_common(struct inode *inode, struct file *file, static int mounts_release(struct inode *inode, struct file *file) { - struct proc_mounts *p = proc_mounts(file->private_data); + struct seq_file *m = file->private_data; + struct proc_mounts *p = m->private; path_put(&p->root); put_mnt_ns(p->ns); - return seq_release(inode, file); + return seq_release_private(inode, file); } static int mounts_open(struct inode *inode, struct file *file) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index b32ce53d24ee..3adcc4669fac 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -190,7 +190,7 @@ static const struct file_operations pstore_file_operations = { */ static int pstore_unlink(struct inode *dir, struct dentry *dentry) { - struct pstore_private *p = dentry->d_inode->i_private; + struct pstore_private *p = d_inode(dentry)->i_private; int err; err = pstore_check_syslog_permissions(p); @@ -199,7 +199,7 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry) if (p->psi->erase) p->psi->erase(p->type, p->id, p->count, - dentry->d_inode->i_ctime, p->psi); + d_inode(dentry)->i_ctime, p->psi); else return -EPERM; @@ -364,6 +364,9 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, case PSTORE_TYPE_PMSG: scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id); break; + case PSTORE_TYPE_PPC_OPAL: + sprintf(name, "powerpc-opal-%s-%lld", psname, id); + break; case PSTORE_TYPE_UNKNOWN: scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id); break; @@ -373,7 +376,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, break; } - mutex_lock(&root->d_inode->i_mutex); + mutex_lock(&d_inode(root)->i_mutex); dentry = d_alloc_name(root, name); if (!dentry) @@ -393,12 +396,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); return 0; fail_lockedalloc: - mutex_unlock(&root->d_inode->i_mutex); + mutex_unlock(&d_inode(root)->i_mutex); kfree(private); fail_alloc: iput(inode); @@ -458,22 +461,18 @@ static struct file_system_type pstore_fs_type = { .kill_sb = pstore_kill_sb, }; -static struct kobject *pstore_kobj; - static int __init init_pstore_fs(void) { - int err = 0; + int err; /* Create a convenient mount point for people to access pstore */ - pstore_kobj = kobject_create_and_add("pstore", fs_kobj); - if (!pstore_kobj) { - err = -ENOMEM; + err = sysfs_create_mount_point(fs_kobj, "pstore"); + if (err) goto out; - } err = register_filesystem(&pstore_fs_type); if (err < 0) - kobject_put(pstore_kobj); + sysfs_remove_mount_point(fs_kobj, "pstore"); out: return err; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index c4c9a10c5760..791743deedf1 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -299,7 +299,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, bool compressed; size_t total_len; - if (big_oops_buf) { + if (big_oops_buf && is_locked) { dst = big_oops_buf; hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part); @@ -456,6 +456,12 @@ int pstore_register(struct pstore_info *psi) add_timer(&pstore_timer); } + /* + * Update the module parameter backend, so it is visible + * through /sys/module/pstore/parameters/backend + */ + backend = psi->name; + pr_info("Registered %s as persistent store backend\n", psi->name); return 0; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 39d1373128e9..6c26c4daaec9 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -186,12 +186,34 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, ssize_t size; ssize_t ecc_notice_size; struct ramoops_context *cxt = psi->data; - struct persistent_ram_zone *prz; - int header_length; + struct persistent_ram_zone *prz = NULL; + int header_length = 0; + + /* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but + * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have + * valid time stamps, so it is initialized to zero. + */ + time->tv_sec = 0; + time->tv_nsec = 0; + *compressed = false; + + /* Find the next valid persistent_ram_zone for DMESG */ + while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) { + prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, + cxt->max_dump_cnt, id, type, + PSTORE_TYPE_DMESG, 1); + if (!prz_ok(prz)) + continue; + header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), + time, compressed); + /* Clear and skip this DMESG record if it has no valid header */ + if (!header_length) { + persistent_ram_free_old(prz); + persistent_ram_zap(prz); + prz = NULL; + } + } - prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, - cxt->max_dump_cnt, id, type, - PSTORE_TYPE_DMESG, 1); if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, 1, id, type, PSTORE_TYPE_CONSOLE, 0); @@ -204,13 +226,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (!prz_ok(prz)) return 0; - if (!persistent_ram_old(prz)) - return 0; - - size = persistent_ram_old_size(prz); - header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), time, - compressed); - size -= header_length; + size = persistent_ram_old_size(prz) - header_length; /* ECC correction notice */ ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); @@ -394,18 +410,16 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, } for (i = 0; i < cxt->max_dump_cnt; i++) { - size_t sz = cxt->record_size; - - cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, + cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0, &cxt->ecc_info, cxt->memtype); if (IS_ERR(cxt->przs[i])) { err = PTR_ERR(cxt->przs[i]); dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", - sz, (unsigned long long)*paddr, err); + cxt->record_size, (unsigned long long)*paddr, err); goto fail_prz; } - *paddr += sz; + *paddr += cxt->record_size; } return 0; @@ -539,6 +553,9 @@ static int ramoops_probe(struct platform_device *pdev) mem_address = pdata->mem_address; record_size = pdata->record_size; dump_oops = pdata->dump_oops; + ramoops_console_size = pdata->console_size; + ramoops_pmsg_size = pdata->pmsg_size; + ramoops_ftrace_size = pdata->ftrace_size; pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n", cxt->size, (unsigned long long)cxt->phys_addr, @@ -605,7 +622,7 @@ static void ramoops_register_dummy(void) dummy_data->mem_size = mem_size; dummy_data->mem_address = mem_address; - dummy_data->mem_type = 0; + dummy_data->mem_type = mem_type; dummy_data->record_size = record_size; dummy_data->console_size = ramoops_console_size; dummy_data->ftrace_size = ramoops_ftrace_size; diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 8d64bb5366bf..e1f37278cf97 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -32,11 +32,6 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n) return page; } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static unsigned last_entry(struct inode *inode, unsigned long page_nr) { unsigned long last_byte = inode->i_size; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 44e73923670d..32d2e1a9774c 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -182,7 +182,7 @@ static const char *qnx6_checkroot(struct super_block *s) static char match_root[2][3] = {".\0\0", "..\0"}; int i, error = 0; struct qnx6_dir_entry *dir_entry; - struct inode *root = s->s_root->d_inode; + struct inode *root = d_inode(s->s_root); struct address_space *mapping = root->i_mapping; struct page *page = read_mapping_page(mapping, 0, NULL); if (IS_ERR(page)) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 0ccd4ba3a246..20d1f74561cf 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -900,14 +900,17 @@ static inline struct dquot **i_dquot(struct inode *inode) static int dqinit_needed(struct inode *inode, int type) { + struct dquot * const *dquots; int cnt; if (IS_NOQUOTA(inode)) return 0; + + dquots = i_dquot(inode); if (type != -1) - return !i_dquot(inode)[type]; + return !dquots[type]; for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (!i_dquot(inode)[cnt]) + if (!dquots[cnt]) return 1; return 0; } @@ -970,12 +973,13 @@ static void add_dquot_ref(struct super_block *sb, int type) static void remove_inode_dquot_ref(struct inode *inode, int type, struct list_head *tofree_head) { - struct dquot *dquot = i_dquot(inode)[type]; + struct dquot **dquots = i_dquot(inode); + struct dquot *dquot = dquots[type]; - i_dquot(inode)[type] = NULL; if (!dquot) return; + dquots[type] = NULL; if (list_empty(&dquot->dq_free)) { /* * The inode still has reference to dquot so it can't be in the @@ -1159,8 +1163,8 @@ static int need_print_warning(struct dquot_warn *warn) return uid_eq(current_fsuid(), warn->w_dq_id.uid); case GRPQUOTA: return in_group_p(warn->w_dq_id.gid); - case PRJQUOTA: /* Never taken... Just make gcc happy */ - return 0; + case PRJQUOTA: + return 1; } return 0; } @@ -1389,16 +1393,21 @@ static int dquot_active(const struct inode *inode) static void __dquot_initialize(struct inode *inode, int type) { int cnt, init_needed = 0; - struct dquot *got[MAXQUOTAS]; + struct dquot **dquots, *got[MAXQUOTAS]; struct super_block *sb = inode->i_sb; qsize_t rsv; if (!dquot_active(inode)) return; + dquots = i_dquot(inode); + /* First get references to structures we might need. */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { struct kqid qid; + kprojid_t projid; + int rc; + got[cnt] = NULL; if (type != -1 && cnt != type) continue; @@ -1407,8 +1416,12 @@ static void __dquot_initialize(struct inode *inode, int type) * we check it without locking here to avoid unnecessary * dqget()/dqput() calls. */ - if (i_dquot(inode)[cnt]) + if (dquots[cnt]) + continue; + + if (!sb_has_quota_active(sb, cnt)) continue; + init_needed = 1; switch (cnt) { @@ -1418,6 +1431,12 @@ static void __dquot_initialize(struct inode *inode, int type) case GRPQUOTA: qid = make_kqid_gid(inode->i_gid); break; + case PRJQUOTA: + rc = inode->i_sb->dq_op->get_projid(inode, &projid); + if (rc) + continue; + qid = make_kqid_projid(projid); + break; } got[cnt] = dqget(sb, qid); } @@ -1438,8 +1457,8 @@ static void __dquot_initialize(struct inode *inode, int type) /* We could race with quotaon or dqget() could have failed */ if (!got[cnt]) continue; - if (!i_dquot(inode)[cnt]) { - i_dquot(inode)[cnt] = got[cnt]; + if (!dquots[cnt]) { + dquots[cnt] = got[cnt]; got[cnt] = NULL; /* * Make quota reservation system happy if someone @@ -1447,7 +1466,7 @@ static void __dquot_initialize(struct inode *inode, int type) */ rsv = inode_get_rsv_space(inode); if (unlikely(rsv)) - dquot_resv_space(i_dquot(inode)[cnt], rsv); + dquot_resv_space(dquots[cnt], rsv); } } out_err: @@ -1473,12 +1492,13 @@ EXPORT_SYMBOL(dquot_initialize); static void __dquot_drop(struct inode *inode) { int cnt; + struct dquot **dquots = i_dquot(inode); struct dquot *put[MAXQUOTAS]; spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - put[cnt] = i_dquot(inode)[cnt]; - i_dquot(inode)[cnt] = NULL; + put[cnt] = dquots[cnt]; + dquots[cnt] = NULL; } spin_unlock(&dq_data_lock); dqput_all(put); @@ -1486,6 +1506,7 @@ static void __dquot_drop(struct inode *inode) void dquot_drop(struct inode *inode) { + struct dquot * const *dquots; int cnt; if (IS_NOQUOTA(inode)) @@ -1498,8 +1519,9 @@ void dquot_drop(struct inode *inode) * must assure that nobody can come after the DQUOT_DROP and * add quota pointers back anyway. */ + dquots = i_dquot(inode); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (i_dquot(inode)[cnt]) + if (dquots[cnt]) break; } @@ -1600,8 +1622,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; - struct dquot **dquots = i_dquot(inode); int reserve = flags & DQUOT_SPACE_RESERVE; + struct dquot **dquots; if (!dquot_active(inode)) { inode_incr_space(inode, number, reserve); @@ -1611,6 +1633,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; + dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1652,13 +1675,14 @@ int dquot_alloc_inode(struct inode *inode) { int cnt, ret = 0, index; struct dquot_warn warn[MAXQUOTAS]; - struct dquot * const *dquots = i_dquot(inode); + struct dquot * const *dquots; if (!dquot_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warn[cnt].w_type = QUOTA_NL_NOWARN; + dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1690,6 +1714,7 @@ EXPORT_SYMBOL(dquot_alloc_inode); */ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { + struct dquot **dquots; int cnt, index; if (!dquot_active(inode)) { @@ -1697,18 +1722,18 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) return 0; } + dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (i_dquot(inode)[cnt]) - dquot_claim_reserved_space(i_dquot(inode)[cnt], - number); + if (dquots[cnt]) + dquot_claim_reserved_space(dquots[cnt], number); } /* Update inode bytes */ inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); - mark_all_dquot_dirty(i_dquot(inode)); + mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); return 0; } @@ -1719,6 +1744,7 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty); */ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { + struct dquot **dquots; int cnt, index; if (!dquot_active(inode)) { @@ -1726,18 +1752,18 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) return; } + dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); /* Claim reserved quotas to allocated quotas */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (i_dquot(inode)[cnt]) - dquot_reclaim_reserved_space(i_dquot(inode)[cnt], - number); + if (dquots[cnt]) + dquot_reclaim_reserved_space(dquots[cnt], number); } /* Update inode bytes */ inode_reclaim_rsv_space(inode, number); spin_unlock(&dq_data_lock); - mark_all_dquot_dirty(i_dquot(inode)); + mark_all_dquot_dirty(dquots); srcu_read_unlock(&dquot_srcu, index); return; } @@ -1750,7 +1776,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; - struct dquot **dquots = i_dquot(inode); + struct dquot **dquots; int reserve = flags & DQUOT_SPACE_RESERVE, index; if (!dquot_active(inode)) { @@ -1758,6 +1784,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) return; } + dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1793,12 +1820,13 @@ void dquot_free_inode(struct inode *inode) { unsigned int cnt; struct dquot_warn warn[MAXQUOTAS]; - struct dquot * const *dquots = i_dquot(inode); + struct dquot * const *dquots; int index; if (!dquot_active(inode)) return; + dquots = i_dquot(inode); index = srcu_read_lock(&dquot_srcu); spin_lock(&dq_data_lock); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -2161,7 +2189,8 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, error = -EROFS; goto out_fmt; } - if (!sb->s_op->quota_write || !sb->s_op->quota_read) { + if (!sb->s_op->quota_write || !sb->s_op->quota_read || + (type == PRJQUOTA && sb->dq_op->get_projid == NULL)) { error = -EINVAL; goto out_fmt; } @@ -2299,7 +2328,7 @@ int dquot_quota_on(struct super_block *sb, int type, int format_id, if (path->dentry->d_sb != sb) error = -EXDEV; else - error = vfs_load_quota_inode(path->dentry->d_inode, type, + error = vfs_load_quota_inode(d_inode(path->dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); return error; @@ -2363,20 +2392,20 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - mutex_lock(&sb->s_root->d_inode->i_mutex); + mutex_lock(&d_inode(sb->s_root)->i_mutex); dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); - mutex_unlock(&sb->s_root->d_inode->i_mutex); + mutex_unlock(&d_inode(sb->s_root)->i_mutex); if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (!dentry->d_inode) { + if (d_really_is_negative(dentry)) { error = -ENOENT; goto out; } error = security_quota_on(dentry); if (!error) - error = vfs_load_quota_inode(dentry->d_inode, type, format_id, + error = vfs_load_quota_inode(d_inode(dentry), type, format_id, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); out: @@ -2614,55 +2643,73 @@ out: EXPORT_SYMBOL(dquot_set_dqblk); /* Generic routine for getting common part of quota file information */ -int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) +int dquot_get_state(struct super_block *sb, struct qc_state *state) { struct mem_dqinfo *mi; + struct qc_type_state *tstate; + struct quota_info *dqopt = sb_dqopt(sb); + int type; mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!sb_has_quota_active(sb, type)) { - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); - return -ESRCH; + memset(state, 0, sizeof(*state)); + for (type = 0; type < MAXQUOTAS; type++) { + if (!sb_has_quota_active(sb, type)) + continue; + tstate = state->s_state + type; + mi = sb_dqopt(sb)->info + type; + tstate->flags = QCI_ACCT_ENABLED; + spin_lock(&dq_data_lock); + if (mi->dqi_flags & DQF_SYS_FILE) + tstate->flags |= QCI_SYSFILE; + if (mi->dqi_flags & DQF_ROOT_SQUASH) + tstate->flags |= QCI_ROOT_SQUASH; + if (sb_has_quota_limits_enabled(sb, type)) + tstate->flags |= QCI_LIMITS_ENFORCED; + tstate->spc_timelimit = mi->dqi_bgrace; + tstate->ino_timelimit = mi->dqi_igrace; + tstate->ino = dqopt->files[type]->i_ino; + tstate->blocks = dqopt->files[type]->i_blocks; + tstate->nextents = 1; /* We don't know... */ + spin_unlock(&dq_data_lock); } - mi = sb_dqopt(sb)->info + type; - spin_lock(&dq_data_lock); - ii->dqi_bgrace = mi->dqi_bgrace; - ii->dqi_igrace = mi->dqi_igrace; - ii->dqi_flags = mi->dqi_flags & DQF_GETINFO_MASK; - ii->dqi_valid = IIF_ALL; - spin_unlock(&dq_data_lock); mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return 0; } -EXPORT_SYMBOL(dquot_get_dqinfo); +EXPORT_SYMBOL(dquot_get_state); /* Generic routine for setting common part of quota file information */ -int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) +int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) { struct mem_dqinfo *mi; int err = 0; + if ((ii->i_fieldmask & QC_WARNS_MASK) || + (ii->i_fieldmask & QC_RT_SPC_TIMER)) + return -EINVAL; mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); if (!sb_has_quota_active(sb, type)) { err = -ESRCH; goto out; } mi = sb_dqopt(sb)->info + type; - if (ii->dqi_valid & IIF_FLAGS) { - if (ii->dqi_flags & ~DQF_SETINFO_MASK || - (ii->dqi_flags & DQF_ROOT_SQUASH && + if (ii->i_fieldmask & QC_FLAGS) { + if ((ii->i_flags & QCI_ROOT_SQUASH && mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) { err = -EINVAL; goto out; } } spin_lock(&dq_data_lock); - if (ii->dqi_valid & IIF_BGRACE) - mi->dqi_bgrace = ii->dqi_bgrace; - if (ii->dqi_valid & IIF_IGRACE) - mi->dqi_igrace = ii->dqi_igrace; - if (ii->dqi_valid & IIF_FLAGS) - mi->dqi_flags = (mi->dqi_flags & ~DQF_SETINFO_MASK) | - (ii->dqi_flags & DQF_SETINFO_MASK); + if (ii->i_fieldmask & QC_SPC_TIMER) + mi->dqi_bgrace = ii->i_spc_timelimit; + if (ii->i_fieldmask & QC_INO_TIMER) + mi->dqi_igrace = ii->i_ino_timelimit; + if (ii->i_fieldmask & QC_FLAGS) { + if (ii->i_flags & QCI_ROOT_SQUASH) + mi->dqi_flags |= DQF_ROOT_SQUASH; + else + mi->dqi_flags &= ~DQF_ROOT_SQUASH; + } spin_unlock(&dq_data_lock); mark_info_dirty(sb, type); /* Force write to disk */ @@ -2677,7 +2724,7 @@ const struct quotactl_ops dquot_quotactl_ops = { .quota_on = dquot_quota_on, .quota_off = dquot_quota_off, .quota_sync = dquot_quota_sync, - .get_info = dquot_get_dqinfo, + .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk @@ -2688,7 +2735,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = { .quota_enable = dquot_quota_enable, .quota_disable = dquot_quota_disable, .quota_sync = dquot_quota_sync, - .get_info = dquot_get_dqinfo, + .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk diff --git a/fs/quota/quota.c b/fs/quota/quota.c index d14a799c7785..86ded7375c21 100644 --- a/fs/quota/quota.c +++ b/fs/quota/quota.c @@ -118,13 +118,30 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr) static int quota_getinfo(struct super_block *sb, int type, void __user *addr) { - struct if_dqinfo info; + struct qc_state state; + struct qc_type_state *tstate; + struct if_dqinfo uinfo; int ret; - if (!sb->s_qcop->get_info) + /* This checks whether qc_state has enough entries... */ + BUILD_BUG_ON(MAXQUOTAS > XQM_MAXQUOTAS); + if (!sb->s_qcop->get_state) return -ENOSYS; - ret = sb->s_qcop->get_info(sb, type, &info); - if (!ret && copy_to_user(addr, &info, sizeof(info))) + ret = sb->s_qcop->get_state(sb, &state); + if (ret) + return ret; + tstate = state.s_state + type; + if (!(tstate->flags & QCI_ACCT_ENABLED)) + return -ESRCH; + memset(&uinfo, 0, sizeof(uinfo)); + uinfo.dqi_bgrace = tstate->spc_timelimit; + uinfo.dqi_igrace = tstate->ino_timelimit; + if (tstate->flags & QCI_SYSFILE) + uinfo.dqi_flags |= DQF_SYS_FILE; + if (tstate->flags & QCI_ROOT_SQUASH) + uinfo.dqi_flags |= DQF_ROOT_SQUASH; + uinfo.dqi_valid = IIF_ALL; + if (!ret && copy_to_user(addr, &uinfo, sizeof(uinfo))) return -EFAULT; return ret; } @@ -132,12 +149,31 @@ static int quota_getinfo(struct super_block *sb, int type, void __user *addr) static int quota_setinfo(struct super_block *sb, int type, void __user *addr) { struct if_dqinfo info; + struct qc_info qinfo; if (copy_from_user(&info, addr, sizeof(info))) return -EFAULT; if (!sb->s_qcop->set_info) return -ENOSYS; - return sb->s_qcop->set_info(sb, type, &info); + if (info.dqi_valid & ~(IIF_FLAGS | IIF_BGRACE | IIF_IGRACE)) + return -EINVAL; + memset(&qinfo, 0, sizeof(qinfo)); + if (info.dqi_valid & IIF_FLAGS) { + if (info.dqi_flags & ~DQF_SETINFO_MASK) + return -EINVAL; + if (info.dqi_flags & DQF_ROOT_SQUASH) + qinfo.i_flags |= QCI_ROOT_SQUASH; + qinfo.i_fieldmask |= QC_FLAGS; + } + if (info.dqi_valid & IIF_BGRACE) { + qinfo.i_spc_timelimit = info.dqi_bgrace; + qinfo.i_fieldmask |= QC_SPC_TIMER; + } + if (info.dqi_valid & IIF_IGRACE) { + qinfo.i_ino_timelimit = info.dqi_igrace; + qinfo.i_fieldmask |= QC_INO_TIMER; + } + return sb->s_qcop->set_info(sb, type, &qinfo); } static inline qsize_t qbtos(qsize_t blocks) @@ -252,25 +288,149 @@ static int quota_disable(struct super_block *sb, void __user *addr) return sb->s_qcop->quota_disable(sb, flags); } +static int quota_state_to_flags(struct qc_state *state) +{ + int flags = 0; + + if (state->s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) + flags |= FS_QUOTA_UDQ_ACCT; + if (state->s_state[USRQUOTA].flags & QCI_LIMITS_ENFORCED) + flags |= FS_QUOTA_UDQ_ENFD; + if (state->s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) + flags |= FS_QUOTA_GDQ_ACCT; + if (state->s_state[GRPQUOTA].flags & QCI_LIMITS_ENFORCED) + flags |= FS_QUOTA_GDQ_ENFD; + if (state->s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) + flags |= FS_QUOTA_PDQ_ACCT; + if (state->s_state[PRJQUOTA].flags & QCI_LIMITS_ENFORCED) + flags |= FS_QUOTA_PDQ_ENFD; + return flags; +} + +static int quota_getstate(struct super_block *sb, struct fs_quota_stat *fqs) +{ + int type; + struct qc_state state; + int ret; + + ret = sb->s_qcop->get_state(sb, &state); + if (ret < 0) + return ret; + + memset(fqs, 0, sizeof(*fqs)); + fqs->qs_version = FS_QSTAT_VERSION; + fqs->qs_flags = quota_state_to_flags(&state); + /* No quota enabled? */ + if (!fqs->qs_flags) + return -ENOSYS; + fqs->qs_incoredqs = state.s_incoredqs; + /* + * GETXSTATE quotactl has space for just one set of time limits so + * report them for the first enabled quota type + */ + for (type = 0; type < XQM_MAXQUOTAS; type++) + if (state.s_state[type].flags & QCI_ACCT_ENABLED) + break; + BUG_ON(type == XQM_MAXQUOTAS); + fqs->qs_btimelimit = state.s_state[type].spc_timelimit; + fqs->qs_itimelimit = state.s_state[type].ino_timelimit; + fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; + fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; + fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; + if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) { + fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; + fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; + fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; + } + if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) { + fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; + fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; + fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; + } + if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) { + /* + * Q_XGETQSTAT doesn't have room for both group and project + * quotas. So, allow the project quota values to be copied out + * only if there is no group quota information available. + */ + if (!(state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED)) { + fqs->qs_gquota.qfs_ino = state.s_state[PRJQUOTA].ino; + fqs->qs_gquota.qfs_nblks = + state.s_state[PRJQUOTA].blocks; + fqs->qs_gquota.qfs_nextents = + state.s_state[PRJQUOTA].nextents; + } + } + return 0; +} + static int quota_getxstate(struct super_block *sb, void __user *addr) { struct fs_quota_stat fqs; int ret; - if (!sb->s_qcop->get_xstate) + if (!sb->s_qcop->get_state) return -ENOSYS; - ret = sb->s_qcop->get_xstate(sb, &fqs); + ret = quota_getstate(sb, &fqs); if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) return -EFAULT; return ret; } +static int quota_getstatev(struct super_block *sb, struct fs_quota_statv *fqs) +{ + int type; + struct qc_state state; + int ret; + + ret = sb->s_qcop->get_state(sb, &state); + if (ret < 0) + return ret; + + memset(fqs, 0, sizeof(*fqs)); + fqs->qs_version = FS_QSTAT_VERSION; + fqs->qs_flags = quota_state_to_flags(&state); + /* No quota enabled? */ + if (!fqs->qs_flags) + return -ENOSYS; + fqs->qs_incoredqs = state.s_incoredqs; + /* + * GETXSTATV quotactl has space for just one set of time limits so + * report them for the first enabled quota type + */ + for (type = 0; type < XQM_MAXQUOTAS; type++) + if (state.s_state[type].flags & QCI_ACCT_ENABLED) + break; + BUG_ON(type == XQM_MAXQUOTAS); + fqs->qs_btimelimit = state.s_state[type].spc_timelimit; + fqs->qs_itimelimit = state.s_state[type].ino_timelimit; + fqs->qs_rtbtimelimit = state.s_state[type].rt_spc_timelimit; + fqs->qs_bwarnlimit = state.s_state[type].spc_warnlimit; + fqs->qs_iwarnlimit = state.s_state[type].ino_warnlimit; + if (state.s_state[USRQUOTA].flags & QCI_ACCT_ENABLED) { + fqs->qs_uquota.qfs_ino = state.s_state[USRQUOTA].ino; + fqs->qs_uquota.qfs_nblks = state.s_state[USRQUOTA].blocks; + fqs->qs_uquota.qfs_nextents = state.s_state[USRQUOTA].nextents; + } + if (state.s_state[GRPQUOTA].flags & QCI_ACCT_ENABLED) { + fqs->qs_gquota.qfs_ino = state.s_state[GRPQUOTA].ino; + fqs->qs_gquota.qfs_nblks = state.s_state[GRPQUOTA].blocks; + fqs->qs_gquota.qfs_nextents = state.s_state[GRPQUOTA].nextents; + } + if (state.s_state[PRJQUOTA].flags & QCI_ACCT_ENABLED) { + fqs->qs_pquota.qfs_ino = state.s_state[PRJQUOTA].ino; + fqs->qs_pquota.qfs_nblks = state.s_state[PRJQUOTA].blocks; + fqs->qs_pquota.qfs_nextents = state.s_state[PRJQUOTA].nextents; + } + return 0; +} + static int quota_getxstatev(struct super_block *sb, void __user *addr) { struct fs_quota_statv fqs; int ret; - if (!sb->s_qcop->get_xstatev) + if (!sb->s_qcop->get_state) return -ENOSYS; memset(&fqs, 0, sizeof(fqs)); @@ -284,7 +444,7 @@ static int quota_getxstatev(struct super_block *sb, void __user *addr) default: return -EINVAL; } - ret = sb->s_qcop->get_xstatev(sb, &fqs); + ret = quota_getstatev(sb, &fqs); if (!ret && copy_to_user(addr, &fqs, sizeof(fqs))) return -EFAULT; return ret; @@ -357,6 +517,30 @@ static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src) dst->d_fieldmask |= QC_RT_SPACE; } +static void copy_qcinfo_from_xfs_dqblk(struct qc_info *dst, + struct fs_disk_quota *src) +{ + memset(dst, 0, sizeof(*dst)); + dst->i_spc_timelimit = src->d_btimer; + dst->i_ino_timelimit = src->d_itimer; + dst->i_rt_spc_timelimit = src->d_rtbtimer; + dst->i_ino_warnlimit = src->d_iwarns; + dst->i_spc_warnlimit = src->d_bwarns; + dst->i_rt_spc_warnlimit = src->d_rtbwarns; + if (src->d_fieldmask & FS_DQ_BWARNS) + dst->i_fieldmask |= QC_SPC_WARNS; + if (src->d_fieldmask & FS_DQ_IWARNS) + dst->i_fieldmask |= QC_INO_WARNS; + if (src->d_fieldmask & FS_DQ_RTBWARNS) + dst->i_fieldmask |= QC_RT_SPC_WARNS; + if (src->d_fieldmask & FS_DQ_BTIMER) + dst->i_fieldmask |= QC_SPC_TIMER; + if (src->d_fieldmask & FS_DQ_ITIMER) + dst->i_fieldmask |= QC_INO_TIMER; + if (src->d_fieldmask & FS_DQ_RTBTIMER) + dst->i_fieldmask |= QC_RT_SPC_TIMER; +} + static int quota_setxquota(struct super_block *sb, int type, qid_t id, void __user *addr) { @@ -371,6 +555,21 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id, qid = make_kqid(current_user_ns(), type, id); if (!qid_valid(qid)) return -EINVAL; + /* Are we actually setting timer / warning limits for all users? */ + if (from_kqid(&init_user_ns, qid) == 0 && + fdq.d_fieldmask & (FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK)) { + struct qc_info qinfo; + int ret; + + if (!sb->s_qcop->set_info) + return -EINVAL; + copy_qcinfo_from_xfs_dqblk(&qinfo, &fdq); + ret = sb->s_qcop->set_info(sb, type, &qinfo); + if (ret) + return ret; + /* These are already done */ + fdq.d_fieldmask &= ~(FS_DQ_WARNS_MASK | FS_DQ_TIMER_MASK); + } copy_from_xfs_dqblk(&qdq, &fdq); return sb->s_qcop->set_dqblk(sb, qid, &qdq); } diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index d65877fbe8f4..58efb83dec1c 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -349,6 +349,13 @@ static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot) { int tmp = QT_TREEOFF; + +#ifdef __QUOTA_QT_PARANOIA + if (info->dqi_blocks <= QT_TREEOFF) { + quota_error(dquot->dq_sb, "Quota tree root isn't allocated!"); + return -EIO; + } +#endif return do_insert_tree(info, dquot, &tmp, 0); } diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 9cb10d7197f7..2aa012a68e90 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -117,12 +117,16 @@ static int v2_read_file_info(struct super_block *sb, int type) qinfo = info->dqi_priv; if (version == 0) { /* limits are stored as unsigned 32-bit data */ - info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS; + info->dqi_max_spc_limit = 0xffffffffLL << QUOTABLOCK_BITS; info->dqi_max_ino_limit = 0xffffffff; } else { - /* used space is stored as unsigned 64-bit value in bytes */ - info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */ - info->dqi_max_ino_limit = 0xffffffffffffffffULL; + /* + * Used space is stored as unsigned 64-bit value in bytes but + * quota core supports only signed 64-bit values so use that + * as a limit + */ + info->dqi_max_spc_limit = 0x7fffffffffffffffLL; /* 2^63-1 */ + info->dqi_max_ino_limit = 0x7fffffffffffffffLL; } info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); diff --git a/fs/quota/quotaio_v2.h b/fs/quota/quotaio_v2.h index f1966b42c2fd..4e95430093d9 100644 --- a/fs/quota/quotaio_v2.h +++ b/fs/quota/quotaio_v2.h @@ -13,12 +13,14 @@ */ #define V2_INITQMAGICS {\ 0xd9c01f11, /* USRQUOTA */\ - 0xd9c01927 /* GRPQUOTA */\ + 0xd9c01927, /* GRPQUOTA */\ + 0xd9c03f14, /* PRJQUOTA */\ } #define V2_INITQVERSIONS {\ 1, /* USRQUOTA */\ - 1 /* GRPQUOTA */\ + 1, /* GRPQUOTA */\ + 1, /* PRJQUOTA */\ } /* First generic header */ diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index 4f56de822d2f..183a212694bf 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -31,9 +31,7 @@ #include "internal.h" const struct file_operations ramfs_file_operations = { - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = noop_fsync, diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index f6ab41b39612..ba1323a94924 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -44,9 +44,7 @@ const struct file_operations ramfs_file_operations = { .mmap_capabilities = ramfs_mmap_capabilities, .mmap = ramfs_nommu_mmap, .get_unmapped_area = ramfs_nommu_get_unmapped_area, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .fsync = noop_fsync, .splice_read = generic_file_splice_read, @@ -165,7 +163,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) */ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int old_ia_valid = ia->ia_valid; int ret = 0; diff --git a/fs/read_write.c b/fs/read_write.c index 8e1b68786d66..819ef3faf1bb 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,7 +9,6 @@ #include <linux/fcntl.h> #include <linux/file.h> #include <linux/uio.h> -#include <linux/aio.h> #include <linux/fsnotify.h> #include <linux/security.h> #include <linux/export.h> @@ -23,13 +22,10 @@ #include <asm/unistd.h> typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *); -typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *, - unsigned long, loff_t); typedef ssize_t (*iter_fn_t)(struct kiocb *, struct iov_iter *); const struct file_operations generic_ro_fops = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, .mmap = generic_file_readonly_mmap, .splice_read = generic_file_splice_read, @@ -343,13 +339,10 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = iov_iter_count(iter); iter->type |= READ; ret = file->f_op->read_iter(&kiocb, iter); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - + BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; return ret; @@ -366,13 +359,10 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = iov_iter_count(iter); iter->type |= WRITE; ret = file->f_op->write_iter(&kiocb, iter); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - + BUG_ON(ret == -EIOCBQUEUED); if (ret > 0) *ppos = kiocb.ki_pos; return ret; @@ -418,26 +408,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t return count > MAX_RW_COUNT ? MAX_RW_COUNT : count; } -ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) -{ - struct iovec iov = { .iov_base = buf, .iov_len = len }; - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, filp); - kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; - - ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - *ppos = kiocb.ki_pos; - return ret; -} - -EXPORT_SYMBOL(do_sync_read); - -ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) +static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = len }; struct kiocb kiocb; @@ -446,34 +417,25 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, READ, &iov, 1, len); ret = filp->f_op->read_iter(&kiocb, &iter); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } -EXPORT_SYMBOL(new_sync_read); - ssize_t __vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { - ssize_t ret; - if (file->f_op->read) - ret = file->f_op->read(file, buf, count, pos); - else if (file->f_op->aio_read) - ret = do_sync_read(file, buf, count, pos); + return file->f_op->read(file, buf, count, pos); else if (file->f_op->read_iter) - ret = new_sync_read(file, buf, count, pos); + return new_sync_read(file, buf, count, pos); else - ret = -EINVAL; - - return ret; + return -EINVAL; } +EXPORT_SYMBOL(__vfs_read); ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) { @@ -502,26 +464,7 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos) EXPORT_SYMBOL(vfs_read); -ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) -{ - struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, filp); - kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; - - ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - *ppos = kiocb.ki_pos; - return ret; -} - -EXPORT_SYMBOL(do_sync_write); - -ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) +static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; struct kiocb kiocb; @@ -530,17 +473,26 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, WRITE, &iov, 1, len); ret = filp->f_op->write_iter(&kiocb, &iter); - if (-EIOCBQUEUED == ret) - ret = wait_on_sync_kiocb(&kiocb); - *ppos = kiocb.ki_pos; + BUG_ON(ret == -EIOCBQUEUED); + if (ret > 0) + *ppos = kiocb.ki_pos; return ret; } -EXPORT_SYMBOL(new_sync_write); +ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, + loff_t *pos) +{ + if (file->f_op->write) + return file->f_op->write(file, p, count, pos); + else if (file->f_op->write_iter) + return new_sync_write(file, p, count, pos); + else + return -EINVAL; +} +EXPORT_SYMBOL(__vfs_write); ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) { @@ -556,12 +508,7 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t p = (__force const char __user *)buf; if (count > MAX_RW_COUNT) count = MAX_RW_COUNT; - if (file->f_op->write) - ret = file->f_op->write(file, p, count, pos); - else if (file->f_op->aio_write) - ret = do_sync_write(file, p, count, pos); - else - ret = new_sync_write(file, p, count, pos); + ret = __vfs_write(file, p, count, pos); set_fs(old_fs); if (ret > 0) { fsnotify_modify(file); @@ -588,12 +535,7 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_ if (ret >= 0) { count = ret; file_start_write(file); - if (file->f_op->write) - ret = file->f_op->write(file, buf, count, pos); - else if (file->f_op->aio_write) - ret = do_sync_write(file, buf, count, pos); - else - ret = new_sync_write(file, buf, count, pos); + ret = __vfs_write(file, buf, count, pos); if (ret > 0) { fsnotify_modify(file); add_wchar(current, ret); @@ -710,60 +652,32 @@ unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) } EXPORT_SYMBOL(iov_shorten); -static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iovec *iov, - unsigned long nr_segs, size_t len, loff_t *ppos, iter_fn_t fn) -{ - struct kiocb kiocb; - struct iov_iter iter; - ssize_t ret; - - init_sync_kiocb(&kiocb, filp); - kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; - - iov_iter_init(&iter, rw, iov, nr_segs, len); - ret = fn(&kiocb, &iter); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); - *ppos = kiocb.ki_pos; - return ret; -} - -static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, - unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn) +static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, + loff_t *ppos, iter_fn_t fn) { struct kiocb kiocb; ssize_t ret; init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; - ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); - if (ret == -EIOCBQUEUED) - ret = wait_on_sync_kiocb(&kiocb); + ret = fn(&kiocb, iter); + BUG_ON(ret == -EIOCBQUEUED); *ppos = kiocb.ki_pos; return ret; } /* Do it by hand, with file-ops */ -static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, - unsigned long nr_segs, loff_t *ppos, io_fn_t fn) +static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter, + loff_t *ppos, io_fn_t fn) { - struct iovec *vector = iov; ssize_t ret = 0; - while (nr_segs > 0) { - void __user *base; - size_t len; + while (iov_iter_count(iter)) { + struct iovec iovec = iov_iter_iovec(iter); ssize_t nr; - base = vector->iov_base; - len = vector->iov_len; - vector++; - nr_segs--; - - nr = fn(filp, base, len, ppos); + nr = fn(filp, iovec.iov_base, iovec.iov_len, ppos); if (nr < 0) { if (!ret) @@ -771,8 +685,9 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov, break; } ret += nr; - if (nr != len) + if (nr != iovec.iov_len) break; + iov_iter_advance(iter, nr); } return ret; @@ -863,48 +778,42 @@ static ssize_t do_readv_writev(int type, struct file *file, size_t tot_len; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; + struct iov_iter iter; ssize_t ret; io_fn_t fn; - iov_fn_t fnv; iter_fn_t iter_fn; - ret = rw_copy_check_uvector(type, uvector, nr_segs, - ARRAY_SIZE(iovstack), iovstack, &iov); - if (ret <= 0) - goto out; + ret = import_iovec(type, uvector, nr_segs, + ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + return ret; - tot_len = ret; + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; ret = rw_verify_area(type, file, pos, tot_len); if (ret < 0) goto out; - fnv = NULL; if (type == READ) { fn = file->f_op->read; - fnv = file->f_op->aio_read; iter_fn = file->f_op->read_iter; } else { fn = (io_fn_t)file->f_op->write; - fnv = file->f_op->aio_write; iter_fn = file->f_op->write_iter; file_start_write(file); } if (iter_fn) - ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len, - pos, iter_fn); - else if (fnv) - ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, - pos, fnv); + ret = do_iter_readv_writev(file, &iter, pos, iter_fn); else - ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + ret = do_loop_readv_writev(file, &iter, pos, fn); if (type != READ) file_end_write(file); out: - if (iov != iovstack) - kfree(iov); + kfree(iov); if ((ret + (type == READ)) > 0) { if (type == READ) fsnotify_access(file); @@ -1043,48 +952,42 @@ static ssize_t compat_do_readv_writev(int type, struct file *file, compat_ssize_t tot_len; struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; + struct iov_iter iter; ssize_t ret; io_fn_t fn; - iov_fn_t fnv; iter_fn_t iter_fn; - ret = compat_rw_copy_check_uvector(type, uvector, nr_segs, - UIO_FASTIOV, iovstack, &iov); - if (ret <= 0) - goto out; + ret = compat_import_iovec(type, uvector, nr_segs, + UIO_FASTIOV, &iov, &iter); + if (ret < 0) + return ret; - tot_len = ret; + tot_len = iov_iter_count(&iter); + if (!tot_len) + goto out; ret = rw_verify_area(type, file, pos, tot_len); if (ret < 0) goto out; - fnv = NULL; if (type == READ) { fn = file->f_op->read; - fnv = file->f_op->aio_read; iter_fn = file->f_op->read_iter; } else { fn = (io_fn_t)file->f_op->write; - fnv = file->f_op->aio_write; iter_fn = file->f_op->write_iter; file_start_write(file); } if (iter_fn) - ret = do_iter_readv_writev(file, type, iov, nr_segs, tot_len, - pos, iter_fn); - else if (fnv) - ret = do_sync_readv_writev(file, iov, nr_segs, tot_len, - pos, fnv); + ret = do_iter_readv_writev(file, &iter, pos, iter_fn); else - ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn); + ret = do_loop_readv_writev(file, &iter, pos, fn); if (type != READ) file_end_write(file); out: - if (iov != iovstack) - kfree(iov); + kfree(iov); if ((ret + (type == READ)) > 0) { if (type == READ) fsnotify_access(file); diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 0a7dc941aaf4..4a024e2ceb9f 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -53,8 +53,8 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, static inline bool is_privroot_deh(struct inode *dir, struct reiserfs_de_head *deh) { struct dentry *privroot = REISERFS_SB(dir->i_sb)->priv_root; - return (privroot->d_inode && - deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); + return (d_really_is_positive(privroot) && + deh->deh_objectid == INODE_PKEY(d_inode(privroot))->k_objectid); } int reiserfs_readdir_inode(struct inode *inode, struct dir_context *ctx) diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 751dd3f4346b..96a1bcf33db4 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -243,8 +243,6 @@ drop_write_lock: } const struct file_operations reiserfs_file_operations = { - .read = new_sync_read, - .write = new_sync_write, .unlocked_ioctl = reiserfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = reiserfs_compat_ioctl, diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index e72401e1f995..f6f2fbad9777 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,7 +18,7 @@ #include <linux/writeback.h> #include <linux/quotaops.h> #include <linux/swap.h> -#include <linux/aio.h> +#include <linux/uio.h> int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); @@ -3278,22 +3278,22 @@ static int reiserfs_releasepage(struct page *page, gfp_t unused_gfp_flags) * We thank Mingming Cao for helping us understand in great detail what * to do in this section of the code. */ -static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, loff_t offset) +static ssize_t reiserfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, + ret = blockdev_direct_IO(iocb, inode, iter, offset, reiserfs_get_blocks_direct_io); /* * In case of error extending write may have instantiated a few * blocks outside i_size. Trim these off again. */ - if (unlikely((rw & WRITE) && ret < 0)) { + if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { loff_t isize = i_size_read(inode); loff_t end = offset + count; @@ -3308,7 +3308,7 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int ia_valid; int error; diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index cd11358b10c7..b55a074653d7 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -400,7 +400,7 @@ struct dentry *reiserfs_get_parent(struct dentry *child) struct inode *inode = NULL; struct reiserfs_dir_entry de; INITIALIZE_PATH(path_to_entry); - struct inode *dir = child->d_inode; + struct inode *dir = d_inode(child); if (dir->i_nlink == 0) { return ERR_PTR(-ENOENT); @@ -917,7 +917,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; } - inode = dentry->d_inode; + inode = d_inode(dentry); reiserfs_update_inode_transaction(inode); reiserfs_update_inode_transaction(dir); @@ -987,7 +987,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) dquot_initialize(dir); - inode = dentry->d_inode; + inode = d_inode(dentry); /* * in this transaction we can be doing at max two balancings and @@ -1174,7 +1174,7 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int retval; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct reiserfs_transaction_handle th; /* * We need blocks for transaction + update of quotas for @@ -1311,8 +1311,8 @@ static int reiserfs_rename(struct inode *old_dir, struct dentry *old_dentry, dquot_initialize(old_dir); dquot_initialize(new_dir); - old_inode = old_dentry->d_inode; - new_dentry_inode = new_dentry->d_inode; + old_inode = d_inode(old_dentry); + new_dentry_inode = d_inode(new_dentry); /* * make sure that oldname still exists and points to an object we diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index bb79cddf0a1f..2adcde137c3f 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -910,7 +910,6 @@ do { \ if (!(cond)) \ reiserfs_panic(NULL, "assertion failure", "(" #cond ") at " \ __FILE__ ":%i:%s: " format "\n", \ - in_interrupt() ? -1 : task_pid_nr(current), \ __LINE__, __func__ , ##args); \ } while (0) diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 71fbbe3e2dab..0e4cf728126f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -21,6 +21,7 @@ #include "xattr.h" #include <linux/init.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/quotaops.h> @@ -588,8 +589,7 @@ static struct kmem_cache *reiserfs_inode_cachep; static struct inode *reiserfs_alloc_inode(struct super_block *sb) { struct reiserfs_inode_info *ei; - ei = (struct reiserfs_inode_info *) - kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; atomic_set(&ei->openers, 0); @@ -805,7 +805,7 @@ static const struct quotactl_ops reiserfs_qctl_operations = { .quota_on = reiserfs_quota_on, .quota_off = dquot_quota_off, .quota_sync = dquot_quota_sync, - .get_info = dquot_get_dqinfo, + .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk, @@ -1687,7 +1687,7 @@ static __u32 find_hash_out(struct super_block *s) __u32 hash = DEFAULT_HASH; __u32 deh_hashval, teahash, r5hash, yurahash; - inode = s->s_root->d_inode; + inode = d_inode(s->s_root); make_cpu_key(&key, inode, ~0, TYPE_DIRENTRY, 3); retval = search_by_entry_key(s, &key, &path, &de); @@ -2347,7 +2347,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, err = -EXDEV; goto out; } - inode = path->dentry->d_inode; + inode = d_inode(path->dentry); /* * We must not pack tails for quota files on reiserfs for quota * IO to work diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4e781e697c90..e87f9b52bf06 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -87,9 +87,9 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); error = dir->i_op->unlink(dir, dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); if (!error) d_delete(dentry); @@ -102,11 +102,11 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); error = dir->i_op->rmdir(dir, dentry); if (!error) - dentry->d_inode->i_flags |= S_DEAD; - mutex_unlock(&dentry->d_inode->i_mutex); + d_inode(dentry)->i_flags |= S_DEAD; + mutex_unlock(&d_inode(dentry)->i_mutex); if (!error) d_delete(dentry); @@ -120,26 +120,26 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) struct dentry *privroot = REISERFS_SB(sb)->priv_root; struct dentry *xaroot; - if (!privroot->d_inode) + if (d_really_is_negative(privroot)) return ERR_PTR(-ENODATA); - mutex_lock_nested(&privroot->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR); xaroot = dget(REISERFS_SB(sb)->xattr_root); if (!xaroot) xaroot = ERR_PTR(-ENODATA); - else if (!xaroot->d_inode) { + else if (d_really_is_negative(xaroot)) { int err = -ENODATA; if (xattr_may_create(flags)) - err = xattr_mkdir(privroot->d_inode, xaroot, 0700); + err = xattr_mkdir(d_inode(privroot), xaroot, 0700); if (err) { dput(xaroot); xaroot = ERR_PTR(err); } } - mutex_unlock(&privroot->d_inode->i_mutex); + mutex_unlock(&d_inode(privroot)->i_mutex); return xaroot; } @@ -156,21 +156,21 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) le32_to_cpu(INODE_PKEY(inode)->k_objectid), inode->i_generation); - mutex_lock_nested(&xaroot->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR); xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); - if (!IS_ERR(xadir) && !xadir->d_inode) { + if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { int err = -ENODATA; if (xattr_may_create(flags)) - err = xattr_mkdir(xaroot->d_inode, xadir, 0700); + err = xattr_mkdir(d_inode(xaroot), xadir, 0700); if (err) { dput(xadir); xadir = ERR_PTR(err); } } - mutex_unlock(&xaroot->d_inode->i_mutex); + mutex_unlock(&d_inode(xaroot)->i_mutex); dput(xaroot); return xadir; } @@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, container_of(ctx, struct reiserfs_dentry_buf, ctx); struct dentry *dentry; - WARN_ON_ONCE(!mutex_is_locked(&dbuf->xadir->d_inode->i_mutex)); + WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex)); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) return -ENOSPC; @@ -207,7 +207,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, dentry = lookup_one_len(name, dbuf->xadir, namelen); if (IS_ERR(dentry)) { return PTR_ERR(dentry); - } else if (!dentry->d_inode) { + } else if (d_really_is_negative(dentry)) { /* A directory entry exists, but no file? */ reiserfs_error(dentry->d_sb, "xattr-20003", "Corrupted directory: xattr %pd listed but " @@ -249,16 +249,16 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (IS_ERR(dir)) { err = PTR_ERR(dir); goto out; - } else if (!dir->d_inode) { + } else if (d_really_is_negative(dir)) { err = 0; goto out_dir; } - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); buf.xadir = dir; while (1) { - err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); if (err) break; if (!buf.count) @@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, break; buf.count = 0; } - mutex_unlock(&dir->d_inode->i_mutex); + mutex_unlock(&d_inode(dir)->i_mutex); cleanup_dentry_buf(&buf); @@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (!err) { int jerror; - mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, + mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex, I_MUTEX_XATTR); err = action(dir, data); reiserfs_write_lock(inode->i_sb); jerror = journal_end(&th); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&dir->d_parent->d_inode->i_mutex); + mutex_unlock(&d_inode(dir->d_parent)->i_mutex); err = jerror ?: err; } } @@ -319,7 +319,7 @@ out: static int delete_one_xattr(struct dentry *dentry, void *data) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); /* This is the xattr dir, handle specially. */ if (d_is_dir(dentry)) @@ -384,27 +384,27 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (IS_ERR(xadir)) return ERR_CAST(xadir); - mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); xafile = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(xafile)) { err = PTR_ERR(xafile); goto out; } - if (xafile->d_inode && (flags & XATTR_CREATE)) + if (d_really_is_positive(xafile) && (flags & XATTR_CREATE)) err = -EEXIST; - if (!xafile->d_inode) { + if (d_really_is_negative(xafile)) { err = -ENODATA; if (xattr_may_create(flags)) - err = xattr_create(xadir->d_inode, xafile, + err = xattr_create(d_inode(xadir), xafile, 0700|S_IFREG); } if (err) dput(xafile); out: - mutex_unlock(&xadir->d_inode->i_mutex); + mutex_unlock(&d_inode(xadir)->i_mutex); dput(xadir); if (err) return ERR_PTR(err); @@ -469,21 +469,21 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) if (IS_ERR(xadir)) return PTR_ERR(xadir); - mutex_lock_nested(&xadir->d_inode->i_mutex, I_MUTEX_XATTR); + mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); dentry = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_dput; } - if (dentry->d_inode) { - err = xattr_unlink(xadir->d_inode, dentry); + if (d_really_is_positive(dentry)) { + err = xattr_unlink(d_inode(xadir), dentry); update_ctime(inode); } dput(dentry); out_dput: - mutex_unlock(&xadir->d_inode->i_mutex); + mutex_unlock(&d_inode(xadir)->i_mutex); dput(xadir); return err; } @@ -533,7 +533,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, else chunk = buffer_size - buffer_pos; - page = reiserfs_get_page(dentry->d_inode, file_pos); + page = reiserfs_get_page(d_inode(dentry), file_pos); if (IS_ERR(page)) { err = PTR_ERR(page); goto out_unlock; @@ -573,18 +573,18 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, } new_size = buffer_size + sizeof(struct reiserfs_xattr_header); - if (!err && new_size < i_size_read(dentry->d_inode)) { + if (!err && new_size < i_size_read(d_inode(dentry))) { struct iattr newattrs = { .ia_ctime = current_fs_time(inode->i_sb), .ia_size = new_size, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); - inode_dio_wait(dentry->d_inode); + mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR); + inode_dio_wait(d_inode(dentry)); err = reiserfs_setattr(dentry, &newattrs); - mutex_unlock(&dentry->d_inode->i_mutex); + mutex_unlock(&d_inode(dentry)->i_mutex); } else update_ctime(inode); out_unlock: @@ -657,7 +657,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, down_read(&REISERFS_I(inode)->i_xattr_sem); - isize = i_size_read(dentry->d_inode); + isize = i_size_read(d_inode(dentry)); /* Just return the size needed */ if (buffer == NULL) { @@ -680,7 +680,7 @@ reiserfs_xattr_get(struct inode *inode, const char *name, void *buffer, else chunk = isize - file_pos; - page = reiserfs_get_page(dentry->d_inode, file_pos); + page = reiserfs_get_page(d_inode(dentry), file_pos); if (IS_ERR(page)) { err = PTR_ERR(page); goto out_unlock; @@ -775,7 +775,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; return handler->get(dentry, name, buffer, size, handler->flags); @@ -784,7 +784,7 @@ reiserfs_getxattr(struct dentry * dentry, const char *name, void *buffer, /* * Inode operation setxattr() * - * dentry->d_inode->i_mutex down + * d_inode(dentry)->i_mutex down */ int reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, @@ -794,7 +794,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; return handler->set(dentry, name, value, size, flags, handler->flags); @@ -803,7 +803,7 @@ reiserfs_setxattr(struct dentry *dentry, const char *name, const void *value, /* * Inode operation removexattr() * - * dentry->d_inode->i_mutex down + * d_inode(dentry)->i_mutex down */ int reiserfs_removexattr(struct dentry *dentry, const char *name) { @@ -811,7 +811,7 @@ int reiserfs_removexattr(struct dentry *dentry, const char *name) handler = find_xattr_handler_prefix(dentry->d_sb->s_xattr, name); - if (!handler || get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + if (!handler || get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; return handler->set(dentry, name, NULL, 0, XATTR_REPLACE, handler->flags); @@ -875,14 +875,14 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) .size = buffer ? size : 0, }; - if (!dentry->d_inode) + if (d_really_is_negative(dentry)) return -EINVAL; if (!dentry->d_sb->s_xattr || - get_inode_sd_version(dentry->d_inode) == STAT_DATA_V1) + get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1) return -EOPNOTSUPP; - dir = open_xa_dir(dentry->d_inode, XATTR_REPLACE); + dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE); if (IS_ERR(dir)) { err = PTR_ERR(dir); if (err == -ENODATA) @@ -890,9 +890,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) goto out; } - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); - err = reiserfs_readdir_inode(dir->d_inode, &buf.ctx); - mutex_unlock(&dir->d_inode->i_mutex); + mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); + mutex_unlock(&d_inode(dir)->i_mutex); if (!err) err = buf.pos; @@ -905,12 +905,12 @@ out: static int create_privroot(struct dentry *dentry) { int err; - struct inode *inode = dentry->d_parent->d_inode; + struct inode *inode = d_inode(dentry->d_parent); WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); err = xattr_mkdir(inode, dentry, 0700); - if (err || !dentry->d_inode) { + if (err || d_really_is_negative(dentry)) { reiserfs_warning(dentry->d_sb, "jdm-20006", "xattrs/ACLs enabled and couldn't " "find/create .reiserfs_priv. " @@ -918,7 +918,7 @@ static int create_privroot(struct dentry *dentry) return -EOPNOTSUPP; } - dentry->d_inode->i_flags |= S_PRIVATE; + d_inode(dentry)->i_flags |= S_PRIVATE; reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr " "storage.\n", PRIVROOT_NAME); @@ -997,17 +997,17 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - mutex_lock(&s->s_root->d_inode->i_mutex); + mutex_lock(&d_inode(s->s_root)->i_mutex); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; d_set_d_op(dentry, &xattr_lookup_poison_ops); - if (dentry->d_inode) - dentry->d_inode->i_flags |= S_PRIVATE; + if (d_really_is_positive(dentry)) + d_inode(dentry)->i_flags |= S_PRIVATE; } else err = PTR_ERR(dentry); - mutex_unlock(&s->s_root->d_inode->i_mutex); + mutex_unlock(&d_inode(s->s_root)->i_mutex); return err; } @@ -1026,15 +1026,15 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) if (err) goto error; - if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) { - mutex_lock(&s->s_root->d_inode->i_mutex); + if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { + mutex_lock(&d_inode(s->s_root)->i_mutex); err = create_privroot(REISERFS_SB(s)->priv_root); - mutex_unlock(&s->s_root->d_inode->i_mutex); + mutex_unlock(&d_inode(s->s_root)->i_mutex); } - if (privroot->d_inode) { + if (d_really_is_positive(privroot)) { s->s_xattr = reiserfs_xattr_handlers; - mutex_lock(&privroot->d_inode->i_mutex); + mutex_lock(&d_inode(privroot)->i_mutex); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; @@ -1045,7 +1045,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) else err = PTR_ERR(dentry); } - mutex_unlock(&privroot->d_inode->i_mutex); + mutex_unlock(&d_inode(privroot)->i_mutex); } error: diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index f620e9678dd5..15dde6262c00 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -78,7 +78,7 @@ static inline size_t reiserfs_xattr_jcreate_nblocks(struct inode *inode) if ((REISERFS_I(inode)->i_flags & i_has_xattr_dir) == 0) { nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); - if (!REISERFS_SB(inode->i_sb)->xattr_root->d_inode) + if (d_really_is_negative(REISERFS_SB(inode->i_sb)->xattr_root)) nblocks += JOURNAL_BLOCKS_PER_OBJECT(inode->i_sb); } diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index e7f8939a4cb5..9a3b0616f283 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -15,10 +15,10 @@ security_get(struct dentry *dentry, const char *name, void *buffer, size_t size, if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(dentry->d_inode)) + if (IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); } static int @@ -28,10 +28,10 @@ security_set(struct dentry *dentry, const char *name, const void *buffer, if (strlen(name) < sizeof(XATTR_SECURITY_PREFIX)) return -EINVAL; - if (IS_PRIVATE(dentry->d_inode)) + if (IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } static size_t security_list(struct dentry *dentry, char *list, size_t list_len, @@ -39,7 +39,7 @@ static size_t security_list(struct dentry *dentry, char *list, size_t list_len, { const size_t len = namelen + 1; - if (IS_PRIVATE(dentry->d_inode)) + if (IS_PRIVATE(d_inode(dentry))) return 0; if (list && len <= list_len) { diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index 5eeb0c48ba46..e4f1343714e0 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -14,10 +14,10 @@ trusted_get(struct dentry *dentry, const char *name, void *buffer, size_t size, if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); } static int @@ -27,10 +27,10 @@ trusted_set(struct dentry *dentry, const char *name, const void *buffer, if (strlen(name) < sizeof(XATTR_TRUSTED_PREFIX)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return -EPERM; - return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, @@ -38,7 +38,7 @@ static size_t trusted_list(struct dentry *dentry, char *list, size_t list_size, { const size_t len = name_len + 1; - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(dentry->d_inode)) + if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) return 0; if (list && len <= list_size) { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index e50eab046471..d0b08d3e5689 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -15,7 +15,7 @@ user_get(struct dentry *dentry, const char *name, void *buffer, size_t size, return -EINVAL; if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_get(dentry->d_inode, name, buffer, size); + return reiserfs_xattr_get(d_inode(dentry), name, buffer, size); } static int @@ -27,7 +27,7 @@ user_set(struct dentry *dentry, const char *name, const void *buffer, if (!reiserfs_xattrs_user(dentry->d_sb)) return -EOPNOTSUPP; - return reiserfs_xattr_set(dentry->d_inode, name, buffer, size, flags); + return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } static size_t user_list(struct dentry *dentry, char *list, size_t list_size, diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c index 7da9e2153953..1118a0dc6b45 100644 --- a/fs/romfs/mmap-nommu.c +++ b/fs/romfs/mmap-nommu.c @@ -81,7 +81,6 @@ static unsigned romfs_mmap_capabilities(struct file *file) const struct file_operations romfs_ro_fops = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, .splice_read = generic_file_splice_read, .mmap = romfs_mmap, diff --git a/fs/select.c b/fs/select.c index f684c750e08a..015547330e88 100644 --- a/fs/select.c +++ b/fs/select.c @@ -189,7 +189,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() - * and is paired with set_mb() in poll_schedule_timeout. + * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; @@ -244,7 +244,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, /* * Prepare for the next iteration. * - * The following set_mb() serves two purposes. First, it's + * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing @@ -252,7 +252,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ - set_mb(pwq->triggered, 0); + smp_store_mb(pwq->triggered, 0); return rc; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 555f82155be8..ce9e39fd5daf 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -48,18 +48,21 @@ static void *seq_buf_alloc(unsigned long size) * ERR_PTR(error). In the end of sequence they return %NULL. ->show() * returns 0 in case of success and negative number in case of error. * Returning SEQ_SKIP means "discard this element and move on". + * Note: seq_open() will allocate a struct seq_file and store its + * pointer in @file->private_data. This pointer should not be modified. */ int seq_open(struct file *file, const struct seq_operations *op) { - struct seq_file *p = file->private_data; + struct seq_file *p; + + WARN_ON(file->private_data); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + file->private_data = p; - if (!p) { - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return -ENOMEM; - file->private_data = p; - } - memset(p, 0, sizeof(*p)); mutex_init(&p->lock); p->op = op; #ifdef CONFIG_USER_NS @@ -487,6 +490,20 @@ int seq_path(struct seq_file *m, const struct path *path, const char *esc) } EXPORT_SYMBOL(seq_path); +/** + * seq_file_path - seq_file interface to print a pathname of a file + * @m: the seq_file handle + * @file: the struct file to print + * @esc: set of characters to escape in the output + * + * return the absolute path to the file. + */ +int seq_file_path(struct seq_file *m, struct file *file, const char *esc) +{ + return seq_path(m, &file->f_path, esc); +} +EXPORT_SYMBOL(seq_file_path); + /* * Same as seq_path, but relative to supplied root. */ @@ -538,6 +555,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) return res; } +EXPORT_SYMBOL(seq_dentry); static void *single_start(struct seq_file *p, loff_t *pos) { diff --git a/fs/signalfd.c b/fs/signalfd.c index 7e412ad74836..270221fcef42 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -121,8 +121,9 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ - if (kinfo->si_code == BUS_MCEERR_AR || - kinfo->si_code == BUS_MCEERR_AO) + if (kinfo->si_signo == SIGBUS && + (kinfo->si_code == BUS_MCEERR_AR || + kinfo->si_code == BUS_MCEERR_AO)) err |= __put_user((short) kinfo->si_addr_lsb, &uinfo->ssi_addr_lsb); #endif diff --git a/fs/splice.c b/fs/splice.c index 7968da96bebb..5fc1e50a7f30 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -32,7 +32,6 @@ #include <linux/gfp.h> #include <linux/socket.h> #include <linux/compat.h> -#include <linux/aio.h> #include "internal.h" /* @@ -262,6 +261,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, return ret; } +EXPORT_SYMBOL_GPL(splice_to_pipe); void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { @@ -360,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, break; error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL); + GFP_KERNEL & mapping_gfp_mask(mapping)); if (unlikely(error)) { page_cache_release(page); if (error == -EEXIST) @@ -524,6 +524,9 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, loff_t isize, left; int ret; + if (IS_DAX(in->f_mapping->host)) + return default_file_splice_read(in, ppos, pipe, len, flags); + isize = i_size_read(in->f_mapping->host); if (unlikely(*ppos >= isize)) return 0; @@ -1159,7 +1162,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, long ret, bytes; umode_t i_mode; size_t len; - int i, flags; + int i, flags, more; /* * We require the input being a regular file, as we don't want to @@ -1202,6 +1205,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * Don't block on output, we have to drain the direct pipe. */ sd->flags &= ~SPLICE_F_NONBLOCK; + more = sd->flags & SPLICE_F_MORE; while (len) { size_t read_len; @@ -1215,6 +1219,15 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, sd->total_len = read_len; /* + * If more data is pending, set SPLICE_F_MORE + * If this is the last data and SPLICE_F_MORE was not set + * initially, clears it. + */ + if (read_len < len) + sd->flags |= SPLICE_F_MORE; + else if (!more) + sd->flags &= ~SPLICE_F_MORE; + /* * NOTE: nonblocking mode only applies to the input. We * must not do the output in nonblocking mode as then we * could get stuck data in the internal pipe: @@ -1534,34 +1547,29 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov, struct iovec iovstack[UIO_FASTIOV]; struct iovec *iov = iovstack; struct iov_iter iter; - ssize_t count; pipe = get_pipe_info(file); if (!pipe) return -EBADF; - ret = rw_copy_check_uvector(READ, uiov, nr_segs, - ARRAY_SIZE(iovstack), iovstack, &iov); - if (ret <= 0) - goto out; - - count = ret; - iov_iter_init(&iter, READ, iov, nr_segs, count); + ret = import_iovec(READ, uiov, nr_segs, + ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + return ret; + sd.total_len = iov_iter_count(&iter); sd.len = 0; - sd.total_len = count; sd.flags = flags; sd.u.data = &iter; sd.pos = 0; - pipe_lock(pipe); - ret = __splice_from_pipe(pipe, &sd, pipe_to_user); - pipe_unlock(pipe); - -out: - if (iov != iovstack) - kfree(iov); + if (sd.total_len) { + pipe_lock(pipe); + ret = __splice_from_pipe(pipe, &sd, pipe_to_user); + pipe_unlock(pipe); + } + kfree(iov); return ret; } diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c index 5e1101ff276f..8073b6532cf0 100644 --- a/fs/squashfs/export.c +++ b/fs/squashfs/export.c @@ -110,7 +110,7 @@ static struct dentry *squashfs_fh_to_parent(struct super_block *sb, static struct dentry *squashfs_get_parent(struct dentry *child) { - struct inode *inode = child->d_inode; + struct inode *inode = d_inode(child); unsigned int parent_ino = squashfs_i(inode)->parent; return squashfs_export_iget(inode->i_sb, parent_ino); diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index 73588e7700ed..d09fcd6fb85d 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -49,6 +49,6 @@ struct squashfs_inode_info { static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) { - return list_entry(inode, struct squashfs_inode_info, vfs_inode); + return container_of(inode, struct squashfs_inode_info, vfs_inode); } #endif diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index 92fcde7b4d61..e5e0ddf5b143 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -39,7 +39,7 @@ static const struct xattr_handler *squashfs_xattr_handler(int); ssize_t squashfs_listxattr(struct dentry *d, char *buffer, size_t buffer_size) { - struct inode *inode = d->d_inode; + struct inode *inode = d_inode(d); struct super_block *sb = inode->i_sb; struct squashfs_sb_info *msblk = sb->s_fs_info; u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr) @@ -229,7 +229,7 @@ static int squashfs_user_get(struct dentry *d, const char *name, void *buffer, if (name[0] == '\0') return -EINVAL; - return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name, + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_USER, name, buffer, size); } @@ -259,7 +259,7 @@ static int squashfs_trusted_get(struct dentry *d, const char *name, if (name[0] == '\0') return -EINVAL; - return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name, + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_TRUSTED, name, buffer, size); } @@ -286,7 +286,7 @@ static int squashfs_security_get(struct dentry *d, const char *name, if (name[0] == '\0') return -EINVAL; - return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name, + return squashfs_xattr_get(d_inode(d), SQUASHFS_XATTR_SECURITY, name, buffer, size); } diff --git a/fs/stat.c b/fs/stat.c index ae0c3cef9927..cccc1aab9a8b 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -51,7 +51,7 @@ EXPORT_SYMBOL(generic_fillattr); */ int vfs_getattr_nosec(struct path *path, struct kstat *stat) { - struct inode *inode = path->dentry->d_inode; + struct inode *inode = d_backing_inode(path->dentry); if (inode->i_op->getattr) return inode->i_op->getattr(path->mnt, path->dentry, stat); @@ -66,7 +66,7 @@ int vfs_getattr(struct path *path, struct kstat *stat) { int retval; - retval = security_inode_getattr(path->mnt, path->dentry); + retval = security_inode_getattr(path); if (retval) return retval; return vfs_getattr_nosec(path, stat); @@ -326,7 +326,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname, retry: error = user_path_at_empty(dfd, pathname, lookup_flags, &path, &empty); if (!error) { - struct inode *inode = path.dentry->d_inode; + struct inode *inode = d_backing_inode(path.dentry); error = empty ? -ENOENT : -EINVAL; if (inode->i_op->readlink) { diff --git a/fs/super.c b/fs/super.c index 2b7dc90ccdbb..b61372354f2b 100644 --- a/fs/super.c +++ b/fs/super.c @@ -224,7 +224,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) s->s_maxbytes = MAX_NON_LFS; s->s_op = &default_op; s->s_time_gran = 1000000000; - s->cleancache_poolid = -1; + s->cleancache_poolid = CLEANCACHE_NO_POOL; s->s_shrink.seeks = DEFAULT_SEEKS; s->s_shrink.scan_objects = super_cache_scan; @@ -842,7 +842,7 @@ int get_anon_bdev(dev_t *p) else if (error) return -EAGAIN; - if (dev == (1 << MINORBITS)) { + if (dev >= (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 0b45ff42f374..94374e435025 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -121,3 +121,37 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); } + +/** + * sysfs_create_mount_point - create an always empty directory + * @parent_kobj: kobject that will contain this always empty directory + * @name: The name of the always empty directory to add + */ +int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) +{ + struct kernfs_node *kn, *parent = parent_kobj->sd; + + kn = kernfs_create_empty_dir(parent, name); + if (IS_ERR(kn)) { + if (PTR_ERR(kn) == -EEXIST) + sysfs_warn_dup(parent, name); + return PTR_ERR(kn); + } + + return 0; +} +EXPORT_SYMBOL_GPL(sysfs_create_mount_point); + +/** + * sysfs_remove_mount_point - remove an always empty directory. + * @parent_kobj: kobject that will contain this always empty directory + * @name: The name of the always empty directory to remove + * + */ +void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) +{ + struct kernfs_node *parent = parent_kobj->sd; + + kernfs_remove_by_name_ns(parent, name, NULL); +} +EXPORT_SYMBOL_GPL(sysfs_remove_mount_point); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 7c2867b44141..6c95628ea377 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -90,7 +90,7 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, return 0; if (size) { - if (pos > size) + if (pos >= size) return 0; if (pos + count > size) count = size - pos; diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 2554d8835b48..39a019936768 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -41,7 +41,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, if (grp->attrs) { for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { - umode_t mode = 0; + umode_t mode = (*attr)->mode; /* * In update mode, we're changing the permissions or @@ -55,9 +55,14 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, if (!mode) continue; } + + WARN(mode & ~(SYSFS_PREALLOC | 0664), + "Attribute %s: Invalid permissions 0%o\n", + (*attr)->name, mode); + + mode &= SYSFS_PREALLOC | 0664; error = sysfs_add_file_mode_ns(parent, *attr, false, - (*attr)->mode | mode, - NULL); + mode, NULL); if (unlikely(error)) break; } @@ -130,7 +135,7 @@ static int internal_create_group(struct kobject *kobj, int update, * This function creates a group for the first time. It will explicitly * warn and error if any of the attribute files being created already exist. * - * Returns 0 on success or error. + * Returns 0 on success or error code on failure. */ int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) @@ -150,7 +155,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_group); * It will explicitly warn and error if any of the attribute files being * created already exist. * - * Returns 0 on success or error code from sysfs_create_group on error. + * Returns 0 on success or error code from sysfs_create_group on failure. */ int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) @@ -188,7 +193,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_groups); * The primary use for this function is to call it after making a change * that affects group visibility. * - * Returns 0 on success or error. + * Returns 0 on success or error code on failure. */ int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 8a49486bf30c..1c6ac6fcee9f 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -31,9 +31,6 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, bool new_sb; if (!(flags & MS_KERNMOUNT)) { - if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) - return ERR_PTR(-EPERM); - if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) return ERR_PTR(-EPERM); } @@ -58,7 +55,7 @@ static struct file_system_type sysfs_fs_type = { .name = "sysfs", .mount = sysfs_mount, .kill_sb = sysfs_kill_sb, - .fs_flags = FS_USERNS_MOUNT, + .fs_flags = FS_USERNS_VISIBLE | FS_USERNS_MOUNT, }; int __init sysfs_init(void) diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile index 3591f9d7a48a..7a75e70a4b61 100644 --- a/fs/sysv/Makefile +++ b/fs/sysv/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_SYSV_FS) += sysv.o sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \ - namei.o super.o symlink.o + namei.o super.o diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index d42291d08215..63c1bcb224ee 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -33,11 +33,6 @@ static inline void dir_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; @@ -132,7 +127,7 @@ struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct page **res_ { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; - struct inode * dir = dentry->d_parent->d_inode; + struct inode * dir = d_inode(dentry->d_parent); unsigned long start, n; unsigned long npages = dir_pages(dir); struct page *page = NULL; @@ -176,7 +171,7 @@ found: int sysv_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct page *page = NULL; diff --git a/fs/sysv/file.c b/fs/sysv/file.c index b00811c75b24..82ddc09061e2 100644 --- a/fs/sysv/file.c +++ b/fs/sysv/file.c @@ -21,9 +21,7 @@ */ const struct file_operations sysv_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .fsync = generic_file_fsync, @@ -32,7 +30,7 @@ const struct file_operations sysv_file_operations = { static int sysv_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 88956309cc86..590ad9206e3f 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -166,8 +166,9 @@ void sysv_set_inode(struct inode *inode, dev_t rdev) inode->i_op = &sysv_symlink_inode_operations; inode->i_mapping->a_ops = &sysv_aops; } else { - inode->i_op = &sysv_fast_symlink_inode_operations; - nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size, + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = (char *)SYSV_I(inode)->i_data; + nd_terminate_link(inode->i_link, inode->i_size, sizeof(SYSV_I(inode)->i_data) - 1); } } else diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c index 66bc316927e8..2fde40acf024 100644 --- a/fs/sysv/itree.c +++ b/fs/sysv/itree.c @@ -443,7 +443,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size) int sysv_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct super_block *s = dentry->d_sb; - generic_fillattr(dentry->d_inode, stat); + generic_fillattr(d_inode(dentry), stat); stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); stat->blksize = s->s_blocksize; return 0; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c index 731b2bbcaab3..11e83ed0b4bf 100644 --- a/fs/sysv/namei.c +++ b/fs/sysv/namei.c @@ -118,7 +118,7 @@ out_fail: static int sysv_link(struct dentry * old_dentry, struct inode * dir, struct dentry * dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); @@ -166,7 +166,7 @@ out_dir: static int sysv_unlink(struct inode * dir, struct dentry * dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct page * page; struct sysv_dir_entry * de; int err = -ENOENT; @@ -187,7 +187,7 @@ out: static int sysv_rmdir(struct inode * dir, struct dentry * dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int err = -ENOTEMPTY; if (sysv_empty_dir(inode)) { @@ -208,8 +208,8 @@ static int sysv_rmdir(struct inode * dir, struct dentry * dentry) static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry, struct inode * new_dir, struct dentry * new_dentry) { - struct inode * old_inode = old_dentry->d_inode; - struct inode * new_inode = new_dentry->d_inode; + struct inode * old_inode = d_inode(old_dentry); + struct inode * new_inode = d_inode(new_dentry); struct page * dir_page = NULL; struct sysv_dir_entry * dir_de = NULL; struct page * old_page; diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c deleted file mode 100644 index 00d2f8a43e4e..000000000000 --- a/fs/sysv/symlink.c +++ /dev/null @@ -1,20 +0,0 @@ -/* - * linux/fs/sysv/symlink.c - * - * Handling of System V filesystem fast symlinks extensions. - * Aug 2001, Christoph Hellwig (hch@infradead.org) - */ - -#include "sysv.h" -#include <linux/namei.h> - -static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, (char *)SYSV_I(dentry->d_inode)->i_data); - return NULL; -} - -const struct inode_operations sysv_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = sysv_follow_link, -}; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 69d488986cce..6c212288adcb 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -73,7 +73,7 @@ struct sysv_inode_info { static inline struct sysv_inode_info *SYSV_I(struct inode *inode) { - return list_entry(inode, struct sysv_inode_info, vfs_inode); + return container_of(inode, struct sysv_inode_info, vfs_inode); } static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb) @@ -161,7 +161,6 @@ extern ino_t sysv_inode_by_name(struct dentry *); extern const struct inode_operations sysv_file_inode_operations; extern const struct inode_operations sysv_dir_inode_operations; -extern const struct inode_operations sysv_fast_symlink_inode_operations; extern const struct file_operations sysv_file_operations; extern const struct file_operations sysv_dir_operations; extern const struct address_space_operations sysv_aops; diff --git a/fs/tracefs/Makefile b/fs/tracefs/Makefile new file mode 100644 index 000000000000..82fa35b656c4 --- /dev/null +++ b/fs/tracefs/Makefile @@ -0,0 +1,4 @@ +tracefs-objs := inode.o + +obj-$(CONFIG_TRACING) += tracefs.o + diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c new file mode 100644 index 000000000000..cbc8d5d2755a --- /dev/null +++ b/fs/tracefs/inode.c @@ -0,0 +1,643 @@ +/* + * inode.c - part of tracefs, a pseudo file system for activating tracing + * + * Based on debugfs by: Greg Kroah-Hartman <greg@kroah.com> + * + * Copyright (C) 2014 Red Hat Inc, author: Steven Rostedt <srostedt@redhat.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * tracefs is the file system that is used by the tracing infrastructure. + * + */ + +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/kobject.h> +#include <linux/namei.h> +#include <linux/tracefs.h> +#include <linux/fsnotify.h> +#include <linux/seq_file.h> +#include <linux/parser.h> +#include <linux/magic.h> +#include <linux/slab.h> + +#define TRACEFS_DEFAULT_MODE 0700 + +static struct vfsmount *tracefs_mount; +static int tracefs_mount_count; +static bool tracefs_registered; + +static ssize_t default_read_file(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return 0; +} + +static ssize_t default_write_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + return count; +} + +static const struct file_operations tracefs_file_operations = { + .read = default_read_file, + .write = default_write_file, + .open = simple_open, + .llseek = noop_llseek, +}; + +static struct tracefs_dir_ops { + int (*mkdir)(const char *name); + int (*rmdir)(const char *name); +} tracefs_ops; + +static char *get_dname(struct dentry *dentry) +{ + const char *dname; + char *name; + int len = dentry->d_name.len; + + dname = dentry->d_name.name; + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) + return NULL; + memcpy(name, dname, len); + name[len] = 0; + return name; +} + +static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode) +{ + char *name; + int ret; + + name = get_dname(dentry); + if (!name) + return -ENOMEM; + + /* + * The mkdir call can call the generic functions that create + * the files within the tracefs system. It is up to the individual + * mkdir routine to handle races. + */ + mutex_unlock(&inode->i_mutex); + ret = tracefs_ops.mkdir(name); + mutex_lock(&inode->i_mutex); + + kfree(name); + + return ret; +} + +static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) +{ + char *name; + int ret; + + name = get_dname(dentry); + if (!name) + return -ENOMEM; + + /* + * The rmdir call can call the generic functions that create + * the files within the tracefs system. It is up to the individual + * rmdir routine to handle races. + * This time we need to unlock not only the parent (inode) but + * also the directory that is being deleted. + */ + mutex_unlock(&inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); + + ret = tracefs_ops.rmdir(name); + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock(&dentry->d_inode->i_mutex); + + kfree(name); + + return ret; +} + +static const struct inode_operations tracefs_dir_inode_operations = { + .lookup = simple_lookup, + .mkdir = tracefs_syscall_mkdir, + .rmdir = tracefs_syscall_rmdir, +}; + +static struct inode *tracefs_get_inode(struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + } + return inode; +} + +struct tracefs_mount_opts { + kuid_t uid; + kgid_t gid; + umode_t mode; +}; + +enum { + Opt_uid, + Opt_gid, + Opt_mode, + Opt_err +}; + +static const match_table_t tokens = { + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_mode, "mode=%o"}, + {Opt_err, NULL} +}; + +struct tracefs_fs_info { + struct tracefs_mount_opts mount_opts; +}; + +static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts) +{ + substring_t args[MAX_OPT_ARGS]; + int option; + int token; + kuid_t uid; + kgid_t gid; + char *p; + + opts->mode = TRACEFS_DEFAULT_MODE; + + while ((p = strsep(&data, ",")) != NULL) { + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_uid: + if (match_int(&args[0], &option)) + return -EINVAL; + uid = make_kuid(current_user_ns(), option); + if (!uid_valid(uid)) + return -EINVAL; + opts->uid = uid; + break; + case Opt_gid: + if (match_int(&args[0], &option)) + return -EINVAL; + gid = make_kgid(current_user_ns(), option); + if (!gid_valid(gid)) + return -EINVAL; + opts->gid = gid; + break; + case Opt_mode: + if (match_octal(&args[0], &option)) + return -EINVAL; + opts->mode = option & S_IALLUGO; + break; + /* + * We might like to report bad mount options here; + * but traditionally tracefs has ignored all mount options + */ + } + } + + return 0; +} + +static int tracefs_apply_options(struct super_block *sb) +{ + struct tracefs_fs_info *fsi = sb->s_fs_info; + struct inode *inode = sb->s_root->d_inode; + struct tracefs_mount_opts *opts = &fsi->mount_opts; + + inode->i_mode &= ~S_IALLUGO; + inode->i_mode |= opts->mode; + + inode->i_uid = opts->uid; + inode->i_gid = opts->gid; + + return 0; +} + +static int tracefs_remount(struct super_block *sb, int *flags, char *data) +{ + int err; + struct tracefs_fs_info *fsi = sb->s_fs_info; + + sync_filesystem(sb); + err = tracefs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; + + tracefs_apply_options(sb); + +fail: + return err; +} + +static int tracefs_show_options(struct seq_file *m, struct dentry *root) +{ + struct tracefs_fs_info *fsi = root->d_sb->s_fs_info; + struct tracefs_mount_opts *opts = &fsi->mount_opts; + + if (!uid_eq(opts->uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, opts->uid)); + if (!gid_eq(opts->gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, opts->gid)); + if (opts->mode != TRACEFS_DEFAULT_MODE) + seq_printf(m, ",mode=%o", opts->mode); + + return 0; +} + +static const struct super_operations tracefs_super_operations = { + .statfs = simple_statfs, + .remount_fs = tracefs_remount, + .show_options = tracefs_show_options, +}; + +static int trace_fill_super(struct super_block *sb, void *data, int silent) +{ + static struct tree_descr trace_files[] = {{""}}; + struct tracefs_fs_info *fsi; + int err; + + save_mount_options(sb, data); + + fsi = kzalloc(sizeof(struct tracefs_fs_info), GFP_KERNEL); + sb->s_fs_info = fsi; + if (!fsi) { + err = -ENOMEM; + goto fail; + } + + err = tracefs_parse_options(data, &fsi->mount_opts); + if (err) + goto fail; + + err = simple_fill_super(sb, TRACEFS_MAGIC, trace_files); + if (err) + goto fail; + + sb->s_op = &tracefs_super_operations; + + tracefs_apply_options(sb); + + return 0; + +fail: + kfree(fsi); + sb->s_fs_info = NULL; + return err; +} + +static struct dentry *trace_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return mount_single(fs_type, flags, data, trace_fill_super); +} + +static struct file_system_type trace_fs_type = { + .owner = THIS_MODULE, + .name = "tracefs", + .mount = trace_mount, + .kill_sb = kill_litter_super, +}; +MODULE_ALIAS_FS("tracefs"); + +static struct dentry *start_creating(const char *name, struct dentry *parent) +{ + struct dentry *dentry; + int error; + + pr_debug("tracefs: creating file '%s'\n",name); + + error = simple_pin_fs(&trace_fs_type, &tracefs_mount, + &tracefs_mount_count); + if (error) + return ERR_PTR(error); + + /* If the parent is not specified, we create it in the root. + * We need the root dentry to do this, which is in the super + * block. A pointer to that is in the struct vfsmount that we + * have around. + */ + if (!parent) + parent = tracefs_mount->mnt_root; + + mutex_lock(&parent->d_inode->i_mutex); + dentry = lookup_one_len(name, parent, strlen(name)); + if (!IS_ERR(dentry) && dentry->d_inode) { + dput(dentry); + dentry = ERR_PTR(-EEXIST); + } + if (IS_ERR(dentry)) + mutex_unlock(&parent->d_inode->i_mutex); + return dentry; +} + +static struct dentry *failed_creating(struct dentry *dentry) +{ + mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + dput(dentry); + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + return NULL; +} + +static struct dentry *end_creating(struct dentry *dentry) +{ + mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + return dentry; +} + +/** + * tracefs_create_file - create a file in the tracefs filesystem + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * file will be created in the root of the tracefs filesystem. + * @data: a pointer to something that the caller will want to get to later + * on. The inode.i_private pointer will point to this value on + * the open() call. + * @fops: a pointer to a struct file_operations that should be used for + * this file. + * + * This is the basic "create a file" function for tracefs. It allows for a + * wide range of flexibility in creating a file, or a directory (if you want + * to create a directory, the tracefs_create_dir() function is + * recommended to be used instead.) + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the tracefs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, %NULL will be returned. + * + * If tracefs is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *tracefs_create_file(const char *name, umode_t mode, + struct dentry *parent, void *data, + const struct file_operations *fops) +{ + struct dentry *dentry; + struct inode *inode; + + if (!(mode & S_IFMT)) + mode |= S_IFREG; + BUG_ON(!S_ISREG(mode)); + dentry = start_creating(name, parent); + + if (IS_ERR(dentry)) + return NULL; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = mode; + inode->i_fop = fops ? fops : &tracefs_file_operations; + inode->i_private = data; + d_instantiate(dentry, inode); + fsnotify_create(dentry->d_parent->d_inode, dentry); + return end_creating(dentry); +} + +static struct dentry *__create_dir(const char *name, struct dentry *parent, + const struct inode_operations *ops) +{ + struct dentry *dentry = start_creating(name, parent); + struct inode *inode; + + if (IS_ERR(dentry)) + return NULL; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = ops; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + return end_creating(dentry); +} + +/** + * tracefs_create_dir - create a directory in the tracefs filesystem + * @name: a pointer to a string containing the name of the directory to + * create. + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is NULL, then the + * directory will be created in the root of the tracefs filesystem. + * + * This function creates a directory in tracefs with the given name. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the tracefs_remove() function when the file is + * to be removed. If an error occurs, %NULL will be returned. + * + * If tracing is not enabled in the kernel, the value -%ENODEV will be + * returned. + */ +struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) +{ + return __create_dir(name, parent, &simple_dir_inode_operations); +} + +/** + * tracefs_create_instance_dir - create the tracing instances directory + * @name: The name of the instances directory to create + * @parent: The parent directory that the instances directory will exist + * @mkdir: The function to call when a mkdir is performed. + * @rmdir: The function to call when a rmdir is performed. + * + * Only one instances directory is allowed. + * + * The instances directory is special as it allows for mkdir and rmdir to + * to be done by userspace. When a mkdir or rmdir is performed, the inode + * locks are released and the methhods passed in (@mkdir and @rmdir) are + * called without locks and with the name of the directory being created + * within the instances directory. + * + * Returns the dentry of the instances directory. + */ +struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent, + int (*mkdir)(const char *name), + int (*rmdir)(const char *name)) +{ + struct dentry *dentry; + + /* Only allow one instance of the instances directory. */ + if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir)) + return NULL; + + dentry = __create_dir(name, parent, &tracefs_dir_inode_operations); + if (!dentry) + return NULL; + + tracefs_ops.mkdir = mkdir; + tracefs_ops.rmdir = rmdir; + + return dentry; +} + +static int __tracefs_remove(struct dentry *dentry, struct dentry *parent) +{ + int ret = 0; + + if (simple_positive(dentry)) { + if (dentry->d_inode) { + dget(dentry); + switch (dentry->d_inode->i_mode & S_IFMT) { + case S_IFDIR: + ret = simple_rmdir(parent->d_inode, dentry); + break; + default: + simple_unlink(parent->d_inode, dentry); + break; + } + if (!ret) + d_delete(dentry); + dput(dentry); + } + } + return ret; +} + +/** + * tracefs_remove - removes a file or directory from the tracefs filesystem + * @dentry: a pointer to a the dentry of the file or directory to be + * removed. + * + * This function removes a file or directory in tracefs that was previously + * created with a call to another tracefs function (like + * tracefs_create_file() or variants thereof.) + */ +void tracefs_remove(struct dentry *dentry) +{ + struct dentry *parent; + int ret; + + if (IS_ERR_OR_NULL(dentry)) + return; + + parent = dentry->d_parent; + if (!parent || !parent->d_inode) + return; + + mutex_lock(&parent->d_inode->i_mutex); + ret = __tracefs_remove(dentry, parent); + mutex_unlock(&parent->d_inode->i_mutex); + if (!ret) + simple_release_fs(&tracefs_mount, &tracefs_mount_count); +} + +/** + * tracefs_remove_recursive - recursively removes a directory + * @dentry: a pointer to a the dentry of the directory to be removed. + * + * This function recursively removes a directory tree in tracefs that + * was previously created with a call to another tracefs function + * (like tracefs_create_file() or variants thereof.) + */ +void tracefs_remove_recursive(struct dentry *dentry) +{ + struct dentry *child, *parent; + + if (IS_ERR_OR_NULL(dentry)) + return; + + parent = dentry->d_parent; + if (!parent || !parent->d_inode) + return; + + parent = dentry; + down: + mutex_lock(&parent->d_inode->i_mutex); + loop: + /* + * The parent->d_subdirs is protected by the d_lock. Outside that + * lock, the child can be unlinked and set to be freed which can + * use the d_u.d_child as the rcu head and corrupt this list. + */ + spin_lock(&parent->d_lock); + list_for_each_entry(child, &parent->d_subdirs, d_child) { + if (!simple_positive(child)) + continue; + + /* perhaps simple_empty(child) makes more sense */ + if (!list_empty(&child->d_subdirs)) { + spin_unlock(&parent->d_lock); + mutex_unlock(&parent->d_inode->i_mutex); + parent = child; + goto down; + } + + spin_unlock(&parent->d_lock); + + if (!__tracefs_remove(child, parent)) + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + + /* + * The parent->d_lock protects agaist child from unlinking + * from d_subdirs. When releasing the parent->d_lock we can + * no longer trust that the next pointer is valid. + * Restart the loop. We'll skip this one with the + * simple_positive() check. + */ + goto loop; + } + spin_unlock(&parent->d_lock); + + mutex_unlock(&parent->d_inode->i_mutex); + child = parent; + parent = parent->d_parent; + mutex_lock(&parent->d_inode->i_mutex); + + if (child != dentry) + /* go up */ + goto loop; + + if (!__tracefs_remove(child, parent)) + simple_release_fs(&tracefs_mount, &tracefs_mount_count); + mutex_unlock(&parent->d_inode->i_mutex); +} + +/** + * tracefs_initialized - Tells whether tracefs has been registered + */ +bool tracefs_initialized(void) +{ + return tracefs_registered; +} + +static int __init tracefs_init(void) +{ + int retval; + + retval = sysfs_create_mount_point(kernel_kobj, "tracing"); + if (retval) + return -EINVAL; + + retval = register_filesystem(&trace_fs_type); + if (!retval) + tracefs_registered = true; + + return retval; +} +core_initcall(tracefs_init); diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index eb997e9c4ab0..11a11b32a2a9 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -509,7 +509,7 @@ again: c->bi.nospace_rp = 1; smp_wmb(); } else - ubifs_err("cannot budget space, error %d", err); + ubifs_err(c, "cannot budget space, error %d", err); return err; } diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index 26b69b2d4a45..63f56619991d 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -225,7 +225,7 @@ out_cancel: out_up: up_write(&c->commit_sem); out: - ubifs_err("commit failed, error %d", err); + ubifs_err(c, "commit failed, error %d", err); spin_lock(&c->cs_lock); c->cmt_state = COMMIT_BROKEN; wake_up(&c->cmt_wq); @@ -289,7 +289,7 @@ int ubifs_bg_thread(void *info) int err; struct ubifs_info *c = info; - ubifs_msg("background thread \"%s\" started, PID %d", + ubifs_msg(c, "background thread \"%s\" started, PID %d", c->bgt_name, current->pid); set_freezable(); @@ -324,7 +324,7 @@ int ubifs_bg_thread(void *info) cond_resched(); } - ubifs_msg("background thread \"%s\" stops", c->bgt_name); + ubifs_msg(c, "background thread \"%s\" stops", c->bgt_name); return 0; } @@ -712,13 +712,13 @@ out: return 0; out_dump: - ubifs_err("dumping index node (iip=%d)", i->iip); + ubifs_err(c, "dumping index node (iip=%d)", i->iip); ubifs_dump_node(c, idx); list_del(&i->list); kfree(i); if (!list_empty(&list)) { i = list_entry(list.prev, struct idx_node, list); - ubifs_err("dumping parent index node"); + ubifs_err(c, "dumping parent index node"); ubifs_dump_node(c, &i->idx); } out_free: @@ -727,7 +727,7 @@ out_free: list_del(&i->list); kfree(i); } - ubifs_err("failed, error %d", err); + ubifs_err(c, "failed, error %d", err); if (err > 0) err = -EINVAL; return err; diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 2bfa0953335d..565cb56d7225 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -92,8 +92,8 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; * Note, if the input buffer was not compressed, it is copied to the output * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. */ -void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, - int *compr_type) +void ubifs_compress(const struct ubifs_info *c, const void *in_buf, + int in_len, void *out_buf, int *out_len, int *compr_type) { int err; struct ubifs_compressor *compr = ubifs_compressors[*compr_type]; @@ -112,9 +112,9 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, if (compr->comp_mutex) mutex_unlock(compr->comp_mutex); if (unlikely(err)) { - ubifs_warn("cannot compress %d bytes, compressor %s, error %d, leave data uncompressed", + ubifs_warn(c, "cannot compress %d bytes, compressor %s, error %d, leave data uncompressed", in_len, compr->name, err); - goto no_compr; + goto no_compr; } /* @@ -144,21 +144,21 @@ no_compr: * The length of the uncompressed data is returned in @out_len. This functions * returns %0 on success or a negative error code on failure. */ -int ubifs_decompress(const void *in_buf, int in_len, void *out_buf, - int *out_len, int compr_type) +int ubifs_decompress(const struct ubifs_info *c, const void *in_buf, + int in_len, void *out_buf, int *out_len, int compr_type) { int err; struct ubifs_compressor *compr; if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) { - ubifs_err("invalid compression type %d", compr_type); + ubifs_err(c, "invalid compression type %d", compr_type); return -EINVAL; } compr = ubifs_compressors[compr_type]; if (unlikely(!compr->capi_name)) { - ubifs_err("%s compression is not compiled in", compr->name); + ubifs_err(c, "%s compression is not compiled in", compr->name); return -EINVAL; } @@ -175,7 +175,7 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf, if (compr->decomp_mutex) mutex_unlock(compr->decomp_mutex); if (err) - ubifs_err("cannot decompress %d bytes, compressor %s, error %d", + ubifs_err(c, "cannot decompress %d bytes, compressor %s, error %d", in_len, compr->name, err); return err; @@ -193,8 +193,8 @@ static int __init compr_init(struct ubifs_compressor *compr) if (compr->capi_name) { compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0); if (IS_ERR(compr->cc)) { - ubifs_err("cannot initialize compressor %s, error %ld", - compr->name, PTR_ERR(compr->cc)); + pr_err("UBIFS error (pid %d): cannot initialize compressor %s, error %ld", + current->pid, compr->name, PTR_ERR(compr->cc)); return PTR_ERR(compr->cc); } } diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 4cfb3e82c56f..4c46a9865fa7 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -746,7 +746,7 @@ void ubifs_dump_lprops(struct ubifs_info *c) for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) { err = ubifs_read_one_lp(c, lnum, &lp); if (err) { - ubifs_err("cannot read lprops for LEB %d", lnum); + ubifs_err(c, "cannot read lprops for LEB %d", lnum); continue; } @@ -819,13 +819,13 @@ void ubifs_dump_leb(const struct ubifs_info *c, int lnum) buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); if (!buf) { - ubifs_err("cannot allocate memory for dumping LEB %d", lnum); + ubifs_err(c, "cannot allocate memory for dumping LEB %d", lnum); return; } sleb = ubifs_scan(c, lnum, 0, buf, 0); if (IS_ERR(sleb)) { - ubifs_err("scan error %d", (int)PTR_ERR(sleb)); + ubifs_err(c, "scan error %d", (int)PTR_ERR(sleb)); goto out; } @@ -1032,7 +1032,7 @@ int dbg_check_space_info(struct ubifs_info *c) spin_unlock(&c->space_lock); if (free != d->saved_free) { - ubifs_err("free space changed from %lld to %lld", + ubifs_err(c, "free space changed from %lld to %lld", d->saved_free, free); goto out; } @@ -1040,15 +1040,15 @@ int dbg_check_space_info(struct ubifs_info *c) return 0; out: - ubifs_msg("saved lprops statistics dump"); + ubifs_msg(c, "saved lprops statistics dump"); ubifs_dump_lstats(&d->saved_lst); - ubifs_msg("saved budgeting info dump"); + ubifs_msg(c, "saved budgeting info dump"); ubifs_dump_budg(c, &d->saved_bi); - ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt); - ubifs_msg("current lprops statistics dump"); + ubifs_msg(c, "saved idx_gc_cnt %d", d->saved_idx_gc_cnt); + ubifs_msg(c, "current lprops statistics dump"); ubifs_get_lp_stats(c, &lst); ubifs_dump_lstats(&lst); - ubifs_msg("current budgeting info dump"); + ubifs_msg(c, "current budgeting info dump"); ubifs_dump_budg(c, &c->bi); dump_stack(); return -EINVAL; @@ -1077,9 +1077,9 @@ int dbg_check_synced_i_size(const struct ubifs_info *c, struct inode *inode) mutex_lock(&ui->ui_mutex); spin_lock(&ui->ui_lock); if (ui->ui_size != ui->synced_i_size && !ui->dirty) { - ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode is clean", + ubifs_err(c, "ui_size is %lld, synced_i_size is %lld, but inode is clean", ui->ui_size, ui->synced_i_size); - ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, + ubifs_err(c, "i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino, inode->i_mode, i_size_read(inode)); dump_stack(); err = -EINVAL; @@ -1140,7 +1140,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) kfree(pdent); if (i_size_read(dir) != size) { - ubifs_err("directory inode %lu has size %llu, but calculated size is %llu", + ubifs_err(c, "directory inode %lu has size %llu, but calculated size is %llu", dir->i_ino, (unsigned long long)i_size_read(dir), (unsigned long long)size); ubifs_dump_inode(c, dir); @@ -1148,7 +1148,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir) return -EINVAL; } if (dir->i_nlink != nlink) { - ubifs_err("directory inode %lu has nlink %u, but calculated nlink is %u", + ubifs_err(c, "directory inode %lu has nlink %u, but calculated nlink is %u", dir->i_ino, dir->i_nlink, nlink); ubifs_dump_inode(c, dir); dump_stack(); @@ -1207,10 +1207,10 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, err = 1; key_read(c, &dent1->key, &key); if (keys_cmp(c, &zbr1->key, &key)) { - ubifs_err("1st entry at %d:%d has key %s", zbr1->lnum, + ubifs_err(c, "1st entry at %d:%d has key %s", zbr1->lnum, zbr1->offs, dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); - ubifs_err("but it should have key %s according to tnc", + ubifs_err(c, "but it should have key %s according to tnc", dbg_snprintf_key(c, &zbr1->key, key_buf, DBG_KEY_BUF_LEN)); ubifs_dump_node(c, dent1); @@ -1219,10 +1219,10 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, key_read(c, &dent2->key, &key); if (keys_cmp(c, &zbr2->key, &key)) { - ubifs_err("2nd entry at %d:%d has key %s", zbr1->lnum, + ubifs_err(c, "2nd entry at %d:%d has key %s", zbr1->lnum, zbr1->offs, dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); - ubifs_err("but it should have key %s according to tnc", + ubifs_err(c, "but it should have key %s according to tnc", dbg_snprintf_key(c, &zbr2->key, key_buf, DBG_KEY_BUF_LEN)); ubifs_dump_node(c, dent2); @@ -1238,14 +1238,14 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1, goto out_free; } if (cmp == 0 && nlen1 == nlen2) - ubifs_err("2 xent/dent nodes with the same name"); + ubifs_err(c, "2 xent/dent nodes with the same name"); else - ubifs_err("bad order of colliding key %s", + ubifs_err(c, "bad order of colliding key %s", dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN)); - ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs); + ubifs_msg(c, "first node at %d:%d\n", zbr1->lnum, zbr1->offs); ubifs_dump_node(c, dent1); - ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs); + ubifs_msg(c, "second node at %d:%d\n", zbr2->lnum, zbr2->offs); ubifs_dump_node(c, dent2); out_free: @@ -1447,11 +1447,11 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr) return 0; out: - ubifs_err("failed, error %d", err); - ubifs_msg("dump of the znode"); + ubifs_err(c, "failed, error %d", err); + ubifs_msg(c, "dump of the znode"); ubifs_dump_znode(c, znode); if (zp) { - ubifs_msg("dump of the parent znode"); + ubifs_msg(c, "dump of the parent znode"); ubifs_dump_znode(c, zp); } dump_stack(); @@ -1518,9 +1518,9 @@ int dbg_check_tnc(struct ubifs_info *c, int extra) if (err < 0) return err; if (err) { - ubifs_msg("first znode"); + ubifs_msg(c, "first znode"); ubifs_dump_znode(c, prev); - ubifs_msg("second znode"); + ubifs_msg(c, "second znode"); ubifs_dump_znode(c, znode); return -EINVAL; } @@ -1529,13 +1529,13 @@ int dbg_check_tnc(struct ubifs_info *c, int extra) if (extra) { if (clean_cnt != atomic_long_read(&c->clean_zn_cnt)) { - ubifs_err("incorrect clean_zn_cnt %ld, calculated %ld", + ubifs_err(c, "incorrect clean_zn_cnt %ld, calculated %ld", atomic_long_read(&c->clean_zn_cnt), clean_cnt); return -EINVAL; } if (dirty_cnt != atomic_long_read(&c->dirty_zn_cnt)) { - ubifs_err("incorrect dirty_zn_cnt %ld, calculated %ld", + ubifs_err(c, "incorrect dirty_zn_cnt %ld, calculated %ld", atomic_long_read(&c->dirty_zn_cnt), dirty_cnt); return -EINVAL; @@ -1608,7 +1608,7 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, if (znode_cb) { err = znode_cb(c, znode, priv); if (err) { - ubifs_err("znode checking function returned error %d", + ubifs_err(c, "znode checking function returned error %d", err); ubifs_dump_znode(c, znode); goto out_dump; @@ -1619,7 +1619,7 @@ int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb, zbr = &znode->zbranch[idx]; err = leaf_cb(c, zbr, priv); if (err) { - ubifs_err("leaf checking function returned error %d, for leaf at LEB %d:%d", + ubifs_err(c, "leaf checking function returned error %d, for leaf at LEB %d:%d", err, zbr->lnum, zbr->offs); goto out_dump; } @@ -1675,7 +1675,7 @@ out_dump: zbr = &znode->parent->zbranch[znode->iip]; else zbr = &c->zroot; - ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs); + ubifs_msg(c, "dump of znode at LEB %d:%d", zbr->lnum, zbr->offs); ubifs_dump_znode(c, znode); out_unlock: mutex_unlock(&c->tnc_mutex); @@ -1722,12 +1722,12 @@ int dbg_check_idx_size(struct ubifs_info *c, long long idx_size) err = dbg_walk_index(c, NULL, add_size, &calc); if (err) { - ubifs_err("error %d while walking the index", err); + ubifs_err(c, "error %d while walking the index", err); return err; } if (calc != idx_size) { - ubifs_err("index size check failed: calculated size is %lld, should be %lld", + ubifs_err(c, "index size check failed: calculated size is %lld, should be %lld", calc, idx_size); dump_stack(); return -EINVAL; @@ -1814,7 +1814,7 @@ static struct fsck_inode *add_inode(struct ubifs_info *c, } if (inum > c->highest_inum) { - ubifs_err("too high inode number, max. is %lu", + ubifs_err(c, "too high inode number, max. is %lu", (unsigned long)c->highest_inum); return ERR_PTR(-EINVAL); } @@ -1921,17 +1921,17 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c, ino_key_init(c, &key, inum); err = ubifs_lookup_level0(c, &key, &znode, &n); if (!err) { - ubifs_err("inode %lu not found in index", (unsigned long)inum); + ubifs_err(c, "inode %lu not found in index", (unsigned long)inum); return ERR_PTR(-ENOENT); } else if (err < 0) { - ubifs_err("error %d while looking up inode %lu", + ubifs_err(c, "error %d while looking up inode %lu", err, (unsigned long)inum); return ERR_PTR(err); } zbr = &znode->zbranch[n]; if (zbr->len < UBIFS_INO_NODE_SZ) { - ubifs_err("bad node %lu node length %d", + ubifs_err(c, "bad node %lu node length %d", (unsigned long)inum, zbr->len); return ERR_PTR(-EINVAL); } @@ -1942,7 +1942,7 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c, err = ubifs_tnc_read_node(c, zbr, ino); if (err) { - ubifs_err("cannot read inode node at LEB %d:%d, error %d", + ubifs_err(c, "cannot read inode node at LEB %d:%d, error %d", zbr->lnum, zbr->offs, err); kfree(ino); return ERR_PTR(err); @@ -1951,7 +1951,7 @@ static struct fsck_inode *read_add_inode(struct ubifs_info *c, fscki = add_inode(c, fsckd, ino); kfree(ino); if (IS_ERR(fscki)) { - ubifs_err("error %ld while adding inode %lu node", + ubifs_err(c, "error %ld while adding inode %lu node", PTR_ERR(fscki), (unsigned long)inum); return fscki; } @@ -1985,7 +1985,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, struct fsck_inode *fscki; if (zbr->len < UBIFS_CH_SZ) { - ubifs_err("bad leaf length %d (LEB %d:%d)", + ubifs_err(c, "bad leaf length %d (LEB %d:%d)", zbr->len, zbr->lnum, zbr->offs); return -EINVAL; } @@ -1996,7 +1996,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, err = ubifs_tnc_read_node(c, zbr, node); if (err) { - ubifs_err("cannot read leaf node at LEB %d:%d, error %d", + ubifs_err(c, "cannot read leaf node at LEB %d:%d, error %d", zbr->lnum, zbr->offs, err); goto out_free; } @@ -2006,7 +2006,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, fscki = add_inode(c, priv, node); if (IS_ERR(fscki)) { err = PTR_ERR(fscki); - ubifs_err("error %d while adding inode node", err); + ubifs_err(c, "error %d while adding inode node", err); goto out_dump; } goto out; @@ -2014,7 +2014,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, if (type != UBIFS_DENT_KEY && type != UBIFS_XENT_KEY && type != UBIFS_DATA_KEY) { - ubifs_err("unexpected node type %d at LEB %d:%d", + ubifs_err(c, "unexpected node type %d at LEB %d:%d", type, zbr->lnum, zbr->offs); err = -EINVAL; goto out_free; @@ -2022,7 +2022,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, ch = node; if (le64_to_cpu(ch->sqnum) > c->max_sqnum) { - ubifs_err("too high sequence number, max. is %llu", + ubifs_err(c, "too high sequence number, max. is %llu", c->max_sqnum); err = -EINVAL; goto out_dump; @@ -2042,7 +2042,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, fscki = read_add_inode(c, priv, inum); if (IS_ERR(fscki)) { err = PTR_ERR(fscki); - ubifs_err("error %d while processing data node and trying to find inode node %lu", + ubifs_err(c, "error %d while processing data node and trying to find inode node %lu", err, (unsigned long)inum); goto out_dump; } @@ -2052,7 +2052,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, blk_offs <<= UBIFS_BLOCK_SHIFT; blk_offs += le32_to_cpu(dn->size); if (blk_offs > fscki->size) { - ubifs_err("data node at LEB %d:%d is not within inode size %lld", + ubifs_err(c, "data node at LEB %d:%d is not within inode size %lld", zbr->lnum, zbr->offs, fscki->size); err = -EINVAL; goto out_dump; @@ -2076,7 +2076,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, fscki = read_add_inode(c, priv, inum); if (IS_ERR(fscki)) { err = PTR_ERR(fscki); - ubifs_err("error %d while processing entry node and trying to find inode node %lu", + ubifs_err(c, "error %d while processing entry node and trying to find inode node %lu", err, (unsigned long)inum); goto out_dump; } @@ -2088,7 +2088,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr, fscki1 = read_add_inode(c, priv, inum); if (IS_ERR(fscki1)) { err = PTR_ERR(fscki1); - ubifs_err("error %d while processing entry node and trying to find parent inode node %lu", + ubifs_err(c, "error %d while processing entry node and trying to find parent inode node %lu", err, (unsigned long)inum); goto out_dump; } @@ -2111,7 +2111,7 @@ out: return 0; out_dump: - ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs); + ubifs_msg(c, "dump of node at LEB %d:%d", zbr->lnum, zbr->offs); ubifs_dump_node(c, node); out_free: kfree(node); @@ -2162,52 +2162,52 @@ static int check_inodes(struct ubifs_info *c, struct fsck_data *fsckd) */ if (fscki->inum != UBIFS_ROOT_INO && fscki->references != 1) { - ubifs_err("directory inode %lu has %d direntries which refer it, but should be 1", + ubifs_err(c, "directory inode %lu has %d direntries which refer it, but should be 1", (unsigned long)fscki->inum, fscki->references); goto out_dump; } if (fscki->inum == UBIFS_ROOT_INO && fscki->references != 0) { - ubifs_err("root inode %lu has non-zero (%d) direntries which refer it", + ubifs_err(c, "root inode %lu has non-zero (%d) direntries which refer it", (unsigned long)fscki->inum, fscki->references); goto out_dump; } if (fscki->calc_sz != fscki->size) { - ubifs_err("directory inode %lu size is %lld, but calculated size is %lld", + ubifs_err(c, "directory inode %lu size is %lld, but calculated size is %lld", (unsigned long)fscki->inum, fscki->size, fscki->calc_sz); goto out_dump; } if (fscki->calc_cnt != fscki->nlink) { - ubifs_err("directory inode %lu nlink is %d, but calculated nlink is %d", + ubifs_err(c, "directory inode %lu nlink is %d, but calculated nlink is %d", (unsigned long)fscki->inum, fscki->nlink, fscki->calc_cnt); goto out_dump; } } else { if (fscki->references != fscki->nlink) { - ubifs_err("inode %lu nlink is %d, but calculated nlink is %d", + ubifs_err(c, "inode %lu nlink is %d, but calculated nlink is %d", (unsigned long)fscki->inum, fscki->nlink, fscki->references); goto out_dump; } } if (fscki->xattr_sz != fscki->calc_xsz) { - ubifs_err("inode %lu has xattr size %u, but calculated size is %lld", + ubifs_err(c, "inode %lu has xattr size %u, but calculated size is %lld", (unsigned long)fscki->inum, fscki->xattr_sz, fscki->calc_xsz); goto out_dump; } if (fscki->xattr_cnt != fscki->calc_xcnt) { - ubifs_err("inode %lu has %u xattrs, but calculated count is %lld", + ubifs_err(c, "inode %lu has %u xattrs, but calculated count is %lld", (unsigned long)fscki->inum, fscki->xattr_cnt, fscki->calc_xcnt); goto out_dump; } if (fscki->xattr_nms != fscki->calc_xnms) { - ubifs_err("inode %lu has xattr names' size %u, but calculated names' size is %lld", + ubifs_err(c, "inode %lu has xattr names' size %u, but calculated names' size is %lld", (unsigned long)fscki->inum, fscki->xattr_nms, fscki->calc_xnms); goto out_dump; @@ -2221,11 +2221,11 @@ out_dump: ino_key_init(c, &key, fscki->inum); err = ubifs_lookup_level0(c, &key, &znode, &n); if (!err) { - ubifs_err("inode %lu not found in index", + ubifs_err(c, "inode %lu not found in index", (unsigned long)fscki->inum); return -ENOENT; } else if (err < 0) { - ubifs_err("error %d while looking up inode %lu", + ubifs_err(c, "error %d while looking up inode %lu", err, (unsigned long)fscki->inum); return err; } @@ -2237,13 +2237,13 @@ out_dump: err = ubifs_tnc_read_node(c, zbr, ino); if (err) { - ubifs_err("cannot read inode node at LEB %d:%d, error %d", + ubifs_err(c, "cannot read inode node at LEB %d:%d, error %d", zbr->lnum, zbr->offs, err); kfree(ino); return err; } - ubifs_msg("dump of the inode %lu sitting in LEB %d:%d", + ubifs_msg(c, "dump of the inode %lu sitting in LEB %d:%d", (unsigned long)fscki->inum, zbr->lnum, zbr->offs); ubifs_dump_node(c, ino); kfree(ino); @@ -2284,7 +2284,7 @@ int dbg_check_filesystem(struct ubifs_info *c) return 0; out_free: - ubifs_err("file-system check failed with error %d", err); + ubifs_err(c, "file-system check failed with error %d", err); dump_stack(); free_inodes(&fsckd); return err; @@ -2315,12 +2315,12 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) sb = container_of(cur->next, struct ubifs_scan_node, list); if (sa->type != UBIFS_DATA_NODE) { - ubifs_err("bad node type %d", sa->type); + ubifs_err(c, "bad node type %d", sa->type); ubifs_dump_node(c, sa->node); return -EINVAL; } if (sb->type != UBIFS_DATA_NODE) { - ubifs_err("bad node type %d", sb->type); + ubifs_err(c, "bad node type %d", sb->type); ubifs_dump_node(c, sb->node); return -EINVAL; } @@ -2331,7 +2331,7 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) if (inuma < inumb) continue; if (inuma > inumb) { - ubifs_err("larger inum %lu goes before inum %lu", + ubifs_err(c, "larger inum %lu goes before inum %lu", (unsigned long)inuma, (unsigned long)inumb); goto error_dump; } @@ -2340,11 +2340,11 @@ int dbg_check_data_nodes_order(struct ubifs_info *c, struct list_head *head) blkb = key_block(c, &sb->key); if (blka > blkb) { - ubifs_err("larger block %u goes before %u", blka, blkb); + ubifs_err(c, "larger block %u goes before %u", blka, blkb); goto error_dump; } if (blka == blkb) { - ubifs_err("two data nodes for the same block"); + ubifs_err(c, "two data nodes for the same block"); goto error_dump; } } @@ -2383,19 +2383,19 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && sa->type != UBIFS_XENT_NODE) { - ubifs_err("bad node type %d", sa->type); + ubifs_err(c, "bad node type %d", sa->type); ubifs_dump_node(c, sa->node); return -EINVAL; } if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && sa->type != UBIFS_XENT_NODE) { - ubifs_err("bad node type %d", sb->type); + ubifs_err(c, "bad node type %d", sb->type); ubifs_dump_node(c, sb->node); return -EINVAL; } if (sa->type != UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) { - ubifs_err("non-inode node goes before inode node"); + ubifs_err(c, "non-inode node goes before inode node"); goto error_dump; } @@ -2405,7 +2405,7 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) if (sa->type == UBIFS_INO_NODE && sb->type == UBIFS_INO_NODE) { /* Inode nodes are sorted in descending size order */ if (sa->len < sb->len) { - ubifs_err("smaller inode node goes first"); + ubifs_err(c, "smaller inode node goes first"); goto error_dump; } continue; @@ -2421,7 +2421,7 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) if (inuma < inumb) continue; if (inuma > inumb) { - ubifs_err("larger inum %lu goes before inum %lu", + ubifs_err(c, "larger inum %lu goes before inum %lu", (unsigned long)inuma, (unsigned long)inumb); goto error_dump; } @@ -2430,7 +2430,7 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) hashb = key_block(c, &sb->key); if (hasha > hashb) { - ubifs_err("larger hash %u goes before %u", + ubifs_err(c, "larger hash %u goes before %u", hasha, hashb); goto error_dump; } @@ -2439,9 +2439,9 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) return 0; error_dump: - ubifs_msg("dumping first node"); + ubifs_msg(c, "dumping first node"); ubifs_dump_node(c, sa->node); - ubifs_msg("dumping second node"); + ubifs_msg(c, "dumping second node"); ubifs_dump_node(c, sb->node); return -EINVAL; return 0; @@ -2470,13 +2470,13 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) delay = prandom_u32() % 60000; d->pc_timeout = jiffies; d->pc_timeout += msecs_to_jiffies(delay); - ubifs_warn("failing after %lums", delay); + ubifs_warn(c, "failing after %lums", delay); } else { d->pc_delay = 2; delay = prandom_u32() % 10000; /* Fail within 10000 operations */ d->pc_cnt_max = delay; - ubifs_warn("failing after %lu calls", delay); + ubifs_warn(c, "failing after %lu calls", delay); } } @@ -2494,55 +2494,55 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) return 0; if (chance(19, 20)) return 0; - ubifs_warn("failing in super block LEB %d", lnum); + ubifs_warn(c, "failing in super block LEB %d", lnum); } else if (lnum == UBIFS_MST_LNUM || lnum == UBIFS_MST_LNUM + 1) { if (chance(19, 20)) return 0; - ubifs_warn("failing in master LEB %d", lnum); + ubifs_warn(c, "failing in master LEB %d", lnum); } else if (lnum >= UBIFS_LOG_LNUM && lnum <= c->log_last) { if (write && chance(99, 100)) return 0; if (chance(399, 400)) return 0; - ubifs_warn("failing in log LEB %d", lnum); + ubifs_warn(c, "failing in log LEB %d", lnum); } else if (lnum >= c->lpt_first && lnum <= c->lpt_last) { if (write && chance(7, 8)) return 0; if (chance(19, 20)) return 0; - ubifs_warn("failing in LPT LEB %d", lnum); + ubifs_warn(c, "failing in LPT LEB %d", lnum); } else if (lnum >= c->orph_first && lnum <= c->orph_last) { if (write && chance(1, 2)) return 0; if (chance(9, 10)) return 0; - ubifs_warn("failing in orphan LEB %d", lnum); + ubifs_warn(c, "failing in orphan LEB %d", lnum); } else if (lnum == c->ihead_lnum) { if (chance(99, 100)) return 0; - ubifs_warn("failing in index head LEB %d", lnum); + ubifs_warn(c, "failing in index head LEB %d", lnum); } else if (c->jheads && lnum == c->jheads[GCHD].wbuf.lnum) { if (chance(9, 10)) return 0; - ubifs_warn("failing in GC head LEB %d", lnum); + ubifs_warn(c, "failing in GC head LEB %d", lnum); } else if (write && !RB_EMPTY_ROOT(&c->buds) && !ubifs_search_bud(c, lnum)) { if (chance(19, 20)) return 0; - ubifs_warn("failing in non-bud LEB %d", lnum); + ubifs_warn(c, "failing in non-bud LEB %d", lnum); } else if (c->cmt_state == COMMIT_RUNNING_BACKGROUND || c->cmt_state == COMMIT_RUNNING_REQUIRED) { if (chance(999, 1000)) return 0; - ubifs_warn("failing in bud LEB %d commit running", lnum); + ubifs_warn(c, "failing in bud LEB %d commit running", lnum); } else { if (chance(9999, 10000)) return 0; - ubifs_warn("failing in bud LEB %d commit not running", lnum); + ubifs_warn(c, "failing in bud LEB %d commit not running", lnum); } d->pc_happened = 1; - ubifs_warn("========== Power cut emulated =========="); + ubifs_warn(c, "========== Power cut emulated =========="); dump_stack(); return 1; } @@ -2557,7 +2557,7 @@ static int corrupt_data(const struct ubifs_info *c, const void *buf, /* Corruption span max to end of write unit */ to = min(len, ALIGN(from + 1, c->max_write_size)); - ubifs_warn("filled bytes %u-%u with %s", from, to - 1, + ubifs_warn(c, "filled bytes %u-%u with %s", from, to - 1, ffs ? "0xFFs" : "random data"); if (ffs) @@ -2579,7 +2579,7 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, failing = power_cut_emulated(c, lnum, 1); if (failing) { len = corrupt_data(c, buf, len); - ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)", + ubifs_warn(c, "actually write %d bytes to LEB %d:%d (the buffer was corrupted)", len, lnum, offs); } err = ubi_leb_write(c->ubi, lnum, buf, offs, len); @@ -2909,7 +2909,7 @@ out_remove: debugfs_remove_recursive(d->dfs_dir); out: err = dent ? PTR_ERR(dent) : -ENODEV; - ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n", + ubifs_err(c, "cannot create \"%s\" debugfs file or directory, error %d\n", fname, err); return err; } @@ -3063,8 +3063,8 @@ out_remove: debugfs_remove_recursive(dfs_rootdir); out: err = dent ? PTR_ERR(dent) : -ENODEV; - ubifs_err("cannot create \"%s\" debugfs file or directory, error %d\n", - fname, err); + pr_err("UBIFS error (pid %d): cannot create \"%s\" debugfs file or directory, error %d\n", + current->pid, fname, err); return err; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 0fa6c803992e..5c27c66c224a 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -146,12 +146,12 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir, if (c->highest_inum >= INUM_WARN_WATERMARK) { if (c->highest_inum >= INUM_WATERMARK) { spin_unlock(&c->cnt_lock); - ubifs_err("out of inode numbers"); + ubifs_err(c, "out of inode numbers"); make_bad_inode(inode); iput(inode); return ERR_PTR(-EINVAL); } - ubifs_warn("running out of inode numbers (current %lu, max %d)", + ubifs_warn(c, "running out of inode numbers (current %lu, max %u)", (unsigned long)c->highest_inum, INUM_WATERMARK); } @@ -222,7 +222,7 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, * checking. */ err = PTR_ERR(inode); - ubifs_err("dead directory entry '%pd', error %d", + ubifs_err(c, "dead directory entry '%pd', error %d", dentry, err); ubifs_ro_mode(c, err); goto out; @@ -272,7 +272,7 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode, err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) - goto out_cancel; + goto out_inode; mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; @@ -292,11 +292,12 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); +out_inode: make_bad_inode(inode); iput(inode); out_budg: ubifs_release_budget(c, &req); - ubifs_err("cannot create regular file, error %d", err); + ubifs_err(c, "cannot create regular file, error %d", err); return err; } @@ -449,7 +450,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) out: if (err != -ENOENT) { - ubifs_err("cannot find next direntry, error %d", err); + ubifs_err(c, "cannot find next direntry, error %d", err); return err; } @@ -498,7 +499,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_inode *dir_ui = ubifs_inode(dir); int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len); @@ -553,7 +554,7 @@ out_cancel: static int ubifs_unlink(struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ubifs_inode *dir_ui = ubifs_inode(dir); int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, budgeted = 1; @@ -645,7 +646,7 @@ static int check_dir_empty(struct ubifs_info *c, struct inode *dir) static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) { struct ubifs_info *c = dir->i_sb->s_fs_info; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int sz_change = CALC_DENT_SIZE(dentry->d_name.len); int err, budgeted = 1; struct ubifs_inode *dir_ui = ubifs_inode(dir); @@ -661,7 +662,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) inode->i_ino, dir->i_ino); ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); - err = check_dir_empty(c, dentry->d_inode); + err = check_dir_empty(c, d_inode(dentry)); if (err) return err; @@ -732,7 +733,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) - goto out_cancel; + goto out_inode; mutex_lock(&dir_ui->ui_mutex); insert_inode_hash(inode); @@ -743,7 +744,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) dir->i_mtime = dir->i_ctime = inode->i_ctime; err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0); if (err) { - ubifs_err("cannot create directory, error %d", err); + ubifs_err(c, "cannot create directory, error %d", err); goto out_cancel; } mutex_unlock(&dir_ui->ui_mutex); @@ -757,6 +758,7 @@ out_cancel: dir_ui->ui_size = dir->i_size; drop_nlink(dir); mutex_unlock(&dir_ui->ui_mutex); +out_inode: make_bad_inode(inode); iput(inode); out_budg: @@ -816,7 +818,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) - goto out_cancel; + goto out_inode; mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; @@ -836,6 +838,7 @@ out_cancel: dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; mutex_unlock(&dir_ui->ui_mutex); +out_inode: make_bad_inode(inode); iput(inode); out_budg: @@ -886,6 +889,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, memcpy(ui->data, symname, len); ((char *)ui->data)[len] = '\0'; + inode->i_link = ui->data; /* * The terminating zero byte is not written to the flash media and it * is put just to make later in-memory string processing simpler. Thus, @@ -896,7 +900,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, err = ubifs_init_security(dir, inode, &dentry->d_name); if (err) - goto out_cancel; + goto out_inode; mutex_lock(&dir_ui->ui_mutex); dir->i_size += sz_change; @@ -967,8 +971,8 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct ubifs_info *c = old_dir->i_sb->s_fs_info; - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct ubifs_inode *old_inode_ui = ubifs_inode(old_inode); int err, release, sync = 0, move = (new_dir != old_dir); int is_dir = S_ISDIR(old_inode->i_mode); @@ -1133,7 +1137,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { loff_t size; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index e627c0acf626..a3dfe2ae79f2 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -50,9 +50,7 @@ */ #include "ubifs.h" -#include <linux/aio.h> #include <linux/mount.h> -#include <linux/namei.h> #include <linux/slab.h> static int read_block(struct inode *inode, void *addr, unsigned int block, @@ -80,7 +78,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; out_len = UBIFS_BLOCK_SIZE; - err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, le16_to_cpu(dn->compr_type)); if (err || len != out_len) goto dump; @@ -96,7 +94,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, return 0; dump: - ubifs_err("bad data node (block %u, inode %lu)", + ubifs_err(c, "bad data node (block %u, inode %lu)", block, inode->i_ino); ubifs_dump_node(c, dn); return -EINVAL; @@ -161,13 +159,14 @@ static int do_readpage(struct page *page) addr += UBIFS_BLOCK_SIZE; } if (err) { + struct ubifs_info *c = inode->i_sb->s_fs_info; if (err == -ENOENT) { /* Not found, so it must be a hole */ SetPageChecked(page); dbg_gen("hole"); goto out_free; } - ubifs_err("cannot read page %lu of inode %lu, error %d", + ubifs_err(c, "cannot read page %lu of inode %lu, error %d", page->index, inode->i_ino, err); goto error; } @@ -650,7 +649,7 @@ static int populate_page(struct ubifs_info *c, struct page *page, dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; out_len = UBIFS_BLOCK_SIZE; - err = ubifs_decompress(&dn->data, dlen, addr, &out_len, + err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, le16_to_cpu(dn->compr_type)); if (err || len != out_len) goto out_err; @@ -698,7 +697,7 @@ out_err: SetPageError(page); flush_dcache_page(page); kunmap(page); - ubifs_err("bad data node (block %u, inode %lu)", + ubifs_err(c, "bad data node (block %u, inode %lu)", page_block, inode->i_ino); return -EINVAL; } @@ -802,7 +801,7 @@ out_free: return ret; out_warn: - ubifs_warn("ignoring error %d and skipping bulk-read", err); + ubifs_warn(c, "ignoring error %d and skipping bulk-read", err); goto out_free; out_bu_off: @@ -930,7 +929,7 @@ static int do_writepage(struct page *page, int len) } if (err) { SetPageError(page); - ubifs_err("cannot write page %lu of inode %lu, error %d", + ubifs_err(c, "cannot write page %lu of inode %lu, error %d", page->index, inode->i_ino, err); ubifs_ro_mode(c, err); } @@ -1257,7 +1256,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, int ubifs_setattr(struct dentry *dentry, struct iattr *attr) { int err; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct ubifs_info *c = inode->i_sb->s_fs_info; dbg_gen("ino %lu, mode %#x, ia_valid %#x", @@ -1300,14 +1299,6 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset, ClearPageChecked(page); } -static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ubifs_inode *ui = ubifs_inode(dentry->d_inode); - - nd_set_link(nd, ui->data); - return NULL; -} - int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; @@ -1485,7 +1476,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, err = ubifs_budget_space(c, &req); if (unlikely(err)) { if (err == -ENOSPC) - ubifs_warn("out of space for mmapped file (inode number %lu)", + ubifs_warn(c, "out of space for mmapped file (inode number %lu)", inode->i_ino); return VM_FAULT_SIGBUS; } @@ -1570,7 +1561,7 @@ const struct inode_operations ubifs_file_inode_operations = { const struct inode_operations ubifs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ubifs_follow_link, + .follow_link = simple_follow_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, .setxattr = ubifs_setxattr, @@ -1581,8 +1572,6 @@ const struct inode_operations ubifs_symlink_inode_operations = { const struct file_operations ubifs_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = generic_file_read_iter, .write_iter = ubifs_write_iter, .mmap = ubifs_file_mmap, diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index fb08b0c514b6..97be41215332 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -85,7 +85,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) c->ro_error = 1; c->no_chk_data_crc = 0; c->vfs_sb->s_flags |= MS_RDONLY; - ubifs_warn("switched to read-only mode, error %d", err); + ubifs_warn(c, "switched to read-only mode, error %d", err); dump_stack(); } } @@ -107,7 +107,7 @@ int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs, * @even_ebadmsg is true. */ if (err && (err != -EBADMSG || even_ebadmsg)) { - ubifs_err("reading %d bytes from LEB %d:%d failed, error %d", + ubifs_err(c, "reading %d bytes from LEB %d:%d failed, error %d", len, lnum, offs, err); dump_stack(); } @@ -127,7 +127,7 @@ int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs, else err = dbg_leb_write(c, lnum, buf, offs, len); if (err) { - ubifs_err("writing %d bytes to LEB %d:%d failed, error %d", + ubifs_err(c, "writing %d bytes to LEB %d:%d failed, error %d", len, lnum, offs, err); ubifs_ro_mode(c, err); dump_stack(); @@ -147,7 +147,7 @@ int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len) else err = dbg_leb_change(c, lnum, buf, len); if (err) { - ubifs_err("changing %d bytes in LEB %d failed, error %d", + ubifs_err(c, "changing %d bytes in LEB %d failed, error %d", len, lnum, err); ubifs_ro_mode(c, err); dump_stack(); @@ -167,7 +167,7 @@ int ubifs_leb_unmap(struct ubifs_info *c, int lnum) else err = dbg_leb_unmap(c, lnum); if (err) { - ubifs_err("unmap LEB %d failed, error %d", lnum, err); + ubifs_err(c, "unmap LEB %d failed, error %d", lnum, err); ubifs_ro_mode(c, err); dump_stack(); } @@ -186,7 +186,7 @@ int ubifs_leb_map(struct ubifs_info *c, int lnum) else err = dbg_leb_map(c, lnum); if (err) { - ubifs_err("mapping LEB %d failed, error %d", lnum, err); + ubifs_err(c, "mapping LEB %d failed, error %d", lnum, err); ubifs_ro_mode(c, err); dump_stack(); } @@ -199,7 +199,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum) err = ubi_is_mapped(c->ubi, lnum); if (err < 0) { - ubifs_err("ubi_is_mapped failed for LEB %d, error %d", + ubifs_err(c, "ubi_is_mapped failed for LEB %d, error %d", lnum, err); dump_stack(); } @@ -247,7 +247,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, magic = le32_to_cpu(ch->magic); if (magic != UBIFS_NODE_MAGIC) { if (!quiet) - ubifs_err("bad magic %#08x, expected %#08x", + ubifs_err(c, "bad magic %#08x, expected %#08x", magic, UBIFS_NODE_MAGIC); err = -EUCLEAN; goto out; @@ -256,7 +256,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, type = ch->node_type; if (type < 0 || type >= UBIFS_NODE_TYPES_CNT) { if (!quiet) - ubifs_err("bad node type %d", type); + ubifs_err(c, "bad node type %d", type); goto out; } @@ -279,7 +279,7 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, node_crc = le32_to_cpu(ch->crc); if (crc != node_crc) { if (!quiet) - ubifs_err("bad CRC: calculated %#08x, read %#08x", + ubifs_err(c, "bad CRC: calculated %#08x, read %#08x", crc, node_crc); err = -EUCLEAN; goto out; @@ -289,10 +289,10 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum, out_len: if (!quiet) - ubifs_err("bad node length %d", node_len); + ubifs_err(c, "bad node length %d", node_len); out: if (!quiet) { - ubifs_err("bad node at LEB %d:%d", lnum, offs); + ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); ubifs_dump_node(c, buf); dump_stack(); } @@ -355,11 +355,11 @@ static unsigned long long next_sqnum(struct ubifs_info *c) if (unlikely(sqnum >= SQNUM_WARN_WATERMARK)) { if (sqnum >= SQNUM_WATERMARK) { - ubifs_err("sequence number overflow %llu, end of life", + ubifs_err(c, "sequence number overflow %llu, end of life", sqnum); ubifs_ro_mode(c, -EINVAL); } - ubifs_warn("running out of sequence numbers, end of life soon"); + ubifs_warn(c, "running out of sequence numbers, end of life soon"); } return sqnum; @@ -636,7 +636,7 @@ int ubifs_bg_wbufs_sync(struct ubifs_info *c) err = ubifs_wbuf_sync_nolock(wbuf); mutex_unlock(&wbuf->io_mutex); if (err) { - ubifs_err("cannot sync write-buffer, error %d", err); + ubifs_err(c, "cannot sync write-buffer, error %d", err); ubifs_ro_mode(c, err); goto out_timers; } @@ -833,7 +833,7 @@ exit: return 0; out: - ubifs_err("cannot write %d bytes to LEB %d:%d, error %d", + ubifs_err(c, "cannot write %d bytes to LEB %d:%d, error %d", len, wbuf->lnum, wbuf->offs, err); ubifs_dump_node(c, buf); dump_stack(); @@ -932,27 +932,27 @@ int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len, } if (type != ch->node_type) { - ubifs_err("bad node type (%d but expected %d)", + ubifs_err(c, "bad node type (%d but expected %d)", ch->node_type, type); goto out; } err = ubifs_check_node(c, buf, lnum, offs, 0, 0); if (err) { - ubifs_err("expected node type %d", type); + ubifs_err(c, "expected node type %d", type); return err; } rlen = le32_to_cpu(ch->len); if (rlen != len) { - ubifs_err("bad node length %d, expected %d", rlen, len); + ubifs_err(c, "bad node length %d, expected %d", rlen, len); goto out; } return 0; out: - ubifs_err("bad node at LEB %d:%d", lnum, offs); + ubifs_err(c, "bad node at LEB %d:%d", lnum, offs); ubifs_dump_node(c, buf); dump_stack(); return -EINVAL; diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 648b143606cc..3c7b29de0ca7 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -138,7 +138,7 @@ static int setflags(struct inode *inode, int flags) return err; out_unlock: - ubifs_err("can't modify inode %lu attributes", inode->i_ino); + ubifs_err(c, "can't modify inode %lu attributes", inode->i_ino); mutex_unlock(&ui->ui_mutex); ubifs_release_budget(c, &req); return err; diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index f6ac3f29323c..0b9da5b6e0f9 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -363,11 +363,11 @@ again: * This should not happen unless the journal size limitations * are too tough. */ - ubifs_err("stuck in space allocation"); + ubifs_err(c, "stuck in space allocation"); err = -ENOSPC; goto out; } else if (cmt_retries > 32) - ubifs_warn("too many space allocation re-tries (%d)", + ubifs_warn(c, "too many space allocation re-tries (%d)", cmt_retries); dbg_jnl("-EAGAIN, commit and retry (retried %d times)", @@ -380,7 +380,7 @@ again: goto again; out: - ubifs_err("cannot reserve %d bytes in jhead %d, error %d", + ubifs_err(c, "cannot reserve %d bytes in jhead %d, error %d", len, jhead, err); if (err == -ENOSPC) { /* This are some budgeting problems, print useful information */ @@ -731,7 +731,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, compr_type = ui->compr_type; out_len = dlen - UBIFS_DATA_NODE_SZ; - ubifs_compress(buf, len, &data->data, &out_len, &compr_type); + ubifs_compress(c, buf, len, &data->data, &out_len, &compr_type); ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); dlen = UBIFS_DATA_NODE_SZ + out_len; @@ -930,8 +930,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir, union ubifs_key key; struct ubifs_dent_node *dent, *dent2; int err, dlen1, dlen2, ilen, lnum, offs, len; - const struct inode *old_inode = old_dentry->d_inode; - const struct inode *new_inode = new_dentry->d_inode; + const struct inode *old_inode = d_inode(old_dentry); + const struct inode *new_inode = d_inode(new_dentry); int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ; int last_reference = !!(new_inode && new_inode->i_nlink == 0); int move = (old_dir != new_dir); @@ -1100,7 +1100,8 @@ out_free: * This function is used when an inode is truncated and the last data node of * the inode has to be re-compressed and re-written. */ -static int recomp_data_node(struct ubifs_data_node *dn, int *new_len) +static int recomp_data_node(const struct ubifs_info *c, + struct ubifs_data_node *dn, int *new_len) { void *buf; int err, len, compr_type, out_len; @@ -1112,11 +1113,11 @@ static int recomp_data_node(struct ubifs_data_node *dn, int *new_len) len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ; compr_type = le16_to_cpu(dn->compr_type); - err = ubifs_decompress(&dn->data, len, buf, &out_len, compr_type); + err = ubifs_decompress(c, &dn->data, len, buf, &out_len, compr_type); if (err) goto out; - ubifs_compress(buf, *new_len, &dn->data, &out_len, &compr_type); + ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type); ubifs_assert(out_len <= UBIFS_BLOCK_SIZE); dn->compr_type = cpu_to_le16(compr_type); dn->size = cpu_to_le32(*new_len); @@ -1191,7 +1192,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode, int compr_type = le16_to_cpu(dn->compr_type); if (compr_type != UBIFS_COMPR_NONE) { - err = recomp_data_node(dn, &dlen); + err = recomp_data_node(c, dn, &dlen); if (err) goto out_free; } else { diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index c14628fbeee2..8c795e6392b1 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -696,7 +696,7 @@ int ubifs_consolidate_log(struct ubifs_info *c) destroy_done_tree(&done_tree); vfree(buf); if (write_lnum == c->lhead_lnum) { - ubifs_err("log is too full"); + ubifs_err(c, "log is too full"); return -EINVAL; } /* Unmap remaining LEBs */ @@ -743,7 +743,7 @@ static int dbg_check_bud_bytes(struct ubifs_info *c) bud_bytes += c->leb_size - bud->start; if (c->bud_bytes != bud_bytes) { - ubifs_err("bad bud_bytes %lld, calculated %lld", + ubifs_err(c, "bad bud_bytes %lld, calculated %lld", c->bud_bytes, bud_bytes); err = -EINVAL; } diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c index 46190a7c42a6..a0011aa3a779 100644 --- a/fs/ubifs/lprops.c +++ b/fs/ubifs/lprops.c @@ -682,7 +682,7 @@ int ubifs_change_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, out: ubifs_release_lprops(c); if (err) - ubifs_err("cannot change properties of LEB %d, error %d", + ubifs_err(c, "cannot change properties of LEB %d, error %d", lnum, err); return err; } @@ -721,7 +721,7 @@ int ubifs_update_one_lp(struct ubifs_info *c, int lnum, int free, int dirty, out: ubifs_release_lprops(c); if (err) - ubifs_err("cannot update properties of LEB %d, error %d", + ubifs_err(c, "cannot update properties of LEB %d, error %d", lnum, err); return err; } @@ -746,7 +746,7 @@ int ubifs_read_one_lp(struct ubifs_info *c, int lnum, struct ubifs_lprops *lp) lpp = ubifs_lpt_lookup(c, lnum); if (IS_ERR(lpp)) { err = PTR_ERR(lpp); - ubifs_err("cannot read properties of LEB %d, error %d", + ubifs_err(c, "cannot read properties of LEB %d, error %d", lnum, err); goto out; } @@ -873,13 +873,13 @@ int dbg_check_cats(struct ubifs_info *c) list_for_each_entry(lprops, &c->empty_list, list) { if (lprops->free != c->leb_size) { - ubifs_err("non-empty LEB %d on empty list (free %d dirty %d flags %d)", + ubifs_err(c, "non-empty LEB %d on empty list (free %d dirty %d flags %d)", lprops->lnum, lprops->free, lprops->dirty, lprops->flags); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { - ubifs_err("taken LEB %d on empty list (free %d dirty %d flags %d)", + ubifs_err(c, "taken LEB %d on empty list (free %d dirty %d flags %d)", lprops->lnum, lprops->free, lprops->dirty, lprops->flags); return -EINVAL; @@ -889,13 +889,13 @@ int dbg_check_cats(struct ubifs_info *c) i = 0; list_for_each_entry(lprops, &c->freeable_list, list) { if (lprops->free + lprops->dirty != c->leb_size) { - ubifs_err("non-freeable LEB %d on freeable list (free %d dirty %d flags %d)", + ubifs_err(c, "non-freeable LEB %d on freeable list (free %d dirty %d flags %d)", lprops->lnum, lprops->free, lprops->dirty, lprops->flags); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { - ubifs_err("taken LEB %d on freeable list (free %d dirty %d flags %d)", + ubifs_err(c, "taken LEB %d on freeable list (free %d dirty %d flags %d)", lprops->lnum, lprops->free, lprops->dirty, lprops->flags); return -EINVAL; @@ -903,7 +903,7 @@ int dbg_check_cats(struct ubifs_info *c) i += 1; } if (i != c->freeable_cnt) { - ubifs_err("freeable list count %d expected %d", i, + ubifs_err(c, "freeable list count %d expected %d", i, c->freeable_cnt); return -EINVAL; } @@ -912,26 +912,26 @@ int dbg_check_cats(struct ubifs_info *c) list_for_each(pos, &c->idx_gc) i += 1; if (i != c->idx_gc_cnt) { - ubifs_err("idx_gc list count %d expected %d", i, + ubifs_err(c, "idx_gc list count %d expected %d", i, c->idx_gc_cnt); return -EINVAL; } list_for_each_entry(lprops, &c->frdi_idx_list, list) { if (lprops->free + lprops->dirty != c->leb_size) { - ubifs_err("non-freeable LEB %d on frdi_idx list (free %d dirty %d flags %d)", + ubifs_err(c, "non-freeable LEB %d on frdi_idx list (free %d dirty %d flags %d)", lprops->lnum, lprops->free, lprops->dirty, lprops->flags); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { - ubifs_err("taken LEB %d on frdi_idx list (free %d dirty %d flags %d)", + ubifs_err(c, "taken LEB %d on frdi_idx list (free %d dirty %d flags %d)", lprops->lnum, lprops->free, lprops->dirty, lprops->flags); return -EINVAL; } if (!(lprops->flags & LPROPS_INDEX)) { - ubifs_err("non-index LEB %d on frdi_idx list (free %d dirty %d flags %d)", + ubifs_err(c, "non-index LEB %d on frdi_idx list (free %d dirty %d flags %d)", lprops->lnum, lprops->free, lprops->dirty, lprops->flags); return -EINVAL; @@ -944,15 +944,15 @@ int dbg_check_cats(struct ubifs_info *c) for (i = 0; i < heap->cnt; i++) { lprops = heap->arr[i]; if (!lprops) { - ubifs_err("null ptr in LPT heap cat %d", cat); + ubifs_err(c, "null ptr in LPT heap cat %d", cat); return -EINVAL; } if (lprops->hpos != i) { - ubifs_err("bad ptr in LPT heap cat %d", cat); + ubifs_err(c, "bad ptr in LPT heap cat %d", cat); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { - ubifs_err("taken LEB in LPT heap cat %d", cat); + ubifs_err(c, "taken LEB in LPT heap cat %d", cat); return -EINVAL; } } @@ -988,7 +988,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, goto out; } if (lprops != lp) { - ubifs_err("lprops %zx lp %zx lprops->lnum %d lp->lnum %d", + ubifs_err(c, "lprops %zx lp %zx lprops->lnum %d lp->lnum %d", (size_t)lprops, (size_t)lp, lprops->lnum, lp->lnum); err = 4; @@ -1008,7 +1008,7 @@ void dbg_check_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat, } out: if (err) { - ubifs_err("failed cat %d hpos %d err %d", cat, i, err); + ubifs_err(c, "failed cat %d hpos %d err %d", cat, i, err); dump_stack(); ubifs_dump_heap(c, heap, cat); } @@ -1039,7 +1039,7 @@ static int scan_check_cb(struct ubifs_info *c, if (cat != LPROPS_UNCAT) { cat = ubifs_categorize_lprops(c, lp); if (cat != (lp->flags & LPROPS_CAT_MASK)) { - ubifs_err("bad LEB category %d expected %d", + ubifs_err(c, "bad LEB category %d expected %d", (lp->flags & LPROPS_CAT_MASK), cat); return -EINVAL; } @@ -1074,7 +1074,7 @@ static int scan_check_cb(struct ubifs_info *c, } } if (!found) { - ubifs_err("bad LPT list (category %d)", cat); + ubifs_err(c, "bad LPT list (category %d)", cat); return -EINVAL; } } @@ -1086,7 +1086,7 @@ static int scan_check_cb(struct ubifs_info *c, if ((lp->hpos != -1 && heap->arr[lp->hpos]->lnum != lnum) || lp != heap->arr[lp->hpos]) { - ubifs_err("bad LPT heap (category %d)", cat); + ubifs_err(c, "bad LPT heap (category %d)", cat); return -EINVAL; } } @@ -1133,7 +1133,7 @@ static int scan_check_cb(struct ubifs_info *c, is_idx = (snod->type == UBIFS_IDX_NODE) ? 1 : 0; if (is_idx && snod->type != UBIFS_IDX_NODE) { - ubifs_err("indexing node in data LEB %d:%d", + ubifs_err(c, "indexing node in data LEB %d:%d", lnum, snod->offs); goto out_destroy; } @@ -1159,7 +1159,7 @@ static int scan_check_cb(struct ubifs_info *c, if (free > c->leb_size || free < 0 || dirty > c->leb_size || dirty < 0) { - ubifs_err("bad calculated accounting for LEB %d: free %d, dirty %d", + ubifs_err(c, "bad calculated accounting for LEB %d: free %d, dirty %d", lnum, free, dirty); goto out_destroy; } @@ -1206,13 +1206,13 @@ static int scan_check_cb(struct ubifs_info *c, /* Free but not unmapped LEB, it's fine */ is_idx = 0; else { - ubifs_err("indexing node without indexing flag"); + ubifs_err(c, "indexing node without indexing flag"); goto out_print; } } if (!is_idx && (lp->flags & LPROPS_INDEX)) { - ubifs_err("data node with indexing flag"); + ubifs_err(c, "data node with indexing flag"); goto out_print; } @@ -1241,7 +1241,7 @@ static int scan_check_cb(struct ubifs_info *c, return LPT_SCAN_CONTINUE; out_print: - ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, should be free %d, dirty %d", + ubifs_err(c, "bad accounting of LEB %d: free %d, dirty %d flags %#x, should be free %d, dirty %d", lnum, lp->free, lp->dirty, lp->flags, free, dirty); ubifs_dump_leb(c, lnum); out_destroy: @@ -1293,11 +1293,11 @@ int dbg_check_lprops(struct ubifs_info *c) lst.total_free != c->lst.total_free || lst.total_dirty != c->lst.total_dirty || lst.total_used != c->lst.total_used) { - ubifs_err("bad overall accounting"); - ubifs_err("calculated: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld", + ubifs_err(c, "bad overall accounting"); + ubifs_err(c, "calculated: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld", lst.empty_lebs, lst.idx_lebs, lst.total_free, lst.total_dirty, lst.total_used); - ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld", + ubifs_err(c, "read from lprops: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld", c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free, c->lst.total_dirty, c->lst.total_used); err = -EINVAL; @@ -1306,10 +1306,10 @@ int dbg_check_lprops(struct ubifs_info *c) if (lst.total_dead != c->lst.total_dead || lst.total_dark != c->lst.total_dark) { - ubifs_err("bad dead/dark space accounting"); - ubifs_err("calculated: total_dead %lld, total_dark %lld", + ubifs_err(c, "bad dead/dark space accounting"); + ubifs_err(c, "calculated: total_dead %lld, total_dark %lld", lst.total_dead, lst.total_dark); - ubifs_err("read from lprops: total_dead %lld, total_dark %lld", + ubifs_err(c, "read from lprops: total_dead %lld, total_dark %lld", c->lst.total_dead, c->lst.total_dark); err = -EINVAL; goto out; diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c index 421bd0a80424..dc9f27e9d61b 100644 --- a/fs/ubifs/lpt.c +++ b/fs/ubifs/lpt.c @@ -145,13 +145,13 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c) sz = c->lpt_sz * 2; /* Must have at least 2 times the size */ lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size); if (lebs_needed > c->lpt_lebs) { - ubifs_err("too few LPT LEBs"); + ubifs_err(c, "too few LPT LEBs"); return -EINVAL; } /* Verify that ltab fits in a single LEB (since ltab is a single node */ if (c->ltab_sz > c->leb_size) { - ubifs_err("LPT ltab too big"); + ubifs_err(c, "LPT ltab too big"); return -EINVAL; } @@ -213,7 +213,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs, continue; } if (c->ltab_sz > c->leb_size) { - ubifs_err("LPT ltab too big"); + ubifs_err(c, "LPT ltab too big"); return -EINVAL; } *main_lebs = c->main_lebs; @@ -911,7 +911,7 @@ static void replace_cats(struct ubifs_info *c, struct ubifs_pnode *old_pnode, * * This function returns %0 on success and a negative error code on failure. */ -static int check_lpt_crc(void *buf, int len) +static int check_lpt_crc(const struct ubifs_info *c, void *buf, int len) { int pos = 0; uint8_t *addr = buf; @@ -921,8 +921,8 @@ static int check_lpt_crc(void *buf, int len) calc_crc = crc16(-1, buf + UBIFS_LPT_CRC_BYTES, len - UBIFS_LPT_CRC_BYTES); if (crc != calc_crc) { - ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc, - calc_crc); + ubifs_err(c, "invalid crc in LPT node: crc %hx calc %hx", + crc, calc_crc); dump_stack(); return -EINVAL; } @@ -938,14 +938,15 @@ static int check_lpt_crc(void *buf, int len) * * This function returns %0 on success and a negative error code on failure. */ -static int check_lpt_type(uint8_t **addr, int *pos, int type) +static int check_lpt_type(const struct ubifs_info *c, uint8_t **addr, + int *pos, int type) { int node_type; node_type = ubifs_unpack_bits(addr, pos, UBIFS_LPT_TYPE_BITS); if (node_type != type) { - ubifs_err("invalid type (%d) in LPT node type %d", node_type, - type); + ubifs_err(c, "invalid type (%d) in LPT node type %d", + node_type, type); dump_stack(); return -EINVAL; } @@ -966,7 +967,7 @@ static int unpack_pnode(const struct ubifs_info *c, void *buf, uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int i, pos = 0, err; - err = check_lpt_type(&addr, &pos, UBIFS_LPT_PNODE); + err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_PNODE); if (err) return err; if (c->big_lpt) @@ -985,7 +986,7 @@ static int unpack_pnode(const struct ubifs_info *c, void *buf, lprops->flags = 0; lprops->flags |= ubifs_categorize_lprops(c, lprops); } - err = check_lpt_crc(buf, c->pnode_sz); + err = check_lpt_crc(c, buf, c->pnode_sz); return err; } @@ -1003,7 +1004,7 @@ int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int i, pos = 0, err; - err = check_lpt_type(&addr, &pos, UBIFS_LPT_NNODE); + err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_NNODE); if (err) return err; if (c->big_lpt) @@ -1019,7 +1020,7 @@ int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf, nnode->nbranch[i].offs = ubifs_unpack_bits(&addr, &pos, c->lpt_offs_bits); } - err = check_lpt_crc(buf, c->nnode_sz); + err = check_lpt_crc(c, buf, c->nnode_sz); return err; } @@ -1035,7 +1036,7 @@ static int unpack_ltab(const struct ubifs_info *c, void *buf) uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int i, pos = 0, err; - err = check_lpt_type(&addr, &pos, UBIFS_LPT_LTAB); + err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_LTAB); if (err) return err; for (i = 0; i < c->lpt_lebs; i++) { @@ -1051,7 +1052,7 @@ static int unpack_ltab(const struct ubifs_info *c, void *buf) c->ltab[i].tgc = 0; c->ltab[i].cmt = 0; } - err = check_lpt_crc(buf, c->ltab_sz); + err = check_lpt_crc(c, buf, c->ltab_sz); return err; } @@ -1067,7 +1068,7 @@ static int unpack_lsave(const struct ubifs_info *c, void *buf) uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES; int i, pos = 0, err; - err = check_lpt_type(&addr, &pos, UBIFS_LPT_LSAVE); + err = check_lpt_type(c, &addr, &pos, UBIFS_LPT_LSAVE); if (err) return err; for (i = 0; i < c->lsave_cnt; i++) { @@ -1077,7 +1078,7 @@ static int unpack_lsave(const struct ubifs_info *c, void *buf) return -EINVAL; c->lsave[i] = lnum; } - err = check_lpt_crc(buf, c->lsave_sz); + err = check_lpt_crc(c, buf, c->lsave_sz); return err; } @@ -1243,7 +1244,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) return 0; out: - ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs); + ubifs_err(c, "error %d reading nnode at %d:%d", err, lnum, offs); dump_stack(); kfree(nnode); return err; @@ -1308,10 +1309,10 @@ static int read_pnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip) return 0; out: - ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs); + ubifs_err(c, "error %d reading pnode at %d:%d", err, lnum, offs); ubifs_dump_pnode(c, pnode, parent, iip); dump_stack(); - ubifs_err("calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); + ubifs_err(c, "calc num: %d", calc_pnode_num_from_parent(c, parent, iip)); kfree(pnode); return err; } @@ -2095,7 +2096,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, int i; if (pnode->num != col) { - ubifs_err("pnode num %d expected %d parent num %d iip %d", + ubifs_err(c, "pnode num %d expected %d parent num %d iip %d", pnode->num, col, pnode->parent->num, pnode->iip); return -EINVAL; } @@ -2110,13 +2111,13 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, if (lnum >= c->leb_cnt) continue; if (lprops->lnum != lnum) { - ubifs_err("bad LEB number %d expected %d", + ubifs_err(c, "bad LEB number %d expected %d", lprops->lnum, lnum); return -EINVAL; } if (lprops->flags & LPROPS_TAKEN) { if (cat != LPROPS_UNCAT) { - ubifs_err("LEB %d taken but not uncat %d", + ubifs_err(c, "LEB %d taken but not uncat %d", lprops->lnum, cat); return -EINVAL; } @@ -2129,7 +2130,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, case LPROPS_FRDI_IDX: break; default: - ubifs_err("LEB %d index but cat %d", + ubifs_err(c, "LEB %d index but cat %d", lprops->lnum, cat); return -EINVAL; } @@ -2142,7 +2143,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, case LPROPS_FREEABLE: break; default: - ubifs_err("LEB %d not index but cat %d", + ubifs_err(c, "LEB %d not index but cat %d", lprops->lnum, cat); return -EINVAL; } @@ -2183,14 +2184,14 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, break; } if (!found) { - ubifs_err("LEB %d cat %d not found in cat heap/list", + ubifs_err(c, "LEB %d cat %d not found in cat heap/list", lprops->lnum, cat); return -EINVAL; } switch (cat) { case LPROPS_EMPTY: if (lprops->free != c->leb_size) { - ubifs_err("LEB %d cat %d free %d dirty %d", + ubifs_err(c, "LEB %d cat %d free %d dirty %d", lprops->lnum, cat, lprops->free, lprops->dirty); return -EINVAL; @@ -2199,7 +2200,7 @@ static int dbg_chk_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode, case LPROPS_FREEABLE: case LPROPS_FRDI_IDX: if (lprops->free + lprops->dirty != c->leb_size) { - ubifs_err("LEB %d cat %d free %d dirty %d", + ubifs_err(c, "LEB %d cat %d free %d dirty %d", lprops->lnum, cat, lprops->free, lprops->dirty); return -EINVAL; @@ -2236,7 +2237,7 @@ int dbg_check_lpt_nodes(struct ubifs_info *c, struct ubifs_cnode *cnode, /* cnode is a nnode */ num = calc_nnode_num(row, col); if (cnode->num != num) { - ubifs_err("nnode num %d expected %d parent num %d iip %d", + ubifs_err(c, "nnode num %d expected %d parent num %d iip %d", cnode->num, num, (nnode ? nnode->num : 0), cnode->iip); return -EINVAL; diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c index d9c02928e992..ce89bdc3eb02 100644 --- a/fs/ubifs/lpt_commit.c +++ b/fs/ubifs/lpt_commit.c @@ -319,7 +319,7 @@ static int layout_cnodes(struct ubifs_info *c) return 0; no_space: - ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, done_lsave %d", + ubifs_err(c, "LPT out of space at LEB %d:%d needing %d, done_ltab %d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); ubifs_dump_lpt_info(c); ubifs_dump_lpt_lebs(c); @@ -543,7 +543,7 @@ static int write_cnodes(struct ubifs_info *c) return 0; no_space: - ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab %d, done_lsave %d", + ubifs_err(c, "LPT out of space mismatch at LEB %d:%d needing %d, done_ltab %d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave); ubifs_dump_lpt_info(c); ubifs_dump_lpt_lebs(c); @@ -1638,7 +1638,7 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); if (!buf) { - ubifs_err("cannot allocate memory for ltab checking"); + ubifs_err(c, "cannot allocate memory for ltab checking"); return 0; } @@ -1660,18 +1660,18 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum) continue; } if (!dbg_is_all_ff(p, len)) { - ubifs_err("invalid empty space in LEB %d at %d", + ubifs_err(c, "invalid empty space in LEB %d at %d", lnum, c->leb_size - len); err = -EINVAL; } i = lnum - c->lpt_first; if (len != c->ltab[i].free) { - ubifs_err("invalid free space in LEB %d (free %d, expected %d)", + ubifs_err(c, "invalid free space in LEB %d (free %d, expected %d)", lnum, len, c->ltab[i].free); err = -EINVAL; } if (dirty != c->ltab[i].dirty) { - ubifs_err("invalid dirty space in LEB %d (dirty %d, expected %d)", + ubifs_err(c, "invalid dirty space in LEB %d (dirty %d, expected %d)", lnum, dirty, c->ltab[i].dirty); err = -EINVAL; } @@ -1725,7 +1725,7 @@ int dbg_check_ltab(struct ubifs_info *c) for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) { err = dbg_check_ltab_lnum(c, lnum); if (err) { - ubifs_err("failed at LEB %d", lnum); + ubifs_err(c, "failed at LEB %d", lnum); return err; } } @@ -1757,7 +1757,7 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c) free += c->leb_size; } if (free < c->lpt_sz) { - ubifs_err("LPT space error: free %lld lpt_sz %lld", + ubifs_err(c, "LPT space error: free %lld lpt_sz %lld", free, c->lpt_sz); ubifs_dump_lpt_info(c); ubifs_dump_lpt_lebs(c); @@ -1797,12 +1797,12 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) d->chk_lpt_lebs = 0; d->chk_lpt_wastage = 0; if (c->dirty_pn_cnt > c->pnode_cnt) { - ubifs_err("dirty pnodes %d exceed max %d", + ubifs_err(c, "dirty pnodes %d exceed max %d", c->dirty_pn_cnt, c->pnode_cnt); err = -EINVAL; } if (c->dirty_nn_cnt > c->nnode_cnt) { - ubifs_err("dirty nnodes %d exceed max %d", + ubifs_err(c, "dirty nnodes %d exceed max %d", c->dirty_nn_cnt, c->nnode_cnt); err = -EINVAL; } @@ -1820,22 +1820,22 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) chk_lpt_sz *= d->chk_lpt_lebs; chk_lpt_sz += len - c->nhead_offs; if (d->chk_lpt_sz != chk_lpt_sz) { - ubifs_err("LPT wrote %lld but space used was %lld", + ubifs_err(c, "LPT wrote %lld but space used was %lld", d->chk_lpt_sz, chk_lpt_sz); err = -EINVAL; } if (d->chk_lpt_sz > c->lpt_sz) { - ubifs_err("LPT wrote %lld but lpt_sz is %lld", + ubifs_err(c, "LPT wrote %lld but lpt_sz is %lld", d->chk_lpt_sz, c->lpt_sz); err = -EINVAL; } if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) { - ubifs_err("LPT layout size %lld but wrote %lld", + ubifs_err(c, "LPT layout size %lld but wrote %lld", d->chk_lpt_sz, d->chk_lpt_sz2); err = -EINVAL; } if (d->chk_lpt_sz2 && d->new_nhead_offs != len) { - ubifs_err("LPT new nhead offs: expected %d was %d", + ubifs_err(c, "LPT new nhead offs: expected %d was %d", d->new_nhead_offs, len); err = -EINVAL; } @@ -1845,7 +1845,7 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) if (c->big_lpt) lpt_sz += c->lsave_sz; if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) { - ubifs_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld", + ubifs_err(c, "LPT chk_lpt_sz %lld + waste %lld exceeds %lld", d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz); err = -EINVAL; } @@ -1887,7 +1887,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum); buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); if (!buf) { - ubifs_err("cannot allocate memory to dump LPT"); + ubifs_err(c, "cannot allocate memory to dump LPT"); return; } @@ -1962,7 +1962,7 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum) pr_err("LEB %d:%d, lsave len\n", lnum, offs); break; default: - ubifs_err("LPT node type %d not recognized", node_type); + ubifs_err(c, "LPT node type %d not recognized", node_type); goto out; } diff --git a/fs/ubifs/master.c b/fs/ubifs/master.c index 1a4bb9e8b3b8..c6a5e39e2ba5 100644 --- a/fs/ubifs/master.c +++ b/fs/ubifs/master.c @@ -82,7 +82,7 @@ out: return -EUCLEAN; out_dump: - ubifs_err("unexpected node type %d master LEB %d:%d", + ubifs_err(c, "unexpected node type %d master LEB %d:%d", snod->type, lnum, snod->offs); ubifs_scan_destroy(sleb); return -EINVAL; @@ -240,7 +240,7 @@ static int validate_master(const struct ubifs_info *c) return 0; out: - ubifs_err("bad master node at offset %d error %d", c->mst_offs, err); + ubifs_err(c, "bad master node at offset %d error %d", c->mst_offs, err); ubifs_dump_node(c, c->mst_node); return -EINVAL; } @@ -316,7 +316,7 @@ int ubifs_read_master(struct ubifs_info *c) if (c->leb_cnt < old_leb_cnt || c->leb_cnt < UBIFS_MIN_LEB_CNT) { - ubifs_err("bad leb_cnt on master node"); + ubifs_err(c, "bad leb_cnt on master node"); ubifs_dump_node(c, c->mst_node); return -EINVAL; } diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c index 4409f486ecef..caf2d123e9ee 100644 --- a/fs/ubifs/orphan.c +++ b/fs/ubifs/orphan.c @@ -88,7 +88,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum) else if (inum > o->inum) p = &(*p)->rb_right; else { - ubifs_err("orphaned twice"); + ubifs_err(c, "orphaned twice"); spin_unlock(&c->orphan_lock); kfree(orphan); return 0; @@ -155,7 +155,7 @@ void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum) } } spin_unlock(&c->orphan_lock); - ubifs_err("missing orphan ino %lu", (unsigned long)inum); + ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum); dump_stack(); } @@ -287,7 +287,7 @@ static int write_orph_node(struct ubifs_info *c, int atomic) * We limit the number of orphans so that this should * never happen. */ - ubifs_err("out of space in orphan area"); + ubifs_err(c, "out of space in orphan area"); return -EINVAL; } } @@ -397,7 +397,7 @@ static int consolidate(struct ubifs_info *c) * We limit the number of orphans so that this should * never happen. */ - ubifs_err("out of space in orphan area"); + ubifs_err(c, "out of space in orphan area"); err = -EINVAL; } spin_unlock(&c->orphan_lock); @@ -569,7 +569,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, list_for_each_entry(snod, &sleb->nodes, list) { if (snod->type != UBIFS_ORPH_NODE) { - ubifs_err("invalid node type %d in orphan area at %d:%d", + ubifs_err(c, "invalid node type %d in orphan area at %d:%d", snod->type, sleb->lnum, snod->offs); ubifs_dump_node(c, snod->node); return -EINVAL; @@ -596,7 +596,7 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb, * number. That makes this orphan node, out of date. */ if (!first) { - ubifs_err("out of order commit number %llu in orphan node at %d:%d", + ubifs_err(c, "out of order commit number %llu in orphan node at %d:%d", cmt_no, sleb->lnum, snod->offs); ubifs_dump_node(c, snod->node); return -EINVAL; @@ -831,20 +831,20 @@ static int dbg_orphan_check(struct ubifs_info *c, struct ubifs_zbranch *zbr, if (inum != ci->last_ino) { /* Lowest node type is the inode node, so it comes first */ if (key_type(c, &zbr->key) != UBIFS_INO_KEY) - ubifs_err("found orphan node ino %lu, type %d", + ubifs_err(c, "found orphan node ino %lu, type %d", (unsigned long)inum, key_type(c, &zbr->key)); ci->last_ino = inum; ci->tot_inos += 1; err = ubifs_tnc_read_node(c, zbr, ci->node); if (err) { - ubifs_err("node read failed, error %d", err); + ubifs_err(c, "node read failed, error %d", err); return err; } if (ci->node->nlink == 0) /* Must be recorded as an orphan */ if (!dbg_find_check_orphan(&ci->root, inum) && !dbg_find_orphan(c, inum)) { - ubifs_err("missing orphan, ino %lu", + ubifs_err(c, "missing orphan, ino %lu", (unsigned long)inum); ci->missing += 1; } @@ -887,7 +887,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci) buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL); if (!buf) { - ubifs_err("cannot allocate memory to check orphans"); + ubifs_err(c, "cannot allocate memory to check orphans"); return 0; } @@ -925,7 +925,7 @@ static int dbg_check_orphans(struct ubifs_info *c) ci.root = RB_ROOT; ci.node = kmalloc(UBIFS_MAX_INO_NODE_SZ, GFP_NOFS); if (!ci.node) { - ubifs_err("out of memory"); + ubifs_err(c, "out of memory"); return -ENOMEM; } @@ -935,12 +935,12 @@ static int dbg_check_orphans(struct ubifs_info *c) err = dbg_walk_index(c, &dbg_orphan_check, NULL, &ci); if (err) { - ubifs_err("cannot scan TNC, error %d", err); + ubifs_err(c, "cannot scan TNC, error %d", err); goto out; } if (ci.missing) { - ubifs_err("%lu missing orphan(s)", ci.missing); + ubifs_err(c, "%lu missing orphan(s)", ci.missing); err = -EINVAL; goto out; } diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index c640938f62f0..695fc71d5244 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -305,7 +305,7 @@ int ubifs_recover_master_node(struct ubifs_info *c) mst = mst2; } - ubifs_msg("recovered master node from LEB %d", + ubifs_msg(c, "recovered master node from LEB %d", (mst == mst1 ? UBIFS_MST_LNUM : UBIFS_MST_LNUM + 1)); memcpy(c->mst_node, mst, UBIFS_MST_NODE_SZ); @@ -360,13 +360,13 @@ int ubifs_recover_master_node(struct ubifs_info *c) out_err: err = -EINVAL; out_free: - ubifs_err("failed to recover master node"); + ubifs_err(c, "failed to recover master node"); if (mst1) { - ubifs_err("dumping first master node"); + ubifs_err(c, "dumping first master node"); ubifs_dump_node(c, mst1); } if (mst2) { - ubifs_err("dumping second master node"); + ubifs_err(c, "dumping second master node"); ubifs_dump_node(c, mst2); } vfree(buf2); @@ -682,7 +682,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, ret, lnum, offs); break; } else { - ubifs_err("unexpected return value %d", ret); + ubifs_err(c, "unexpected return value %d", ret); err = -EINVAL; goto error; } @@ -702,7 +702,7 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, * See header comment for this file for more * explanations about the reasons we have this check. */ - ubifs_err("corrupt empty space LEB %d:%d, corruption starts at %d", + ubifs_err(c, "corrupt empty space LEB %d:%d, corruption starts at %d", lnum, offs, corruption); /* Make sure we dump interesting non-0xFF data */ offs += corruption; @@ -788,13 +788,13 @@ struct ubifs_scan_leb *ubifs_recover_leb(struct ubifs_info *c, int lnum, corrupted_rescan: /* Re-scan the corrupted data with verbose messages */ - ubifs_err("corruption %d", ret); + ubifs_err(c, "corruption %d", ret); ubifs_scan_a_node(c, buf, len, lnum, offs, 1); corrupted: ubifs_scanned_corruption(c, lnum, offs, buf); err = -EUCLEAN; error: - ubifs_err("LEB %d scanning failed", lnum); + ubifs_err(c, "LEB %d scanning failed", lnum); ubifs_scan_destroy(sleb); return ERR_PTR(err); } @@ -826,15 +826,15 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, goto out_free; ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0); if (ret != SCANNED_A_NODE) { - ubifs_err("Not a valid node"); + ubifs_err(c, "Not a valid node"); goto out_err; } if (cs_node->ch.node_type != UBIFS_CS_NODE) { - ubifs_err("Node a CS node, type is %d", cs_node->ch.node_type); + ubifs_err(c, "Node a CS node, type is %d", cs_node->ch.node_type); goto out_err; } if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) { - ubifs_err("CS node cmt_no %llu != current cmt_no %llu", + ubifs_err(c, "CS node cmt_no %llu != current cmt_no %llu", (unsigned long long)le64_to_cpu(cs_node->cmt_no), c->cmt_no); goto out_err; @@ -847,7 +847,7 @@ static int get_cs_sqnum(struct ubifs_info *c, int lnum, int offs, out_err: err = -EINVAL; out_free: - ubifs_err("failed to get CS sqnum"); + ubifs_err(c, "failed to get CS sqnum"); kfree(cs_node); return err; } @@ -899,7 +899,7 @@ struct ubifs_scan_leb *ubifs_recover_log_leb(struct ubifs_info *c, int lnum, } } if (snod->sqnum > cs_sqnum) { - ubifs_err("unrecoverable log corruption in LEB %d", + ubifs_err(c, "unrecoverable log corruption in LEB %d", lnum); ubifs_scan_destroy(sleb); return ERR_PTR(-EUCLEAN); @@ -975,11 +975,8 @@ int ubifs_recover_inl_heads(struct ubifs_info *c, void *sbuf) return err; dbg_rcvry("checking LPT head at %d:%d", c->nhead_lnum, c->nhead_offs); - err = recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf); - if (err) - return err; - return 0; + return recover_head(c, c->nhead_lnum, c->nhead_offs, sbuf); } /** @@ -1004,10 +1001,7 @@ static int clean_an_unclean_leb(struct ubifs_info *c, if (len == 0) { /* Nothing to read, just unmap it */ - err = ubifs_leb_unmap(c, lnum); - if (err) - return err; - return 0; + return ubifs_leb_unmap(c, lnum); } err = ubifs_leb_read(c, lnum, buf, offs, len, 0); @@ -1043,7 +1037,7 @@ static int clean_an_unclean_leb(struct ubifs_info *c, } if (ret == SCANNED_EMPTY_SPACE) { - ubifs_err("unexpected empty space at %d:%d", + ubifs_err(c, "unexpected empty space at %d:%d", lnum, offs); return -EUCLEAN; } @@ -1137,7 +1131,7 @@ static int grab_empty_leb(struct ubifs_info *c) */ lnum = ubifs_find_free_leb_for_idx(c); if (lnum < 0) { - ubifs_err("could not find an empty LEB"); + ubifs_err(c, "could not find an empty LEB"); ubifs_dump_lprops(c); ubifs_dump_budg(c, &c->bi); return lnum; @@ -1217,7 +1211,7 @@ int ubifs_rcvry_gc_commit(struct ubifs_info *c) } mutex_unlock(&wbuf->io_mutex); if (err < 0) { - ubifs_err("GC failed, error %d", err); + ubifs_err(c, "GC failed, error %d", err); if (err == -EAGAIN) err = -EINVAL; return err; @@ -1464,7 +1458,7 @@ static int fix_size_in_place(struct ubifs_info *c, struct size_entry *e) return 0; out: - ubifs_warn("inode %lu failed to fix size %lld -> %lld error %d", + ubifs_warn(c, "inode %lu failed to fix size %lld -> %lld error %d", (unsigned long)e->inum, e->i_size, e->d_size, err); return err; } diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c index 9b40a1c5e160..3ca4540130b5 100644 --- a/fs/ubifs/replay.c +++ b/fs/ubifs/replay.c @@ -458,13 +458,13 @@ int ubifs_validate_entry(struct ubifs_info *c, nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 || strnlen(dent->name, nlen) != nlen || le64_to_cpu(dent->inum) > MAX_INUM) { - ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ? + ubifs_err(c, "bad %s node", key_type == UBIFS_DENT_KEY ? "directory entry" : "extended attribute entry"); return -EINVAL; } if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) { - ubifs_err("bad key type %d", key_type); + ubifs_err(c, "bad key type %d", key_type); return -EINVAL; } @@ -589,7 +589,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) cond_resched(); if (snod->sqnum >= SQNUM_WATERMARK) { - ubifs_err("file system's life ended"); + ubifs_err(c, "file system's life ended"); goto out_dump; } @@ -647,7 +647,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) if (old_size < 0 || old_size > c->max_inode_sz || new_size < 0 || new_size > c->max_inode_sz || old_size <= new_size) { - ubifs_err("bad truncation node"); + ubifs_err(c, "bad truncation node"); goto out_dump; } @@ -662,7 +662,7 @@ static int replay_bud(struct ubifs_info *c, struct bud_entry *b) break; } default: - ubifs_err("unexpected node type %d in bud LEB %d:%d", + ubifs_err(c, "unexpected node type %d in bud LEB %d:%d", snod->type, lnum, snod->offs); err = -EINVAL; goto out_dump; @@ -685,7 +685,7 @@ out: return err; out_dump: - ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs); + ubifs_err(c, "bad node is at LEB %d:%d", lnum, snod->offs); ubifs_dump_node(c, snod->node); ubifs_scan_destroy(sleb); return -EINVAL; @@ -805,7 +805,7 @@ static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref) if (bud) { if (bud->jhead == jhead && bud->start <= offs) return 1; - ubifs_err("bud at LEB %d:%d was already referred", lnum, offs); + ubifs_err(c, "bud at LEB %d:%d was already referred", lnum, offs); return -EINVAL; } @@ -861,12 +861,12 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) * numbers. */ if (snod->type != UBIFS_CS_NODE) { - ubifs_err("first log node at LEB %d:%d is not CS node", + ubifs_err(c, "first log node at LEB %d:%d is not CS node", lnum, offs); goto out_dump; } if (le64_to_cpu(node->cmt_no) != c->cmt_no) { - ubifs_err("first CS node at LEB %d:%d has wrong commit number %llu expected %llu", + ubifs_err(c, "first CS node at LEB %d:%d has wrong commit number %llu expected %llu", lnum, offs, (unsigned long long)le64_to_cpu(node->cmt_no), c->cmt_no); @@ -891,7 +891,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) /* Make sure the first node sits at offset zero of the LEB */ if (snod->offs != 0) { - ubifs_err("first node is not at zero offset"); + ubifs_err(c, "first node is not at zero offset"); goto out_dump; } @@ -899,12 +899,12 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) cond_resched(); if (snod->sqnum >= SQNUM_WATERMARK) { - ubifs_err("file system's life ended"); + ubifs_err(c, "file system's life ended"); goto out_dump; } if (snod->sqnum < c->cs_sqnum) { - ubifs_err("bad sqnum %llu, commit sqnum %llu", + ubifs_err(c, "bad sqnum %llu, commit sqnum %llu", snod->sqnum, c->cs_sqnum); goto out_dump; } @@ -934,12 +934,12 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf) case UBIFS_CS_NODE: /* Make sure it sits at the beginning of LEB */ if (snod->offs != 0) { - ubifs_err("unexpected node in log"); + ubifs_err(c, "unexpected node in log"); goto out_dump; } break; default: - ubifs_err("unexpected node in log"); + ubifs_err(c, "unexpected node in log"); goto out_dump; } } @@ -955,7 +955,7 @@ out: return err; out_dump: - ubifs_err("log error detected while replaying the log at LEB %d:%d", + ubifs_err(c, "log error detected while replaying the log at LEB %d:%d", lnum, offs + snod->offs); ubifs_dump_node(c, snod->node); ubifs_scan_destroy(sleb); @@ -1017,7 +1017,7 @@ int ubifs_replay_journal(struct ubifs_info *c) return free; /* Error code */ if (c->ihead_offs != c->leb_size - free) { - ubifs_err("bad index head LEB %d:%d", c->ihead_lnum, + ubifs_err(c, "bad index head LEB %d:%d", c->ihead_lnum, c->ihead_offs); return -EINVAL; } @@ -1040,7 +1040,7 @@ int ubifs_replay_journal(struct ubifs_info *c) * someting went wrong and we cannot proceed mounting * the file-system. */ - ubifs_err("no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted", + ubifs_err(c, "no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted", lnum, 0); err = -EINVAL; } diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 79c6dbbc0e04..f4fbc7b6b794 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -335,7 +335,7 @@ static int create_default_filesystem(struct ubifs_info *c) if (err) return err; - ubifs_msg("default file-system created"); + ubifs_msg(c, "default file-system created"); return 0; } @@ -365,13 +365,13 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) } if (le32_to_cpu(sup->min_io_size) != c->min_io_size) { - ubifs_err("min. I/O unit mismatch: %d in superblock, %d real", + ubifs_err(c, "min. I/O unit mismatch: %d in superblock, %d real", le32_to_cpu(sup->min_io_size), c->min_io_size); goto failed; } if (le32_to_cpu(sup->leb_size) != c->leb_size) { - ubifs_err("LEB size mismatch: %d in superblock, %d real", + ubifs_err(c, "LEB size mismatch: %d in superblock, %d real", le32_to_cpu(sup->leb_size), c->leb_size); goto failed; } @@ -393,33 +393,33 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6; if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) { - ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, %d minimum required", + ubifs_err(c, "bad LEB count: %d in superblock, %d on UBI volume, %d minimum required", c->leb_cnt, c->vi.size, min_leb_cnt); goto failed; } if (c->max_leb_cnt < c->leb_cnt) { - ubifs_err("max. LEB count %d less than LEB count %d", + ubifs_err(c, "max. LEB count %d less than LEB count %d", c->max_leb_cnt, c->leb_cnt); goto failed; } if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) { - ubifs_err("too few main LEBs count %d, must be at least %d", + ubifs_err(c, "too few main LEBs count %d, must be at least %d", c->main_lebs, UBIFS_MIN_MAIN_LEBS); goto failed; } max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS; if (c->max_bud_bytes < max_bytes) { - ubifs_err("too small journal (%lld bytes), must be at least %lld bytes", + ubifs_err(c, "too small journal (%lld bytes), must be at least %lld bytes", c->max_bud_bytes, max_bytes); goto failed; } max_bytes = (long long)c->leb_size * c->main_lebs; if (c->max_bud_bytes > max_bytes) { - ubifs_err("too large journal size (%lld bytes), only %lld bytes available in the main area", + ubifs_err(c, "too large journal size (%lld bytes), only %lld bytes available in the main area", c->max_bud_bytes, max_bytes); goto failed; } @@ -468,7 +468,7 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup) return 0; failed: - ubifs_err("bad superblock, error %d", err); + ubifs_err(c, "bad superblock, error %d", err); ubifs_dump_node(c, sup); return -EINVAL; } @@ -549,12 +549,12 @@ int ubifs_read_superblock(struct ubifs_info *c) ubifs_assert(!c->ro_media || c->ro_mount); if (!c->ro_mount || c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) { - ubifs_err("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", + ubifs_err(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", c->fmt_version, c->ro_compat_version, UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) { - ubifs_msg("only R/O mounting is possible"); + ubifs_msg(c, "only R/O mounting is possible"); err = -EROFS; } else err = -EINVAL; @@ -570,7 +570,7 @@ int ubifs_read_superblock(struct ubifs_info *c) } if (c->fmt_version < 3) { - ubifs_err("on-flash format version %d is not supported", + ubifs_err(c, "on-flash format version %d is not supported", c->fmt_version); err = -EINVAL; goto out; @@ -595,7 +595,7 @@ int ubifs_read_superblock(struct ubifs_info *c) c->key_len = UBIFS_SK_LEN; break; default: - ubifs_err("unsupported key format"); + ubifs_err(c, "unsupported key format"); err = -EINVAL; goto out; } @@ -785,7 +785,7 @@ int ubifs_fixup_free_space(struct ubifs_info *c) ubifs_assert(c->space_fixup); ubifs_assert(!c->ro_mount); - ubifs_msg("start fixing up free space"); + ubifs_msg(c, "start fixing up free space"); err = fixup_free_space(c); if (err) @@ -804,6 +804,6 @@ int ubifs_fixup_free_space(struct ubifs_info *c) if (err) return err; - ubifs_msg("free space fixup complete"); + ubifs_msg(c, "free space fixup complete"); return err; } diff --git a/fs/ubifs/scan.c b/fs/ubifs/scan.c index 89adbc4d08ac..aab87340d3de 100644 --- a/fs/ubifs/scan.c +++ b/fs/ubifs/scan.c @@ -100,7 +100,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, if (pad_len < 0 || offs + node_len + pad_len > c->leb_size) { if (!quiet) { - ubifs_err("bad pad node at LEB %d:%d", + ubifs_err(c, "bad pad node at LEB %d:%d", lnum, offs); ubifs_dump_node(c, pad); } @@ -110,7 +110,7 @@ int ubifs_scan_a_node(const struct ubifs_info *c, void *buf, int len, int lnum, /* Make the node pads to 8-byte boundary */ if ((node_len + pad_len) & 7) { if (!quiet) - ubifs_err("bad padding length %d - %d", + ubifs_err(c, "bad padding length %d - %d", offs, offs + node_len + pad_len); return SCANNED_A_BAD_PAD_NODE; } @@ -152,7 +152,7 @@ struct ubifs_scan_leb *ubifs_start_scan(const struct ubifs_info *c, int lnum, err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0); if (err && err != -EBADMSG) { - ubifs_err("cannot read %d bytes from LEB %d:%d, error %d", + ubifs_err(c, "cannot read %d bytes from LEB %d:%d, error %d", c->leb_size - offs, lnum, offs, err); kfree(sleb); return ERR_PTR(err); @@ -240,11 +240,11 @@ void ubifs_scanned_corruption(const struct ubifs_info *c, int lnum, int offs, { int len; - ubifs_err("corruption at LEB %d:%d", lnum, offs); + ubifs_err(c, "corruption at LEB %d:%d", lnum, offs); len = c->leb_size - offs; if (len > 8192) len = 8192; - ubifs_err("first %d bytes from LEB %d:%d", len, lnum, offs); + ubifs_err(c, "first %d bytes from LEB %d:%d", len, lnum, offs); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1); } @@ -299,16 +299,16 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, switch (ret) { case SCANNED_GARBAGE: - ubifs_err("garbage"); + ubifs_err(c, "garbage"); goto corrupted; case SCANNED_A_NODE: break; case SCANNED_A_CORRUPT_NODE: case SCANNED_A_BAD_PAD_NODE: - ubifs_err("bad node"); + ubifs_err(c, "bad node"); goto corrupted; default: - ubifs_err("unknown"); + ubifs_err(c, "unknown"); err = -EINVAL; goto error; } @@ -325,7 +325,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, if (offs % c->min_io_size) { if (!quiet) - ubifs_err("empty space starts at non-aligned offset %d", + ubifs_err(c, "empty space starts at non-aligned offset %d", offs); goto corrupted; } @@ -338,7 +338,7 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, for (; len; offs++, buf++, len--) if (*(uint8_t *)buf != 0xff) { if (!quiet) - ubifs_err("corrupt empty space at LEB %d:%d", + ubifs_err(c, "corrupt empty space at LEB %d:%d", lnum, offs); goto corrupted; } @@ -348,14 +348,14 @@ struct ubifs_scan_leb *ubifs_scan(const struct ubifs_info *c, int lnum, corrupted: if (!quiet) { ubifs_scanned_corruption(c, lnum, offs, buf); - ubifs_err("LEB %d scanning failed", lnum); + ubifs_err(c, "LEB %d scanning failed", lnum); } err = -EUCLEAN; ubifs_scan_destroy(sleb); return ERR_PTR(err); error: - ubifs_err("LEB %d scanning failed, error %d", lnum, err); + ubifs_err(c, "LEB %d scanning failed, error %d", lnum, err); ubifs_scan_destroy(sleb); return ERR_PTR(err); } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 93e946561c5c..9547a27868ad 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -70,13 +70,13 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) const struct ubifs_inode *ui = ubifs_inode(inode); if (inode->i_size > c->max_inode_sz) { - ubifs_err("inode is too large (%lld)", + ubifs_err(c, "inode is too large (%lld)", (long long)inode->i_size); return 1; } if (ui->compr_type >= UBIFS_COMPR_TYPES_CNT) { - ubifs_err("unknown compression type %d", ui->compr_type); + ubifs_err(c, "unknown compression type %d", ui->compr_type); return 2; } @@ -90,7 +90,7 @@ static int validate_inode(struct ubifs_info *c, const struct inode *inode) return 5; if (!ubifs_compr_present(ui->compr_type)) { - ubifs_warn("inode %lu uses '%s' compression, but it was not compiled in", + ubifs_warn(c, "inode %lu uses '%s' compression, but it was not compiled in", inode->i_ino, ubifs_compr_name(ui->compr_type)); } @@ -195,6 +195,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) } memcpy(ui->data, ino->data, ui->data_len); ((char *)ui->data)[ui->data_len] = '\0'; + inode->i_link = ui->data; break; case S_IFBLK: case S_IFCHR: @@ -242,14 +243,14 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) return inode; out_invalid: - ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err); + ubifs_err(c, "inode %lu validation failed, error %d", inode->i_ino, err); ubifs_dump_node(c, ino); ubifs_dump_inode(c, inode); err = -EINVAL; out_ino: kfree(ino); out: - ubifs_err("failed to read inode %lu, error %d", inode->i_ino, err); + ubifs_err(c, "failed to read inode %lu, error %d", inode->i_ino, err); iget_failed(inode); return ERR_PTR(err); } @@ -319,7 +320,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) if (inode->i_nlink) { err = ubifs_jnl_write_inode(c, inode); if (err) - ubifs_err("can't write inode %lu, error %d", + ubifs_err(c, "can't write inode %lu, error %d", inode->i_ino, err); else err = dbg_check_inode_size(c, inode, ui->ui_size); @@ -363,7 +364,7 @@ static void ubifs_evict_inode(struct inode *inode) * Worst case we have a lost orphan inode wasting space, so a * simple error message is OK here. */ - ubifs_err("can't delete inode %lu, error %d", + ubifs_err(c, "can't delete inode %lu, error %d", inode->i_ino, err); out: @@ -492,17 +493,17 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) static int init_constants_early(struct ubifs_info *c) { if (c->vi.corrupted) { - ubifs_warn("UBI volume is corrupted - read-only mode"); + ubifs_warn(c, "UBI volume is corrupted - read-only mode"); c->ro_media = 1; } if (c->di.ro_mode) { - ubifs_msg("read-only UBI device"); + ubifs_msg(c, "read-only UBI device"); c->ro_media = 1; } if (c->vi.vol_type == UBI_STATIC_VOLUME) { - ubifs_msg("static UBI volume - read-only mode"); + ubifs_msg(c, "static UBI volume - read-only mode"); c->ro_media = 1; } @@ -516,19 +517,19 @@ static int init_constants_early(struct ubifs_info *c) c->max_write_shift = fls(c->max_write_size) - 1; if (c->leb_size < UBIFS_MIN_LEB_SZ) { - ubifs_err("too small LEBs (%d bytes), min. is %d bytes", + ubifs_err(c, "too small LEBs (%d bytes), min. is %d bytes", c->leb_size, UBIFS_MIN_LEB_SZ); return -EINVAL; } if (c->leb_cnt < UBIFS_MIN_LEB_CNT) { - ubifs_err("too few LEBs (%d), min. is %d", + ubifs_err(c, "too few LEBs (%d), min. is %d", c->leb_cnt, UBIFS_MIN_LEB_CNT); return -EINVAL; } if (!is_power_of_2(c->min_io_size)) { - ubifs_err("bad min. I/O size %d", c->min_io_size); + ubifs_err(c, "bad min. I/O size %d", c->min_io_size); return -EINVAL; } @@ -539,7 +540,7 @@ static int init_constants_early(struct ubifs_info *c) if (c->max_write_size < c->min_io_size || c->max_write_size % c->min_io_size || !is_power_of_2(c->max_write_size)) { - ubifs_err("bad write buffer size %d for %d min. I/O unit", + ubifs_err(c, "bad write buffer size %d for %d min. I/O unit", c->max_write_size, c->min_io_size); return -EINVAL; } @@ -665,7 +666,7 @@ static int init_constants_sb(struct ubifs_info *c) tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt; tmp = ALIGN(tmp, c->min_io_size); if (tmp > c->leb_size) { - ubifs_err("too small LEB size %d, at least %d needed", + ubifs_err(c, "too small LEB size %d, at least %d needed", c->leb_size, tmp); return -EINVAL; } @@ -680,7 +681,7 @@ static int init_constants_sb(struct ubifs_info *c) tmp /= c->leb_size; tmp += 1; if (c->log_lebs < tmp) { - ubifs_err("too small log %d LEBs, required min. %d LEBs", + ubifs_err(c, "too small log %d LEBs, required min. %d LEBs", c->log_lebs, tmp); return -EINVAL; } @@ -772,7 +773,7 @@ static int take_gc_lnum(struct ubifs_info *c) int err; if (c->gc_lnum == -1) { - ubifs_err("no LEB for GC"); + ubifs_err(c, "no LEB for GC"); return -EINVAL; } @@ -857,7 +858,7 @@ static void free_orphans(struct ubifs_info *c) orph = list_entry(c->orph_list.next, struct ubifs_orphan, list); list_del(&orph->list); kfree(orph); - ubifs_err("orphan list not empty at unmount"); + ubifs_err(c, "orphan list not empty at unmount"); } vfree(c->orph_buf); @@ -954,7 +955,8 @@ static const match_table_t tokens = { */ static int parse_standard_option(const char *option) { - ubifs_msg("parse %s", option); + + pr_notice("UBIFS: parse %s\n", option); if (!strcmp(option, "sync")) return MS_SYNCHRONOUS; return 0; @@ -1026,7 +1028,7 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, else if (!strcmp(name, "zlib")) c->mount_opts.compr_type = UBIFS_COMPR_ZLIB; else { - ubifs_err("unknown compressor \"%s\"", name); + ubifs_err(c, "unknown compressor \"%s\"", name); //FIXME: is c ready? kfree(name); return -EINVAL; } @@ -1042,7 +1044,7 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, flag = parse_standard_option(p); if (!flag) { - ubifs_err("unrecognized mount option \"%s\" or missing value", + ubifs_err(c, "unrecognized mount option \"%s\" or missing value", p); return -EINVAL; } @@ -1105,7 +1107,7 @@ again: } /* Just disable bulk-read */ - ubifs_warn("cannot allocate %d bytes of memory for bulk-read, disabling it", + ubifs_warn(c, "cannot allocate %d bytes of memory for bulk-read, disabling it", c->max_bu_buf_len); c->mount_opts.bulk_read = 1; c->bulk_read = 0; @@ -1124,7 +1126,7 @@ static int check_free_space(struct ubifs_info *c) { ubifs_assert(c->dark_wm > 0); if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) { - ubifs_err("insufficient free space to mount in R/W mode"); + ubifs_err(c, "insufficient free space to mount in R/W mode"); ubifs_dump_budg(c, &c->bi); ubifs_dump_lprops(c); return -ENOSPC; @@ -1166,14 +1168,14 @@ static int mount_ubifs(struct ubifs_info *c) * This UBI volume is empty, and read-only, or the file system * is mounted read-only - we cannot format it. */ - ubifs_err("can't format empty UBI volume: read-only %s", + ubifs_err(c, "can't format empty UBI volume: read-only %s", c->ro_media ? "UBI volume" : "mount"); err = -EROFS; goto out_free; } if (c->ro_media && !c->ro_mount) { - ubifs_err("cannot mount read-write - read-only media"); + ubifs_err(c, "cannot mount read-write - read-only media"); err = -EROFS; goto out_free; } @@ -1221,7 +1223,7 @@ static int mount_ubifs(struct ubifs_info *c) * or overridden by mount options is actually compiled in. */ if (!ubifs_compr_present(c->default_compr)) { - ubifs_err("'compressor \"%s\" is not compiled in", + ubifs_err(c, "'compressor \"%s\" is not compiled in", ubifs_compr_name(c->default_compr)); err = -ENOTSUPP; goto out_free; @@ -1250,7 +1252,7 @@ static int mount_ubifs(struct ubifs_info *c) if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; - ubifs_err("cannot spawn \"%s\", error %d", + ubifs_err(c, "cannot spawn \"%s\", error %d", c->bgt_name, err); goto out_wbufs; } @@ -1264,7 +1266,7 @@ static int mount_ubifs(struct ubifs_info *c) init_constants_master(c); if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) { - ubifs_msg("recovery needed"); + ubifs_msg(c, "recovery needed"); c->need_recovery = 1; } @@ -1284,7 +1286,7 @@ static int mount_ubifs(struct ubifs_info *c) goto out_lpt; } - if (!c->ro_mount) { + if (!c->ro_mount && !c->need_recovery) { /* * Set the "dirty" flag so that if we reboot uncleanly we * will notice this immediately on the next mount. @@ -1373,10 +1375,10 @@ static int mount_ubifs(struct ubifs_info *c) if (c->need_recovery) { if (c->ro_mount) - ubifs_msg("recovery deferred"); + ubifs_msg(c, "recovery deferred"); else { c->need_recovery = 0; - ubifs_msg("recovery completed"); + ubifs_msg(c, "recovery completed"); /* * GC LEB has to be empty and taken at this point. But * the journal head LEBs may also be accounted as @@ -1397,20 +1399,20 @@ static int mount_ubifs(struct ubifs_info *c) c->mounting = 0; - ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s", + ubifs_msg(c, "UBIFS: mounted UBI device %d, volume %d, name \"%s\"%s", c->vi.ubi_num, c->vi.vol_id, c->vi.name, c->ro_mount ? ", R/O mode" : ""); x = (long long)c->main_lebs * c->leb_size; y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; - ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", + ubifs_msg(c, "LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes", c->leb_size, c->leb_size >> 10, c->min_io_size, c->max_write_size); - ubifs_msg("FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)", + ubifs_msg(c, "FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)", x, x >> 20, c->main_lebs, y, y >> 20, c->log_lebs + c->max_bud_cnt); - ubifs_msg("reserved for root: %llu bytes (%llu KiB)", + ubifs_msg(c, "reserved for root: %llu bytes (%llu KiB)", c->report_rp_size, c->report_rp_size >> 10); - ubifs_msg("media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s", + ubifs_msg(c, "media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s", c->fmt_version, c->ro_compat_version, UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid, c->big_lpt ? ", big LPT model" : ", small LPT model"); @@ -1543,8 +1545,8 @@ static int ubifs_remount_rw(struct ubifs_info *c) int err, lnum; if (c->rw_incompat) { - ubifs_err("the file-system is not R/W-compatible"); - ubifs_msg("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", + ubifs_err(c, "the file-system is not R/W-compatible"); + ubifs_msg(c, "on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d", c->fmt_version, c->ro_compat_version, UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION); return -EROFS; @@ -1581,7 +1583,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) } if (c->need_recovery) { - ubifs_msg("completing deferred recovery"); + ubifs_msg(c, "completing deferred recovery"); err = ubifs_write_rcvrd_mst_node(c); if (err) goto out; @@ -1630,7 +1632,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (IS_ERR(c->bgt)) { err = PTR_ERR(c->bgt); c->bgt = NULL; - ubifs_err("cannot spawn \"%s\", error %d", + ubifs_err(c, "cannot spawn \"%s\", error %d", c->bgt_name, err); goto out; } @@ -1664,7 +1666,7 @@ static int ubifs_remount_rw(struct ubifs_info *c) if (c->need_recovery) { c->need_recovery = 0; - ubifs_msg("deferred recovery completed"); + ubifs_msg(c, "deferred recovery completed"); } else { /* * Do not run the debugging space check if the were doing @@ -1752,8 +1754,7 @@ static void ubifs_put_super(struct super_block *sb) int i; struct ubifs_info *c = sb->s_fs_info; - ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num, - c->vi.vol_id); + ubifs_msg(c, "un-mount UBI device %d", c->vi.ubi_num); /* * The following asserts are only valid if there has not been a failure @@ -1809,7 +1810,7 @@ static void ubifs_put_super(struct super_block *sb) * next mount, so we just print a message and * continue to unmount normally. */ - ubifs_err("failed to write master node, error %d", + ubifs_err(c, "failed to write master node, error %d", err); } else { for (i = 0; i < c->jhead_cnt; i++) @@ -1834,17 +1835,17 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) err = ubifs_parse_options(c, data, 1); if (err) { - ubifs_err("invalid or unknown remount parameter"); + ubifs_err(c, "invalid or unknown remount parameter"); return err; } if (c->ro_mount && !(*flags & MS_RDONLY)) { if (c->ro_error) { - ubifs_msg("cannot re-mount R/W due to prior errors"); + ubifs_msg(c, "cannot re-mount R/W due to prior errors"); return -EROFS; } if (c->ro_media) { - ubifs_msg("cannot re-mount R/W - UBI volume is R/O"); + ubifs_msg(c, "cannot re-mount R/W - UBI volume is R/O"); return -EROFS; } err = ubifs_remount_rw(c); @@ -1852,7 +1853,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) return err; } else if (!c->ro_mount && (*flags & MS_RDONLY)) { if (c->ro_error) { - ubifs_msg("cannot re-mount R/O due to prior errors"); + ubifs_msg(c, "cannot re-mount R/O due to prior errors"); return -EROFS; } ubifs_remount_ro(c); @@ -2104,8 +2105,8 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, */ ubi = open_ubi(name, UBI_READONLY); if (IS_ERR(ubi)) { - ubifs_err("cannot open \"%s\", error %d", - name, (int)PTR_ERR(ubi)); + pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", + current->pid, name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); } @@ -2233,8 +2234,8 @@ static int __init ubifs_init(void) * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2. */ if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) { - ubifs_err("VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes", - (unsigned int)PAGE_CACHE_SIZE); + pr_err("UBIFS error (pid %d): VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes", + current->pid, (unsigned int)PAGE_CACHE_SIZE); return -EINVAL; } @@ -2245,7 +2246,9 @@ static int __init ubifs_init(void) if (!ubifs_inode_slab) return -ENOMEM; - register_shrinker(&ubifs_shrinker_info); + err = register_shrinker(&ubifs_shrinker_info); + if (err) + goto out_slab; err = ubifs_compressors_init(); if (err) @@ -2257,7 +2260,8 @@ static int __init ubifs_init(void) err = register_filesystem(&ubifs_fs_type); if (err) { - ubifs_err("cannot register file system, error %d", err); + pr_err("UBIFS error (pid %d): cannot register file system, error %d", + current->pid, err); goto out_dbg; } return 0; @@ -2268,6 +2272,7 @@ out_compr: ubifs_compressors_exit(); out_shrinker: unregister_shrinker(&ubifs_shrinker_info); +out_slab: kmem_cache_destroy(ubifs_inode_slab); return err; } diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 6793db0754f6..957f5757f374 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -98,7 +98,7 @@ static int insert_old_idx(struct ubifs_info *c, int lnum, int offs) else if (offs > o->offs) p = &(*p)->rb_right; else { - ubifs_err("old idx added twice!"); + ubifs_err(c, "old idx added twice!"); kfree(old_idx); return 0; } @@ -447,7 +447,7 @@ static int try_read_node(const struct ubifs_info *c, void *buf, int type, err = ubifs_leb_read(c, lnum, buf, offs, len, 1); if (err) { - ubifs_err("cannot read node type %d from LEB %d:%d, error %d", + ubifs_err(c, "cannot read node type %d from LEB %d:%d, error %d", type, lnum, offs, err); return err; } @@ -1684,27 +1684,27 @@ static int validate_data_node(struct ubifs_info *c, void *buf, int err, len; if (ch->node_type != UBIFS_DATA_NODE) { - ubifs_err("bad node type (%d but expected %d)", + ubifs_err(c, "bad node type (%d but expected %d)", ch->node_type, UBIFS_DATA_NODE); goto out_err; } err = ubifs_check_node(c, buf, zbr->lnum, zbr->offs, 0, 0); if (err) { - ubifs_err("expected node type %d", UBIFS_DATA_NODE); + ubifs_err(c, "expected node type %d", UBIFS_DATA_NODE); goto out; } len = le32_to_cpu(ch->len); if (len != zbr->len) { - ubifs_err("bad node length %d, expected %d", len, zbr->len); + ubifs_err(c, "bad node length %d, expected %d", len, zbr->len); goto out_err; } /* Make sure the key of the read node is correct */ key_read(c, buf + UBIFS_KEY_OFFSET, &key1); if (!keys_eq(c, &zbr->key, &key1)) { - ubifs_err("bad key in node at LEB %d:%d", + ubifs_err(c, "bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); dbg_tnck(&zbr->key, "looked for key "); dbg_tnck(&key1, "found node's key "); @@ -1716,7 +1716,7 @@ static int validate_data_node(struct ubifs_info *c, void *buf, out_err: err = -EINVAL; out: - ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs); + ubifs_err(c, "bad node at LEB %d:%d", zbr->lnum, zbr->offs); ubifs_dump_node(c, buf); dump_stack(); return err; @@ -1741,7 +1741,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) len = bu->zbranch[bu->cnt - 1].offs; len += bu->zbranch[bu->cnt - 1].len - offs; if (len > bu->buf_len) { - ubifs_err("buffer too small %d vs %d", bu->buf_len, len); + ubifs_err(c, "buffer too small %d vs %d", bu->buf_len, len); return -EINVAL; } @@ -1757,7 +1757,7 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu) return -EAGAIN; if (err && err != -EBADMSG) { - ubifs_err("failed to read from LEB %d:%d, error %d", + ubifs_err(c, "failed to read from LEB %d:%d, error %d", lnum, offs, err); dump_stack(); dbg_tnck(&bu->key, "key "); @@ -3313,7 +3313,7 @@ int dbg_check_inode_size(struct ubifs_info *c, const struct inode *inode, out_dump: block = key_block(c, key); - ubifs_err("inode %lu has size %lld, but there are data at offset %lld", + ubifs_err(c, "inode %lu has size %lld, but there are data at offset %lld", (unsigned long)inode->i_ino, size, ((loff_t)block) << UBIFS_BLOCK_SHIFT); mutex_unlock(&c->tnc_mutex); diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c index 7a205e046776..b45345d701e7 100644 --- a/fs/ubifs/tnc_commit.c +++ b/fs/ubifs/tnc_commit.c @@ -53,7 +53,7 @@ static int make_idx_node(struct ubifs_info *c, struct ubifs_idx_node *idx, br->offs = cpu_to_le32(zbr->offs); br->len = cpu_to_le32(zbr->len); if (!zbr->lnum || !zbr->len) { - ubifs_err("bad ref in znode"); + ubifs_err(c, "bad ref in znode"); ubifs_dump_znode(c, znode); if (zbr->znode) ubifs_dump_znode(c, zbr->znode); @@ -384,7 +384,7 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt) * Do not print scary warnings if the debugging * option which forces in-the-gaps is enabled. */ - ubifs_warn("out of space"); + ubifs_warn(c, "out of space"); ubifs_dump_budg(c, &c->bi); ubifs_dump_lprops(c); } @@ -441,7 +441,7 @@ static int layout_in_empty_space(struct ubifs_info *c) /* Determine the index node position */ if (lnum == -1) { if (c->ileb_nxt >= c->ileb_cnt) { - ubifs_err("out of space"); + ubifs_err(c, "out of space"); return -ENOSPC; } lnum = c->ilebs[c->ileb_nxt++]; @@ -855,7 +855,7 @@ static int write_index(struct ubifs_info *c) br->offs = cpu_to_le32(zbr->offs); br->len = cpu_to_le32(zbr->len); if (!zbr->lnum || !zbr->len) { - ubifs_err("bad ref in znode"); + ubifs_err(c, "bad ref in znode"); ubifs_dump_znode(c, znode); if (zbr->znode) ubifs_dump_znode(c, zbr->znode); @@ -875,7 +875,7 @@ static int write_index(struct ubifs_info *c) if (lnum != znode->lnum || offs != znode->offs || len != znode->len) { - ubifs_err("inconsistent znode posn"); + ubifs_err(c, "inconsistent znode posn"); return -EINVAL; } @@ -973,7 +973,7 @@ static int write_index(struct ubifs_info *c) if (lnum != c->dbg->new_ihead_lnum || buf_offs != c->dbg->new_ihead_offs) { - ubifs_err("inconsistent ihead"); + ubifs_err(c, "inconsistent ihead"); return -EINVAL; } diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c index f6bf8995c7b1..93f5b7859e6f 100644 --- a/fs/ubifs/tnc_misc.c +++ b/fs/ubifs/tnc_misc.c @@ -293,9 +293,9 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, lnum, offs, znode->level, znode->child_cnt); if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) { - ubifs_err("current fanout %d, branch count %d", + ubifs_err(c, "current fanout %d, branch count %d", c->fanout, znode->child_cnt); - ubifs_err("max levels %d, znode level %d", + ubifs_err(c, "max levels %d, znode level %d", UBIFS_MAX_LEVELS, znode->level); err = 1; goto out_dump; @@ -316,7 +316,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, if (zbr->lnum < c->main_first || zbr->lnum >= c->leb_cnt || zbr->offs < 0 || zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) { - ubifs_err("bad branch %d", i); + ubifs_err(c, "bad branch %d", i); err = 2; goto out_dump; } @@ -328,7 +328,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, case UBIFS_XENT_KEY: break; default: - ubifs_err("bad key type at slot %d: %d", + ubifs_err(c, "bad key type at slot %d: %d", i, key_type(c, &zbr->key)); err = 3; goto out_dump; @@ -340,17 +340,17 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, type = key_type(c, &zbr->key); if (c->ranges[type].max_len == 0) { if (zbr->len != c->ranges[type].len) { - ubifs_err("bad target node (type %d) length (%d)", + ubifs_err(c, "bad target node (type %d) length (%d)", type, zbr->len); - ubifs_err("have to be %d", c->ranges[type].len); + ubifs_err(c, "have to be %d", c->ranges[type].len); err = 4; goto out_dump; } } else if (zbr->len < c->ranges[type].min_len || zbr->len > c->ranges[type].max_len) { - ubifs_err("bad target node (type %d) length (%d)", + ubifs_err(c, "bad target node (type %d) length (%d)", type, zbr->len); - ubifs_err("have to be in range of %d-%d", + ubifs_err(c, "have to be in range of %d-%d", c->ranges[type].min_len, c->ranges[type].max_len); err = 5; @@ -370,12 +370,12 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, cmp = keys_cmp(c, key1, key2); if (cmp > 0) { - ubifs_err("bad key order (keys %d and %d)", i, i + 1); + ubifs_err(c, "bad key order (keys %d and %d)", i, i + 1); err = 6; goto out_dump; } else if (cmp == 0 && !is_hash_key(c, key1)) { /* These can only be keys with colliding hash */ - ubifs_err("keys %d and %d are not hashed but equivalent", + ubifs_err(c, "keys %d and %d are not hashed but equivalent", i, i + 1); err = 7; goto out_dump; @@ -386,7 +386,7 @@ static int read_znode(struct ubifs_info *c, int lnum, int offs, int len, return 0; out_dump: - ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err); + ubifs_err(c, "bad indexing node at LEB %d:%d, error %d", lnum, offs, err); ubifs_dump_node(c, idx); kfree(idx); return -EINVAL; @@ -482,7 +482,7 @@ int ubifs_tnc_read_node(struct ubifs_info *c, struct ubifs_zbranch *zbr, /* Make sure the key of the read node is correct */ key_read(c, node + UBIFS_KEY_OFFSET, &key1); if (!keys_eq(c, key, &key1)) { - ubifs_err("bad key in node at LEB %d:%d", + ubifs_err(c, "bad key in node at LEB %d:%d", zbr->lnum, zbr->offs); dbg_tnck(key, "looked for key "); dbg_tnck(&key1, "but found node's key "); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index bc04b9c69891..de759022f3d6 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -43,15 +43,19 @@ #define UBIFS_VERSION 1 /* Normal UBIFS messages */ -#define ubifs_msg(fmt, ...) pr_notice("UBIFS: " fmt "\n", ##__VA_ARGS__) +#define ubifs_msg(c, fmt, ...) \ + pr_notice("UBIFS (ubi%d:%d): " fmt "\n", \ + (c)->vi.ubi_num, (c)->vi.vol_id, ##__VA_ARGS__) /* UBIFS error messages */ -#define ubifs_err(fmt, ...) \ - pr_err("UBIFS error (pid %d): %s: " fmt "\n", current->pid, \ +#define ubifs_err(c, fmt, ...) \ + pr_err("UBIFS error (ubi%d:%d pid %d): %s: " fmt "\n", \ + (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \ __func__, ##__VA_ARGS__) /* UBIFS warning messages */ -#define ubifs_warn(fmt, ...) \ - pr_warn("UBIFS warning (pid %d): %s: " fmt "\n", \ - current->pid, __func__, ##__VA_ARGS__) +#define ubifs_warn(c, fmt, ...) \ + pr_warn("UBIFS warning (ubi%d:%d pid %d): %s: " fmt "\n", \ + (c)->vi.ubi_num, (c)->vi.vol_id, current->pid, \ + __func__, ##__VA_ARGS__) /* * A variant of 'ubifs_err()' which takes the UBIFS file-sytem description * object as an argument. @@ -59,7 +63,7 @@ #define ubifs_errc(c, fmt, ...) \ do { \ if (!(c)->probing) \ - ubifs_err(fmt, ##__VA_ARGS__); \ + ubifs_err(c, fmt, ##__VA_ARGS__); \ } while (0) /* UBIFS file system VFS magic number */ @@ -158,7 +162,7 @@ #define WORST_COMPR_FACTOR 2 /* - * How much memory is needed for a buffer where we comress a data node. + * How much memory is needed for a buffer where we compress a data node. */ #define COMPRESSED_DATA_NODE_BUF_SZ \ (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR) @@ -664,7 +668,7 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes * fields * @softlimit: soft write-buffer timeout interval - * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit + * @delta: hard and soft timeouts delta (the timer expire interval is @softlimit * and @softlimit + @delta) * @timer: write-buffer timer * @no_timer: non-zero if this write-buffer does not have a timer @@ -930,9 +934,9 @@ struct ubifs_orphan { /** * struct ubifs_mount_opts - UBIFS-specific mount options information. * @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast) - * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable) + * @bulk_read: enable/disable bulk-reads (%0 default, %1 disable, %2 enable) * @chk_data_crc: enable/disable CRC data checking when reading data nodes - * (%0 default, %1 disabe, %2 enable) + * (%0 default, %1 disable, %2 enable) * @override_compr: override default compressor (%0 - do not override and use * superblock compressor, %1 - override and use compressor * specified in @compr_type) @@ -962,9 +966,9 @@ struct ubifs_mount_opts { * optimization) * @nospace_rp: the same as @nospace, but additionally means that even reserved * pool is full - * @page_budget: budget for a page (constant, nenver changed after mount) - * @inode_budget: budget for an inode (constant, nenver changed after mount) - * @dent_budget: budget for a directory entry (constant, nenver changed after + * @page_budget: budget for a page (constant, never changed after mount) + * @inode_budget: budget for an inode (constant, never changed after mount) + * @dent_budget: budget for a directory entry (constant, never changed after * mount) */ struct ubifs_budg_info { @@ -1787,10 +1791,10 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* compressor.c */ int __init ubifs_compressors_init(void); void ubifs_compressors_exit(void); -void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len, - int *compr_type); -int ubifs_decompress(const void *buf, int len, void *out, int *out_len, - int compr_type); +void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len, + void *out_buf, int *out_len, int *compr_type); +int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, + void *out, int *out_len, int compr_type); #include "debug.h" #include "misc.h" diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index a92be244a6fb..96f3448b6eb4 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -108,7 +108,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, .dirtied_ino_d = ALIGN(host_ui->data_len, 8) }; if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) { - ubifs_err("inode %lu already has too many xattrs (%d), cannot create more", + ubifs_err(c, "inode %lu already has too many xattrs (%d), cannot create more", host->i_ino, host_ui->xattr_cnt); return -ENOSPC; } @@ -120,7 +120,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, */ names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1; if (names_len > XATTR_LIST_MAX) { - ubifs_err("cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d", + ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d", host->i_ino, names_len, XATTR_LIST_MAX); return -ENOSPC; } @@ -288,13 +288,13 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum) inode = ubifs_iget(c->vfs_sb, inum); if (IS_ERR(inode)) { - ubifs_err("dead extended attribute entry, error %d", + ubifs_err(c, "dead extended attribute entry, error %d", (int)PTR_ERR(inode)); return inode; } if (ubifs_inode(inode)->xattr) return inode; - ubifs_err("corrupt extended attribute entry"); + ubifs_err(c, "corrupt extended attribute entry"); iput(inode); return ERR_PTR(-EINVAL); } @@ -364,15 +364,15 @@ int ubifs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", - name, dentry->d_inode->i_ino, dentry, size); + name, d_inode(dentry)->i_ino, dentry, size); - return setxattr(dentry->d_inode, name, value, size, flags); + return setxattr(d_inode(dentry), name, value, size, flags); } ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, size_t size) { - struct inode *inode, *host = dentry->d_inode; + struct inode *inode, *host = d_inode(dentry); struct ubifs_info *c = host->i_sb->s_fs_info; struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_inode *ui; @@ -412,7 +412,7 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf, if (buf) { /* If @buf is %NULL we are supposed to return the length */ if (ui->data_len > size) { - ubifs_err("buffer size %zd, xattr len %d", + ubifs_err(c, "buffer size %zd, xattr len %d", size, ui->data_len); err = -ERANGE; goto out_iput; @@ -432,7 +432,7 @@ out_unlock: ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) { union ubifs_key key; - struct inode *host = dentry->d_inode; + struct inode *host = d_inode(dentry); struct ubifs_info *c = host->i_sb->s_fs_info; struct ubifs_inode *host_ui = ubifs_inode(host); struct ubifs_dent_node *xent, *pxent = NULL; @@ -485,7 +485,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size) kfree(pxent); if (err != -ENOENT) { - ubifs_err("cannot find next direntry, error %d", err); + ubifs_err(c, "cannot find next direntry, error %d", err); return err; } @@ -535,7 +535,7 @@ out_cancel: int ubifs_removexattr(struct dentry *dentry, const char *name) { - struct inode *inode, *host = dentry->d_inode; + struct inode *inode, *host = d_inode(dentry); struct ubifs_info *c = host->i_sb->s_fs_info; struct qstr nm = QSTR_INIT(name, strlen(name)); struct ubifs_dent_node *xent; @@ -657,8 +657,10 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, &init_xattrs, 0); mutex_unlock(&inode->i_mutex); - if (err) - ubifs_err("cannot initialize security for inode %lu, error %d", + if (err) { + struct ubifs_info *c = dentry->i_sb->s_fs_info; + ubifs_err(c, "cannot initialize security for inode %lu, error %d", inode->i_ino, err); + } return err; } diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 1ba2baaf4367..6d6a96b4e73f 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -21,7 +21,6 @@ #include "udfdecl.h" -#include <linux/buffer_head.h> #include <linux/bitops.h> #include "udf_i.h" @@ -63,15 +62,14 @@ static int __load_block_bitmap(struct super_block *sb, block_group, nr_groups); } - if (bitmap->s_block_bitmap[block_group]) { + if (bitmap->s_block_bitmap[block_group]) return block_group; - } else { - retval = read_block_bitmap(sb, bitmap, block_group, - block_group); - if (retval < 0) - return retval; - return block_group; - } + + retval = read_block_bitmap(sb, bitmap, block_group, block_group); + if (retval < 0) + return retval; + + return block_group; } static inline int load_block_bitmap(struct super_block *sb, @@ -358,7 +356,6 @@ static void udf_table_free_blocks(struct super_block *sb, struct kernel_lb_addr eloc; struct extent_position oepos, epos; int8_t etype; - int i; struct udf_inode_info *iinfo; mutex_lock(&sbi->s_alloc_mutex); @@ -425,7 +422,6 @@ static void udf_table_free_blocks(struct super_block *sb, } if (epos.bh != oepos.bh) { - i = -1; oepos.block = epos.block; brelse(oepos.bh); get_bh(epos.bh); @@ -762,7 +758,7 @@ inline int udf_prealloc_blocks(struct super_block *sb, uint32_t block_count) { struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; - sector_t allocated; + int allocated; if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) allocated = udf_bitmap_prealloc_blocks(sb, diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 05e90edd1992..541d9c65014d 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -30,7 +30,6 @@ #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include "udf_i.h" #include "udf_sb.h" @@ -169,7 +168,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); - if (!flen) + if (flen < 0) continue; tloc = lelb_to_cpu(cfi.icb.extLocation); diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 3e44f575fb9c..c763fda257bf 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -16,7 +16,6 @@ #include <linux/fs.h> #include <linux/string.h> -#include <linux/buffer_head.h> struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, struct udf_fileident_bh *fibh, diff --git a/fs/udf/file.c b/fs/udf/file.c index 08f3555fbeac..bddf3d071dae 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -33,8 +33,7 @@ #include <linux/capability.h> #include <linux/errno.h> #include <linux/pagemap.h> -#include <linux/buffer_head.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "udf_i.h" #include "udf_sb.h" @@ -100,8 +99,7 @@ static int udf_adinicb_write_begin(struct file *file, return 0; } -static ssize_t udf_adinicb_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, +static ssize_t udf_adinicb_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { /* Fallback to buffered I/O. */ @@ -121,21 +119,21 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t retval; struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); - int err, pos; - size_t count = iocb->ki_nbytes; struct udf_inode_info *iinfo = UDF_I(inode); + int err; mutex_lock(&inode->i_mutex); + + retval = generic_write_checks(iocb, from); + if (retval <= 0) + goto out; + down_write(&iinfo->i_data_sem); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { - if (file->f_flags & O_APPEND) - pos = inode->i_size; - else - pos = iocb->ki_pos; + loff_t end = iocb->ki_pos + iov_iter_count(from); if (inode->i_sb->s_blocksize < - (udf_file_entry_alloc_offset(inode) + - pos + count)) { + (udf_file_entry_alloc_offset(inode) + end)) { err = udf_expand_file_adinicb(inode); if (err) { mutex_unlock(&inode->i_mutex); @@ -143,21 +141,17 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return err; } } else { - if (pos + count > inode->i_size) - iinfo->i_lenAlloc = pos + count; - else - iinfo->i_lenAlloc = inode->i_size; + iinfo->i_lenAlloc = max(end, inode->i_size); up_write(&iinfo->i_data_sem); } } else up_write(&iinfo->i_data_sem); retval = __generic_file_write_iter(iocb, from); +out: mutex_unlock(&inode->i_mutex); if (retval > 0) { - ssize_t err; - mark_inode_dirty(inode); err = generic_write_sync(file, iocb->ki_pos - retval, retval); if (err < 0) @@ -240,12 +234,10 @@ static int udf_release_file(struct inode *inode, struct file *filp) } const struct file_operations udf_file_operations = { - .read = new_sync_read, .read_iter = generic_file_read_iter, .unlocked_ioctl = udf_ioctl, .open = generic_file_open, .mmap = generic_file_mmap, - .write = new_sync_write, .write_iter = udf_file_write_iter, .release = udf_release_file, .fsync = generic_file_fsync, @@ -255,7 +247,7 @@ const struct file_operations udf_file_operations = { static int udf_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; error = inode_change_ok(inode, attr); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a445d599098d..8d0b3ade0ff0 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -33,12 +33,11 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/pagemap.h> -#include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/slab.h> #include <linux/crc-itu-t.h> #include <linux/mpage.h> -#include <linux/aio.h> +#include <linux/uio.h> #include "udf_i.h" #include "udf_sb.h" @@ -215,8 +214,7 @@ static int udf_write_begin(struct file *file, struct address_space *mapping, return ret; } -static ssize_t udf_direct_IO(int rw, struct kiocb *iocb, - struct iov_iter *iter, +static ssize_t udf_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; @@ -225,8 +223,8 @@ static ssize_t udf_direct_IO(int rw, struct kiocb *iocb, size_t count = iov_iter_count(iter); ssize_t ret; - ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, udf_get_block); - if (unlikely(ret < 0 && (rw & WRITE))) + ret = blockdev_direct_IO(iocb, inode, iter, offset, udf_get_block); + if (unlikely(ret < 0 && iov_iter_rw(iter) == WRITE)) udf_write_failed(mapping, offset + count); return ret; } @@ -1637,7 +1635,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) udf_get_lb_pblock(inode->i_sb, &iinfo->i_location, 0)); if (!bh) { udf_debug("getblk failure\n"); - return -ENOMEM; + return -EIO; } lock_buffer(bh); @@ -1654,17 +1652,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); - use->descTag.tagLocation = - cpu_to_le32(iinfo->i_location.logicalBlockNum); - crclen = sizeof(struct unallocSpaceEntry) + - iinfo->i_lenAlloc - sizeof(struct tag); - use->descTag.descCRCLength = cpu_to_le16(crclen); - use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + - sizeof(struct tag), - crclen)); - use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); + crclen = sizeof(struct unallocSpaceEntry); - goto out; + goto finish; } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) @@ -1784,6 +1774,8 @@ static int udf_update_inode(struct inode *inode, int do_sync) efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); crclen = sizeof(struct extendedFileEntry); } + +finish: if (iinfo->i_strat4096) { fe->icbTag.strategyType = cpu_to_le16(4096); fe->icbTag.strategyParameter = cpu_to_le16(1); @@ -1793,7 +1785,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->icbTag.numEntries = cpu_to_le16(1); } - if (S_ISDIR(inode->i_mode)) + if (iinfo->i_use) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE; + else if (S_ISDIR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY; else if (S_ISREG(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR; @@ -1830,7 +1824,6 @@ static int udf_update_inode(struct inode *inode, int do_sync) crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); -out: set_buffer_uptodate(bh); unlock_buffer(bh); diff --git a/fs/udf/misc.c b/fs/udf/misc.c index c175b4dabc14..71d1c25f360d 100644 --- a/fs/udf/misc.c +++ b/fs/udf/misc.c @@ -23,7 +23,6 @@ #include <linux/fs.h> #include <linux/string.h> -#include <linux/buffer_head.h> #include <linux/crc-itu-t.h> #include "udf_i.h" diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 33b246b82c98..c97b5a8d1e24 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -27,7 +27,6 @@ #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> -#include <linux/buffer_head.h> #include <linux/sched.h> #include <linux/crc-itu-t.h> #include <linux/exportfs.h> @@ -139,6 +138,25 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, return 0; } +/** + * udf_find_entry - find entry in given directory. + * + * @dir: directory inode to search in + * @child: qstr of the name + * @fibh: buffer head / inode with file identifier descriptor we found + * @cfi: found file identifier descriptor with given name + * + * This function searches in the directory @dir for a file name @child. When + * found, @fibh points to the buffer head(s) (bh is NULL for in ICB + * directories) containing the file identifier descriptor (FID). In that case + * the function returns pointer to the FID in the buffer or inode - but note + * that FID may be split among two buffers (blocks) so accessing it via that + * pointer isn't easily possible. This pointer can be used only as an iterator + * for other directory manipulation functions. For inspection of the FID @cfi + * can be used - the found FID is copied there. + * + * Returns pointer to FID, NULL when nothing found, or error code. + */ static struct fileIdentDesc *udf_find_entry(struct inode *dir, const struct qstr *child, struct udf_fileident_bh *fibh, @@ -168,8 +186,11 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1); if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos, - &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) + &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { + fi = ERR_PTR(-EIO); goto out_err; + } + block = udf_get_lb_pblock(sb, &eloc, offset); if ((++offset << sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) @@ -180,19 +201,25 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, offset = 0; fibh->sbh = fibh->ebh = udf_tread(sb, block); - if (!fibh->sbh) + if (!fibh->sbh) { + fi = ERR_PTR(-EIO); goto out_err; + } } fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); - if (!fname) + if (!fname) { + fi = ERR_PTR(-ENOMEM); goto out_err; + } while (f_pos < size) { fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, &elen, &offset); - if (!fi) + if (!fi) { + fi = ERR_PTR(-EIO); goto out_err; + } liu = le16_to_cpu(cfi->lengthOfImpUse); lfi = cfi->lengthFileIdent; @@ -235,12 +262,17 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, continue; flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); - if (flen && udf_match(flen, fname, child->len, child->name)) + if (flen < 0) { + fi = ERR_PTR(flen); + goto out_err; + } + + if (udf_match(flen, fname, child->len, child->name)) goto out_ok; } -out_err: fi = NULL; +out_err: if (fibh->sbh != fibh->ebh) brelse(fibh->ebh); brelse(fibh->sbh); @@ -257,6 +289,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct fileIdentDesc cfi; struct udf_fileident_bh fibh; + struct fileIdentDesc *fi; if (dentry->d_name.len > UDF_NAME_LEN - 2) return ERR_PTR(-ENAMETOOLONG); @@ -276,7 +309,11 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, } else #endif /* UDF_RECOVERY */ - if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { + fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); + if (IS_ERR(fi)) + return ERR_CAST(fi); + + if (fi) { struct kernel_lb_addr loc; if (fibh.sbh != fibh.ebh) @@ -552,7 +589,7 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, static int udf_add_nondir(struct dentry *dentry, struct inode *inode) { struct udf_inode_info *iinfo = UDF_I(inode); - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); struct udf_fileident_bh fibh; struct fileIdentDesc cfi, *fi; int err; @@ -569,8 +606,8 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(dir); + dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + mark_inode_dirty(dir); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); @@ -683,6 +720,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) cfi.fileCharacteristics |= FID_FILE_CHAR_DIRECTORY; udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); inc_nlink(dir); + dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); mark_inode_dirty(dir); unlock_new_inode(inode); d_instantiate(dentry, inode); @@ -767,15 +805,18 @@ static int empty_dir(struct inode *dir) static int udf_rmdir(struct inode *dir, struct dentry *dentry) { int retval; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct udf_fileident_bh fibh; struct fileIdentDesc *fi, cfi; struct kernel_lb_addr tloc; retval = -ENOENT; fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - if (!fi) + if (IS_ERR_OR_NULL(fi)) { + if (fi) + retval = PTR_ERR(fi); goto out; + } retval = -EIO; tloc = lelb_to_cpu(cfi.icb.extLocation); @@ -809,7 +850,7 @@ out: static int udf_unlink(struct inode *dir, struct dentry *dentry) { int retval; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct udf_fileident_bh fibh; struct fileIdentDesc *fi; struct fileIdentDesc cfi; @@ -817,8 +858,12 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) retval = -ENOENT; fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - if (!fi) + + if (IS_ERR_OR_NULL(fi)) { + if (fi) + retval = PTR_ERR(fi); goto out; + } retval = -EIO; tloc = lelb_to_cpu(cfi.icb.extLocation); @@ -999,7 +1044,7 @@ out_no_entry: static int udf_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct udf_fileident_bh fibh; struct fileIdentDesc cfi, *fi; int err; @@ -1024,6 +1069,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, inc_nlink(inode); inode->i_ctime = current_fs_time(inode->i_sb); mark_inode_dirty(inode); + dir->i_ctime = dir->i_mtime = current_fs_time(dir->i_sb); + mark_inode_dirty(dir); ihold(inode); d_instantiate(dentry, inode); @@ -1036,8 +1083,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir, static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct udf_fileident_bh ofibh, nfibh; struct fileIdentDesc *ofi = NULL, *nfi = NULL, *dir_fi = NULL; struct fileIdentDesc ocfi, ncfi; @@ -1047,24 +1094,30 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct udf_inode_info *old_iinfo = UDF_I(old_inode); ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - if (ofi) { - if (ofibh.sbh != ofibh.ebh) - brelse(ofibh.ebh); - brelse(ofibh.sbh); + if (IS_ERR(ofi)) { + retval = PTR_ERR(ofi); + goto end_rename; } + + if (ofibh.sbh != ofibh.ebh) + brelse(ofibh.ebh); + + brelse(ofibh.sbh); tloc = lelb_to_cpu(ocfi.icb.extLocation); if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) goto end_rename; nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); - if (nfi) { - if (!new_inode) { - if (nfibh.sbh != nfibh.ebh) - brelse(nfibh.ebh); - brelse(nfibh.sbh); - nfi = NULL; - } + if (IS_ERR(nfi)) { + retval = PTR_ERR(nfi); + goto end_rename; + } + if (nfi && !new_inode) { + if (nfibh.sbh != nfibh.ebh) + brelse(nfibh.ebh); + brelse(nfibh.sbh); + nfi = NULL; } if (S_ISDIR(old_inode->i_mode)) { int offset = udf_ext0_offset(old_inode); @@ -1127,7 +1180,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, inode_dec_link_count(new_inode); } old_dir->i_ctime = old_dir->i_mtime = current_fs_time(old_dir->i_sb); + new_dir->i_ctime = new_dir->i_mtime = current_fs_time(new_dir->i_sb); mark_inode_dirty(old_dir); + mark_inode_dirty(new_dir); if (dir_fi) { dir_fi->icb.extLocation = cpu_to_lelb(UDF_I(new_dir)->i_location); @@ -1175,7 +1230,7 @@ static struct dentry *udf_get_parent(struct dentry *child) struct fileIdentDesc cfi; struct udf_fileident_bh fibh; - if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) + if (!udf_find_entry(d_inode(child), &dotdot, &fibh, &cfi)) return ERR_PTR(-EACCES); if (fibh.sbh != fibh.ebh) @@ -1183,7 +1238,7 @@ static struct dentry *udf_get_parent(struct dentry *child) brelse(fibh.sbh); tloc = lelb_to_cpu(cfi.icb.extLocation); - inode = udf_iget(child->d_inode->i_sb, &tloc); + inode = udf_iget(d_inode(child)->i_sb, &tloc); if (IS_ERR(inode)) return ERR_CAST(inode); @@ -1217,7 +1272,7 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, static struct dentry *udf_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - if ((fh_len != 3 && fh_len != 5) || + if (fh_len < 3 || (fh_type != FILEID_UDF_WITH_PARENT && fh_type != FILEID_UDF_WITHOUT_PARENT)) return NULL; @@ -1229,7 +1284,7 @@ static struct dentry *udf_fh_to_dentry(struct super_block *sb, static struct dentry *udf_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - if (fh_len != 5 || fh_type != FILEID_UDF_WITH_PARENT) + if (fh_len < 5 || fh_type != FILEID_UDF_WITH_PARENT) return NULL; return udf_nfs_get_inode(sb, fid->udf.parent_block, diff --git a/fs/udf/partition.c b/fs/udf/partition.c index d6caf01a2097..5f861ed287c3 100644 --- a/fs/udf/partition.c +++ b/fs/udf/partition.c @@ -24,7 +24,6 @@ #include <linux/fs.h> #include <linux/string.h> -#include <linux/buffer_head.h> #include <linux/mutex.h> uint32_t udf_get_pblock(struct super_block *sb, uint32_t block, diff --git a/fs/udf/super.c b/fs/udf/super.c index f169411c4ea0..b96f190bc567 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -48,7 +48,6 @@ #include <linux/stat.h> #include <linux/cdrom.h> #include <linux/nls.h> -#include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/vmalloc.h> #include <linux/errno.h> @@ -928,17 +927,23 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) #endif } - if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) - if (udf_CS0toUTF8(outstr, instr)) { - strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, - outstr->u_len > 31 ? 31 : outstr->u_len); - udf_debug("volIdent[] = '%s'\n", - UDF_SB(sb)->s_volume_ident); - } + if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) { + ret = udf_CS0toUTF8(outstr, instr); + if (ret < 0) + goto out_bh; + + strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, + outstr->u_len > 31 ? 31 : outstr->u_len); + udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); + } - if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) - if (udf_CS0toUTF8(outstr, instr)) - udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); + if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) { + ret = udf_CS0toUTF8(outstr, instr); + if (ret < 0) + goto out_bh; + + udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); + } ret = 0; out_bh: diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index ac10ca939f26..862535b3ba58 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -27,7 +27,6 @@ #include <linux/mm.h> #include <linux/stat.h> #include <linux/pagemap.h> -#include <linux/buffer_head.h> #include "udf_i.h" static int udf_pc_to_char(struct super_block *sb, unsigned char *from, @@ -83,6 +82,9 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from, comp_len = udf_get_filename(sb, pc->componentIdent, pc->lengthComponentIdent, p, tolen); + if (comp_len < 0) + return comp_len; + p += comp_len; tolen -= comp_len; if (tolen == 0) diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 8a9657d7f7c6..42b8c57795cb 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -22,7 +22,6 @@ #include "udfdecl.h" #include <linux/fs.h> #include <linux/mm.h> -#include <linux/buffer_head.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index b5cd8ed2aa12..b1b9a63d8cf3 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -56,7 +56,7 @@ struct udf_inode_info { static inline struct udf_inode_info *UDF_I(struct inode *inode) { - return list_entry(inode, struct udf_inode_info, vfs_inode); + return container_of(inode, struct udf_inode_info, vfs_inode); } #endif /* _UDF_I_H) */ diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index b84fee372734..ab478e62baae 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -68,21 +68,16 @@ int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) /* * udf_build_ustr_exact */ -static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) +static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) { - if ((!dest) || (!ptr) || (!exactsize)) - return -1; - memset(dest, 0, sizeof(struct ustr)); dest->u_cmpID = ptr[0]; dest->u_len = exactsize - 1; memcpy(dest->u_name, ptr + 1, exactsize - 1); - - return 0; } /* - * udf_ocu_to_utf8 + * udf_CS0toUTF8 * * PURPOSE * Convert OSTA Compressed Unicode to the UTF-8 equivalent. @@ -94,7 +89,7 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) * both of type "struct ustr *" * * POST-CONDITIONS - * <return> Zero on success. + * <return> >= 0 on success. * * HISTORY * November 12, 1997 - Andrew E. Mileski @@ -117,7 +112,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) memset(utf_o, 0, sizeof(struct ustr)); pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); - return 0; + return -EINVAL; } ocu = ocu_i->u_name; @@ -154,7 +149,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) /* * - * udf_utf8_to_ocu + * udf_UTF8toCS0 * * PURPOSE * Convert UTF-8 to the OSTA Compressed Unicode equivalent. @@ -270,7 +265,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, memset(utf_o, 0, sizeof(struct ustr)); pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); - return 0; + return -EINVAL; } ocu = ocu_i->u_name; @@ -338,43 +333,51 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen, uint8_t *dname, int dlen) { struct ustr *filename, *unifilename; - int len = 0; + int ret; + + if (!slen) + return -EIO; filename = kmalloc(sizeof(struct ustr), GFP_NOFS); if (!filename) - return 0; + return -ENOMEM; unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS); - if (!unifilename) + if (!unifilename) { + ret = -ENOMEM; goto out1; + } - if (udf_build_ustr_exact(unifilename, sname, slen)) - goto out2; - + udf_build_ustr_exact(unifilename, sname, slen); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { - if (!udf_CS0toUTF8(filename, unifilename)) { + ret = udf_CS0toUTF8(filename, unifilename); + if (ret < 0) { udf_debug("Failed in udf_get_filename: sname = %s\n", sname); goto out2; } } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { - if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename, - unifilename)) { + ret = udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename, + unifilename); + if (ret < 0) { udf_debug("Failed in udf_get_filename: sname = %s\n", sname); goto out2; } } else - goto out2; + BUG(); - len = udf_translate_to_linux(dname, dlen, + ret = udf_translate_to_linux(dname, dlen, filename->u_name, filename->u_len, unifilename->u_name, unifilename->u_len); + /* Zero length filename isn't valid... */ + if (ret == 0) + ret = -EINVAL; out2: kfree(unifilename); out1: kfree(filename); - return len; + return ret; } int udf_put_filename(struct super_block *sb, const uint8_t *sname, diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 2c1036080d52..a7106eda5024 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -51,8 +51,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (ufs_fragnum(fragment) + count > uspi->s_fpg) ufs_error (sb, "ufs_free_fragments", "internal error"); - - lock_ufs(sb); + + mutex_lock(&UFS_SB(sb)->s_lock); cgno = ufs_dtog(uspi, fragment); bit = ufs_dtogd(uspi, fragment); @@ -115,13 +115,13 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) if (sb->s_flags & MS_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - - unlock_ufs(sb); + + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); return; failed: - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return; } @@ -151,7 +151,7 @@ void ufs_free_blocks(struct inode *inode, u64 fragment, unsigned count) goto failed; } - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); do_more: overflow = 0; @@ -211,12 +211,12 @@ do_more: } ufs_mark_sb_dirty(sb); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); return; failed_unlock: - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); failed: UFSD("EXIT (FAILED)\n"); return; @@ -357,7 +357,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, usb1 = ubh_get_usb_first(uspi); *err = -ENOSPC; - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); tmp = ufs_data_ptr_to_cpu(sb, p); if (count + ufs_fragnum(fragment) > uspi->s_fpb) { @@ -378,19 +378,19 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, "fragment %llu, tmp %llu\n", (unsigned long long)fragment, (unsigned long long)tmp); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return INVBLOCK; } if (fragment < UFS_I(inode)->i_lastfrag) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } } else { if (tmp) { UFSD("EXIT (ALREADY ALLOCATED)\n"); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } } @@ -399,7 +399,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, * There is not enough space for user on the device */ if (!capable(CAP_SYS_RESOURCE) && ufs_freespace(uspi, UFS_MINFREE) <= 0) { - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return 0; } @@ -424,7 +424,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); } - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -439,7 +439,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, fragment + count); ufs_clear_frags(inode, result + oldcount, newcount - oldcount, locked_page != NULL); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT, result %llu\n", (unsigned long long)result); return result; } @@ -477,7 +477,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); if (newcount < request) ufs_free_fragments (inode, result + newcount, request - newcount); ufs_free_fragments (inode, tmp, oldcount); @@ -485,7 +485,7 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, return result; } - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT (FAILED)\n"); return 0; } diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 0ecc2cebed8f..74f2e80288bf 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -65,11 +65,6 @@ static inline void ufs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long ufs_dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; @@ -87,7 +82,8 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) /* Releases the page */ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode) + struct page *page, struct inode *inode, + bool update_times) { loff_t pos = page_offset(page) + (char *) de - (char *) page_address(page); @@ -103,7 +99,8 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, err = ufs_commit_chunk(page, pos, len); ufs_put_page(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (update_times) + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(dir); } @@ -256,7 +253,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, int namelen = qstr->len; unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; - unsigned long npages = ufs_dir_pages(dir); + unsigned long npages = dir_pages(dir); struct page *page = NULL; struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; @@ -311,7 +308,7 @@ found: */ int ufs_add_link(struct dentry *dentry, struct inode *inode) { - struct inode *dir = dentry->d_parent->d_inode; + struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; struct super_block *sb = dir->i_sb; @@ -320,7 +317,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) unsigned short rec_len, name_len; struct page *page = NULL; struct ufs_dir_entry *de; - unsigned long npages = ufs_dir_pages(dir); + unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; loff_t pos; @@ -437,7 +434,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; - unsigned long npages = ufs_dir_pages(inode); + unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); int need_revalidate = file->f_version != inode->i_version; unsigned flags = UFS_SB(sb)->s_flags; @@ -608,7 +605,7 @@ int ufs_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; struct page *page = NULL; - unsigned long i, npages = ufs_dir_pages(inode); + unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { char *kaddr; diff --git a/fs/ufs/file.c b/fs/ufs/file.c index c84ec010a676..042ddbf110cc 100644 --- a/fs/ufs/file.c +++ b/fs/ufs/file.c @@ -35,9 +35,7 @@ const struct file_operations ufs_file_operations = { .llseek = generic_file_llseek, - .read = new_sync_read, .read_iter = generic_file_read_iter, - .write = new_sync_write, .write_iter = generic_file_write_iter, .mmap = generic_file_mmap, .open = generic_file_open, diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 7caa01652888..fd0203ce1f7f 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -69,11 +69,11 @@ void ufs_free_inode (struct inode * inode) ino = inode->i_ino; - lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); if (!((ino > 1) && (ino < (uspi->s_ncg * uspi->s_ipg )))) { ufs_warning(sb, "ufs_free_inode", "reserved inode or nonexistent inode %u\n", ino); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return; } @@ -81,7 +81,7 @@ void ufs_free_inode (struct inode * inode) bit = ufs_inotocgoff (ino); ucpi = ufs_load_cylinder (sb, cg); if (!ucpi) { - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return; } ucg = ubh_get_ucg(UCPI_UBH(ucpi)); @@ -115,7 +115,7 @@ void ufs_free_inode (struct inode * inode) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); UFSD("EXIT\n"); } @@ -193,7 +193,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) sbi = UFS_SB(sb); uspi = sbi->s_uspi; - lock_ufs(sb); + mutex_lock(&sbi->s_lock); /* * Try to place the inode in its parent directory @@ -331,21 +331,21 @@ cg_found: sync_dirty_buffer(bh); brelse(bh); } - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); UFSD("allocating inode %lu\n", inode->i_ino); UFSD("EXIT\n"); return inode; fail_remove_inode: - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); clear_nlink(inode); unlock_new_inode(inode); iput(inode); UFSD("EXIT (FAILED): err %d\n", err); return ERR_PTR(err); failed: - unlock_ufs(sb); + mutex_unlock(&sbi->s_lock); make_bad_inode(inode); iput (inode); UFSD("EXIT (FAILED): err %d\n", err); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index be7d42c7d938..f913a6924b23 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -572,9 +572,10 @@ static void ufs_set_inode_ops(struct inode *inode) inode->i_fop = &ufs_dir_operations; inode->i_mapping->a_ops = &ufs_aops; } else if (S_ISLNK(inode->i_mode)) { - if (!inode->i_blocks) + if (!inode->i_blocks) { inode->i_op = &ufs_fast_symlink_inode_operations; - else { + inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink; + } else { inode->i_op = &ufs_symlink_inode_operations; inode->i_mapping->a_ops = &ufs_aops; } @@ -902,6 +903,9 @@ void ufs_evict_inode(struct inode * inode) invalidate_inode_buffers(inode); clear_inode(inode); - if (want_delete) + if (want_delete) { + lock_ufs(inode->i_sb); ufs_free_inode(inode); + unlock_ufs(inode->i_sb); + } } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index fd65deb4b5f0..47966554317c 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -56,11 +56,9 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi if (dentry->d_name.len > UFS_MAXNAMLEN) return ERR_PTR(-ENAMETOOLONG); - lock_ufs(dir->i_sb); ino = ufs_inode_by_name(dir, &dentry->d_name); if (ino) inode = ufs_iget(dir->i_sb, ino); - unlock_ufs(dir->i_sb); return d_splice_alias(inode, dentry); } @@ -76,24 +74,16 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { struct inode *inode; - int err; - - UFSD("BEGIN\n"); inode = ufs_new_inode(dir, mode); - err = PTR_ERR(inode); + if (IS_ERR(inode)) + return PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ufs_file_inode_operations; - inode->i_fop = &ufs_file_operations; - inode->i_mapping->a_ops = &ufs_aops; - mark_inode_dirty(inode); - lock_ufs(dir->i_sb); - err = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); - } - UFSD("END: err=%d\n", err); - return err; + inode->i_op = &ufs_file_inode_operations; + inode->i_fop = &ufs_file_operations; + inode->i_mapping->a_ops = &ufs_aops; + mark_inode_dirty(inode); + return ufs_add_nondir(dentry, inode); } static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) @@ -110,9 +100,7 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev init_special_inode(inode, mode, rdev); ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); mark_inode_dirty(inode); - lock_ufs(dir->i_sb); err = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); } return err; } @@ -121,19 +109,18 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; - int err = -ENAMETOOLONG; + int err; unsigned l = strlen(symname)+1; struct inode * inode; if (l > sb->s_blocksize) - goto out_notlocked; + return -ENAMETOOLONG; inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out_notlocked; + return err; - lock_ufs(dir->i_sb); if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ inode->i_op = &ufs_symlink_inode_operations; @@ -144,38 +131,37 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, } else { /* fast symlink */ inode->i_op = &ufs_fast_symlink_inode_operations; - memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l); + inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink; + memcpy(inode->i_link, symname, l); inode->i_size = l-1; } mark_inode_dirty(inode); - err = ufs_add_nondir(dentry, inode); -out: - unlock_ufs(dir->i_sb); -out_notlocked: - return err; + return ufs_add_nondir(dentry, inode); out_fail: inode_dec_link_count(inode); unlock_new_inode(inode); iput(inode); - goto out; + return err; } static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); int error; - lock_ufs(dir->i_sb); - inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); ihold(inode); - error = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); + error = ufs_add_link(dentry, inode); + if (error) { + inode_dec_link_count(inode); + iput(inode); + } else + d_instantiate(dentry, inode); return error; } @@ -184,9 +170,12 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) struct inode * inode; int err; + inode_inc_link_count(dir); + inode = ufs_new_inode(dir, S_IFDIR|mode); + err = PTR_ERR(inode); if (IS_ERR(inode)) - return PTR_ERR(inode); + goto out_dir; inode->i_op = &ufs_dir_inode_operations; inode->i_fop = &ufs_dir_operations; @@ -194,9 +183,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) inode_inc_link_count(inode); - lock_ufs(dir->i_sb); - inode_inc_link_count(dir); - err = ufs_make_empty(inode, dir); if (err) goto out_fail; @@ -204,25 +190,24 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) err = ufs_add_link(dentry, inode); if (err) goto out_fail; - unlock_ufs(dir->i_sb); + unlock_new_inode(inode); d_instantiate(dentry, inode); -out: - return err; + return 0; out_fail: inode_dec_link_count(inode); inode_dec_link_count(inode); unlock_new_inode(inode); iput (inode); +out_dir: inode_dec_link_count(dir); - unlock_ufs(dir->i_sb); - goto out; + return err; } static int ufs_unlink(struct inode *dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); struct ufs_dir_entry *de; struct page *page; int err = -ENOENT; @@ -244,10 +229,9 @@ out: static int ufs_rmdir (struct inode * dir, struct dentry *dentry) { - struct inode * inode = dentry->d_inode; + struct inode * inode = d_inode(dentry); int err= -ENOTEMPTY; - lock_ufs(dir->i_sb); if (ufs_empty_dir (inode)) { err = ufs_unlink(dir, dentry); if (!err) { @@ -256,15 +240,14 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) inode_dec_link_count(dir); } } - unlock_ufs(dir->i_sb); return err; } static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct inode *old_inode = old_dentry->d_inode; - struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = d_inode(old_dentry); + struct inode *new_inode = d_inode(new_dentry); struct page *dir_page = NULL; struct ufs_dir_entry * dir_de = NULL; struct page *old_page; @@ -294,7 +277,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, new_page, old_inode); + ufs_set_link(new_dir, new_de, new_page, old_inode, 1); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) drop_nlink(new_inode); @@ -317,7 +300,12 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, mark_inode_dirty(old_inode); if (dir_de) { - ufs_set_link(old_inode, dir_de, dir_page, new_dir); + if (old_dir != new_dir) + ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0); + else { + kunmap(dir_page); + page_cache_release(dir_page); + } inode_dec_link_count(old_dir); } return 0; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 8092d3759a5e..250579a80d90 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -80,6 +80,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/parser.h> #include <linux/buffer_head.h> @@ -144,10 +145,10 @@ static struct dentry *ufs_get_parent(struct dentry *child) struct qstr dot_dot = QSTR_INIT("..", 2); ino_t ino; - ino = ufs_inode_by_name(child->d_inode, &dot_dot); + ino = ufs_inode_by_name(d_inode(child), &dot_dot); if (!ino) return ERR_PTR(-ENOENT); - return d_obtain_alias(ufs_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ufs_iget(d_inode(child)->i_sb, ino)); } static const struct export_operations ufs_export_ops = { @@ -694,6 +695,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait) unsigned flags; lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); @@ -711,6 +713,7 @@ static int ufs_sync_fs(struct super_block *sb, int wait) ufs_put_cstotal(sb); UFSD("EXIT\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; @@ -799,6 +802,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); mutex_init(&sbi->mutex); + mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); /* @@ -1277,6 +1281,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) sync_filesystem(sb); lock_ufs(sb); + mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; usb1 = ubh_get_usb_first(uspi); @@ -1290,6 +1295,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt = 0; ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } @@ -1297,12 +1303,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) new_mount_opt |= ufstype; } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { pr_err("ufstype can't be changed during remount\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } @@ -1326,6 +1334,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) */ #ifndef CONFIG_UFS_FS_WRITE pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; #else @@ -1335,11 +1344,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_SUNx86 && ufstype != UFS_MOUNT_UFSTYPE_UFS2) { pr_err("this ufstype is read-only supported\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { pr_err("failed during remounting\n"); + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return -EPERM; } @@ -1347,6 +1358,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; + mutex_unlock(&UFS_SB(sb)->s_lock); unlock_ufs(sb); return 0; } diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c index d283628b4778..874480bb43e9 100644 --- a/fs/ufs/symlink.c +++ b/fs/ufs/symlink.c @@ -25,23 +25,12 @@ * ext2 symlink handling code */ -#include <linux/fs.h> -#include <linux/namei.h> - #include "ufs_fs.h" #include "ufs.h" - -static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ufs_inode_info *p = UFS_I(dentry->d_inode); - nd_set_link(nd, (char*)p->i_u1.i_symlink); - return NULL; -} - const struct inode_operations ufs_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ufs_follow_link, + .follow_link = simple_follow_link, .setattr = ufs_setattr, }; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index f04f89fbd4d9..21154704c168 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -492,7 +492,7 @@ out: int ufs_setattr(struct dentry *dentry, struct iattr *attr) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); unsigned int ia_valid = attr->ia_valid; int error; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 2a07396d5f9e..2e31ea2e35a3 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -30,6 +30,7 @@ struct ufs_sb_info { int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ spinlock_t work_lock; /* protects sync_work and work_queued */ + struct mutex s_lock; }; struct ufs_inode_info { @@ -105,7 +106,7 @@ extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page extern int ufs_empty_dir (struct inode *); extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode); + struct page *page, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; diff --git a/fs/xattr.c b/fs/xattr.c index 4ef698549e31..072fee1258dd 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -298,18 +298,18 @@ vfs_removexattr(struct dentry *dentry, const char *name) mutex_lock(&inode->i_mutex); error = security_inode_removexattr(dentry, name); - if (error) { - mutex_unlock(&inode->i_mutex); - return error; - } + if (error) + goto out; error = inode->i_op->removexattr(dentry, name); - mutex_unlock(&inode->i_mutex); if (!error) { fsnotify_xattr(dentry); evm_inode_post_removexattr(dentry, name); } + +out: + mutex_unlock(&inode->i_mutex); return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index a6fbf4472017..f9e9ffe6fb46 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -149,13 +149,27 @@ xfs_alloc_compute_aligned( { xfs_agblock_t bno; xfs_extlen_t len; + xfs_extlen_t diff; /* Trim busy sections out of found extent */ xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + /* + * If we have a largish extent that happens to start before min_agbno, + * see if we can shift it into range... + */ + if (bno < args->min_agbno && bno + len > args->min_agbno) { + diff = args->min_agbno - bno; + if (len > diff) { + bno += diff; + len -= diff; + } + } + if (args->alignment > 1 && len >= args->minlen) { xfs_agblock_t aligned_bno = roundup(bno, args->alignment); - xfs_extlen_t diff = aligned_bno - bno; + + diff = aligned_bno - bno; *resbno = aligned_bno; *reslen = diff >= len ? 0 : len - diff; @@ -260,6 +274,7 @@ xfs_alloc_fix_len( rlen = rlen - (k - args->mod); else rlen = rlen - args->prod + (args->mod - k); + /* casts to (int) catch length underflows */ if ((int)rlen < (int)args->minlen) return; ASSERT(rlen >= args->minlen && rlen <= args->maxlen); @@ -286,7 +301,8 @@ xfs_alloc_fix_minleft( if (diff >= 0) return 1; args->len += diff; /* shrink the allocated space */ - if (args->len >= args->minlen) + /* casts to (int) catch length underflows */ + if ((int)args->len >= (int)args->minlen) return 1; args->agbno = NULLAGBLOCK; return 0; @@ -315,6 +331,9 @@ xfs_alloc_fixup_trees( xfs_agblock_t nfbno2; /* second new free startblock */ xfs_extlen_t nflen1=0; /* first new free length */ xfs_extlen_t nflen2=0; /* second new free length */ + struct xfs_mount *mp; + + mp = cnt_cur->bc_mp; /* * Look up the record in the by-size tree if necessary. @@ -323,13 +342,13 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, i == 1 && nfbno1 == fbno && nflen1 == flen); #endif } else { if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } /* * Look up the record in the by-block tree if necessary. @@ -338,13 +357,13 @@ xfs_alloc_fixup_trees( #ifdef DEBUG if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, i == 1 && nfbno1 == fbno && nflen1 == flen); #endif } else { if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } #ifdef DEBUG @@ -355,7 +374,7 @@ xfs_alloc_fixup_trees( bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]); cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, bnoblock->bb_numrecs == cntblock->bb_numrecs); } #endif @@ -386,25 +405,25 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* * Add new by-size btree entry(s). */ if (nfbno1 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } if (nfbno2 != NULLAGBLOCK) { if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(cnt_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } /* * Fix up the by-block btree entry(s). @@ -415,7 +434,7 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_btree_delete(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } else { /* * Update the by-block entry to start later|be shorter. @@ -429,10 +448,10 @@ xfs_alloc_fixup_trees( */ if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 0); + XFS_WANT_CORRUPTED_RETURN(mp, i == 0); if ((error = xfs_btree_insert(bno_cur, &i))) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); } return 0; } @@ -682,7 +701,7 @@ xfs_alloc_ag_vextent_exact( error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(fbno <= args->agbno); /* @@ -783,16 +802,20 @@ xfs_alloc_find_best_extent( error = xfs_alloc_get_rec(*scur, sbno, slen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, *sbno, *slen, sbnoa, slena); /* * The good extent is closer than this one. */ if (!dir) { + if (*sbnoa > args->max_agbno) + goto out_use_good; if (*sbnoa >= args->agbno + gdiff) goto out_use_good; } else { + if (*sbnoa < args->min_agbno) + goto out_use_good; if (*sbnoa <= args->agbno - gdiff) goto out_use_good; } @@ -879,6 +902,17 @@ xfs_alloc_ag_vextent_near( dofirst = prandom_u32() & 1; #endif + /* handle unitialized agbno range so caller doesn't have to */ + if (!args->min_agbno && !args->max_agbno) + args->max_agbno = args->mp->m_sb.sb_agblocks - 1; + ASSERT(args->min_agbno <= args->max_agbno); + + /* clamp agbno to the range if it's outside */ + if (args->agbno < args->min_agbno) + args->agbno = args->min_agbno; + if (args->agbno > args->max_agbno) + args->agbno = args->max_agbno; + restart: bno_cur_lt = NULL; bno_cur_gt = NULL; @@ -946,7 +980,7 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (ltlen >= args->minlen) break; if ((error = xfs_btree_increment(cnt_cur, 0, &i))) @@ -966,11 +1000,13 @@ restart: */ if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); if (ltlena < args->minlen) continue; + if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) + continue; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ASSERT(args->len >= args->minlen); @@ -999,7 +1035,7 @@ restart: cnt_cur->bc_ptrs[0] = besti; if ((error = xfs_alloc_get_rec(cnt_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); args->len = blen; if (!xfs_alloc_fix_minleft(args)) { @@ -1088,14 +1124,14 @@ restart: if (bno_cur_lt) { if ((error = xfs_alloc_get_rec(bno_cur_lt, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); - if (ltlena >= args->minlen) + if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) break; if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) goto error0; - if (!i) { + if (!i || ltbnoa < args->min_agbno) { xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); bno_cur_lt = NULL; @@ -1104,14 +1140,14 @@ restart: if (bno_cur_gt) { if ((error = xfs_alloc_get_rec(bno_cur_gt, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, gtbno, gtlen, >bnoa, >lena); - if (gtlena >= args->minlen) + if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) break; if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) goto error0; - if (!i) { + if (!i || gtbnoa > args->max_agbno) { xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); bno_cur_gt = NULL; @@ -1211,6 +1247,7 @@ restart: ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno); args->agbno = ltnew; if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, @@ -1303,7 +1340,7 @@ restart: error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); @@ -1342,7 +1379,7 @@ restart: * This can't happen in the second case above. */ rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); if (rlen < args->maxlen) { xfs_agblock_t bestfbno; @@ -1362,13 +1399,13 @@ restart: if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); if (flen < bestrlen) break; xfs_alloc_compute_aligned(args, fbno, flen, &rbno, &rlen); rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); - XFS_WANT_CORRUPTED_GOTO(rlen == 0 || + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen == 0 || (rlen <= flen && rbno + rlen <= fbno + flen), error0); if (rlen > bestrlen) { @@ -1383,7 +1420,7 @@ restart: if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); rlen = bestrlen; rbno = bestrbno; flen = bestflen; @@ -1408,7 +1445,7 @@ restart: if (!xfs_alloc_fix_minleft(args)) goto out_nominleft; rlen = args->len; - XFS_WANT_CORRUPTED_GOTO(rlen <= flen, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0); /* * Allocate and initialize a cursor for the by-block tree. */ @@ -1422,7 +1459,7 @@ restart: cnt_cur = bno_cur = NULL; args->len = rlen; args->agbno = rbno; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(args->mp, args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); @@ -1467,7 +1504,7 @@ xfs_alloc_ag_vextent_small( if (i) { if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); } /* * Nothing in the btree, try the freelist. Make sure @@ -1493,7 +1530,7 @@ xfs_alloc_ag_vextent_small( } args->len = 1; args->agbno = fbno; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(args->mp, args->agbno + args->len <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length), error0); @@ -1579,7 +1616,7 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * It's not contiguous, though. */ @@ -1591,7 +1628,8 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(ltbno + ltlen <= bno, error0); + XFS_WANT_CORRUPTED_GOTO(mp, + ltbno + ltlen <= bno, error0); } } /* @@ -1606,7 +1644,7 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * It's not contiguous, though. */ @@ -1618,7 +1656,7 @@ xfs_free_ag_extent( * space was invalid, it's (partly) already free. * Very bad. */ - XFS_WANT_CORRUPTED_GOTO(gtbno >= bno + len, error0); + XFS_WANT_CORRUPTED_GOTO(mp, gtbno >= bno + len, error0); } } /* @@ -1635,31 +1673,31 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Delete the old by-size entry on the right. */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Delete the old by-block entry for the right block. */ if ((error = xfs_btree_delete(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Move the by-block cursor back to the left neighbor. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); #ifdef DEBUG /* * Check that this is the right record: delete didn't @@ -1672,7 +1710,7 @@ xfs_free_ag_extent( if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO( + XFS_WANT_CORRUPTED_GOTO(mp, i == 1 && xxbno == ltbno && xxlen == ltlen, error0); } @@ -1695,17 +1733,17 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Back up the by-block cursor to the left neighbor, and * update its length. */ if ((error = xfs_btree_decrement(bno_cur, 0, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); nbno = ltbno; nlen = len + ltlen; if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) @@ -1721,10 +1759,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if ((error = xfs_btree_delete(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Update the starting block and length of the right * neighbor in the by-block tree. @@ -1743,7 +1781,7 @@ xfs_free_ag_extent( nlen = len; if ((error = xfs_btree_insert(bno_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); } xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); bno_cur = NULL; @@ -1752,10 +1790,10 @@ xfs_free_ag_extent( */ if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 0, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, error0); if ((error = xfs_btree_insert(cnt_cur, &i))) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); cnt_cur = NULL; @@ -1819,11 +1857,11 @@ xfs_alloc_compute_maxlevels( xfs_extlen_t xfs_alloc_longest_free_extent( struct xfs_mount *mp, - struct xfs_perag *pag) + struct xfs_perag *pag, + xfs_extlen_t need) { - xfs_extlen_t need, delta = 0; + xfs_extlen_t delta = 0; - need = XFS_MIN_FREELIST_PAG(pag, mp); if (need > pag->pagf_flcount) delta = need - pag->pagf_flcount; @@ -1832,131 +1870,150 @@ xfs_alloc_longest_free_extent( return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +unsigned int +xfs_alloc_min_freelist( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + unsigned int min_free; + + /* space needed by-bno freespace btree */ + min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + mp->m_ag_maxlevels); + /* space needed by-size freespace btree */ + min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + mp->m_ag_maxlevels); + + return min_free; +} + +/* + * Check if the operation we are fixing up the freelist for should go ahead or + * not. If we are freeing blocks, we always allow it, otherwise the allocation + * is dependent on whether the size and shape of free space available will + * permit the requested allocation to take place. + */ +static bool +xfs_alloc_space_available( + struct xfs_alloc_arg *args, + xfs_extlen_t min_free, + int flags) +{ + struct xfs_perag *pag = args->pag; + xfs_extlen_t longest; + int available; + + if (flags & XFS_ALLOC_FLAG_FREEING) + return true; + + /* do we have enough contiguous free space for the allocation? */ + longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); + if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) + return false; + + /* do have enough free space remaining for the allocation? */ + available = (int)(pag->pagf_freeblks + pag->pagf_flcount - + min_free - args->total); + if (available < (int)args->minleft) + return false; + + return true; +} + /* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ STATIC int /* error */ xfs_alloc_fix_freelist( - xfs_alloc_arg_t *args, /* allocation argument structure */ - int flags) /* XFS_ALLOC_FLAG_... */ + struct xfs_alloc_arg *args, /* allocation argument structure */ + int flags) /* XFS_ALLOC_FLAG_... */ { - xfs_buf_t *agbp; /* agf buffer pointer */ - xfs_agf_t *agf; /* a.g. freespace structure pointer */ - xfs_buf_t *agflbp;/* agfl buffer pointer */ - xfs_agblock_t bno; /* freelist block */ - xfs_extlen_t delta; /* new blocks needed in freelist */ - int error; /* error result code */ - xfs_extlen_t longest;/* longest extent in allocation group */ - xfs_mount_t *mp; /* file system mount point structure */ - xfs_extlen_t need; /* total blocks needed in freelist */ - xfs_perag_t *pag; /* per-ag information structure */ - xfs_alloc_arg_t targs; /* local allocation arguments */ - xfs_trans_t *tp; /* transaction pointer */ - - mp = args->mp; + struct xfs_mount *mp = args->mp; + struct xfs_perag *pag = args->pag; + struct xfs_trans *tp = args->tp; + struct xfs_buf *agbp = NULL; + struct xfs_buf *agflbp = NULL; + struct xfs_alloc_arg targs; /* local allocation arguments */ + xfs_agblock_t bno; /* freelist block */ + xfs_extlen_t need; /* total blocks needed in freelist */ + int error; - pag = args->pag; - tp = args->tp; if (!pag->pagf_init) { - if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, - &agbp))) - return error; + error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + if (error) + goto out_no_agbp; if (!pag->pagf_init) { ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; + goto out_agbp_relse; } - } else - agbp = NULL; + } /* - * If this is a metadata preferred pag and we are user data - * then try somewhere else if we are not being asked to - * try harder at this point + * If this is a metadata preferred pag and we are user data then try + * somewhere else if we are not being asked to try harder at this + * point */ if (pag->pagf_metadata && args->userdata && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; + goto out_agbp_relse; } - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - /* - * If it looks like there isn't a long enough extent, or enough - * total blocks, reject it. - */ - need = XFS_MIN_FREELIST_PAG(pag, mp); - longest = xfs_alloc_longest_free_extent(mp, pag); - if ((args->minlen + args->alignment + args->minalignslop - 1) > - longest || - ((int)(pag->pagf_freeblks + pag->pagf_flcount - - need - args->total) < (int)args->minleft)) { - if (agbp) - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; - } - } + need = xfs_alloc_min_freelist(mp, pag); + if (!xfs_alloc_space_available(args, need, flags)) + goto out_agbp_relse; /* * Get the a.g. freespace buffer. * Can fail if we're not blocking on locks, and it's held. */ - if (agbp == NULL) { - if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, - &agbp))) - return error; - if (agbp == NULL) { + if (!agbp) { + error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + if (error) + goto out_no_agbp; + if (!agbp) { ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; - } - } - /* - * Figure out how many blocks we should have in the freelist. - */ - agf = XFS_BUF_TO_AGF(agbp); - need = XFS_MIN_FREELIST(agf, mp); - /* - * If there isn't enough total or single-extent, reject it. - */ - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - delta = need > be32_to_cpu(agf->agf_flcount) ? - (need - be32_to_cpu(agf->agf_flcount)) : 0; - longest = be32_to_cpu(agf->agf_longest); - longest = (longest > delta) ? (longest - delta) : - (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); - if ((args->minlen + args->alignment + args->minalignslop - 1) > - longest || - ((int)(be32_to_cpu(agf->agf_freeblks) + - be32_to_cpu(agf->agf_flcount) - need - args->total) < - (int)args->minleft)) { - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; + goto out_no_agbp; } } + + /* If there isn't enough total space or single-extent, reject it. */ + need = xfs_alloc_min_freelist(mp, pag); + if (!xfs_alloc_space_available(args, need, flags)) + goto out_agbp_relse; + /* * Make the freelist shorter if it's too long. + * + * Note that from this point onwards, we will always release the agf and + * agfl buffers on error. This handles the case where we error out and + * the buffers are clean or may not have been joined to the transaction + * and hence need to be released manually. If they have been joined to + * the transaction, then xfs_trans_brelse() will handle them + * appropriately based on the recursion count and dirty state of the + * buffer. + * + * XXX (dgc): When we have lots of free space, does this buy us + * anything other than extra overhead when we need to put more blocks + * back on the free list? Maybe we should only do this when space is + * getting low or the AGFL is more than half full? */ - while (be32_to_cpu(agf->agf_flcount) > need) { - xfs_buf_t *bp; + while (pag->pagf_flcount > need) { + struct xfs_buf *bp; error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) - return error; - if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) - return error; + goto out_agbp_relse; + error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1); + if (error) + goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); xfs_trans_binval(tp, bp); } - /* - * Initialize the args structure. - */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; @@ -1965,21 +2022,20 @@ xfs_alloc_fix_freelist( targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; - if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) - return error; - /* - * Make the freelist longer if it's too short. - */ - while (be32_to_cpu(agf->agf_flcount) < need) { + error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); + if (error) + goto out_agbp_relse; + + /* Make the freelist longer if it's too short. */ + while (pag->pagf_flcount < need) { targs.agbno = 0; - targs.maxlen = need - be32_to_cpu(agf->agf_flcount); - /* - * Allocate as many blocks as possible at once. - */ - if ((error = xfs_alloc_ag_vextent(&targs))) { - xfs_trans_brelse(tp, agflbp); - return error; - } + targs.maxlen = need - pag->pagf_flcount; + + /* Allocate as many blocks as possible at once. */ + error = xfs_alloc_ag_vextent(&targs); + if (error) + goto out_agflbp_relse; + /* * Stop if we run out. Won't happen if callers are obeying * the restrictions correctly. Can happen for free calls @@ -1988,9 +2044,7 @@ xfs_alloc_fix_freelist( if (targs.agbno == NULLAGBLOCK) { if (flags & XFS_ALLOC_FLAG_FREEING) break; - xfs_trans_brelse(tp, agflbp); - args->agbp = NULL; - return 0; + goto out_agflbp_relse; } /* * Put each allocated block on the list. @@ -1999,12 +2053,21 @@ xfs_alloc_fix_freelist( error = xfs_alloc_put_freelist(tp, agbp, agflbp, bno, 0); if (error) - return error; + goto out_agflbp_relse; } } xfs_trans_brelse(tp, agflbp); args->agbp = agbp; return 0; + +out_agflbp_relse: + xfs_trans_brelse(tp, agflbp); +out_agbp_relse: + if (agbp) + xfs_trans_brelse(tp, agbp); +out_no_agbp: + args->agbp = NULL; + return error; } /* diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d1b4b6a5c894..ca1c8168373a 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg { xfs_extlen_t total; /* total blocks needed in xaction */ xfs_extlen_t alignment; /* align answer to multiple of this */ xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */ + xfs_agblock_t max_agbno; /* ... */ xfs_extlen_t len; /* output: actual size of extent */ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ xfs_alloctype_t otype; /* original allocation type */ @@ -128,11 +130,9 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ -/* - * Find the length of the longest extent in an AG. - */ -xfs_extlen_t -xfs_alloc_longest_free_extent(struct xfs_mount *mp, +xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag, xfs_extlen_t need); +unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); /* diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 0a472fbe06d4..3349c9a1e845 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -266,7 +266,7 @@ xfs_attr_set( tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; error = xfs_trans_reserve(args.trans, &tres, args.total, 0); if (error) { - xfs_trans_cancel(args.trans, 0); + xfs_trans_cancel(args.trans); return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -276,7 +276,7 @@ xfs_attr_set( XFS_QMOPT_RES_REGBLKS); if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_cancel(args.trans); return error; } @@ -320,8 +320,7 @@ xfs_attr_set( xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); } - err2 = xfs_trans_commit(args.trans, - XFS_TRANS_RELEASE_LOG_RES); + err2 = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error ? error : err2; @@ -383,16 +382,14 @@ xfs_attr_set( * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; out: - if (args.trans) { - xfs_trans_cancel(args.trans, - XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } + if (args.trans) + xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } @@ -462,7 +459,7 @@ xfs_attr_remove( error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, XFS_ATTRRM_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(args.trans, 0); + xfs_trans_cancel(args.trans); return error; } @@ -501,16 +498,14 @@ xfs_attr_remove( * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; out: - if (args.trans) { - xfs_trans_cancel(args.trans, - XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } + if (args.trans) + xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 15105dbc9e28..e9d401ce93bb 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -86,8 +86,83 @@ STATIC void xfs_attr3_leaf_moveents(struct xfs_da_args *args, int move_count); STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index); +/* + * attr3 block 'firstused' conversion helpers. + * + * firstused refers to the offset of the first used byte of the nameval region + * of an attr leaf block. The region starts at the tail of the block and expands + * backwards towards the middle. As such, firstused is initialized to the block + * size for an empty leaf block and is reduced from there. + * + * The attr3 block size is pegged to the fsb size and the maximum fsb is 64k. + * The in-core firstused field is 32-bit and thus supports the maximum fsb size. + * The on-disk field is only 16-bit, however, and overflows at 64k. Since this + * only occurs at exactly 64k, we use zero as a magic on-disk value to represent + * the attr block size. The following helpers manage the conversion between the + * in-core and on-disk formats. + */ + +static void +xfs_attr3_leaf_firstused_from_disk( + struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, + struct xfs_attr_leafblock *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + + if (from->hdr.info.magic == cpu_to_be16(XFS_ATTR3_LEAF_MAGIC)) { + hdr3 = (struct xfs_attr3_leaf_hdr *) from; + to->firstused = be16_to_cpu(hdr3->firstused); + } else { + to->firstused = be16_to_cpu(from->hdr.firstused); + } + + /* + * Convert from the magic fsb size value to actual blocksize. This + * should only occur for empty blocks when the block size overflows + * 16-bits. + */ + if (to->firstused == XFS_ATTR3_LEAF_NULLOFF) { + ASSERT(!to->count && !to->usedbytes); + ASSERT(geo->blksize > USHRT_MAX); + to->firstused = geo->blksize; + } +} + +static void +xfs_attr3_leaf_firstused_to_disk( + struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, + struct xfs_attr3_icleaf_hdr *from) +{ + struct xfs_attr3_leaf_hdr *hdr3; + uint32_t firstused; + + /* magic value should only be seen on disk */ + ASSERT(from->firstused != XFS_ATTR3_LEAF_NULLOFF); + + /* + * Scale down the 32-bit in-core firstused value to the 16-bit on-disk + * value. This only overflows at the max supported value of 64k. Use the + * magic on-disk value to represent block size in this case. + */ + firstused = from->firstused; + if (firstused > USHRT_MAX) { + ASSERT(from->firstused == geo->blksize); + firstused = XFS_ATTR3_LEAF_NULLOFF; + } + + if (from->magic == XFS_ATTR3_LEAF_MAGIC) { + hdr3 = (struct xfs_attr3_leaf_hdr *) to; + hdr3->firstused = cpu_to_be16(firstused); + } else { + to->hdr.firstused = cpu_to_be16(firstused); + } +} + void xfs_attr3_leaf_hdr_from_disk( + struct xfs_da_geometry *geo, struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from) { @@ -104,7 +179,7 @@ xfs_attr3_leaf_hdr_from_disk( to->magic = be16_to_cpu(hdr3->info.hdr.magic); to->count = be16_to_cpu(hdr3->count); to->usedbytes = be16_to_cpu(hdr3->usedbytes); - to->firstused = be16_to_cpu(hdr3->firstused); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = hdr3->holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { @@ -118,7 +193,7 @@ xfs_attr3_leaf_hdr_from_disk( to->magic = be16_to_cpu(from->hdr.info.magic); to->count = be16_to_cpu(from->hdr.count); to->usedbytes = be16_to_cpu(from->hdr.usedbytes); - to->firstused = be16_to_cpu(from->hdr.firstused); + xfs_attr3_leaf_firstused_from_disk(geo, to, from); to->holes = from->hdr.holes; for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { @@ -129,10 +204,11 @@ xfs_attr3_leaf_hdr_from_disk( void xfs_attr3_leaf_hdr_to_disk( + struct xfs_da_geometry *geo, struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from) { - int i; + int i; ASSERT(from->magic == XFS_ATTR_LEAF_MAGIC || from->magic == XFS_ATTR3_LEAF_MAGIC); @@ -145,7 +221,7 @@ xfs_attr3_leaf_hdr_to_disk( hdr3->info.hdr.magic = cpu_to_be16(from->magic); hdr3->count = cpu_to_be16(from->count); hdr3->usedbytes = cpu_to_be16(from->usedbytes); - hdr3->firstused = cpu_to_be16(from->firstused); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); hdr3->holes = from->holes; hdr3->pad1 = 0; @@ -160,7 +236,7 @@ xfs_attr3_leaf_hdr_to_disk( to->hdr.info.magic = cpu_to_be16(from->magic); to->hdr.count = cpu_to_be16(from->count); to->hdr.usedbytes = cpu_to_be16(from->usedbytes); - to->hdr.firstused = cpu_to_be16(from->firstused); + xfs_attr3_leaf_firstused_to_disk(geo, to, from); to->hdr.holes = from->holes; to->hdr.pad1 = 0; @@ -178,7 +254,7 @@ xfs_attr3_leaf_verify( struct xfs_attr_leafblock *leaf = bp->b_addr; struct xfs_attr3_icleaf_hdr ichdr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_da3_node_hdr *hdr3 = bp->b_addr; @@ -498,8 +574,8 @@ xfs_attr_shortform_add(xfs_da_args_t *args, int forkoff) * After the last attribute is removed revert to original inode format, * making all literal area available to the data fork once more. */ -STATIC void -xfs_attr_fork_reset( +void +xfs_attr_fork_remove( struct xfs_inode *ip, struct xfs_trans *tp) { @@ -565,7 +641,7 @@ xfs_attr_shortform_remove(xfs_da_args_t *args) (mp->m_flags & XFS_MOUNT_ATTR2) && (dp->i_d.di_format != XFS_DINODE_FMT_BTREE) && !(args->op_flags & XFS_DA_OP_ADDNAME)) { - xfs_attr_fork_reset(dp, args->trans); + xfs_attr_fork_remove(dp, args->trans); } else { xfs_idata_realloc(dp, -size, XFS_ATTR_FORK); dp->i_d.di_forkoff = xfs_attr_shortform_bytesfit(dp, totsize); @@ -757,9 +833,10 @@ xfs_attr_shortform_allfit( struct xfs_attr3_icleaf_hdr leafhdr; int bytes; int i; + struct xfs_mount *mp = bp->b_target->bt_mount; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); bytes = sizeof(struct xfs_attr_sf_hdr); @@ -812,7 +889,7 @@ xfs_attr3_leaf_to_shortform( memcpy(tmpbuffer, bp->b_addr, args->geo->blksize); leaf = (xfs_attr_leafblock_t *)tmpbuffer; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entry = xfs_attr3_leaf_entryp(leaf); /* XXX (dgc): buffer is about to be marked stale - why zero it? */ @@ -828,7 +905,7 @@ xfs_attr3_leaf_to_shortform( if (forkoff == -1) { ASSERT(dp->i_mount->m_flags & XFS_MOUNT_ATTR2); ASSERT(dp->i_d.di_format != XFS_DINODE_FMT_BTREE); - xfs_attr_fork_reset(dp, args->trans); + xfs_attr_fork_remove(dp, args->trans); goto out; } @@ -923,7 +1000,7 @@ xfs_attr3_leaf_to_node( btree = dp->d_ops->node_tree_p(node); leaf = bp2->b_addr; - xfs_attr3_leaf_hdr_from_disk(&icleafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &icleafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); /* both on-disk, don't endian-flip twice */ @@ -988,7 +1065,7 @@ xfs_attr3_leaf_create( } ichdr.freemap[0].size = ichdr.firstused - ichdr.freemap[0].base; - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, 0, args->geo->blksize - 1); *bpp = bp; @@ -1073,7 +1150,7 @@ xfs_attr3_leaf_add( trace_xfs_attr_leaf_add(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index >= 0 && args->index <= ichdr.count); entsize = xfs_attr_leaf_newentsize(args, NULL); @@ -1126,7 +1203,7 @@ xfs_attr3_leaf_add( tmp = xfs_attr3_leaf_add_work(bp, &ichdr, args, 0); out_log_hdr: - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); @@ -1294,7 +1371,7 @@ xfs_attr3_leaf_compact( ichdr_dst->freemap[0].base; /* write the header back to initialise the underlying buffer */ - xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, @@ -1344,9 +1421,10 @@ xfs_attr_leaf_order( { struct xfs_attr3_icleaf_hdr ichdr1; struct xfs_attr3_icleaf_hdr ichdr2; + struct xfs_mount *mp = leaf1_bp->b_target->bt_mount; - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1_bp->b_addr); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr1, leaf1_bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr2, leaf2_bp->b_addr); return xfs_attr3_leaf_order(leaf1_bp, &ichdr1, leaf2_bp, &ichdr2); } @@ -1388,8 +1466,8 @@ xfs_attr3_leaf_rebalance( ASSERT(blk2->magic == XFS_ATTR_LEAF_MAGIC); leaf1 = blk1->bp->b_addr; leaf2 = blk2->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, leaf2); ASSERT(ichdr2.count == 0); args = state->args; @@ -1490,8 +1568,8 @@ xfs_attr3_leaf_rebalance( ichdr1.count, count); } - xfs_attr3_leaf_hdr_to_disk(leaf1, &ichdr1); - xfs_attr3_leaf_hdr_to_disk(leaf2, &ichdr2); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf1, &ichdr1); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, leaf2, &ichdr2); xfs_trans_log_buf(args->trans, blk1->bp, 0, args->geo->blksize - 1); xfs_trans_log_buf(args->trans, blk2->bp, 0, args->geo->blksize - 1); @@ -1684,7 +1762,7 @@ xfs_attr3_leaf_toosmall( */ blk = &state->path.blk[ state->path.active-1 ]; leaf = blk->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr, leaf); bytes = xfs_attr3_leaf_hdr_size(leaf) + ichdr.count * sizeof(xfs_attr_leaf_entry_t) + ichdr.usedbytes; @@ -1740,7 +1818,7 @@ xfs_attr3_leaf_toosmall( if (error) return error; - xfs_attr3_leaf_hdr_from_disk(&ichdr2, bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &ichdr2, bp->b_addr); bytes = state->args->geo->blksize - (state->args->geo->blksize >> 2) - @@ -1805,7 +1883,7 @@ xfs_attr3_leaf_remove( trace_xfs_attr_leaf_remove(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count > 0 && ichdr.count < args->geo->blksize / 8); ASSERT(args->index >= 0 && args->index < ichdr.count); @@ -1918,12 +1996,11 @@ xfs_attr3_leaf_remove( tmp = be16_to_cpu(entry->nameidx); } ichdr.firstused = tmp; - if (!ichdr.firstused) - ichdr.firstused = tmp - XFS_ATTR_LEAF_NAME_ALIGN; + ASSERT(ichdr.firstused != 0); } else { ichdr.holes = 1; /* mark as needing compaction */ } - xfs_attr3_leaf_hdr_to_disk(leaf, &ichdr); + xfs_attr3_leaf_hdr_to_disk(args->geo, leaf, &ichdr); xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, &leaf->hdr, xfs_attr3_leaf_hdr_size(leaf))); @@ -1957,8 +2034,8 @@ xfs_attr3_leaf_unbalance( drop_leaf = drop_blk->bp->b_addr; save_leaf = save_blk->bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&drophdr, drop_leaf); - xfs_attr3_leaf_hdr_from_disk(&savehdr, save_leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &drophdr, drop_leaf); + xfs_attr3_leaf_hdr_from_disk(state->args->geo, &savehdr, save_leaf); entry = xfs_attr3_leaf_entryp(drop_leaf); /* @@ -2012,7 +2089,7 @@ xfs_attr3_leaf_unbalance( tmphdr.firstused = state->args->geo->blksize; /* write the header to the temp buffer to initialise it */ - xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, tmp_leaf, &tmphdr); if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { @@ -2039,7 +2116,7 @@ xfs_attr3_leaf_unbalance( kmem_free(tmp_leaf); } - xfs_attr3_leaf_hdr_to_disk(save_leaf, &savehdr); + xfs_attr3_leaf_hdr_to_disk(state->args->geo, save_leaf, &savehdr); xfs_trans_log_buf(state->args->trans, save_blk->bp, 0, state->args->geo->blksize - 1); @@ -2085,7 +2162,7 @@ xfs_attr3_leaf_lookup_int( trace_xfs_attr_leaf_lookup(args); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); ASSERT(ichdr.count < args->geo->blksize / 8); @@ -2190,7 +2267,7 @@ xfs_attr3_leaf_getvalue( int valuelen; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(ichdr.count < args->geo->blksize / 8); ASSERT(args->index < ichdr.count); @@ -2391,8 +2468,9 @@ xfs_attr_leaf_lasthash( { struct xfs_attr3_icleaf_hdr ichdr; struct xfs_attr_leaf_entry *entries; + struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_attr3_leaf_hdr_from_disk(&ichdr, bp->b_addr); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, bp->b_addr); entries = xfs_attr3_leaf_entryp(bp->b_addr); if (count) *count = ichdr.count; @@ -2486,7 +2564,7 @@ xfs_attr3_leaf_clearflag( ASSERT(entry->flags & XFS_ATTR_INCOMPLETE); #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); @@ -2550,7 +2628,7 @@ xfs_attr3_leaf_setflag( leaf = bp->b_addr; #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf); ASSERT(args->index < ichdr.count); ASSERT(args->index >= 0); #endif @@ -2629,11 +2707,11 @@ xfs_attr3_leaf_flipflags( entry2 = &xfs_attr3_leaf_entryp(leaf2)[args->index2]; #ifdef DEBUG - xfs_attr3_leaf_hdr_from_disk(&ichdr1, leaf1); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr1, leaf1); ASSERT(args->index < ichdr1.count); ASSERT(args->index >= 0); - xfs_attr3_leaf_hdr_from_disk(&ichdr2, leaf2); + xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr2, leaf2); ASSERT(args->index2 < ichdr2.count); ASSERT(args->index2 >= 0); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index e2929da7c3ba..882c8d338891 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -53,7 +53,7 @@ int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_list(struct xfs_attr_list_context *context); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes); - +void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp); /* * Internal routines when attribute fork size == XFS_LBSIZE(mp). @@ -100,9 +100,11 @@ int xfs_attr_leaf_newentsize(struct xfs_da_args *args, int *local); int xfs_attr3_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp, xfs_dablk_t bno, xfs_daddr_t mappedbno, struct xfs_buf **bpp); -void xfs_attr3_leaf_hdr_from_disk(struct xfs_attr3_icleaf_hdr *to, +void xfs_attr3_leaf_hdr_from_disk(struct xfs_da_geometry *geo, + struct xfs_attr3_icleaf_hdr *to, struct xfs_attr_leafblock *from); -void xfs_attr3_leaf_hdr_to_disk(struct xfs_attr_leafblock *to, +void xfs_attr3_leaf_hdr_to_disk(struct xfs_da_geometry *geo, + struct xfs_attr_leafblock *to, struct xfs_attr3_icleaf_hdr *from); #endif /* __XFS_ATTR_LEAF_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 20de88d1bf86..dd714037c322 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -159,11 +159,10 @@ xfs_attr3_rmt_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_buf_log_item *bip = bp->b_fspriv; + int blksize = mp->m_attr_geo->blksize; char *ptr; int len; xfs_daddr_t bno; - int blksize = mp->m_attr_geo->blksize; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) @@ -175,16 +174,22 @@ xfs_attr3_rmt_write_verify( ASSERT(len >= blksize); while (len > 0) { + struct xfs_attr3_rmt_hdr *rmt = (struct xfs_attr3_rmt_hdr *)ptr; + if (!xfs_attr3_rmt_verify(mp, ptr, blksize, bno)) { xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; } - if (bip) { - struct xfs_attr3_rmt_hdr *rmt; - rmt = (struct xfs_attr3_rmt_hdr *)ptr; - rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + /* + * Ensure we aren't writing bogus LSNs to disk. See + * xfs_attr3_rmt_hdr_set() for the explanation. + */ + if (rmt->rm_lsn != cpu_to_be64(NULLCOMMITLSN)) { + xfs_buf_ioerror(bp, -EFSCORRUPTED); + xfs_verifier_error(bp); + return; } xfs_update_cksum(ptr, blksize, XFS_ATTR3_RMT_CRC_OFF); @@ -221,6 +226,18 @@ xfs_attr3_rmt_hdr_set( rmt->rm_owner = cpu_to_be64(ino); rmt->rm_blkno = cpu_to_be64(bno); + /* + * Remote attribute blocks are written synchronously, so we don't + * have an LSN that we can stamp in them that makes any sense to log + * recovery. To ensure that log recovery handles overwrites of these + * blocks sanely (i.e. once they've been freed and reallocated as some + * other type of metadata) we need to ensure that the LSN has a value + * that tells log recovery to ignore the LSN and overwrite the buffer + * with whatever is in it's log. To do this, we use the magic + * NULLCOMMITLSN to indicate that the LSN is invalid. + */ + rmt->rm_lsn = cpu_to_be64(NULLCOMMITLSN); + return sizeof(struct xfs_attr3_rmt_hdr); } @@ -434,14 +451,21 @@ xfs_attr_rmtval_set( /* * Allocate a single extent, up to the size of the value. + * + * Note that we have to consider this a data allocation as we + * write the remote attribute without logging the contents. + * Hence we must ensure that we aren't using blocks that are on + * the busy list so that we don't overwrite blocks which have + * recently been freed but their transactions are not yet + * committed to disk. If we overwrite the contents of a busy + * extent and then crash then the block may not contain the + * correct metadata after log recovery occurs. */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, - blkcnt, - XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, - args->firstblock, args->total, &map, &nmap, - args->flist); + blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, + args->total, &map, &nmap, args->flist); if (!error) { error = xfs_bmap_finish(&args->trans, args->flist, &committed); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 61ec015dca16..63e05b663380 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -244,30 +244,6 @@ xfs_bmap_forkoff_reset( } } -/* - * Debug/sanity checking code - */ - -STATIC int -xfs_bmap_sanity_check( - struct xfs_mount *mp, - struct xfs_buf *bp, - int level) -{ - struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - - if (block->bb_magic != cpu_to_be32(XFS_BMAP_CRC_MAGIC) && - block->bb_magic != cpu_to_be32(XFS_BMAP_MAGIC)) - return 0; - - if (be16_to_cpu(block->bb_level) != level || - be16_to_cpu(block->bb_numrecs) == 0 || - be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0]) - return 0; - - return 1; -} - #ifdef DEBUG STATIC struct xfs_buf * xfs_bmap_get_bp( @@ -410,9 +386,6 @@ xfs_bmap_check_leaf_extents( goto error_norelse; } block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); if (level == 0) break; @@ -424,7 +397,8 @@ xfs_bmap_check_leaf_extents( xfs_check_block(block, mp, 0, 0); pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); if (bp_release) { bp_release = 0; xfs_trans_brelse(NULL, bp); @@ -1029,7 +1003,7 @@ xfs_bmap_add_attrfork_btree( if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat))) goto error0; /* must be at least one entry */ - XFS_WANT_CORRUPTED_GOTO(stat == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, stat == 1, error0); if ((error = xfs_btree_new_iroot(cur, flags, &stat))) goto error0; if (stat == 0) { @@ -1138,7 +1112,6 @@ xfs_bmap_add_attrfork( int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ - int cancel_flags = 0; ASSERT(XFS_IFORK_Q(ip) == 0); @@ -1150,17 +1123,15 @@ xfs_bmap_add_attrfork( tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); if (error) goto trans_cancel; - cancel_flags |= XFS_TRANS_ABORT; if (XFS_IFORK_Q(ip)) goto trans_cancel; if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { @@ -1244,14 +1215,14 @@ xfs_bmap_add_attrfork( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) goto bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; bmap_cancel: xfs_bmap_cancel(&flist); trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1311,14 +1282,12 @@ xfs_bmap_read_extents( if (error) return error; block = XFS_BUF_TO_BLOCK(bp); - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, level), - error0); if (level == 0) break; pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]); bno = be64_to_cpu(*pp); - XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0); + XFS_WANT_CORRUPTED_GOTO(mp, + XFS_FSB_SANITY_CHECK(mp, bno), error0); xfs_trans_brelse(tp, bp); } /* @@ -1345,9 +1314,6 @@ xfs_bmap_read_extents( XFS_ERRLEVEL_LOW, ip->i_mount, block); goto error0; } - XFS_WANT_CORRUPTED_GOTO( - xfs_bmap_sanity_check(mp, bp, 0), - error0); /* * Read-ahead the next leaf block, if any. */ @@ -1755,7 +1721,9 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + struct xfs_mount *mp; + mp = bma->tp ? bma->tp->t_mountp : NULL; ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); ASSERT(bma->idx >= 0); @@ -1866,15 +1834,15 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_delete(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -1907,7 +1875,7 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -1938,7 +1906,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, PREV.br_startoff, new->br_startblock, PREV.br_blockcount + @@ -1968,12 +1936,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2001,7 +1969,7 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + @@ -2038,12 +2006,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2084,7 +2052,7 @@ xfs_bmap_add_extent_delay_real( RIGHT.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -2122,12 +2090,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2191,12 +2159,12 @@ xfs_bmap_add_extent_delay_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = XFS_EXT_NORM; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { @@ -2212,9 +2180,8 @@ xfs_bmap_add_extent_delay_real( diff = (int)(temp + temp2 - startblockval(PREV.br_startblock) - (bma->cur ? bma->cur->bc_private.b.allocated : 0)); if (diff > 0) { - error = xfs_icsb_modify_counters(bma->ip->i_mount, - XFS_SBS_FDBLOCKS, - -((int64_t)diff), 0); + error = xfs_mod_fdblocks(bma->ip->i_mount, + -((int64_t)diff), false); ASSERT(!error); if (error) goto done; @@ -2265,9 +2232,8 @@ xfs_bmap_add_extent_delay_real( temp += bma->cur->bc_private.b.allocated; ASSERT(temp <= da_old); if (temp < da_old) - xfs_icsb_modify_counters(bma->ip->i_mount, - XFS_SBS_FDBLOCKS, - (int64_t)(da_old - temp), 0); + xfs_mod_fdblocks(bma->ip->i_mount, + (int64_t)(da_old - temp), false); } /* clear out the allocated field, done with it now in any case. */ @@ -2309,6 +2275,7 @@ xfs_bmap_add_extent_unwritten_real( /* left is 0, right is 1, prev is 2 */ int rval=0; /* return value (logging flags) */ int state = 0;/* state bits, accessed thru macros */ + struct xfs_mount *mp = tp->t_mountp; *logflagsp = 0; @@ -2421,19 +2388,19 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount + @@ -2464,13 +2431,13 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, LEFT.br_startoff, LEFT.br_startblock, LEFT.br_blockcount + PREV.br_blockcount, @@ -2499,13 +2466,13 @@ xfs_bmap_add_extent_unwritten_real( RIGHT.br_startblock, RIGHT.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_btree_decrement(cur, 0, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + RIGHT.br_blockcount, @@ -2532,7 +2499,7 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount, newext))) @@ -2569,7 +2536,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -2611,7 +2578,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff + new->br_blockcount, PREV.br_startblock + new->br_blockcount, @@ -2621,7 +2588,7 @@ xfs_bmap_add_extent_unwritten_real( cur->bc_rec.b = *new; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2651,7 +2618,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -2689,7 +2656,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); if ((error = xfs_bmbt_update(cur, PREV.br_startoff, PREV.br_startblock, PREV.br_blockcount - new->br_blockcount, @@ -2699,11 +2666,11 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); cur->bc_rec.b.br_state = XFS_EXT_NORM; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2737,7 +2704,7 @@ xfs_bmap_add_extent_unwritten_real( PREV.br_startblock, PREV.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); /* new right extent - oldext */ if ((error = xfs_bmbt_update(cur, r[1].br_startoff, r[1].br_startblock, r[1].br_blockcount, @@ -2749,7 +2716,7 @@ xfs_bmap_add_extent_unwritten_real( new->br_startoff - PREV.br_startoff; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); /* * Reset the cursor to the position of the new extent * we are about to insert as we can't trust it after @@ -2759,12 +2726,12 @@ xfs_bmap_add_extent_unwritten_real( new->br_startblock, new->br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); /* new middle extent - newext */ cur->bc_rec.b.br_state = new->br_state; if ((error = xfs_btree_insert(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; @@ -2944,8 +2911,8 @@ xfs_bmap_add_extent_hole_delay( } if (oldlen != newlen) { ASSERT(oldlen > newlen); - xfs_icsb_modify_counters(ip->i_mount, XFS_SBS_FDBLOCKS, - (int64_t)(oldlen - newlen), 0); + xfs_mod_fdblocks(ip->i_mount, (int64_t)(oldlen - newlen), + false); /* * Nothing to do for disk quota accounting here. */ @@ -2968,7 +2935,9 @@ xfs_bmap_add_extent_hole_real( xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ + struct xfs_mount *mp; + mp = bma->tp ? bma->tp->t_mountp : NULL; ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); @@ -3056,15 +3025,15 @@ xfs_bmap_add_extent_hole_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_delete(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_btree_decrement(bma->cur, 0, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -3097,7 +3066,7 @@ xfs_bmap_add_extent_hole_real( &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, left.br_startoff, left.br_startblock, left.br_blockcount + @@ -3131,7 +3100,7 @@ xfs_bmap_add_extent_hole_real( right.br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); error = xfs_bmbt_update(bma->cur, new->br_startoff, new->br_startblock, new->br_blockcount + @@ -3161,12 +3130,12 @@ xfs_bmap_add_extent_hole_real( new->br_blockcount, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 0, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); bma->cur->bc_rec.b.br_state = new->br_state; error = xfs_btree_insert(bma->cur, &i); if (error) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } break; } @@ -3252,12 +3221,24 @@ xfs_bmap_extsize_align( align_alen += temp; align_off -= temp; } + + /* Same adjustment for the end of the requested area. */ + temp = (align_alen % extsz); + if (temp) + align_alen += extsz - temp; + /* - * Same adjustment for the end of the requested area. + * For large extent hint sizes, the aligned extent might be larger than + * MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls + * the length back under MAXEXTLEN. The outer allocation loops handle + * short allocation just fine, so it is safe to do this. We only want to + * do it when we are forced to, though, because it means more allocation + * operations are required. */ - if ((temp = (align_alen % extsz))) { - align_alen += extsz - temp; - } + while (align_alen > MAXEXTLEN) + align_alen -= extsz; + ASSERT(align_alen <= MAXEXTLEN); + /* * If the previous block overlaps with this proposed allocation * then move the start forward without adjusting the length. @@ -3346,7 +3327,9 @@ xfs_bmap_extsize_align( return -EINVAL; } else { ASSERT(orig_off >= align_off); - ASSERT(orig_end <= align_off + align_alen); + /* see MAXEXTLEN handling above */ + ASSERT(orig_end <= align_off + align_alen || + align_alen + extsz > MAXEXTLEN); } #ifdef DEBUG @@ -3535,7 +3518,8 @@ xfs_bmap_longest_free_extent( } } - longest = xfs_alloc_longest_free_extent(mp, pag); + longest = xfs_alloc_longest_free_extent(mp, pag, + xfs_alloc_min_freelist(mp, pag)); if (*blen < longest) *blen = longest; @@ -4127,13 +4111,6 @@ xfs_bmapi_reserve_delalloc( /* Figure out the extent size, adjust alen */ extsz = xfs_get_extsz_hint(ip); if (extsz) { - /* - * Make sure we don't exceed a single extent length when we - * align the extent by reducing length we are going to - * allocate by the maximum amount extent size aligment may - * require. - */ - alen = XFS_FILBLKS_MIN(len, MAXEXTLEN - (2 * extsz - 1)); error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof, 1, 0, &aoff, &alen); ASSERT(!error); @@ -4160,18 +4137,15 @@ xfs_bmapi_reserve_delalloc( ASSERT(indlen > 0); if (rt) { - error = xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - -((int64_t)extsz), 0); + error = xfs_mod_frextents(mp, -((int64_t)extsz)); } else { - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)alen), 0); + error = xfs_mod_fdblocks(mp, -((int64_t)alen), false); } if (error) goto out_unreserve_quota; - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - -((int64_t)indlen), 0); + error = xfs_mod_fdblocks(mp, -((int64_t)indlen), false); if (error) goto out_unreserve_blocks; @@ -4198,9 +4172,9 @@ xfs_bmapi_reserve_delalloc( out_unreserve_blocks: if (rt) - xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, extsz, 0); + xfs_mod_frextents(mp, extsz); else - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, alen, 0); + xfs_mod_fdblocks(mp, alen, false); out_unreserve_quota: if (XFS_IS_QUOTA_ON(mp)) xfs_trans_unreserve_quota_nblks(NULL, ip, (long)alen, 0, rt ? @@ -4448,7 +4422,15 @@ xfs_bmapi_convert_unwritten( error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, &bma->cur, mval, bma->firstblock, bma->flist, &tmp_logflags); - bma->logflags |= tmp_logflags; + /* + * Log the inode core unconditionally in the unwritten extent conversion + * path because the conversion might not have done so (e.g., if the + * extent count hasn't changed). We need to make sure the inode is dirty + * in the transaction for the sake of fsync(), even if nothing has + * changed, because fsync() will not force the log for this transaction + * unless it sees the inode pinned. + */ + bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; @@ -4801,7 +4783,7 @@ xfs_bmap_del_extent( got.br_startblock, got.br_blockcount, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } da_old = da_new = 0; } else { @@ -4835,7 +4817,7 @@ xfs_bmap_del_extent( } if ((error = xfs_btree_delete(cur, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); break; case 2: @@ -4935,7 +4917,8 @@ xfs_bmap_del_extent( got.br_startblock, temp, &i))) goto done; - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, + i == 1, done); /* * Update the btree record back * to the original value. @@ -4956,7 +4939,7 @@ xfs_bmap_del_extent( error = -ENOSPC; goto done; } - XFS_WANT_CORRUPTED_GOTO(i == 1, done); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } else flags |= xfs_ilog_fext(whichfork); XFS_IFORK_NEXT_SET(ip, whichfork, @@ -5012,10 +4995,8 @@ xfs_bmap_del_extent( * Nothing to do for disk quota accounting here. */ ASSERT(da_old >= da_new); - if (da_old > da_new) { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)(da_old - da_new), 0); - } + if (da_old > da_new) + xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false); done: *logflagsp = flags; return error; @@ -5284,14 +5265,13 @@ xfs_bunmapi( rtexts = XFS_FSB_TO_B(mp, del.br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); - xfs_mod_incore_sb(mp, XFS_SBS_FREXTENTS, - (int64_t)rtexts, 0); + xfs_mod_frextents(mp, (int64_t)rtexts); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_RTBLKS); } else { - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - (int64_t)del.br_blockcount, 0); + xfs_mod_fdblocks(mp, (int64_t)del.br_blockcount, + false); (void)xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del.br_blockcount), 0, XFS_QMOPT_RES_REGBLKS); @@ -5453,6 +5433,7 @@ xfs_bmse_merge( struct xfs_bmbt_irec left; xfs_filblks_t blockcount; int error, i; + struct xfs_mount *mp = ip->i_mount; xfs_bmbt_get_all(gotp, &got); xfs_bmbt_get_all(leftp, &left); @@ -5487,19 +5468,19 @@ xfs_bmse_merge( got.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); error = xfs_btree_delete(cur, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); /* lookup and update size of the previous extent */ error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, left.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); left.br_blockcount = blockcount; @@ -5518,50 +5499,92 @@ xfs_bmse_shift_one( int *current_ext, struct xfs_bmbt_rec_host *gotp, struct xfs_btree_cur *cur, - int *logflags) + int *logflags, + enum shift_direction direction) { struct xfs_ifork *ifp; + struct xfs_mount *mp; xfs_fileoff_t startoff; - struct xfs_bmbt_rec_host *leftp; + struct xfs_bmbt_rec_host *adj_irecp; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec left; + struct xfs_bmbt_irec adj_irec; int error; int i; + int total_extents; + mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); xfs_bmbt_get_all(gotp, &got); - startoff = got.br_startoff - offset_shift_fsb; /* delalloc extents should be prevented by caller */ - XFS_WANT_CORRUPTED_RETURN(!isnullstartblock(got.br_startblock)); + XFS_WANT_CORRUPTED_RETURN(mp, !isnullstartblock(got.br_startblock)); - /* - * Check for merge if we've got an extent to the left, otherwise make - * sure there's enough room at the start of the file for the shift. - */ - if (*current_ext) { - /* grab the left extent and check for a large enough hole */ - leftp = xfs_iext_get_ext(ifp, *current_ext - 1); - xfs_bmbt_get_all(leftp, &left); + if (direction == SHIFT_LEFT) { + startoff = got.br_startoff - offset_shift_fsb; + + /* + * Check for merge if we've got an extent to the left, + * otherwise make sure there's enough room at the start + * of the file for the shift. + */ + if (!*current_ext) { + if (got.br_startoff < offset_shift_fsb) + return -EINVAL; + goto update_current_ext; + } + /* + * grab the left extent and check for a large + * enough hole. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext - 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); - if (startoff < left.br_startoff + left.br_blockcount) + if (startoff < + adj_irec.br_startoff + adj_irec.br_blockcount) return -EINVAL; /* check whether to merge the extent or shift it down */ - if (xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) { + if (xfs_bmse_can_merge(&adj_irec, &got, + offset_shift_fsb)) { return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, - *current_ext, gotp, leftp, cur, - logflags); + *current_ext, gotp, adj_irecp, + cur, logflags); } - } else if (got.br_startoff < offset_shift_fsb) - return -EINVAL; - + } else { + startoff = got.br_startoff + offset_shift_fsb; + /* nothing to move if this is the last extent */ + if (*current_ext >= (total_extents - 1)) + goto update_current_ext; + /* + * If this is not the last extent in the file, make sure there + * is enough room between current extent and next extent for + * accommodating the shift. + */ + adj_irecp = xfs_iext_get_ext(ifp, *current_ext + 1); + xfs_bmbt_get_all(adj_irecp, &adj_irec); + if (startoff + got.br_blockcount > adj_irec.br_startoff) + return -EINVAL; + /* + * Unlike a left shift (which involves a hole punch), + * a right shift does not modify extent neighbors + * in any way. We should never find mergeable extents + * in this scenario. Check anyways and warn if we + * encounter two extents that could be one. + */ + if (xfs_bmse_can_merge(&got, &adj_irec, offset_shift_fsb)) + WARN_ON_ONCE(1); + } /* * Increment the extent index for the next iteration, update the start * offset of the in-core extent and update the btree if applicable. */ - (*current_ext)++; +update_current_ext: + if (direction == SHIFT_LEFT) + (*current_ext)++; + else + (*current_ext)--; xfs_bmbt_set_startoff(gotp, startoff); *logflags |= XFS_ILOG_CORE; if (!cur) { @@ -5573,18 +5596,18 @@ xfs_bmse_shift_one( got.br_blockcount, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(mp, i == 1); got.br_startoff = startoff; return xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, - got.br_blockcount, got.br_state); + got.br_blockcount, got.br_state); } /* - * Shift extent records to the left to cover a hole. + * Shift extent records to the left/right to cover/create a hole. * * The maximum number of extents to be shifted in a single operation is - * @num_exts. @start_fsb specifies the file offset to start the shift and the + * @num_exts. @stop_fsb specifies the file offset at which to stop shift and the * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb * is the length by which each extent is shifted. If there is no hole to shift * the extents into, this will be considered invalid operation and we abort @@ -5594,12 +5617,13 @@ int xfs_bmap_shift_extents( struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t start_fsb, + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, int *done, - xfs_fileoff_t *next_fsb, + xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, + enum shift_direction direction, int num_exts) { struct xfs_btree_cur *cur = NULL; @@ -5609,10 +5633,11 @@ xfs_bmap_shift_extents( struct xfs_ifork *ifp; xfs_extnum_t nexts = 0; xfs_extnum_t current_ext; + xfs_extnum_t total_extents; + xfs_extnum_t stop_extent; int error = 0; int whichfork = XFS_DATA_FORK; int logflags = 0; - int total_extents; if (unlikely(XFS_TEST_ERROR( (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && @@ -5628,6 +5653,8 @@ xfs_bmap_shift_extents( ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); + ASSERT(*next_fsb != NULLFSBLOCK || direction == SHIFT_RIGHT); ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -5645,43 +5672,83 @@ xfs_bmap_shift_extents( } /* + * There may be delalloc extents in the data fork before the range we + * are collapsing out, so we cannot use the count of real extents here. + * Instead we have to calculate it from the incore fork. + */ + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (total_extents == 0) { + *done = 1; + goto del_cursor; + } + + /* + * In case of first right shift, we need to initialize next_fsb + */ + if (*next_fsb == NULLFSBLOCK) { + gotp = xfs_iext_get_ext(ifp, total_extents - 1); + xfs_bmbt_get_all(gotp, &got); + *next_fsb = got.br_startoff; + if (stop_fsb > *next_fsb) { + *done = 1; + goto del_cursor; + } + } + + /* Lookup the extent index at which we have to stop */ + if (direction == SHIFT_RIGHT) { + gotp = xfs_iext_bno_to_ext(ifp, stop_fsb, &stop_extent); + /* Make stop_extent exclusive of shift range */ + stop_extent--; + } else + stop_extent = total_extents; + + /* * Look up the extent index for the fsb where we start shifting. We can * henceforth iterate with current_ext as extent list changes are locked * out via ilock. * * gotp can be null in 2 cases: 1) if there are no extents or 2) - * start_fsb lies in a hole beyond which there are no extents. Either + * *next_fsb lies in a hole beyond which there are no extents. Either * way, we are done. */ - gotp = xfs_iext_bno_to_ext(ifp, start_fsb, ¤t_ext); + gotp = xfs_iext_bno_to_ext(ifp, *next_fsb, ¤t_ext); if (!gotp) { *done = 1; goto del_cursor; } - /* - * There may be delalloc extents in the data fork before the range we - * are collapsing out, so we cannot use the count of real extents here. - * Instead we have to calculate it from the incore fork. - */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - while (nexts++ < num_exts && current_ext < total_extents) { + /* some sanity checking before we finally start shifting extents */ + if ((direction == SHIFT_LEFT && current_ext >= stop_extent) || + (direction == SHIFT_RIGHT && current_ext <= stop_extent)) { + error = -EIO; + goto del_cursor; + } + + while (nexts++ < num_exts) { error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, - ¤t_ext, gotp, cur, &logflags); + ¤t_ext, gotp, cur, &logflags, + direction); if (error) goto del_cursor; + /* + * If there was an extent merge during the shift, the extent + * count can change. Update the total and grade the next record. + */ + if (direction == SHIFT_LEFT) { + total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + stop_extent = total_extents; + } - /* update total extent count and grab the next record */ - total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - if (current_ext >= total_extents) + if (current_ext == stop_extent) { + *done = 1; + *next_fsb = NULLFSBLOCK; break; + } gotp = xfs_iext_get_ext(ifp, current_ext); } - /* Check if we are done */ - if (current_ext == total_extents) { - *done = 1; - } else if (next_fsb) { + if (!*done) { xfs_bmbt_get_all(gotp, &got); *next_fsb = got.br_startoff; } @@ -5696,3 +5763,188 @@ del_cursor: return error; } + +/* + * Splits an extent into two extents at split_fsb block such that it is + * the first block of the current_ext. @current_ext is a target extent + * to be split. @split_fsb is a block where the extents is split. + * If split_fsb lies in a hole or the first block of extents, just return 0. + */ +STATIC int +xfs_bmap_split_extent_at( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t split_fsb, + xfs_fsblock_t *firstfsb, + struct xfs_bmap_free *free_list) +{ + int whichfork = XFS_DATA_FORK; + struct xfs_btree_cur *cur = NULL; + struct xfs_bmbt_rec_host *gotp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec new; /* split extent */ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp; + xfs_fsblock_t gotblkcnt; /* new block count for got */ + xfs_extnum_t current_ext; + int error = 0; + int logflags = 0; + int i = 0; + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmap_split_extent_at", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + ifp = XFS_IFORK_PTR(ip, whichfork); + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + /* Read in all the extents */ + error = xfs_iread_extents(tp, ip, whichfork); + if (error) + return error; + } + + /* + * gotp can be null in 2 cases: 1) if there are no extents + * or 2) split_fsb lies in a hole beyond which there are + * no extents. Either way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, split_fsb, ¤t_ext); + if (!gotp) + return 0; + + xfs_bmbt_get_all(gotp, &got); + + /* + * Check split_fsb lies in a hole or the start boundary offset + * of the extent. + */ + if (got.br_startoff >= split_fsb) + return 0; + + gotblkcnt = split_fsb - got.br_startoff; + new.br_startoff = split_fsb; + new.br_startblock = got.br_startblock + gotblkcnt; + new.br_blockcount = got.br_blockcount - gotblkcnt; + new.br_state = got.br_state; + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); + cur->bc_private.b.firstblock = *firstfsb; + cur->bc_private.b.flist = free_list; + cur->bc_private.b.flags = 0; + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + xfs_bmbt_set_blockcount(gotp, gotblkcnt); + got.br_blockcount = gotblkcnt; + + logflags = XFS_ILOG_CORE; + if (cur) { + error = xfs_bmbt_update(cur, got.br_startoff, + got.br_startblock, + got.br_blockcount, + got.br_state); + if (error) + goto del_cursor; + } else + logflags |= XFS_ILOG_DEXT; + + /* Add new extent */ + current_ext++; + xfs_iext_insert(ip, current_ext, 1, &new, 0); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + + if (cur) { + error = xfs_bmbt_lookup_eq(cur, new.br_startoff, + new.br_startblock, new.br_blockcount, + &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 0, del_cursor); + cur->bc_rec.b.br_state = new.br_state; + + error = xfs_btree_insert(cur, &i); + if (error) + goto del_cursor; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, del_cursor); + } + + /* + * Convert to a btree if necessary. + */ + if (xfs_bmap_needs_btree(ip, whichfork)) { + int tmp_logflags; /* partial log flag return val */ + + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, firstfsb, free_list, + &cur, 0, &tmp_logflags, whichfork); + logflags |= tmp_logflags; + } + +del_cursor: + if (cur) { + cur->bc_private.b.allocated = 0; + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + return error; +} + +int +xfs_bmap_split_extent( + struct xfs_inode *ip, + xfs_fileoff_t split_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t firstfsb; + int committed; + int error; + + tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, + XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); + if (error) { + xfs_trans_cancel(tp); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + + xfs_bmap_init(&free_list, &firstfsb); + + error = xfs_bmap_split_extent_at(tp, ip, split_fsb, + &firstfsb, &free_list); + if (error) + goto out; + + error = xfs_bmap_finish(&tp, &free_list, &committed); + if (error) + goto out; + + return xfs_trans_commit(tp); + +out: + xfs_trans_cancel(tp); + return error; +} diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b9d8a499d2c4..6aaa0c1c7200 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -166,6 +166,11 @@ static inline void xfs_bmap_init(xfs_bmap_free_t *flp, xfs_fsblock_t *fbp) */ #define XFS_BMAP_MAX_SHIFT_EXTENTS 1 +enum shift_direction { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + #ifdef DEBUG void xfs_bmap_trace_exlist(struct xfs_inode *ip, xfs_extnum_t cnt, int whichfork, unsigned long caller_ip); @@ -211,8 +216,10 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, - xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb, - int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock, - struct xfs_bmap_free *flist, int num_exts); + xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, + int *done, xfs_fileoff_t stop_fsb, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, enum shift_direction direction, + int num_exts); +int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 81cad433df85..c72283dd8d44 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -168,7 +168,7 @@ xfs_btree_check_lptr( xfs_fsblock_t bno, /* btree block disk address */ int level) /* btree block level */ { - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, level > 0 && bno != NULLFSBLOCK && XFS_FSB_SANITY_CHECK(cur->bc_mp, bno)); @@ -187,7 +187,7 @@ xfs_btree_check_sptr( { xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks; - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, level > 0 && bno != NULLAGBLOCK && bno != 0 && @@ -1825,7 +1825,7 @@ xfs_btree_lookup( error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT); *stat = 1; return 0; @@ -2285,7 +2285,7 @@ xfs_btree_rshift( if (error) goto error0; i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_increment(tcur, level, &i); if (error) @@ -3138,7 +3138,7 @@ xfs_btree_insert( goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); level++; /* @@ -3582,15 +3582,15 @@ xfs_btree_delrec( * Actually any entry but the first would suffice. */ i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_increment(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); i = xfs_btree_lastrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); /* Grab a pointer to the block. */ right = xfs_btree_get_block(tcur, level, &rbp); @@ -3634,12 +3634,12 @@ xfs_btree_delrec( rrecs = xfs_btree_get_numrecs(right); if (!xfs_btree_ptr_is_null(cur, &lptr)) { i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); } } @@ -3653,13 +3653,13 @@ xfs_btree_delrec( * previous block. */ i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); error = xfs_btree_decrement(tcur, level, &i); if (error) goto error0; i = xfs_btree_firstrec(tcur, level); - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0); /* Grab a pointer to the block. */ left = xfs_btree_get_block(tcur, level, &lbp); diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 9cb0115c6bd1..2385f8cd08ab 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -538,12 +538,12 @@ xfs_da3_root_split( oldroot = blk1->bp->b_addr; if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC) || oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC)) { - struct xfs_da3_icnode_hdr nodehdr; + struct xfs_da3_icnode_hdr icnodehdr; - dp->d_ops->node_hdr_from_disk(&nodehdr, oldroot); + dp->d_ops->node_hdr_from_disk(&icnodehdr, oldroot); btree = dp->d_ops->node_tree_p(oldroot); - size = (int)((char *)&btree[nodehdr.count] - (char *)oldroot); - level = nodehdr.level; + size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot); + level = icnodehdr.level; /* * we are about to copy oldroot to bp, so set up the type diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 0a49b0286372..74bcbabfa523 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -725,7 +725,13 @@ struct xfs_attr3_icleaf_hdr { __uint16_t magic; __uint16_t count; __uint16_t usedbytes; - __uint16_t firstused; + /* + * firstused is 32-bit here instead of 16-bit like the on-disk variant + * to support maximum fsb size of 64k without overflow issues throughout + * the attr code. Instead, the overflow condition is handled on + * conversion to/from disk. + */ + __uint32_t firstused; __u8 holes; struct { __uint16_t base; @@ -734,6 +740,12 @@ struct xfs_attr3_icleaf_hdr { }; /* + * Special value to represent fs block size in the leaf header firstused field. + * Only used when block size overflows the 2-bytes available on disk. + */ +#define XFS_ATTR3_LEAF_NULLOFF 0 + +/* * Flags used in the leaf_entry[i].flags field. * NOTE: the INCOMPLETE bit must not collide with the flags bits specified * on the system call, they are "or"ed together for various operations. diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index 5ff31be9b1cd..de1ea16f5748 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -89,7 +89,7 @@ __xfs_dir3_data_check( * so just ensure that the count falls somewhere inside the * block right now. */ - XFS_WANT_CORRUPTED_RETURN(be32_to_cpu(btp->count) < + XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(btp->count) < ((char *)btp - p) / sizeof(struct xfs_dir2_leaf_entry)); break; case cpu_to_be32(XFS_DIR3_DATA_MAGIC): @@ -107,21 +107,21 @@ __xfs_dir3_data_check( bf = ops->data_bestfree_p(hdr); count = lastfree = freeseen = 0; if (!bf[0].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[0].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[0].offset); freeseen |= 1 << 0; } if (!bf[1].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[1].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[1].offset); freeseen |= 1 << 1; } if (!bf[2].length) { - XFS_WANT_CORRUPTED_RETURN(!bf[2].offset); + XFS_WANT_CORRUPTED_RETURN(mp, !bf[2].offset); freeseen |= 1 << 2; } - XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >= + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length)); - XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >= + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length)); /* * Loop over the data/unused entries. @@ -134,18 +134,18 @@ __xfs_dir3_data_check( * doesn't need to be there. */ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) { - XFS_WANT_CORRUPTED_RETURN(lastfree == 0); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, lastfree == 0); + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) == (char *)dup - (char *)hdr); dfp = xfs_dir2_data_freefind(hdr, bf, dup); if (dfp) { i = (int)(dfp - bf); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, (freeseen & (1 << i)) == 0); freeseen |= 1 << i; } else { - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(dup->length) <= be16_to_cpu(bf[2].length)); } @@ -160,13 +160,13 @@ __xfs_dir3_data_check( * The linear search is crude but this is DEBUG code. */ dep = (xfs_dir2_data_entry_t *)p; - XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, dep->namelen != 0); + XFS_WANT_CORRUPTED_RETURN(mp, !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber))); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be16_to_cpu(*ops->data_entry_tag_p(dep)) == (char *)dep - (char *)hdr); - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, ops->data_get_ftype(dep) < XFS_DIR3_FT_MAX); count++; lastfree = 0; @@ -183,14 +183,15 @@ __xfs_dir3_data_check( be32_to_cpu(lep[i].hashval) == hash) break; } - XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count)); + XFS_WANT_CORRUPTED_RETURN(mp, + i < be32_to_cpu(btp->count)); } p += ops->data_entsize(dep->namelen); } /* * Need to have seen all the entries and all the bestfree slots. */ - XFS_WANT_CORRUPTED_RETURN(freeseen == 7); + XFS_WANT_CORRUPTED_RETURN(mp, freeseen == 7); if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) || hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) { for (i = stale = 0; i < be32_to_cpu(btp->count); i++) { @@ -198,13 +199,13 @@ __xfs_dir3_data_check( cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) stale++; if (i > 0) - XFS_WANT_CORRUPTED_RETURN( + XFS_WANT_CORRUPTED_RETURN(mp, be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval)); } - XFS_WANT_CORRUPTED_RETURN(count == + XFS_WANT_CORRUPTED_RETURN(mp, count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale)); - XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale)); + XFS_WANT_CORRUPTED_RETURN(mp, stale == be32_to_cpu(btp->stale)); } return 0; } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 8eb718979383..a0ae572051de 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -170,7 +170,7 @@ typedef struct xfs_sb { __uint32_t sb_features_log_incompat; __uint32_t sb_crc; /* superblock crc */ - __uint32_t sb_pad; + xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */ xfs_ino_t sb_pquotino; /* project quota inode */ xfs_lsn_t sb_lsn; /* last write sequence */ @@ -256,7 +256,7 @@ typedef struct xfs_dsb { __be32 sb_features_log_incompat; __le32 sb_crc; /* superblock crc */ - __be32 sb_pad; + __be32 sb_spino_align; /* sparse inode chunk alignment */ __be64 sb_pquotino; /* project quota inode */ __be64 sb_lsn; /* last write sequence */ @@ -264,68 +264,6 @@ typedef struct xfs_dsb { /* must be padded to 64 bit alignment */ } xfs_dsb_t; -/* - * Sequence number values for the fields. - */ -typedef enum { - XFS_SBS_MAGICNUM, XFS_SBS_BLOCKSIZE, XFS_SBS_DBLOCKS, XFS_SBS_RBLOCKS, - XFS_SBS_REXTENTS, XFS_SBS_UUID, XFS_SBS_LOGSTART, XFS_SBS_ROOTINO, - XFS_SBS_RBMINO, XFS_SBS_RSUMINO, XFS_SBS_REXTSIZE, XFS_SBS_AGBLOCKS, - XFS_SBS_AGCOUNT, XFS_SBS_RBMBLOCKS, XFS_SBS_LOGBLOCKS, - XFS_SBS_VERSIONNUM, XFS_SBS_SECTSIZE, XFS_SBS_INODESIZE, - XFS_SBS_INOPBLOCK, XFS_SBS_FNAME, XFS_SBS_BLOCKLOG, - XFS_SBS_SECTLOG, XFS_SBS_INODELOG, XFS_SBS_INOPBLOG, XFS_SBS_AGBLKLOG, - XFS_SBS_REXTSLOG, XFS_SBS_INPROGRESS, XFS_SBS_IMAX_PCT, XFS_SBS_ICOUNT, - XFS_SBS_IFREE, XFS_SBS_FDBLOCKS, XFS_SBS_FREXTENTS, XFS_SBS_UQUOTINO, - XFS_SBS_GQUOTINO, XFS_SBS_QFLAGS, XFS_SBS_FLAGS, XFS_SBS_SHARED_VN, - XFS_SBS_INOALIGNMT, XFS_SBS_UNIT, XFS_SBS_WIDTH, XFS_SBS_DIRBLKLOG, - XFS_SBS_LOGSECTLOG, XFS_SBS_LOGSECTSIZE, XFS_SBS_LOGSUNIT, - XFS_SBS_FEATURES2, XFS_SBS_BAD_FEATURES2, XFS_SBS_FEATURES_COMPAT, - XFS_SBS_FEATURES_RO_COMPAT, XFS_SBS_FEATURES_INCOMPAT, - XFS_SBS_FEATURES_LOG_INCOMPAT, XFS_SBS_CRC, XFS_SBS_PAD, - XFS_SBS_PQUOTINO, XFS_SBS_LSN, - XFS_SBS_FIELDCOUNT -} xfs_sb_field_t; - -/* - * Mask values, defined based on the xfs_sb_field_t values. - * Only define the ones we're using. - */ -#define XFS_SB_MVAL(x) (1LL << XFS_SBS_ ## x) -#define XFS_SB_UUID XFS_SB_MVAL(UUID) -#define XFS_SB_FNAME XFS_SB_MVAL(FNAME) -#define XFS_SB_ROOTINO XFS_SB_MVAL(ROOTINO) -#define XFS_SB_RBMINO XFS_SB_MVAL(RBMINO) -#define XFS_SB_RSUMINO XFS_SB_MVAL(RSUMINO) -#define XFS_SB_VERSIONNUM XFS_SB_MVAL(VERSIONNUM) -#define XFS_SB_UQUOTINO XFS_SB_MVAL(UQUOTINO) -#define XFS_SB_GQUOTINO XFS_SB_MVAL(GQUOTINO) -#define XFS_SB_QFLAGS XFS_SB_MVAL(QFLAGS) -#define XFS_SB_SHARED_VN XFS_SB_MVAL(SHARED_VN) -#define XFS_SB_UNIT XFS_SB_MVAL(UNIT) -#define XFS_SB_WIDTH XFS_SB_MVAL(WIDTH) -#define XFS_SB_ICOUNT XFS_SB_MVAL(ICOUNT) -#define XFS_SB_IFREE XFS_SB_MVAL(IFREE) -#define XFS_SB_FDBLOCKS XFS_SB_MVAL(FDBLOCKS) -#define XFS_SB_FEATURES2 (XFS_SB_MVAL(FEATURES2) | \ - XFS_SB_MVAL(BAD_FEATURES2)) -#define XFS_SB_FEATURES_COMPAT XFS_SB_MVAL(FEATURES_COMPAT) -#define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT) -#define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT) -#define XFS_SB_FEATURES_LOG_INCOMPAT XFS_SB_MVAL(FEATURES_LOG_INCOMPAT) -#define XFS_SB_CRC XFS_SB_MVAL(CRC) -#define XFS_SB_PQUOTINO XFS_SB_MVAL(PQUOTINO) -#define XFS_SB_NUM_BITS ((int)XFS_SBS_FIELDCOUNT) -#define XFS_SB_ALL_BITS ((1LL << XFS_SB_NUM_BITS) - 1) -#define XFS_SB_MOD_BITS \ - (XFS_SB_UUID | XFS_SB_ROOTINO | XFS_SB_RBMINO | XFS_SB_RSUMINO | \ - XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \ - XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \ - XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \ - XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \ - XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \ - XFS_SB_PQUOTINO) - /* * Misc. Flags - warning - these will be cleared by xfs_repair unless @@ -519,8 +457,10 @@ xfs_sb_has_ro_compat_feature( } #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ #define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE) + (XFS_SB_FEAT_INCOMPAT_FTYPE| \ + XFS_SB_FEAT_INCOMPAT_SPINODES) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -568,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); } +static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); +} + /* * end of superblock version macros */ @@ -820,19 +766,6 @@ typedef struct xfs_agfl { #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) - -#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) -#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ - (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) -#define XFS_MIN_FREELIST(a,mp) \ - (XFS_MIN_FREELIST_RAW( \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) -#define XFS_MIN_FREELIST_PAG(pag,mp) \ - (XFS_MIN_FREELIST_RAW( \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) - #define XFS_AGB_TO_FSB(mp,agno,agbno) \ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) #define XFS_FSB_TO_AGNO(mp,fsbno) \ @@ -1278,26 +1211,54 @@ typedef __uint64_t xfs_inofree_t; #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) #define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) +#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */ +#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t)) +#define XFS_INODES_PER_HOLEMASK_BIT \ + (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t))) + static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; } /* - * Data record structure + * The on-disk inode record structure has two formats. The original "full" + * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount + * and replaces the 3 high-order freecount bytes wth the holemask and inode + * count. + * + * The holemask of the sparse record format allows an inode chunk to have holes + * that refer to blocks not owned by the inode record. This facilitates inode + * allocation in the event of severe free space fragmentation. */ typedef struct xfs_inobt_rec { __be32 ir_startino; /* starting inode number */ - __be32 ir_freecount; /* count of free inodes (set bits) */ + union { + struct { + __be32 ir_freecount; /* count of free inodes */ + } f; + struct { + __be16 ir_holemask;/* hole mask for sparse chunks */ + __u8 ir_count; /* total inode count */ + __u8 ir_freecount; /* count of free inodes */ + } sp; + } ir_u; __be64 ir_free; /* free inode mask */ } xfs_inobt_rec_t; typedef struct xfs_inobt_rec_incore { xfs_agino_t ir_startino; /* starting inode number */ - __int32_t ir_freecount; /* count of free inodes (set bits) */ + __uint16_t ir_holemask; /* hole mask for sparse chunks */ + __uint8_t ir_count; /* total inode count */ + __uint8_t ir_freecount; /* count of free inodes (set bits) */ xfs_inofree_t ir_free; /* free inode mask */ } xfs_inobt_rec_incore_t; +static inline bool xfs_inobt_issparse(uint16_t holemask) +{ + /* non-zero holemask represents a sparse rec. */ + return holemask; +} /* * Key structure @@ -1515,8 +1476,8 @@ struct xfs_acl { sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) /* On-disk XFS extended attribute names */ -#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" -#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" +#define SGI_ACL_FILE "SGI_ACL_FILE" +#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" #define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 18dc721ca19f..89689c6a43e2 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ +#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 116ef1ddb3e3..66efc702452a 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -65,6 +65,8 @@ xfs_inobt_lookup( int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_holemask = 0; + cur->bc_rec.i.ir_count = 0; cur->bc_rec.i.ir_freecount = 0; cur->bc_rec.i.ir_free = 0; return xfs_btree_lookup(cur, dir, stat); @@ -82,7 +84,14 @@ xfs_inobt_update( union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); - rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); + rec.inobt.ir_u.sp.ir_count = irec->ir_count; + rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount); + } rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } @@ -100,12 +109,27 @@ xfs_inobt_get_rec( int error; error = xfs_btree_get_rec(cur, &rec, stat); - if (!error && *stat == 1) { - irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); - irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + if (error || *stat == 0) + return error; + + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); + irec->ir_count = rec->inobt.ir_u.sp.ir_count; + irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; + } else { + /* + * ir_holemask/ir_count not supported on-disk. Fill in hardcoded + * values for full inode chunks. + */ + irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL; + irec->ir_count = XFS_INODES_PER_CHUNK; + irec->ir_freecount = + be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } - return error; + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + + return 0; } /* @@ -114,10 +138,14 @@ xfs_inobt_get_rec( STATIC int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, + __uint16_t holemask, + __uint8_t count, __int32_t freecount, xfs_inofree_t free, int *stat) { + cur->bc_rec.i.ir_holemask = holemask; + cur->bc_rec.i.ir_count = count; cur->bc_rec.i.ir_freecount = freecount; cur->bc_rec.i.ir_free = free; return xfs_btree_insert(cur, stat); @@ -154,7 +182,9 @@ xfs_inobt_insert( } ASSERT(i == 0); - error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL, + XFS_INODES_PER_CHUNK, + XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -220,6 +250,7 @@ xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, struct list_head *buffer_list, + int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, @@ -275,7 +306,7 @@ xfs_ialloc_inode_init( * they track in the AIL as if they were physically logged. */ if (tp) - xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, + xfs_icreate_log(tp, agno, agbno, icount, mp->m_sb.sb_inodesize, length, gen); } else version = 2; @@ -347,6 +378,214 @@ xfs_ialloc_inode_init( } /* + * Align startino and allocmask for a recently allocated sparse chunk such that + * they are fit for insertion (or merge) into the on-disk inode btrees. + * + * Background: + * + * When enabled, sparse inode support increases the inode alignment from cluster + * size to inode chunk size. This means that the minimum range between two + * non-adjacent inode records in the inobt is large enough for a full inode + * record. This allows for cluster sized, cluster aligned block allocation + * without need to worry about whether the resulting inode record overlaps with + * another record in the tree. Without this basic rule, we would have to deal + * with the consequences of overlap by potentially undoing recent allocations in + * the inode allocation codepath. + * + * Because of this alignment rule (which is enforced on mount), there are two + * inobt possibilities for newly allocated sparse chunks. One is that the + * aligned inode record for the chunk covers a range of inodes not already + * covered in the inobt (i.e., it is safe to insert a new sparse record). The + * other is that a record already exists at the aligned startino that considers + * the newly allocated range as sparse. In the latter case, record content is + * merged in hope that sparse inode chunks fill to full chunks over time. + */ +STATIC void +xfs_align_sparse_ino( + struct xfs_mount *mp, + xfs_agino_t *startino, + uint16_t *allocmask) +{ + xfs_agblock_t agbno; + xfs_agblock_t mod; + int offset; + + agbno = XFS_AGINO_TO_AGBNO(mp, *startino); + mod = agbno % mp->m_sb.sb_inoalignmt; + if (!mod) + return; + + /* calculate the inode offset and align startino */ + offset = mod << mp->m_sb.sb_inopblog; + *startino -= offset; + + /* + * Since startino has been aligned down, left shift allocmask such that + * it continues to represent the same physical inodes relative to the + * new startino. + */ + *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; +} + +/* + * Determine whether the source inode record can merge into the target. Both + * records must be sparse, the inode ranges must match and there must be no + * allocation overlap between the records. + */ +STATIC bool +__xfs_inobt_can_merge( + struct xfs_inobt_rec_incore *trec, /* tgt record */ + struct xfs_inobt_rec_incore *srec) /* src record */ +{ + uint64_t talloc; + uint64_t salloc; + + /* records must cover the same inode range */ + if (trec->ir_startino != srec->ir_startino) + return false; + + /* both records must be sparse */ + if (!xfs_inobt_issparse(trec->ir_holemask) || + !xfs_inobt_issparse(srec->ir_holemask)) + return false; + + /* both records must track some inodes */ + if (!trec->ir_count || !srec->ir_count) + return false; + + /* can't exceed capacity of a full record */ + if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) + return false; + + /* verify there is no allocation overlap */ + talloc = xfs_inobt_irec_to_allocmask(trec); + salloc = xfs_inobt_irec_to_allocmask(srec); + if (talloc & salloc) + return false; + + return true; +} + +/* + * Merge the source inode record into the target. The caller must call + * __xfs_inobt_can_merge() to ensure the merge is valid. + */ +STATIC void +__xfs_inobt_rec_merge( + struct xfs_inobt_rec_incore *trec, /* target */ + struct xfs_inobt_rec_incore *srec) /* src */ +{ + ASSERT(trec->ir_startino == srec->ir_startino); + + /* combine the counts */ + trec->ir_count += srec->ir_count; + trec->ir_freecount += srec->ir_freecount; + + /* + * Merge the holemask and free mask. For both fields, 0 bits refer to + * allocated inodes. We combine the allocated ranges with bitwise AND. + */ + trec->ir_holemask &= srec->ir_holemask; + trec->ir_free &= srec->ir_free; +} + +/* + * Insert a new sparse inode chunk into the associated inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * This function supports two modes of handling preexisting records depending on + * the merge flag. If merge is true, the provided record is merged with the + * existing record and updated in place. The merged record is returned in nrec. + * If merge is false, an existing record is replaced with the provided record. + * If no preexisting record exists, the provided record is always inserted. + * + * It is considered corruption if a merge is requested and not possible. Given + * the sparse inode alignment constraints, this should never happen. + */ +STATIC int +xfs_inobt_insert_sprec( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + int btnum, + struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ + bool merge) /* merge or replace */ +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + int error; + int i; + struct xfs_inobt_rec_incore rec; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + + goto out; + } + + /* + * A record exists at this startino. Merge or replace the record + * depending on what we've been asked to do. + */ + if (merge) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, + rec.ir_startino == nrec->ir_startino, + error); + + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec), + error); + + trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, + rec.ir_holemask, nrec->ir_startino, + nrec->ir_holemask); + + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); + + trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + nrec->ir_holemask); + + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; + } + + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + +out: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. */ @@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc( xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */ + struct xfs_inobt_rec_incore rec; struct xfs_perag *pag; + int do_sparse = 0; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; + args.fsbno = NULLFSBLOCK; + +#ifdef DEBUG + /* randomly do sparse inode allocations */ + if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks) + do_sparse = prandom_u32() & 1; +#endif /* * Locking will ensure that we don't have two callers in here @@ -376,7 +626,8 @@ xfs_ialloc_ag_alloc( */ newlen = args.mp->m_ialloc_inos; if (args.mp->m_maxicount && - args.mp->m_sb.sb_icount + newlen > args.mp->m_maxicount) + percpu_counter_read_positive(&args.mp->m_icount) + newlen > + args.mp->m_maxicount) return -ENOSPC; args.minlen = args.maxlen = args.mp->m_ialloc_blks; /* @@ -389,6 +640,8 @@ xfs_ialloc_ag_alloc( agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + args.mp->m_ialloc_blks; + if (do_sparse) + goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); @@ -427,8 +680,7 @@ xfs_ialloc_ag_alloc( * subsequent requests. */ args.minalignslop = 0; - } else - args.fsbno = NULLFSBLOCK; + } if (unlikely(args.fsbno == NULLFSBLOCK)) { /* @@ -479,6 +731,47 @@ xfs_ialloc_ag_alloc( return error; } + /* + * Finally, try a sparse allocation if the filesystem supports it and + * the sparse allocation length is smaller than a full chunk. + */ + if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks && + args.fsbno == NULLFSBLOCK) { +sparse_alloc: + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = args.mp->m_sb.sb_spino_align; + args.prod = 1; + + args.minlen = args.mp->m_ialloc_min_blks; + args.maxlen = args.minlen; + + /* + * The inode record will be aligned to full chunk size. We must + * prevent sparse allocation from AG boundaries that result in + * invalid inode records, such as records that start at agbno 0 + * or extend beyond the AG. + * + * Set min agbno to the first aligned, non-zero agbno and max to + * the last aligned agbno that is at least one full chunk from + * the end of the AG. + */ + args.min_agbno = args.mp->m_sb.sb_inoalignmt; + args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.mp->m_sb.sb_inoalignmt) - + args.mp->m_ialloc_blks; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + newlen = args.len << args.mp->m_sb.sb_inopblog; + ASSERT(newlen <= XFS_INODES_PER_CHUNK); + allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; + } + if (args.fsbno == NULLFSBLOCK) { *alloc = 0; return 0; @@ -494,8 +787,8 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, - args.len, prandom_u32()); + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno, + args.agbno, args.len, prandom_u32()); if (error) return error; @@ -503,6 +796,73 @@ xfs_ialloc_ag_alloc( * Convert the results. */ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + + if (xfs_inobt_issparse(~allocmask)) { + /* + * We've allocated a sparse chunk. Align the startino and mask. + */ + xfs_align_sparse_ino(args.mp, &newino, &allocmask); + + rec.ir_startino = newino; + rec.ir_holemask = ~allocmask; + rec.ir_count = newlen; + rec.ir_freecount = newlen; + rec.ir_free = XFS_INOBT_ALL_FREE; + + /* + * Insert the sparse record into the inobt and allow for a merge + * if necessary. If a merge does occur, rec is updated to the + * merged record. + */ + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, + &rec, true); + if (error == -EFSCORRUPTED) { + xfs_alert(args.mp, + "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", + XFS_AGINO_TO_INO(args.mp, agno, + rec.ir_startino), + rec.ir_holemask, rec.ir_count); + xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); + } + if (error) + return error; + + /* + * We can't merge the part we've just allocated as for the inobt + * due to finobt semantics. The original record may or may not + * exist independent of whether physical inodes exist in this + * sparse chunk. + * + * We must update the finobt record based on the inobt record. + * rec contains the fully merged and up to date inobt record + * from the previous call. Set merge false to replace any + * existing record with this one. + */ + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, + XFS_BTNUM_FINO, &rec, + false); + if (error) + return error; + } + } else { + /* full chunk - insert new records to both btrees */ + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, + newlen, XFS_BTNUM_FINO); + if (error) + return error; + } + } + + /* + * Update AGI counts and newino. + */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag = xfs_perag_get(args.mp, agno); @@ -511,20 +871,6 @@ xfs_ialloc_ag_alloc( agi->agi_newino = cpu_to_be32(newino); /* - * Insert records describing the new inode chunk into the btrees. - */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_INO); - if (error) - return error; - - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_FINO); - if (error) - return error; - } - /* * Log allocation group header fields */ xfs_ialloc_log_agi(tp, agbp, @@ -644,7 +990,7 @@ xfs_ialloc_ag_select( * if we fail allocation due to alignment issues then it is most * likely a real ENOSPC condition. */ - ineed = mp->m_ialloc_blks; + ineed = mp->m_ialloc_min_blks; if (flags && ineed > 1) ineed += xfs_ialloc_cluster_alignment(mp); longest = pag->pagf_longest; @@ -700,7 +1046,7 @@ xfs_ialloc_next_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); } return 0; @@ -724,13 +1070,34 @@ xfs_ialloc_get_rec( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); } return 0; } /* + * Return the offset of the first free inode in the record. If the inode chunk + * is sparsely allocated, we convert the record holemask to inode granularity + * and mask off the unallocated regions from the inode free mask. + */ +STATIC int +xfs_inobt_first_free_inode( + struct xfs_inobt_rec_incore *rec) +{ + xfs_inofree_t realfree; + + /* if there are no holes, return the first available offset */ + if (!xfs_inobt_issparse(rec->ir_holemask)) + return xfs_lowbit64(rec->ir_free); + + realfree = xfs_inobt_irec_to_allocmask(rec); + realfree &= rec->ir_free; + + return xfs_lowbit64(realfree); +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int @@ -783,12 +1150,12 @@ xfs_dialloc_ag_inobt( error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); error = xfs_inobt_get_rec(cur, &rec, &j); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(j == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, j == 1, error0); if (rec.ir_freecount > 0) { /* @@ -944,23 +1311,23 @@ newino: error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); for (;;) { error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); if (rec.ir_freecount > 0) break; error = xfs_btree_increment(cur, 0, &i); if (error) goto error0; - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); } alloc_inode: - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1016,7 +1383,7 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(lcur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(lcur->bc_mp, i == 1); /* * See if we've landed in the parent inode record. The finobt @@ -1039,10 +1406,10 @@ xfs_dialloc_ag_finobt_near( error = xfs_inobt_get_rec(rcur, &rrec, &j); if (error) goto error_rcur; - XFS_WANT_CORRUPTED_GOTO(j == 1, error_rcur); + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, j == 1, error_rcur); } - XFS_WANT_CORRUPTED_GOTO(i == 1 || j == 1, error_rcur); + XFS_WANT_CORRUPTED_GOTO(lcur->bc_mp, i == 1 || j == 1, error_rcur); if (i == 1 && j == 1) { /* * Both the left and right records are valid. Choose the closer @@ -1095,7 +1462,7 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); return 0; } } @@ -1106,12 +1473,12 @@ xfs_dialloc_ag_finobt_newino( error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); error = xfs_inobt_get_rec(cur, rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); return 0; } @@ -1133,19 +1500,19 @@ xfs_dialloc_ag_update_inobt( error = xfs_inobt_lookup(cur, frec->ir_startino, XFS_LOOKUP_EQ, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); error = xfs_inobt_get_rec(cur, &rec, &i); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(i == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1); ASSERT((XFS_AGINO_TO_OFFSET(cur->bc_mp, rec.ir_startino) % XFS_INODES_PER_CHUNK) == 0); rec.ir_free &= ~XFS_INOBT_MASK(offset); rec.ir_freecount--; - XFS_WANT_CORRUPTED_RETURN((rec.ir_free == frec->ir_free) && + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, (rec.ir_free == frec->ir_free) && (rec.ir_freecount == frec->ir_freecount)); return xfs_inobt_update(cur, &rec); @@ -1209,7 +1576,7 @@ xfs_dialloc_ag( if (error) goto error_cur; - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1338,9 +1705,13 @@ xfs_dialloc( * If we have already hit the ceiling of inode blocks then clear * okalloc so we scan all available agi structures for a free * inode. + * + * Read rough value of mp->m_icount by percpu_counter_read_positive, + * which will sacrifice the preciseness but improve the performance. */ if (mp->m_maxicount && - mp->m_sb.sb_icount + mp->m_ialloc_inos > mp->m_maxicount) { + percpu_counter_read_positive(&mp->m_icount) + mp->m_ialloc_inos + > mp->m_maxicount) { noroom = 1; okalloc = 0; } @@ -1434,6 +1805,83 @@ out_error: return error; } +/* + * Free the blocks of an inode chunk. We must consider that the inode chunk + * might be sparse and only free the regions that are allocated as part of the + * chunk. + */ +STATIC void +xfs_difree_inode_chunk( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *rec, + struct xfs_bmap_free *flist) +{ + xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); + int startidx, endidx; + int nextbit; + xfs_agblock_t agbno; + int contigblk; + DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); + + if (!xfs_inobt_issparse(rec->ir_holemask)) { + /* not sparse, calculate extent info directly */ + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), + mp->m_ialloc_blks, flist, mp); + return; + } + + /* holemask is only 16-bits (fits in an unsigned long) */ + ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0])); + holemask[0] = rec->ir_holemask; + + /* + * Find contiguous ranges of zeroes (i.e., allocated regions) in the + * holemask and convert the start/end index of each range to an extent. + * We start with the start and end index both pointing at the first 0 in + * the mask. + */ + startidx = endidx = find_first_zero_bit(holemask, + XFS_INOBT_HOLEMASK_BITS); + nextbit = startidx + 1; + while (startidx < XFS_INOBT_HOLEMASK_BITS) { + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, + nextbit); + /* + * If the next zero bit is contiguous, update the end index of + * the current range and continue. + */ + if (nextbit != XFS_INOBT_HOLEMASK_BITS && + nextbit == endidx + 1) { + endidx = nextbit; + goto next; + } + + /* + * nextbit is not contiguous with the current end index. Convert + * the current start/end to an extent and add it to the free + * list. + */ + agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + contigblk = ((endidx - startidx + 1) * + XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + + ASSERT(agbno % mp->m_sb.sb_spino_align == 0); + ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, + flist, mp); + + /* reset range to current bit and carry on... */ + startidx = endidx = nextbit; + +next: + nextbit++; + } +} + STATIC int xfs_difree_inobt( struct xfs_mount *mp, @@ -1441,8 +1889,7 @@ xfs_difree_inobt( struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_bmap_free *flist, - int *deleted, - xfs_ino_t *first_ino, + struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); @@ -1475,14 +1922,14 @@ xfs_difree_inobt( __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); error = xfs_inobt_get_rec(cur, &rec, &i); if (error) { xfs_warn(mp, "%s: xfs_inobt_get_rec() returned error %d.", __func__, error); goto error0; } - XFS_WANT_CORRUPTED_GOTO(i == 1, error0); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error0); /* * Get the offset in the inode chunk. */ @@ -1496,20 +1943,23 @@ xfs_difree_inobt( rec.ir_freecount++; /* - * When an inode cluster is free, it becomes eligible for removal + * When an inode chunk is free, it becomes eligible for removal. Don't + * remove the chunk if the block size is large enough for multiple inode + * chunks (that might not be free). */ if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - (rec.ir_freecount == mp->m_ialloc_inos)) { - - *deleted = 1; - *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { + xic->deleted = 1; + xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* * Remove the inode cluster from the AGI B+Tree, adjust the * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = mp->m_ialloc_inos; + ilen = rec.ir_freecount; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1525,11 +1975,9 @@ xfs_difree_inobt( goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_difree_inode_chunk(mp, agno, &rec, flist); } else { - *deleted = 0; + xic->deleted = 0; error = xfs_inobt_update(cur, &rec); if (error) { @@ -1592,9 +2040,11 @@ xfs_difree_finobt( * freed an inode in a previously fully allocated chunk. If not, * something is out of sync. */ - XFS_WANT_CORRUPTED_GOTO(ibtrec->ir_freecount == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); - error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, + ibtrec->ir_count, + ibtrec->ir_freecount, ibtrec->ir_free, &i); if (error) goto error; @@ -1613,12 +2063,12 @@ xfs_difree_finobt( error = xfs_inobt_get_rec(cur, &rec, &i); if (error) goto error; - XFS_WANT_CORRUPTED_GOTO(i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); rec.ir_free |= XFS_INOBT_MASK(offset); rec.ir_freecount++; - XFS_WANT_CORRUPTED_GOTO((rec.ir_free == ibtrec->ir_free) && + XFS_WANT_CORRUPTED_GOTO(mp, (rec.ir_free == ibtrec->ir_free) && (rec.ir_freecount == ibtrec->ir_freecount), error); @@ -1629,8 +2079,13 @@ xfs_difree_finobt( * free inode. Hence, if all of the inodes are free and we aren't * keeping inode chunks permanently on disk, remove the record. * Otherwise, update the record with the new information. + * + * Note that we currently can't free chunks when the block size is large + * enough for multiple chunks. Leave the finobt record to remain in sync + * with the inobt. */ - if (rec.ir_freecount == mp->m_ialloc_inos && + if (rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK && !(mp->m_flags & XFS_MOUNT_IKEEP)) { error = xfs_btree_delete(cur, &i); if (error) @@ -1666,8 +2121,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted,/* set if inode cluster was deleted */ - xfs_ino_t *first_ino)/* first inode in deleted cluster */ + struct xfs_icluster *xic) /* cluster info if deleted */ { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ @@ -1718,8 +2172,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, - &rec); + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec); if (error) goto error0; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 100007d56449..6e450df2979b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -28,6 +28,13 @@ struct xfs_btree_cur; /* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 +struct xfs_icluster { + bool deleted; /* record is deleted */ + xfs_ino_t first_ino; /* first inode number */ + uint64_t alloc; /* inode phys. allocation bitmap for + * sparse chunks */ +}; + /* Calculate and return the number of filesystem blocks per inode cluster */ static inline int xfs_icluster_size_fsb( @@ -44,8 +51,7 @@ xfs_icluster_size_fsb( static inline struct xfs_dinode * xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) { - return (struct xfs_dinode *) - (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); + return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog); } /* @@ -90,8 +96,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted, /* set if inode cluster was deleted */ - xfs_ino_t *first_ino); /* first inode in deleted cluster */ + struct xfs_icluster *ifree); /* cluster info if deleted */ /* * Return the location of the inode in imap, for mapping it into a buffer. @@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, * Inode chunk initialisation routine */ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, - struct list_head *buffer_list, + struct list_head *buffer_list, int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 964c465ca69c..674ad8f760be 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur( union xfs_btree_rec *rec) { rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); - rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec->inobt.ir_u.sp.ir_holemask = + cpu_to_be16(cur->bc_rec.i.ir_holemask); + rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; + rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec->inobt.ir_u.f.ir_freecount = + cpu_to_be32(cur->bc_rec.i.ir_freecount); + } rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); } @@ -418,3 +427,85 @@ xfs_inobt_maxrecs( return blocklen / sizeof(xfs_inobt_rec_t); return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); } + +/* + * Convert the inode record holemask to an inode allocation bitmap. The inode + * allocation bitmap is inode granularity and specifies whether an inode is + * physically allocated on disk (not whether the inode is considered allocated + * or free by the fs). + * + * A bit value of 1 means the inode is allocated, a value of 0 means it is free. + */ +uint64_t +xfs_inobt_irec_to_allocmask( + struct xfs_inobt_rec_incore *rec) +{ + uint64_t bitmap = 0; + uint64_t inodespbit; + int nextbit; + uint allocbitmap; + + /* + * The holemask has 16-bits for a 64 inode record. Therefore each + * holemask bit represents multiple inodes. Create a mask of bits to set + * in the allocmask for each holemask bit. + */ + inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1; + + /* + * Allocated inodes are represented by 0 bits in holemask. Invert the 0 + * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask + * anything beyond the 16 holemask bits since this casts to a larger + * type. + */ + allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1); + + /* + * allocbitmap is the inverted holemask so every set bit represents + * allocated inodes. To expand from 16-bit holemask granularity to + * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target + * bitmap for every holemask bit. + */ + nextbit = xfs_next_bit(&allocbitmap, 1, 0); + while (nextbit != -1) { + ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY)); + + bitmap |= (inodespbit << + (nextbit * XFS_INODES_PER_HOLEMASK_BIT)); + + nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1); + } + + return bitmap; +} + +#if defined(DEBUG) || defined(XFS_WARN) +/* + * Verify that an in-core inode record has a valid inode count. + */ +int +xfs_inobt_rec_check_count( + struct xfs_mount *mp, + struct xfs_inobt_rec_incore *rec) +{ + int inocount = 0; + int nextbit = 0; + uint64_t allocbmap; + int wordsz; + + wordsz = sizeof(allocbmap) / sizeof(unsigned int); + allocbmap = xfs_inobt_irec_to_allocmask(rec); + + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit); + while (nextbit != -1) { + inocount++; + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, + nextbit + 1); + } + + if (inocount != rec->ir_count) + return -EFSCORRUPTED; + + return 0; +} +#endif /* DEBUG */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index d7ebea72c2d0..bd88453217ce 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, xfs_btnum_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +/* ir_holemask to inode allocation bitmap conversion */ +uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); + +#if defined(DEBUG) || defined(XFS_WARN) +int xfs_inobt_rec_check_count(struct xfs_mount *, + struct xfs_inobt_rec_incore *); +#else +#define xfs_inobt_rec_check_count(mp, rec) 0 +#endif /* DEBUG */ + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 002b6b3a1988..6526e7696184 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -46,8 +46,7 @@ xfs_inobp_check( j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; for (i = 0; i < j; i++) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - i * mp->m_sb.sb_inodesize); + dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); if (!dip->di_next_unlinked) { xfs_alert(mp, "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", @@ -86,8 +85,7 @@ xfs_inode_buf_verify( int di_ok; xfs_dinode_t *dip; - dip = (struct xfs_dinode *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); + dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && XFS_DINODE_GOOD_VERSION(dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, @@ -186,7 +184,7 @@ xfs_imap_to_bp( } *bpp = bp; - *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); + *dipp = xfs_buf_offset(bp, imap->im_boffset); return 0; } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index b0a5fe95a3e2..df9851c46b5c 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -111,14 +111,6 @@ xfs_mount_validate_sb( bool check_inprogress, bool check_version) { - - /* - * If the log device and data device have the - * same device number, the log is internal. - * Consequently, the sb_logstart should be non-zero. If - * we have a zero sb_logstart in this case, we may be trying to mount - * a volume filesystem in a non-volume manner. - */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { xfs_warn(mp, "bad magic number"); return -EWRONGFS; @@ -182,6 +174,27 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + /* + * Full inode chunks must be aligned to inode chunk size when + * sparse inodes are enabled to support the sparse chunk + * allocation algorithm and prevent overlapping inode records. + */ + if (xfs_sb_version_hassparseinodes(sbp)) { + uint32_t align; + + xfs_alert(mp, + "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); + + align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize + >> sbp->sb_blocklog; + if (sbp->sb_inoalignmt != align) { + xfs_warn(mp, +"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", + sbp->sb_inoalignmt, align); + return -EINVAL; + } + } + if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -382,7 +395,7 @@ __xfs_sb_from_disk( be32_to_cpu(from->sb_features_log_incompat); /* crc is only used on disk, not in memory; just init to 0 here. */ to->sb_crc = 0; - to->sb_pad = 0; + to->sb_spino_align = be32_to_cpu(from->sb_spino_align); to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); /* Convert on-disk flags to in-memory flags? */ @@ -524,7 +537,7 @@ xfs_sb_to_disk( cpu_to_be32(from->sb_features_incompat); to->sb_features_log_incompat = cpu_to_be32(from->sb_features_log_incompat); - to->sb_pad = 0; + to->sb_spino_align = cpu_to_be32(from->sb_spino_align); to->sb_lsn = cpu_to_be64(from->sb_lsn); } } @@ -697,6 +710,11 @@ xfs_sb_mount_common( mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; + + if (sbp->sb_spino_align) + mp->m_ialloc_min_blks = sbp->sb_spino_align; + else + mp->m_ialloc_min_blks = mp->m_ialloc_blks; } /* @@ -743,17 +761,15 @@ xfs_initialize_perag_data( btree += pag->pagf_btreeblks; xfs_perag_put(pag); } - /* - * Overwrite incore superblock counters with just-read data - */ + + /* Overwrite incore superblock counters with just-read data */ spin_lock(&mp->m_sb_lock); sbp->sb_ifree = ifree; sbp->sb_icount = ialloc; sbp->sb_fdblocks = bfree + bfreelst + btree; spin_unlock(&mp->m_sb_lock); - /* Fixup the per-cpu counters as well. */ - xfs_icsb_reinit_counters(mp); + xfs_reinit_percpu_counters(mp); return 0; } @@ -771,6 +787,10 @@ xfs_log_sb( struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp = xfs_trans_getsb(tp, mp, 0); + mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); + mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree); + mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb)); @@ -798,12 +818,12 @@ xfs_sync_sb( tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_log_sb(tp); if (wait) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 8dda4b321343..5be529707903 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -182,12 +182,6 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer count in superblock */ /* - * Values for call flags parameter. - */ -#define XFS_TRANS_RELEASE_LOG_RES 0x4 -#define XFS_TRANS_ABORT 0x8 - -/* * Field values for xfs_trans_mod_sb. */ #define XFS_TRANS_SB_ICOUNT 0x00000001 diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 2d5bdfce6d8f..797815012c0e 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -73,9 +73,9 @@ struct xfs_trans_resv { * 2 trees * (2 blocks/level * max depth - 1) * block size */ #define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1))) #define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ - ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1))) /* * Per-directory log reservation for any directory change. diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index bf9c4579334d..41e0428d8175 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -67,7 +67,7 @@ #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ - (2 * XFS_AG_MAXLEVELS(mp)) + (2 * (mp)->m_ag_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) #define XFS_LINK_SPACE_RES(mp,nl) \ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3a9b7a1b8704..3859f5e27a4d 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,7 +31,6 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" -#include <linux/aio.h> #include <linux/gfp.h> #include <linux/mpage.h> #include <linux/pagevec.h> @@ -110,7 +109,7 @@ xfs_setfilesize_trans_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -146,7 +145,7 @@ xfs_setfilesize( isize = xfs_new_eof(ip, offset + size); if (!isize) { xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return 0; } @@ -156,7 +155,7 @@ xfs_setfilesize( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } STATIC int @@ -357,7 +356,6 @@ xfs_end_bio( { xfs_ioend_t *ioend = bio->bi_private; - ASSERT(atomic_read(&bio->bi_cnt) >= 1); ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; /* Toss bio and pass work off to an xfsdatad thread */ @@ -1233,13 +1231,124 @@ xfs_vm_releasepage( return try_to_free_buffers(page); } +/* + * When we map a DIO buffer, we may need to attach an ioend that describes the + * type of write IO we are doing. This passes to the completion function the + * operations it needs to perform. If the mapping is for an overwrite wholly + * within the EOF then we don't need an ioend and so we don't allocate one. + * This avoids the unnecessary overhead of allocating and freeing ioends for + * workloads that don't require transactions on IO completion. + * + * If we get multiple mappings in a single IO, we might be mapping different + * types. But because the direct IO can only have a single private pointer, we + * need to ensure that: + * + * a) i) the ioend spans the entire region of unwritten mappings; or + * ii) the ioend spans all the mappings that cross or are beyond EOF; and + * b) if it contains unwritten extents, it is *permanently* marked as such + * + * We could do this by chaining ioends like buffered IO does, but we only + * actually get one IO completion callback from the direct IO, and that spans + * the entire IO regardless of how many mappings and IOs are needed to complete + * the DIO. There is only going to be one reference to the ioend and its life + * cycle is constrained by the DIO completion code. hence we don't need + * reference counting here. + */ +static void +xfs_map_direct( + struct inode *inode, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset) +{ + struct xfs_ioend *ioend; + xfs_off_t size = bh_result->b_size; + int type; + + if (ISUNWRITTEN(imap)) + type = XFS_IO_UNWRITTEN; + else + type = XFS_IO_OVERWRITE; + + trace_xfs_gbmap_direct(XFS_I(inode), offset, size, type, imap); + + if (bh_result->b_private) { + ioend = bh_result->b_private; + ASSERT(ioend->io_size > 0); + ASSERT(offset >= ioend->io_offset); + if (offset + size > ioend->io_offset + ioend->io_size) + ioend->io_size = offset - ioend->io_offset + size; + + if (type == XFS_IO_UNWRITTEN && type != ioend->io_type) + ioend->io_type = XFS_IO_UNWRITTEN; + + trace_xfs_gbmap_direct_update(XFS_I(inode), ioend->io_offset, + ioend->io_size, ioend->io_type, + imap); + } else if (type == XFS_IO_UNWRITTEN || + offset + size > i_size_read(inode)) { + ioend = xfs_alloc_ioend(inode, type); + ioend->io_offset = offset; + ioend->io_size = size; + + bh_result->b_private = ioend; + set_buffer_defer_completion(bh_result); + + trace_xfs_gbmap_direct_new(XFS_I(inode), offset, size, type, + imap); + } else { + trace_xfs_gbmap_direct_none(XFS_I(inode), offset, size, type, + imap); + } +} + +/* + * If this is O_DIRECT or the mpage code calling tell them how large the mapping + * is, so that we can avoid repeated get_blocks calls. + * + * If the mapping spans EOF, then we have to break the mapping up as the mapping + * for blocks beyond EOF must be marked new so that sub block regions can be + * correctly zeroed. We can't do this for mappings within EOF unless the mapping + * was just allocated or is unwritten, otherwise the callers would overwrite + * existing data with zeros. Hence we have to split the mapping into a range up + * to and including EOF, and a second mapping for beyond EOF. + */ +static void +xfs_map_trim_size( + struct inode *inode, + sector_t iblock, + struct buffer_head *bh_result, + struct xfs_bmbt_irec *imap, + xfs_off_t offset, + ssize_t size) +{ + xfs_off_t mapping_size; + + mapping_size = imap->br_startoff + imap->br_blockcount - iblock; + mapping_size <<= inode->i_blkbits; + + ASSERT(mapping_size > 0); + if (mapping_size > size) + mapping_size = size; + if (offset < i_size_read(inode) && + offset + mapping_size >= i_size_read(inode)) { + /* limit mapping to block that spans EOF */ + mapping_size = roundup_64(i_size_read(inode) - offset, + 1 << inode->i_blkbits); + } + if (mapping_size > LONG_MAX) + mapping_size = LONG_MAX; + + bh_result->b_size = mapping_size; +} + STATIC int __xfs_get_blocks( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create, - int direct) + bool direct) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1304,6 +1413,7 @@ __xfs_get_blocks( if (error) return error; new = 1; + } else { /* * Delalloc reservations do not require a transaction, @@ -1321,31 +1431,37 @@ __xfs_get_blocks( xfs_iunlock(ip, lockmode); } - - trace_xfs_get_blocks_alloc(ip, offset, size, 0, &imap); + trace_xfs_get_blocks_alloc(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_DELALLOC, &imap); } else if (nimaps) { - trace_xfs_get_blocks_found(ip, offset, size, 0, &imap); + trace_xfs_get_blocks_found(ip, offset, size, + ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN + : XFS_IO_OVERWRITE, &imap); xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); goto out_unlock; } + /* trim mapping down to size requested */ + if (direct || size > (1 << inode->i_blkbits)) + xfs_map_trim_size(inode, iblock, bh_result, + &imap, offset, size); + + /* + * For unwritten extents do not report a disk address in the buffered + * read case (treat as if we're reading into a hole). + */ if (imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK) { - /* - * For unwritten extents do not report a disk address on - * the read case (treat as if we're reading into a hole). - */ - if (create || !ISUNWRITTEN(&imap)) - xfs_map_buffer(inode, bh_result, &imap, offset); - if (create && ISUNWRITTEN(&imap)) { - if (direct) { - bh_result->b_private = inode; - set_buffer_defer_completion(bh_result); - } + imap.br_startblock != DELAYSTARTBLOCK && + (create || !ISUNWRITTEN(&imap))) { + xfs_map_buffer(inode, bh_result, &imap, offset); + if (ISUNWRITTEN(&imap)) set_buffer_unwritten(bh_result); - } + /* direct IO needs special help */ + if (create && direct) + xfs_map_direct(inode, bh_result, &imap, offset); } /* @@ -1378,39 +1494,6 @@ __xfs_get_blocks( } } - /* - * If this is O_DIRECT or the mpage code calling tell them how large - * the mapping is, so that we can avoid repeated get_blocks calls. - * - * If the mapping spans EOF, then we have to break the mapping up as the - * mapping for blocks beyond EOF must be marked new so that sub block - * regions can be correctly zeroed. We can't do this for mappings within - * EOF unless the mapping was just allocated or is unwritten, otherwise - * the callers would overwrite existing data with zeros. Hence we have - * to split the mapping into a range up to and including EOF, and a - * second mapping for beyond EOF. - */ - if (direct || size > (1 << inode->i_blkbits)) { - xfs_off_t mapping_size; - - mapping_size = imap.br_startoff + imap.br_blockcount - iblock; - mapping_size <<= inode->i_blkbits; - - ASSERT(mapping_size > 0); - if (mapping_size > size) - mapping_size = size; - if (offset < i_size_read(inode) && - offset + mapping_size >= i_size_read(inode)) { - /* limit mapping to block that spans EOF */ - mapping_size = roundup_64(i_size_read(inode) - offset, - 1 << inode->i_blkbits); - } - if (mapping_size > LONG_MAX) - mapping_size = LONG_MAX; - - bh_result->b_size = mapping_size; - } - return 0; out_unlock: @@ -1425,25 +1508,93 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); + return __xfs_get_blocks(inode, iblock, bh_result, create, false); } -STATIC int +int xfs_get_blocks_direct( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); + return __xfs_get_blocks(inode, iblock, bh_result, create, true); +} + +static void +__xfs_end_io_direct_write( + struct inode *inode, + struct xfs_ioend *ioend, + loff_t offset, + ssize_t size) +{ + struct xfs_mount *mp = XFS_I(inode)->i_mount; + + if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) + goto out_end_io; + + /* + * dio completion end_io functions are only called on writes if more + * than 0 bytes was written. + */ + ASSERT(size > 0); + + /* + * The ioend only maps whole blocks, while the IO may be sector aligned. + * Hence the ioend offset/size may not match the IO offset/size exactly. + * Because we don't map overwrites within EOF into the ioend, the offset + * may not match, but only if the endio spans EOF. Either way, write + * the IO sizes into the ioend so that completion processing does the + * right thing. + */ + ASSERT(offset + size <= ioend->io_offset + ioend->io_size); + ioend->io_size = size; + ioend->io_offset = offset; + + /* + * The ioend tells us whether we are doing unwritten extent conversion + * or an append transaction that updates the on-disk file size. These + * cases are the only cases where we should *potentially* be needing + * to update the VFS inode size. + * + * We need to update the in-core inode size here so that we don't end up + * with the on-disk inode size being outside the in-core inode size. We + * have no other method of updating EOF for AIO, so always do it here + * if necessary. + * + * We need to lock the test/set EOF update as we can be racing with + * other IO completions here to update the EOF. Failing to serialise + * here can result in EOF moving backwards and Bad Things Happen when + * that occurs. + */ + spin_lock(&XFS_I(inode)->i_flags_lock); + if (offset + size > i_size_read(inode)) + i_size_write(inode, offset + size); + spin_unlock(&XFS_I(inode)->i_flags_lock); + + /* + * If we are doing an append IO that needs to update the EOF on disk, + * do the transaction reserve now so we can use common end io + * processing. Stashing the error (if there is one) in the ioend will + * result in the ioend processing passing on the error if it is + * possible as we can't return it from here. + */ + if (ioend->io_type == XFS_IO_OVERWRITE) + ioend->io_error = xfs_setfilesize_trans_alloc(ioend); + +out_end_io: + xfs_end_io(&ioend->io_work); + return; } /* * Complete a direct I/O write request. * - * If the private argument is non-NULL __xfs_get_blocks signals us that we - * need to issue a transaction to convert the range from unwritten to written - * extents. + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). */ STATIC void xfs_end_io_direct_write( @@ -1453,66 +1604,93 @@ xfs_end_io_direct_write( void *private) { struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; + struct xfs_ioend *ioend = private; - if (XFS_FORCED_SHUTDOWN(mp)) + trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); return; + } - /* - * While the generic direct I/O code updates the inode size, it does - * so only after the end_io handler is called, which means our - * end_io handler thinks the on-disk size is outside the in-core - * size. To prevent this just update it a little bit earlier here. - */ - if (offset + size > i_size_read(inode)) - i_size_write(inode, offset + size); + __xfs_end_io_direct_write(inode, ioend, offset, size); +} + +/* + * For DAX we need a mapping buffer callback for unwritten extent conversion + * when page faults allocate blocks and then zero them. Note that in this + * case the mapping indicated by the ioend may extend beyond EOF. We most + * definitely do not want to extend EOF here, so we trim back the ioend size to + * EOF. + */ +#ifdef CONFIG_FS_DAX +void +xfs_end_io_dax_write( + struct buffer_head *bh, + int uptodate) +{ + struct xfs_ioend *ioend = bh->b_private; + struct inode *inode = ioend->io_inode; + ssize_t size = ioend->io_size; + + ASSERT(IS_DAX(ioend->io_inode)); + + /* if there was an error zeroing, then don't convert it */ + if (!uptodate) + ioend->io_error = -EIO; /* - * For direct I/O we do not know if we need to allocate blocks or not, - * so we can't preallocate an append transaction, as that results in - * nested reservations and log space deadlocks. Hence allocate the - * transaction here. While this is sub-optimal and can block IO - * completion for some time, we're stuck with doing it this way until - * we can pass the ioend to the direct IO allocation callbacks and - * avoid nesting that way. + * Trim update to EOF, so we don't extend EOF during unwritten extent + * conversion of partial EOF blocks. */ - if (private && size > 0) { - xfs_iomap_write_unwritten(ip, offset, size); - } else if (offset + size > ip->i_d.di_size) { - struct xfs_trans *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return; - } + spin_lock(&XFS_I(inode)->i_flags_lock); + if (ioend->io_offset + size > i_size_read(inode)) + size = i_size_read(inode) - ioend->io_offset; + spin_unlock(&XFS_I(inode)->i_flags_lock); + + __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); - xfs_setfilesize(ip, tp, offset, size); - } +} +#else +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } +#endif + +static inline ssize_t +xfs_vm_do_dio( + struct inode *inode, + struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset, + void (*endio)(struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private), + int flags) +{ + struct block_device *bdev; + + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, + xfs_get_blocks_direct, endio, 0); + + bdev = xfs_find_bdev_for_inode(inode); + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + xfs_get_blocks_direct, endio, NULL, flags); } STATIC ssize_t xfs_vm_direct_IO( - int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct inode *inode = iocb->ki_filp->f_mapping->host; - struct block_device *bdev = xfs_find_bdev_for_inode(inode); - if (rw & WRITE) { - return __blockdev_direct_IO(rw, iocb, inode, bdev, iter, - offset, xfs_get_blocks_direct, - xfs_end_io_direct_write, NULL, - DIO_ASYNC_EXTEND); - } - return __blockdev_direct_IO(rw, iocb, inode, bdev, iter, - offset, xfs_get_blocks_direct, - NULL, NULL, 0); + if (iov_iter_rw(iter) == WRITE) + return xfs_vm_do_dio(inode, iocb, iter, offset, + xfs_end_io_direct_write, DIO_ASYNC_EXTEND); + return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); } /* @@ -1763,6 +1941,7 @@ xfs_vm_set_page_dirty( loff_t end_offset; loff_t offset; int newly_dirty; + struct mem_cgroup *memcg; if (unlikely(!mapping)) return !TestSetPageDirty(page); @@ -1782,6 +1961,11 @@ xfs_vm_set_page_dirty( offset += 1 << inode->i_blkbits; } while (bh != head); } + /* + * Use mem_group_begin_page_stat() to keep PageDirty synchronized with + * per-memcg dirty page counters. + */ + memcg = mem_cgroup_begin_page_stat(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); @@ -1792,13 +1976,15 @@ xfs_vm_set_page_dirty( spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irqrestore(&mapping->tree_lock, flags); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } + mem_cgroup_end_page_stat(memcg); + if (newly_dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return newly_dirty; } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index ac644e0137a4..86afd1ac7895 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -53,7 +53,12 @@ typedef struct xfs_ioend { } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; -extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); + +int xfs_get_blocks(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +int xfs_get_blocks_direct(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 83af4c149635..2bb959ada45b 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -132,9 +132,10 @@ xfs_attr3_leaf_inactive( int size; int tmp; int i; + struct xfs_mount *mp = bp->b_target->bt_mount; leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); /* * Count the number of "remote" value extents. @@ -379,23 +380,30 @@ xfs_attr3_root_inactive( return error; } +/* + * xfs_attr_inactive kills all traces of an attribute fork on an inode. It + * removes both the on-disk and in-memory inode fork. Note that this also has to + * handle the condition of inodes without attributes but with an attribute fork + * configured, so we can't use xfs_inode_hasattr() here. + * + * The in-memory attribute fork is removed even on error. + */ int -xfs_attr_inactive(xfs_inode_t *dp) +xfs_attr_inactive( + struct xfs_inode *dp) { - xfs_trans_t *trans; - xfs_mount_t *mp; - int error; + struct xfs_trans *trans; + struct xfs_mount *mp; + int lock_mode = XFS_ILOCK_SHARED; + int error = 0; mp = dp->i_mount; ASSERT(! XFS_NOT_DQATTACHED(mp, dp)); - xfs_ilock(dp, XFS_ILOCK_SHARED); - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - xfs_iunlock(dp, XFS_ILOCK_SHARED); - return 0; - } - xfs_iunlock(dp, XFS_ILOCK_SHARED); + xfs_ilock(dp, lock_mode); + if (!XFS_IFORK_Q(dp)) + goto out_destroy_fork; + xfs_iunlock(dp, lock_mode); /* * Start our first transaction of the day. @@ -407,13 +415,17 @@ xfs_attr_inactive(xfs_inode_t *dp) * the inode in every transaction to let it float upward through * the log. */ + lock_mode = 0; trans = xfs_trans_alloc(mp, XFS_TRANS_ATTRINVAL); error = xfs_trans_reserve(trans, &M_RES(mp)->tr_attrinval, 0, 0); - if (error) { - xfs_trans_cancel(trans, 0); - return error; - } - xfs_ilock(dp, XFS_ILOCK_EXCL); + if (error) + goto out_cancel; + + lock_mode = XFS_ILOCK_EXCL; + xfs_ilock(dp, lock_mode); + + if (!XFS_IFORK_Q(dp)) + goto out_cancel; /* * No need to make quota reservations here. We expect to release some @@ -422,28 +434,36 @@ xfs_attr_inactive(xfs_inode_t *dp) xfs_trans_ijoin(trans, dp, 0); /* - * Decide on what work routines to call based on the inode size. + * Invalidate and truncate the attribute fork extents. Make sure the + * fork actually has attributes as otherwise the invalidation has no + * blocks to read and returns an error. In this case, just do the fork + * removal below. */ - if (!xfs_inode_hasattr(dp) || - dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) { - error = 0; - goto out; - } - error = xfs_attr3_root_inactive(&trans, dp); - if (error) - goto out; + if (xfs_inode_hasattr(dp) && + dp->i_d.di_aformat != XFS_DINODE_FMT_LOCAL) { + error = xfs_attr3_root_inactive(&trans, dp); + if (error) + goto out_cancel; - error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); - if (error) - goto out; + error = xfs_itruncate_extents(&trans, dp, XFS_ATTR_FORK, 0); + if (error) + goto out_cancel; + } - error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(dp, XFS_ILOCK_EXCL); + /* Reset the attribute fork - this also destroys the in-core fork */ + xfs_attr_fork_remove(dp, trans); + error = xfs_trans_commit(trans); + xfs_iunlock(dp, lock_mode); return error; -out: - xfs_trans_cancel(trans, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - xfs_iunlock(dp, XFS_ILOCK_EXCL); +out_cancel: + xfs_trans_cancel(trans); +out_destroy_fork: + /* kill the in-core attr fork before we drop the inode lock */ + if (dp->i_afp) + xfs_idestroy_fork(dp, XFS_ATTR_FORK); + if (lock_mode) + xfs_iunlock(dp, lock_mode); return error; } diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c index a43d370d2c58..65fb37a18e92 100644 --- a/fs/xfs/xfs_attr_list.c +++ b/fs/xfs/xfs_attr_list.c @@ -225,6 +225,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) int error, i; struct xfs_buf *bp; struct xfs_inode *dp = context->dp; + struct xfs_mount *mp = dp->i_mount; trace_xfs_attr_node_list(context); @@ -256,7 +257,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) case XFS_ATTR_LEAF_MAGIC: case XFS_ATTR3_LEAF_MAGIC: leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, + &leafhdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); if (cursor->hashval > be32_to_cpu( entries[leafhdr.count - 1].hashval)) { @@ -340,7 +342,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context) xfs_trans_brelse(NULL, bp); return error; } - xfs_attr3_leaf_hdr_from_disk(&leafhdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf); if (context->seen_enough || leafhdr.forw == 0) break; cursor->blkno = leafhdr.forw; @@ -368,11 +370,12 @@ xfs_attr3_leaf_list_int( struct xfs_attr_leaf_entry *entry; int retval; int i; + struct xfs_mount *mp = context->dp->i_mount; trace_xfs_attr_list_leaf(context); leaf = bp->b_addr; - xfs_attr3_leaf_hdr_from_disk(&ichdr, leaf); + xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf); entries = xfs_attr3_leaf_entryp(leaf); cursor = context->cursor; diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 22a5dcb70b32..0f34886cf726 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -75,28 +75,20 @@ xfs_bmap_finish( xfs_efi_log_item_t *efi; /* extent free intention */ int error; /* error return value */ xfs_bmap_free_item_t *free; /* free extent item */ - struct xfs_trans_res tres; /* new log reservation */ xfs_mount_t *mp; /* filesystem mount structure */ xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) { *committed = 0; return 0; } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); + efi = xfs_trans_get_efi(*tp, flist->xbf_count); for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - tres.tr_logres = ntp->t_log_res; - tres.tr_logcount = ntp->t_log_count; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; + error = xfs_trans_roll(tp, NULL); *committed = 1; /* * We have a new transaction, so we should return committed=1, @@ -105,19 +97,10 @@ xfs_bmap_finish( if (error) return error; - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(ntp->t_ticket); - - error = xfs_trans_reserve(ntp, &tres, 0, 0); - if (error) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); for (free = flist->xbf_first; free != NULL; free = next) { next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + if ((error = xfs_free_extent(*tp, free->xbfi_startblock, free->xbfi_blockcount))) { /* * The bmap free list will be cleaned up at a @@ -127,7 +110,7 @@ xfs_bmap_finish( * happens, since this transaction may not be * dirty yet. */ - mp = ntp->t_mountp; + mp = (*tp)->t_mountp; if (!XFS_FORCED_SHUTDOWN(mp)) xfs_force_shutdown(mp, (error == -EFSCORRUPTED) ? @@ -135,7 +118,7 @@ xfs_bmap_finish( SHUTDOWN_META_IO_ERROR); return error; } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock, free->xbfi_blockcount); xfs_bmap_del_free(flist, NULL, free); } @@ -878,7 +861,7 @@ xfs_free_eofblocks( if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return -EAGAIN; } } @@ -886,7 +869,7 @@ xfs_free_eofblocks( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); if (need_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -908,12 +891,9 @@ xfs_free_eofblocks( * If we get an error at this point we simply don't * bother truncating the file. */ - xfs_trans_cancel(tp, - (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); + xfs_trans_cancel(tp); } else { - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (!error) xfs_inode_clear_eofblocks_tag(ip); } @@ -1026,7 +1006,7 @@ xfs_alloc_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1053,7 +1033,7 @@ xfs_alloc_file_space( goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) { break; @@ -1077,7 +1057,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); error1: /* Just cancel transaction */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1133,14 +1113,29 @@ xfs_zero_remaining_bytes( break; ASSERT(imap.br_blockcount >= 1); ASSERT(imap.br_startoff == offset_fsb); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) { + /* skip the entire extent */ + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + + imap.br_blockcount) - 1; + continue; + } + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; if (lastoffset > endoff) lastoffset = endoff; - if (imap.br_startblock == HOLESTARTBLOCK) - continue; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - if (imap.br_state == XFS_EXT_UNWRITTEN) + + /* DAX can just zero the backing device directly */ + if (IS_DAX(VFS_I(ip))) { + error = dax_zero_page_range(VFS_I(ip), offset, + lastoffset - offset + 1, + xfs_get_blocks_direct); + if (error) + return error; continue; + } error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, @@ -1289,7 +1284,7 @@ xfs_free_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1320,7 +1315,7 @@ xfs_free_file_space( goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -1330,7 +1325,7 @@ xfs_free_file_space( error0: xfs_bmap_cancel(&free_list); error1: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); goto out; } @@ -1376,22 +1371,19 @@ out: } /* - * xfs_collapse_file_space() - * This routine frees disk space and shift extent for the given file. - * The first thing we do is to free data blocks in the specified range - * by calling xfs_free_file_space(). It would also sync dirty data - * and invalidate page cache over the region on which collapse range - * is working. And Shift extent records to the left to cover a hole. - * RETURNS: - * 0 on success - * errno on error - * + * @next_fsb will keep track of the extent currently undergoing shift. + * @stop_fsb will keep track of the extent at which we have to stop. + * If we are shifting left, we will start with block (offset + len) and + * shift each extent till last extent. + * If we are shifting right, we will start with last extent inside file space + * and continue until we reach the block corresponding to offset. */ -int -xfs_collapse_file_space( - struct xfs_inode *ip, - xfs_off_t offset, - xfs_off_t len) +static int +xfs_shift_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len, + enum shift_direction direction) { int done = 0; struct xfs_mount *mp = ip->i_mount; @@ -1400,21 +1392,26 @@ xfs_collapse_file_space( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; int committed; - xfs_fileoff_t start_fsb; + xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT); - trace_xfs_collapse_file_space(ip); + if (direction == SHIFT_LEFT) { + next_fsb = XFS_B_TO_FSB(mp, offset + len); + stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size); + } else { + /* + * If right shift, delegate the work of initialization of + * next_fsb to xfs_bmap_shift_extent as it has ilock held. + */ + next_fsb = NULLFSBLOCK; + stop_fsb = XFS_B_TO_FSB(mp, offset); + } - next_fsb = XFS_B_TO_FSB(mp, offset + len); shift_fsb = XFS_B_TO_FSB(mp, len); - error = xfs_free_file_space(ip, offset, len); - if (error) - return error; - /* * Trim eofblocks to avoid shifting uninitialized post-eof preallocation * into the accessible region of the file. @@ -1427,20 +1424,28 @@ xfs_collapse_file_space( /* * Writeback and invalidate cache for the remainder of the file as we're - * about to shift down every extent from the collapse range to EOF. The - * free of the collapse range above might have already done some of - * this, but we shouldn't rely on it to do anything outside of the range - * that was freed. + * about to shift down every extent from offset to EOF. */ error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - offset + len, -1); + offset, -1); if (error) return error; error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, - (offset + len) >> PAGE_CACHE_SHIFT, -1); + offset >> PAGE_CACHE_SHIFT, -1); if (error) return error; + /* + * The extent shiting code works on extent granularity. So, if + * stop_fsb is not the starting block of extent, we need to split + * the extent at stop_fsb. + */ + if (direction == SHIFT_RIGHT) { + error = xfs_bmap_split_extent(ip, stop_fsb); + if (error) + return error; + } + while (!error && !done) { tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); /* @@ -1452,7 +1457,7 @@ xfs_collapse_file_space( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } @@ -1464,7 +1469,7 @@ xfs_collapse_file_space( if (error) goto out; - xfs_trans_ijoin(tp, ip, 0); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_bmap_init(&free_list, &first_block); @@ -1472,10 +1477,9 @@ xfs_collapse_file_space( * We are using the write transaction in which max 2 bmbt * updates are allowed */ - start_fsb = next_fsb; - error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb, - &done, &next_fsb, &first_block, &free_list, - XFS_BMAP_MAX_SHIFT_EXTENTS); + error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb, + &done, stop_fsb, &first_block, &free_list, + direction, XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) goto out; @@ -1483,19 +1487,71 @@ xfs_collapse_file_space( if (error) goto out; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_trans_commit(tp); } return error; out: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_trans_cancel(tp); return error; } /* + * xfs_collapse_file_space() + * This routine frees disk space and shift extent for the given file. + * The first thing we do is to free data blocks in the specified range + * by calling xfs_free_file_space(). It would also sync dirty data + * and invalidate page cache over the region on which collapse range + * is working. And Shift extent records to the left to cover a hole. + * RETURNS: + * 0 on success + * errno on error + * + */ +int +xfs_collapse_file_space( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t len) +{ + int error; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_collapse_file_space(ip); + + error = xfs_free_file_space(ip, offset, len); + if (error) + return error; + + return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT); +} + +/* + * xfs_insert_file_space() + * This routine create hole space by shifting extents for the given file. + * The first thing we do is to sync dirty data and invalidate page cache + * over the region on which insert range is working. And split an extent + * to two extents at given offset by calling xfs_bmap_split_extent. + * And shift all extent records which are laying between [offset, + * last allocated extent] to the right to reserve hole range. + * RETURNS: + * 0 on success + * errno on error + */ +int +xfs_insert_file_space( + struct xfs_inode *ip, + loff_t offset, + loff_t len) +{ + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + trace_xfs_insert_file_space(ip); + + return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT); +} + +/* * We need to check that the format of the data fork in the temporary inode is * valid for the target inode before doing the swap. This is not a problem with * attr1 because of the fixed fork offset, but attr2 has a dynamically sized @@ -1599,13 +1655,6 @@ xfs_swap_extent_flush( /* Verify O_DIRECT for ftmp */ if (VFS_I(ip)->i_mapping->nrpages) return -EINVAL; - - /* - * Don't try to swap extents on mmap()d files because we can't lock - * out races against page faults safely. - */ - if (mapping_mapped(VFS_I(ip)->i_mapping)) - return -EBUSY; return 0; } @@ -1633,13 +1682,14 @@ xfs_swap_extents( } /* - * Lock up the inodes against other IO and truncate to begin with. - * Then we can ensure the inodes are flushed and have no page cache - * safely. Once we have done this we can take the ilocks and do the rest - * of the checks. + * Lock the inodes against other IO, page faults and truncate to + * begin with. Then we can ensure the inodes are flushed and have no + * page cache safely. Once we have done this we can take the ilocks and + * do the rest of the checks. */ - lock_flags = XFS_IOLOCK_EXCL; + lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL); /* Verify that both files have the same format */ if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) { @@ -1663,11 +1713,19 @@ xfs_swap_extents( tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_unlock; } + + /* + * Lock and join the inodes to the tansaction so that transaction commit + * or cancel will unlock the inodes from this point onwards. + */ xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); lock_flags |= XFS_ILOCK_EXCL; + xfs_trans_ijoin(tp, ip, lock_flags); + xfs_trans_ijoin(tp, tip, lock_flags); + /* Verify all data are being swapped */ if (sxp->sx_offset != 0 || @@ -1720,9 +1778,6 @@ xfs_swap_extents( goto out_trans_cancel; } - xfs_trans_ijoin(tp, ip, lock_flags); - xfs_trans_ijoin(tp, tip, lock_flags); - /* * Before we've swapped the forks, lets set the owners of the forks * appropriately. We have to do this as we are demand paging the btree @@ -1841,7 +1896,7 @@ xfs_swap_extents( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); @@ -1855,6 +1910,6 @@ out_unlock: goto out; out_trans_cancel: - xfs_trans_cancel(tp, 0); - goto out_unlock; + xfs_trans_cancel(tp); + goto out; } diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index 736429a72a12..af97d9a1dfb4 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -63,6 +63,8 @@ int xfs_zero_file_space(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t len); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, xfs_off_t len); +int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1790b00bea7a..a4b7d92e946c 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1419,9 +1419,9 @@ xfs_buf_submit_wait( return error; } -xfs_caddr_t +void * xfs_buf_offset( - xfs_buf_t *bp, + struct xfs_buf *bp, size_t offset) { struct page *page; @@ -1431,7 +1431,7 @@ xfs_buf_offset( offset += bp->b_offset; page = bp->b_pages[offset >> PAGE_SHIFT]; - return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); + return page_address(page) + (offset & (PAGE_SIZE-1)); } /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 75ff5d5a7d2e..331c1ccf8264 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -299,7 +299,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) /* Buffer Utility Routines */ -extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); +extern void *xfs_buf_offset(struct xfs_buf *, size_t); /* Delayed Write Buffer Routines */ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 507d96a57ac7..092d652bc03d 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -537,9 +537,9 @@ xfs_buf_item_push( /* has a previous flush failed due to IO errors? */ if ((bp->b_flags & XBF_WRITE_FAIL) && - ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) { + ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS: Failing async write")) { xfs_warn(bp->b_target->bt_mount, -"Detected failing async write on buffer block 0x%llx. Retrying async write.", +"Failing async write on buffer block 0x%llx. Retrying async write.", (long long)bp->b_bn); } diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 799e5a2d334d..e85a9519a5ae 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -84,7 +84,7 @@ xfs_trim_extents( error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); if (error) goto out_del_cursor; - XFS_WANT_CORRUPTED_GOTO(i == 1, out_del_cursor); + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_del_cursor); ASSERT(flen <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_longest)); /* diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 02c01bbbc789..4143dc75dca4 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -568,8 +568,6 @@ xfs_qm_dqread( struct xfs_buf *bp; struct xfs_trans *tp = NULL; int error; - int cancelflags = 0; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); @@ -617,7 +615,6 @@ xfs_qm_dqread( XFS_QM_DQALLOC_SPACE_RES(mp), 0); if (error) goto error1; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; } /* @@ -632,7 +629,6 @@ xfs_qm_dqread( * allocate (ENOENT). */ trace_xfs_dqread_fail(dqp); - cancelflags |= XFS_TRANS_ABORT; goto error1; } @@ -670,7 +666,7 @@ xfs_qm_dqread( xfs_trans_brelse(tp, bp); if (tp) { - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error0; } @@ -680,7 +676,7 @@ xfs_qm_dqread( error1: if (tp) - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); error0: xfs_qm_dqdestroy(dqp); *O_dqpp = NULL; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 3ee186ac1093..74d0e5966ebc 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -127,11 +127,11 @@ xfs_error_report( struct xfs_mount *mp, const char *filename, int linenum, - inst_t *ra) + void *ra) { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, - "Internal error %s at line %d of file %s. Caller %pF", + "Internal error %s at line %d of file %s. Caller %pS", tag, linenum, filename, ra); xfs_stack_trace(); @@ -146,7 +146,7 @@ xfs_corruption_error( void *p, const char *filename, int linenum, - inst_t *ra) + void *ra) { if (level <= xfs_error_level) xfs_hex_dump(p, 64); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 279a76e52791..4ed3042a0f16 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -21,10 +21,10 @@ struct xfs_mount; extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, inst_t *ra); + const char *filename, int linenum, void *ra); extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, - int linenum, inst_t *ra); + int linenum, void *ra); extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERROR_REPORT(e, lvl, mp) \ @@ -40,25 +40,25 @@ extern void xfs_verifier_error(struct xfs_buf *bp); /* * Macros to set EFSCORRUPTED & return/branch. */ -#define XFS_WANT_CORRUPTED_GOTO(x,l) \ +#define XFS_WANT_CORRUPTED_GOTO(mp, x, l) \ { \ int fs_is_ok = (x); \ ASSERT(fs_is_ok); \ if (unlikely(!fs_is_ok)) { \ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_GOTO", \ - XFS_ERRLEVEL_LOW, NULL); \ + XFS_ERRLEVEL_LOW, mp); \ error = -EFSCORRUPTED; \ goto l; \ } \ } -#define XFS_WANT_CORRUPTED_RETURN(x) \ +#define XFS_WANT_CORRUPTED_RETURN(mp, x) \ { \ int fs_is_ok = (x); \ ASSERT(fs_is_ok); \ if (unlikely(!fs_is_ok)) { \ XFS_ERROR_REPORT("XFS_WANT_CORRUPTED_RETURN", \ - XFS_ERRLEVEL_LOW, NULL); \ + XFS_ERRLEVEL_LOW, mp); \ return -EFSCORRUPTED; \ } \ } diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c index b97359ba2648..652cd3c5b58c 100644 --- a/fs/xfs/xfs_export.c +++ b/fs/xfs/xfs_export.c @@ -215,7 +215,7 @@ xfs_fs_get_parent( int error; struct xfs_inode *cip; - error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip, NULL); + error = xfs_lookup(XFS_I(d_inode(child)), &xfs_name_dotdot, &cip, NULL); if (unlikely(error)) return ERR_PTR(error); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index cb7fe64cdbfa..adc8f8fdd145 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -239,7 +239,7 @@ xfs_efi_init( xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; - efip->efi_format.efi_id = (__psint_t)(void*)efip; + efip->efi_format.efi_id = (uintptr_t)(void *)efip; atomic_set(&efip->efi_next_extent, 0); atomic_set(&efip->efi_refcount, 2); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index a2e1cb8a568b..db4acc1c3e73 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -38,10 +38,10 @@ #include "xfs_icache.h" #include "xfs_pnfs.h" -#include <linux/aio.h> #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> +#include <linux/backing-dev.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -80,14 +80,15 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero + * xfs_iozero clears the specified range supplied via the page cache (except in + * the DAX case). Writes through the page cache will allocate blocks over holes, + * though the callers usually map the holes first and avoid them. If a block is + * not completely zeroed, then it will be read from disk before being partially + * zeroed. * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. + * In the DAX case, we can just directly write to the underlying pages. This + * will not allocate blocks, but will avoid holes and unwritten extents and so + * not do unnecessary work. */ int xfs_iozero( @@ -97,7 +98,8 @@ xfs_iozero( { struct page *page; struct address_space *mapping; - int status; + int status = 0; + mapping = VFS_I(ip)->i_mapping; do { @@ -109,23 +111,30 @@ xfs_iozero( if (bytes > count) bytes = count; - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; + if (IS_DAX(VFS_I(ip))) { + status = dax_zero_page_range(VFS_I(ip), pos, bytes, + xfs_get_blocks_direct); + if (status) + break; + } else { + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; - zero_user(page, offset, bytes); + zero_user(page, offset, bytes); - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ + status = pagecache_write_end(NULL, mapping, pos, bytes, + bytes, page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + status = 0; + } pos += bytes; count -= bytes; - status = 0; } while (count); - return (-status); + return status; } int @@ -139,7 +148,7 @@ xfs_update_prealloc_flags( tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -161,7 +170,7 @@ xfs_update_prealloc_flags( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (flags & XFS_PREALLOC_SYNC) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } /* @@ -280,12 +289,12 @@ xfs_file_read_iter( XFS_STATS_INC(xs_read_calls); - if (unlikely(file->f_flags & O_DIRECT)) + if (unlikely(iocb->ki_flags & IOCB_DIRECT)) ioflags |= XFS_IO_ISDIRECT; if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; - if (unlikely(ioflags & XFS_IO_ISDIRECT)) { + if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -379,7 +388,11 @@ xfs_file_splice_read( trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + /* for dax, we need to avoid the page cache */ + if (IS_DAX(VFS_I(ip))) + ret = default_file_splice_read(infilp, ppos, pipe, count, flags); + else + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -545,44 +558,74 @@ xfs_zero_eof( */ STATIC ssize_t xfs_file_aio_write_checks( - struct file *file, - loff_t *pos, - size_t *count, + struct kiocb *iocb, + struct iov_iter *from, int *iolock) { + struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); - int error = 0; + ssize_t error = 0; + size_t count = iov_iter_count(from); restart: - error = generic_write_checks(file, pos, count, S_ISBLK(inode->i_mode)); - if (error) + error = generic_write_checks(iocb, from); + if (error <= 0) return error; - error = xfs_break_layouts(inode, iolock); + error = xfs_break_layouts(inode, iolock, true); if (error) return error; + /* For changing security info in file_remove_privs() we need i_mutex */ + if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + goto restart; + } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this * write. If zeroing is needed and we are currently holding the * iolock shared, we need to update it to exclusive which implies * having to redo all checks before. + * + * We need to serialise against EOF updates that occur in IO + * completions here. We want to make sure that nobody is changing the + * size while we do this check until we have placed an IO barrier (i.e. + * hold the XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. + * The spinlock effectively forms a memory barrier once we have the + * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value + * and hence be able to correctly determine if we need to run zeroing. */ - if (*pos > i_size_read(inode)) { + spin_lock(&ip->i_flags_lock); + if (iocb->ki_pos > i_size_read(inode)) { bool zero = false; + spin_unlock(&ip->i_flags_lock); if (*iolock == XFS_IOLOCK_SHARED) { xfs_rw_iunlock(ip, *iolock); *iolock = XFS_IOLOCK_EXCL; xfs_rw_ilock(ip, *iolock); + iov_iter_reexpand(from, count); + + /* + * We now have an IO submission barrier in place, but + * AIO can do EOF updates during IO completion and hence + * we now need to wait for all of them to drain. Non-AIO + * DIO will have drained before we are given the + * XFS_IOLOCK_EXCL, and so for most cases this wait is a + * no-op. + */ + inode_dio_wait(inode); goto restart; } - error = xfs_zero_eof(ip, *pos, i_size_read(inode), &zero); + error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero); if (error) return error; - } + } else + spin_unlock(&ip->i_flags_lock); /* * Updating the timestamps will grab the ilock again from @@ -601,7 +644,9 @@ restart: * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ - return file_remove_suid(file); + if (!IS_NOSEC(inode)) + return file_remove_privs(file); + return 0; } /* @@ -644,11 +689,13 @@ xfs_file_dio_aio_write( int iolock; size_t count = iov_iter_count(from); loff_t pos = iocb->ki_pos; + loff_t end; + struct iov_iter data; struct xfs_buftarg *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if ((pos | count) & target->bt_logical_sectormask) + if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask)) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -679,14 +726,16 @@ xfs_file_dio_aio_write( xfs_rw_ilock(ip, iolock); } - ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; - iov_iter_truncate(from, count); + count = iov_iter_count(from); + pos = iocb->ki_pos; + end = pos + count - 1; if (mapping->nrpages) { ret = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - pos, pos + count - 1); + pos, end); if (ret) goto out; /* @@ -696,7 +745,7 @@ xfs_file_dio_aio_write( */ ret = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, pos >> PAGE_CACHE_SHIFT, - (pos + count - 1) >> PAGE_CACHE_SHIFT); + end >> PAGE_CACHE_SHIFT); WARN_ON_ONCE(ret); ret = 0; } @@ -713,13 +762,30 @@ xfs_file_dio_aio_write( } trace_xfs_file_direct_write(ip, count, iocb->ki_pos, 0); - ret = generic_file_direct_write(iocb, from, pos); + data = *from; + ret = mapping->a_ops->direct_IO(iocb, &data, pos); + + /* see generic_file_direct_write() for why this is necessary */ + if (mapping->nrpages) { + invalidate_inode_pages2_range(mapping, + pos >> PAGE_CACHE_SHIFT, + end >> PAGE_CACHE_SHIFT); + } + + if (ret > 0) { + pos += ret; + iov_iter_advance(from, ret); + iocb->ki_pos = pos; + } out: xfs_rw_iunlock(ip, iolock); - /* No fallback to buffered IO on errors for XFS. */ - ASSERT(ret < 0 || ret == count); + /* + * No fallback to buffered IO on errors for XFS. DAX can result in + * partial writes, but direct IO will either complete fully or fail. + */ + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); return ret; } @@ -735,24 +801,22 @@ xfs_file_buffered_aio_write( ssize_t ret; int enospc = 0; int iolock = XFS_IOLOCK_EXCL; - loff_t pos = iocb->ki_pos; - size_t count = iov_iter_count(from); xfs_rw_ilock(ip, iolock); - ret = xfs_file_aio_write_checks(file, &pos, &count, &iolock); + ret = xfs_file_aio_write_checks(iocb, from, &iolock); if (ret) goto out; - iov_iter_truncate(from, count); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); write_retry: - trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); - ret = generic_perform_write(file, from, pos); + trace_xfs_file_buffered_write(ip, iov_iter_count(from), + iocb->ki_pos, 0); + ret = generic_perform_write(file, from, iocb->ki_pos); if (likely(ret >= 0)) - iocb->ki_pos = pos + ret; + iocb->ki_pos += ret; /* * If we hit a space limit, try to free up some lingering preallocated @@ -804,7 +868,7 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (unlikely(file->f_flags & O_DIRECT)) + if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -822,6 +886,11 @@ xfs_file_write_iter( return ret; } +#define XFS_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \ + FALLOC_FL_INSERT_RANGE) + STATIC long xfs_file_fallocate( struct file *file, @@ -835,18 +904,21 @@ xfs_file_fallocate( enum xfs_prealloc_flags flags = 0; uint iolock = XFS_IOLOCK_EXCL; loff_t new_size = 0; + bool do_file_insert = 0; if (!S_ISREG(inode->i_mode)) return -EINVAL; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + if (mode & ~XFS_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, false); if (error) goto out_unlock; + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + if (mode & FALLOC_FL_PUNCH_HOLE) { error = xfs_free_file_space(ip, offset, len); if (error) @@ -873,6 +945,27 @@ xfs_file_fallocate( error = xfs_collapse_file_space(ip, offset, len); if (error) goto out_unlock; + } else if (mode & FALLOC_FL_INSERT_RANGE) { + unsigned blksize_mask = (1 << inode->i_blkbits) - 1; + + new_size = i_size_read(inode) + len; + if (offset & blksize_mask || len & blksize_mask) { + error = -EINVAL; + goto out_unlock; + } + + /* check the new inode size does not wrap through zero */ + if (new_size > inode->i_sb->s_maxbytes) { + error = -EFBIG; + goto out_unlock; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + do_file_insert = 1; } else { flags |= XFS_PREALLOC_SET; @@ -907,8 +1000,19 @@ xfs_file_fallocate( iattr.ia_valid = ATTR_SIZE; iattr.ia_size = new_size; error = xfs_setattr_size(ip, &iattr); + if (error) + goto out_unlock; } + /* + * Perform hole insertion now that the file size has been + * updated so that if we crash during the operation we don't + * leave shifted extents past EOF and hence losing access to + * the data that is contained within them. + */ + if (do_file_insert) + error = xfs_insert_file_space(ip, offset, len); + out_unlock: xfs_iunlock(ip, iolock); return error; @@ -985,31 +1089,6 @@ xfs_file_readdir( return xfs_readdir(ip, ctx, bufsize); } -STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) -{ - vma->vm_ops = &xfs_file_vm_ops; - - file_accessed(filp); - return 0; -} - -/* - * mmap()d file has taken write protection fault and is being made - * writable. We can set the page state up correctly for a writable - * page, which means we can do correct delalloc accounting (ENOSPC - * checking!) and unwritten extent mapping. - */ -STATIC int -xfs_vm_page_mkwrite( - struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - return block_page_mkwrite(vma, vmf, xfs_get_blocks); -} - /* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). @@ -1385,10 +1464,101 @@ xfs_file_llseek( } } +/* + * Locking for serialisation of IO during page faults. This results in a lock + * ordering of: + * + * mmap_sem (MM) + * sb_start_pagefault(vfs, freeze) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. + */ +STATIC int +xfs_filemap_page_mkwrite( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int ret; + + trace_xfs_filemap_page_mkwrite(XFS_I(inode)); + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + + if (IS_DAX(inode)) { + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, + xfs_end_io_dax_write); + } else { + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = block_page_mkwrite_return(ret); + } + + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + + return ret; +} + +STATIC int +xfs_filemap_fault( + struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int ret; + + trace_xfs_filemap_fault(XFS_I(inode)); + + /* DAX can shortcut the normal fault path on write faults! */ + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) + return xfs_filemap_page_mkwrite(vma, vmf); + + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (IS_DAX(inode)) { + /* + * we do not want to trigger unwritten extent conversion on read + * faults - that is unnecessary overhead and would also require + * changes to xfs_get_blocks_direct() to map unwritten extent + * ioend for conversion on read-only mappings. + */ + ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL); + } else + ret = filemap_fault(vma, vmf); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + + return ret; +} + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, +}; + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + file_accessed(filp); + vma->vm_ops = &xfs_file_vm_ops; + if (IS_DAX(file_inode(filp))) + vma->vm_flags |= VM_MIXEDMAP; + return 0; +} + const struct file_operations xfs_file_operations = { .llseek = xfs_file_llseek, - .read = new_sync_read, - .write = new_sync_write, .read_iter = xfs_file_read_iter, .write_iter = xfs_file_write_iter, .splice_read = xfs_file_splice_read, @@ -1415,9 +1585,3 @@ const struct file_operations xfs_dir_file_operations = { #endif .fsync = xfs_dir_fsync, }; - -static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = xfs_vm_page_mkwrite, -}; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a2e86e8a0fea..c4c130f9bfb6 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -196,7 +196,8 @@ xfs_filestream_pick_ag( goto next_ag; } - longest = xfs_alloc_longest_free_extent(mp, pag); + longest = xfs_alloc_longest_free_extent(mp, pag, + xfs_alloc_min_freelist(mp, pag)); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || @@ -294,7 +295,7 @@ xfs_filestream_get_parent( if (!parent) goto out_dput; - dir = igrab(parent->d_inode); + dir = igrab(d_inode(parent)); dput(parent); out_dput: @@ -322,7 +323,7 @@ xfs_filestream_lookup_ag( pip = xfs_filestream_get_parent(ip); if (!pip) - goto out; + return NULLAGNUMBER; mru = xfs_mru_cache_lookup(mp->m_filestream, pip->i_ino); if (mru) { diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 74efe5b760dc..9b3438a7680f 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -101,7 +101,9 @@ xfs_fs_geometry( (xfs_sb_version_hasftype(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | (xfs_sb_version_hasfinobt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_FINOBT : 0); + XFS_FSOP_GEOM_FLAGS_FINOBT : 0) | + (xfs_sb_version_hassparseinodes(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SPINODES : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -201,7 +203,7 @@ xfs_growfs_data_private( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, XFS_GROWFS_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -489,7 +491,7 @@ xfs_growfs_data_private( if (dpct) xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) return error; @@ -557,7 +559,7 @@ xfs_growfs_data_private( return saved_error ? saved_error : error; error0: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -637,12 +639,13 @@ xfs_fs_counts( xfs_mount_t *mp, xfs_fsop_counts_t *cnt) { - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + cnt->allocino = percpu_counter_read_positive(&mp->m_icount); + cnt->freeino = percpu_counter_read_positive(&mp->m_ifree); + cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); + spin_lock(&mp->m_sb_lock); - cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); cnt->freertx = mp->m_sb.sb_frextents; - cnt->freeino = mp->m_sb.sb_ifree; - cnt->allocino = mp->m_sb.sb_icount; spin_unlock(&mp->m_sb_lock); return 0; } @@ -692,14 +695,9 @@ xfs_reserve_blocks( * what to do. This means that the amount of free space can * change while we do this, so we need to retry if we end up * trying to reserve more space than is available. - * - * We also use the xfs_mod_incore_sb() interface so that we - * don't have to care about whether per cpu counter are - * enabled, disabled or even compiled in.... */ retry: spin_lock(&mp->m_sb_lock); - xfs_icsb_sync_counters_locked(mp, 0); /* * If our previous reservation was larger than the current value, @@ -716,7 +714,8 @@ retry: } else { __int64_t free; - free = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + free = percpu_counter_sum(&mp->m_fdblocks) - + XFS_ALLOC_SET_ASIDE(mp); if (!free) goto out; /* ENOSPC and fdblks_delta = 0 */ @@ -755,8 +754,7 @@ out: * the extra reserve blocks from the reserve..... */ int error; - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - fdblks_delta, 0); + error = xfs_mod_fdblocks(mp, fdblks_delta, 0); if (error == -ENOSPC) goto retry; } diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9771b7ef62ed..76a9f2783282 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -439,11 +439,11 @@ again: *ipp = ip; /* - * If we have a real type for an on-disk inode, we can set ops(&unlock) + * If we have a real type for an on-disk inode, we can setup the inode * now. If it's a new inode being created, xfs_ialloc will handle it. */ if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) - xfs_setup_inode(ip); + xfs_setup_existing_inode(ip); return 0; out_error_or_again: diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6163767aa856..3da9f4da4f3d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -117,24 +117,34 @@ xfs_ilock_attr_map_shared( } /* - * The xfs inode contains 2 locks: a multi-reader lock called the - * i_iolock and a multi-reader lock called the i_lock. This routine - * allows either or both of the locks to be obtained. + * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and + * the i_lock. This routine allows various combinations of the locks to be + * obtained. * - * The 2 locks should always be ordered so that the IO lock is - * obtained first in order to prevent deadlock. + * The 3 locks should always be ordered so that the IO lock is obtained first, + * the mmap lock second and the ilock last in order to prevent deadlock. * - * ip -- the inode being locked - * lock_flags -- this parameter indicates the inode's locks - * to be locked. It can be: - * XFS_IOLOCK_SHARED, - * XFS_IOLOCK_EXCL, - * XFS_ILOCK_SHARED, - * XFS_ILOCK_EXCL, - * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED, - * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL, - * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED, - * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL + * Basic locking order: + * + * i_iolock -> i_mmap_lock -> page_lock -> i_ilock + * + * mmap_sem locking order: + * + * i_iolock -> page lock -> mmap_sem + * mmap_sem -> i_mmap_lock -> page_lock + * + * The difference in mmap_sem locking order mean that we cannot hold the + * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can + * fault in pages during copy in/out (for buffered IO) or require the mmap_sem + * in get_user_pages() to map the user pages into the kernel address space for + * direct IO. Similarly the i_iolock cannot be taken inside a page fault because + * page faults already hold the mmap_sem. + * + * Hence to serialise fully against both syscall and mmap based IO, we need to + * take both the i_iolock and the i_mmap_lock. These locks should *only* be both + * taken in places where we need to invalidate the page cache in a race + * free manner (e.g. truncate, hole punch and other extent manipulation + * functions). */ void xfs_ilock( @@ -150,6 +160,8 @@ xfs_ilock( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -159,6 +171,11 @@ xfs_ilock( else if (lock_flags & XFS_IOLOCK_SHARED) mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mraccess_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags)); + if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); else if (lock_flags & XFS_ILOCK_SHARED) @@ -191,6 +208,8 @@ xfs_ilock_nowait( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -202,21 +221,35 @@ xfs_ilock_nowait( if (!mrtryaccess(&ip->i_iolock)) goto out; } + + if (lock_flags & XFS_MMAPLOCK_EXCL) { + if (!mrtryupdate(&ip->i_mmaplock)) + goto out_undo_iolock; + } else if (lock_flags & XFS_MMAPLOCK_SHARED) { + if (!mrtryaccess(&ip->i_mmaplock)) + goto out_undo_iolock; + } + if (lock_flags & XFS_ILOCK_EXCL) { if (!mrtryupdate(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } else if (lock_flags & XFS_ILOCK_SHARED) { if (!mrtryaccess(&ip->i_lock)) - goto out_undo_iolock; + goto out_undo_mmaplock; } return 1; - out_undo_iolock: +out_undo_mmaplock: + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); +out_undo_iolock: if (lock_flags & XFS_IOLOCK_EXCL) mrunlock_excl(&ip->i_iolock); else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); - out: +out: return 0; } @@ -244,6 +277,8 @@ xfs_iunlock( */ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) != (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)); + ASSERT((lock_flags & (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)) != + (XFS_MMAPLOCK_SHARED | XFS_MMAPLOCK_EXCL)); ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) != (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); @@ -254,6 +289,11 @@ xfs_iunlock( else if (lock_flags & XFS_IOLOCK_SHARED) mrunlock_shared(&ip->i_iolock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrunlock_excl(&ip->i_mmaplock); + else if (lock_flags & XFS_MMAPLOCK_SHARED) + mrunlock_shared(&ip->i_mmaplock); + if (lock_flags & XFS_ILOCK_EXCL) mrunlock_excl(&ip->i_lock); else if (lock_flags & XFS_ILOCK_SHARED) @@ -271,11 +311,14 @@ xfs_ilock_demote( xfs_inode_t *ip, uint lock_flags) { - ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); - ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); + ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)); + ASSERT((lock_flags & + ~(XFS_IOLOCK_EXCL|XFS_MMAPLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); + if (lock_flags & XFS_MMAPLOCK_EXCL) + mrdemote(&ip->i_mmaplock); if (lock_flags & XFS_IOLOCK_EXCL) mrdemote(&ip->i_iolock); @@ -294,6 +337,12 @@ xfs_isilocked( return rwsem_is_locked(&ip->i_lock.mr_lock); } + if (lock_flags & (XFS_MMAPLOCK_EXCL|XFS_MMAPLOCK_SHARED)) { + if (!(lock_flags & XFS_MMAPLOCK_SHARED)) + return !!ip->i_mmaplock.mr_writer; + return rwsem_is_locked(&ip->i_mmaplock.mr_lock); + } + if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) { if (!(lock_flags & XFS_IOLOCK_SHARED)) return !!ip->i_iolock.mr_writer; @@ -314,14 +363,27 @@ int xfs_lock_delays; #endif /* - * Bump the subclass so xfs_lock_inodes() acquires each lock with - * a different value + * Bump the subclass so xfs_lock_inodes() acquires each lock with a different + * value. This shouldn't be called for page fault locking, but we also need to + * ensure we don't overrun the number of lockdep subclasses for the iolock or + * mmaplock as that is limited to 12 by the mmap lock lockdep annotations. */ static inline int xfs_lock_inumorder(int lock_mode, int subclass) { - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_MMAPLOCK_SHIFT - XFS_IOLOCK_SHIFT))); lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT; + } + + if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) { + ASSERT(subclass + XFS_LOCK_INUMORDER < + (1 << (XFS_ILOCK_SHIFT - XFS_MMAPLOCK_SHIFT))); + lock_mode |= (subclass + XFS_LOCK_INUMORDER) << + XFS_MMAPLOCK_SHIFT; + } + if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT; @@ -329,15 +391,14 @@ xfs_lock_inumorder(int lock_mode, int subclass) } /* - * The following routine will lock n inodes in exclusive mode. - * We assume the caller calls us with the inodes in i_ino order. + * The following routine will lock n inodes in exclusive mode. We assume the + * caller calls us with the inodes in i_ino order. * - * We need to detect deadlock where an inode that we lock - * is in the AIL and we start waiting for another inode that is locked - * by a thread in a long running transaction (such as truncate). This can - * result in deadlock since the long running trans might need to wait - * for the inode we just locked in order to push the tail and free space - * in the log. + * We need to detect deadlock where an inode that we lock is in the AIL and we + * start waiting for another inode that is locked by a thread in a long running + * transaction (such as truncate). This can result in deadlock since the long + * running trans might need to wait for the inode we just locked in order to + * push the tail and free space in the log. */ void xfs_lock_inodes( @@ -348,30 +409,27 @@ xfs_lock_inodes( int attempts = 0, i, j, try_lock; xfs_log_item_t *lp; - ASSERT(ips && (inodes >= 2)); /* we need at least two */ + /* currently supports between 2 and 5 inodes */ + ASSERT(ips && inodes >= 2 && inodes <= 5); try_lock = 0; i = 0; - again: for (; i < inodes; i++) { ASSERT(ips[i]); - if (i && (ips[i] == ips[i-1])) /* Already locked */ + if (i && (ips[i] == ips[i - 1])) /* Already locked */ continue; /* - * If try_lock is not set yet, make sure all locked inodes - * are not in the AIL. - * If any are, set try_lock to be used later. + * If try_lock is not set yet, make sure all locked inodes are + * not in the AIL. If any are, set try_lock to be used later. */ - if (!try_lock) { for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (lp && (lp->li_flags & XFS_LI_IN_AIL)) try_lock++; - } } } @@ -381,51 +439,42 @@ again: * we can't get any, we must release all we have * and try again. */ + if (!try_lock) { + xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); + continue; + } + + /* try_lock means we have an inode locked that is in the AIL. */ + ASSERT(i != 0); + if (xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) + continue; - if (try_lock) { - /* try_lock must be 0 if i is 0. */ + /* + * Unlock all previous guys and try again. xfs_iunlock will try + * to push the tail if the inode is in the AIL. + */ + attempts++; + for (j = i - 1; j >= 0; j--) { /* - * try_lock means we have an inode locked - * that is in the AIL. + * Check to see if we've already unlocked this one. Not + * the first one going back, and the inode ptr is the + * same. */ - ASSERT(i != 0); - if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) { - attempts++; - - /* - * Unlock all previous guys and try again. - * xfs_iunlock will try to push the tail - * if the inode is in the AIL. - */ - - for(j = i - 1; j >= 0; j--) { - - /* - * Check to see if we've already - * unlocked this one. - * Not the first one going back, - * and the inode ptr is the same. - */ - if ((j != (i - 1)) && ips[j] == - ips[j+1]) - continue; - - xfs_iunlock(ips[j], lock_mode); - } + if (j != (i - 1) && ips[j] == ips[j + 1]) + continue; - if ((attempts % 5) == 0) { - delay(1); /* Don't just spin the CPU */ + xfs_iunlock(ips[j], lock_mode); + } + + if ((attempts % 5) == 0) { + delay(1); /* Don't just spin the CPU */ #ifdef DEBUG - xfs_lock_delays++; + xfs_lock_delays++; #endif - } - i = 0; - try_lock = 0; - goto again; - } - } else { - xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i)); } + i = 0; + try_lock = 0; + goto again; } #ifdef DEBUG @@ -440,10 +489,10 @@ again: } /* - * xfs_lock_two_inodes() can only be used to lock one type of lock - * at a time - the iolock or the ilock, but not both at once. If - * we lock both at once, lockdep will report false positives saying - * we have violated locking orders. + * xfs_lock_two_inodes() can only be used to lock one type of lock at a time - + * the iolock, the mmaplock or the ilock, but not more than one at a time. If we + * lock more than one at a time, lockdep will report false positives saying we + * have violated locking orders. */ void xfs_lock_two_inodes( @@ -455,8 +504,12 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; - if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) - ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) { + ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))); + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + } else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) + ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))); + ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { @@ -818,7 +871,7 @@ xfs_ialloc( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, flags); - /* now that we have an i_mode we can setup inode ops and unlock */ + /* now that we have an i_mode we can setup the inode structure */ xfs_setup_inode(ip); *ipp = ip; @@ -852,7 +905,6 @@ xfs_dir_ialloc( { xfs_trans_t *tp; - xfs_trans_t *ntp; xfs_inode_t *ip; xfs_buf_t *ialloc_context = NULL; int code; @@ -901,8 +953,6 @@ xfs_dir_ialloc( * to succeed the second time. */ if (ialloc_context) { - struct xfs_trans_res tres; - /* * Normally, xfs_trans_commit releases all the locks. * We call bhold to hang on to the ialloc_context across @@ -911,12 +961,6 @@ xfs_dir_ialloc( * allocation group. */ xfs_trans_bhold(tp, ialloc_context); - /* - * Save the log reservation so we can use - * them in the next transaction. - */ - tres.tr_logres = xfs_trans_get_log_res(tp); - tres.tr_logcount = xfs_trans_get_log_count(tp); /* * We want the quota changes to be associated with the next @@ -932,35 +976,9 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - ntp = xfs_trans_dup(tp); - code = xfs_trans_commit(tp, 0); - tp = ntp; - if (committed != NULL) { + code = xfs_trans_roll(&tp, 0); + if (committed != NULL) *committed = 1; - } - /* - * If we get an error during the commit processing, - * release the buffer that is still held and return - * to the caller. - */ - if (code) { - xfs_buf_relse(ialloc_context); - if (dqinfo) { - tp->t_dqinfo = dqinfo; - xfs_trans_free_dqinfo(tp); - } - *tpp = ntp; - *ipp = NULL; - return code; - } - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - code = xfs_trans_reserve(tp, &tres, 0, 0); /* * Re-attach the quota info that we detached from prev trx. @@ -972,7 +990,7 @@ xfs_dir_ialloc( if (code) { xfs_buf_relse(ialloc_context); - *tpp = ntp; + *tpp = tp; *ipp = NULL; return code; } @@ -1074,7 +1092,6 @@ xfs_create( xfs_bmap_free_t free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - uint cancel_flags; int committed; prid_t prid; struct xfs_dquot *udqp = NULL; @@ -1111,8 +1128,6 @@ xfs_create( tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not @@ -1130,10 +1145,9 @@ xfs_create( resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -1164,7 +1178,7 @@ xfs_create( if (error) { if (error == -ENOSPC) goto out_trans_cancel; - goto out_trans_abort; + goto out_trans_cancel; } /* @@ -1182,7 +1196,7 @@ xfs_create( resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != -ENOSPC); - goto out_trans_abort; + goto out_trans_cancel; } xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); @@ -1216,7 +1230,7 @@ xfs_create( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -1229,18 +1243,18 @@ xfs_create( out_bmap_cancel: xfs_bmap_cancel(&free_list); - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. */ - if (ip) + if (ip) { + xfs_finish_inode_setup(ip); IRELE(ip); + } xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -1262,7 +1276,6 @@ xfs_create_tmpfile( struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; int error; - uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1295,10 +1308,8 @@ xfs_create_tmpfile( resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); @@ -1310,7 +1321,7 @@ xfs_create_tmpfile( if (error) { if (error == -ENOSPC) goto out_trans_cancel; - goto out_trans_abort; + goto out_trans_cancel; } if (mp->m_flags & XFS_MOUNT_WSYNC) @@ -1326,9 +1337,9 @@ xfs_create_tmpfile( ip->i_d.di_nlink--; error = xfs_iunlink(tp, ip); if (error) - goto out_trans_abort; + goto out_trans_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -1339,18 +1350,18 @@ xfs_create_tmpfile( *ipp = ip; return 0; - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* - * Wait until after the current transaction is aborted to - * release the inode. This prevents recursive transactions - * and deadlocks from xfs_inactive. + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. */ - if (ip) + if (ip) { + xfs_finish_inode_setup(ip); IRELE(ip); + } xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); @@ -1370,7 +1381,6 @@ xfs_link( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; int committed; int resblks; @@ -1390,17 +1400,14 @@ xfs_link( goto std_return; tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; resblks = XFS_LINK_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); if (error == -ENOSPC) { resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto error_return; - } xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); @@ -1429,19 +1436,19 @@ xfs_link( if (sip->i_d.di_nlink == 0) { error = xfs_iunlink_remove(tp, sip); if (error) - goto abort_return; + goto error_return; } error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, &first_block, &free_list, resblks); if (error) - goto abort_return; + goto error_return; xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); error = xfs_bumplink(tp, sip); if (error) - goto abort_return; + goto error_return; /* * If this is a synchronous mount, make sure that the @@ -1455,15 +1462,13 @@ xfs_link( error = xfs_bmap_finish (&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); - goto abort_return; + goto error_return; } - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + return xfs_trans_commit(tp); - abort_return: - cancel_flags |= XFS_TRANS_ABORT; error_return: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); std_return: return error; } @@ -1498,7 +1503,6 @@ xfs_itruncate_extents( { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; - struct xfs_trans *ntp; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; xfs_fileoff_t first_unmap_block; @@ -1556,29 +1560,7 @@ xfs_itruncate_extents( if (error) goto out_bmap_cancel; - if (committed) { - /* - * Mark the inode dirty so it will be logged and - * moved forward in the log as part of every commit. - */ - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } - - ntp = xfs_trans_dup(tp); - error = xfs_trans_commit(tp, 0); - tp = ntp; - - xfs_trans_ijoin(tp, ip, 0); - - if (error) - goto out; - - /* - * Transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_roll(&tp, ip); if (error) goto out; } @@ -1699,7 +1681,7 @@ xfs_inactive_truncate( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -1720,7 +1702,7 @@ xfs_inactive_truncate( ASSERT(ip->i_d.di_nextents == 0); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error_unlock; @@ -1728,7 +1710,7 @@ xfs_inactive_truncate( return 0; error_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -1778,7 +1760,7 @@ xfs_inactive_ifree( } else { ASSERT(XFS_FORCED_SHUTDOWN(mp)); } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_cancel(tp); return error; } @@ -1798,7 +1780,7 @@ xfs_inactive_ifree( __func__, error); xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1817,7 +1799,7 @@ xfs_inactive_ifree( if (error) xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); @@ -1889,21 +1871,17 @@ xfs_inactive( /* * If there are attributes associated with the file then blow them away * now. The code calls a routine that recursively deconstructs the - * attribute fork. We need to just commit the current transaction - * because we can't use it for xfs_attr_inactive(). + * attribute fork. If also blows away the in-core attribute fork. */ - if (ip->i_d.di_anextents > 0) { - ASSERT(ip->i_d.di_forkoff != 0); - + if (XFS_IFORK_Q(ip)) { error = xfs_attr_inactive(ip); if (error) return; } - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - + ASSERT(!ip->i_afp); ASSERT(ip->i_d.di_anextents == 0); + ASSERT(ip->i_d.di_forkoff == 0); /* * Free the inode. @@ -2182,28 +2160,42 @@ xfs_iunlink_remove( */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, - xfs_ino_t inum) + xfs_inode_t *free_ip, + xfs_trans_t *tp, + struct xfs_icluster *xic) { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; int inodes_per_cluster; int nbufs; int i, j; + int ioffset; xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; struct xfs_perag *pag; + xfs_ino_t inum; + inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); blks_per_cluster = xfs_icluster_size_fsb(mp); inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; nbufs = mp->m_ialloc_blks / blks_per_cluster; for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + /* + * The allocation bitmap tells us which inodes of the chunk were + * physically allocated. Skip the cluster if an inode falls into + * a sparse region. + */ + ioffset = inum - xic->first_ino; + if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { + ASSERT(do_mod(ioffset, inodes_per_cluster) == 0); + continue; + } + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -2361,8 +2353,7 @@ xfs_ifree( xfs_bmap_free_t *flist) { int error; - int delete; - xfs_ino_t first_ino; + struct xfs_icluster xic = { 0 }; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); @@ -2378,7 +2369,7 @@ xfs_ifree( if (error) return error; - error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); + error = xfs_difree(tp, ip->i_ino, flist, &xic); if (error) return error; @@ -2395,8 +2386,8 @@ xfs_ifree( ip->i_d.di_gen++; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (delete) - error = xfs_ifree_cluster(ip, tp, first_ino); + if (xic.deleted) + error = xfs_ifree_cluster(ip, tp, &xic); return error; } @@ -2483,7 +2474,6 @@ xfs_remove( int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; int committed; uint resblks; @@ -2504,7 +2494,6 @@ xfs_remove( tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); else tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * We try to get the real space reservation first, @@ -2523,7 +2512,6 @@ xfs_remove( } if (error) { ASSERT(error != -ENOSPC); - cancel_flags = 0; goto out_trans_cancel; } @@ -2535,7 +2523,6 @@ xfs_remove( /* * If we're removing a directory perform some additional validation. */ - cancel_flags |= XFS_TRANS_ABORT; if (is_dir) { ASSERT(ip->i_d.di_nlink >= 2); if (ip->i_d.di_nlink != 2) { @@ -2591,7 +2578,7 @@ xfs_remove( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto std_return; @@ -2603,7 +2590,7 @@ xfs_remove( out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); std_return: return error; } @@ -2611,19 +2598,22 @@ xfs_remove( /* * Enter all inodes for a rename transaction into a sorted array. */ +#define __XFS_SORT_INODES 5 STATIC void xfs_sort_for_rename( - xfs_inode_t *dp1, /* in: old (source) directory inode */ - xfs_inode_t *dp2, /* in: new (target) directory inode */ - xfs_inode_t *ip1, /* in: inode of old entry */ - xfs_inode_t *ip2, /* in: inode of new entry, if it - already exists, NULL otherwise. */ - xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ - int *num_inodes) /* out: number of inodes in array */ + struct xfs_inode *dp1, /* in: old (source) directory inode */ + struct xfs_inode *dp2, /* in: new (target) directory inode */ + struct xfs_inode *ip1, /* in: inode of old entry */ + struct xfs_inode *ip2, /* in: inode of new entry */ + struct xfs_inode *wip, /* in: whiteout inode */ + struct xfs_inode **i_tab,/* out: sorted array of inodes */ + int *num_inodes) /* in/out: inodes in array */ { - xfs_inode_t *temp; int i, j; + ASSERT(*num_inodes == __XFS_SORT_INODES); + memset(i_tab, 0, *num_inodes * sizeof(struct xfs_inode *)); + /* * i_tab contains a list of pointers to inodes. We initialize * the table here & we'll sort it. We will then use it to @@ -2631,25 +2621,24 @@ xfs_sort_for_rename( * * Note that the table may contain duplicates. e.g., dp1 == dp2. */ - i_tab[0] = dp1; - i_tab[1] = dp2; - i_tab[2] = ip1; - if (ip2) { - *num_inodes = 4; - i_tab[3] = ip2; - } else { - *num_inodes = 3; - i_tab[3] = NULL; - } + i = 0; + i_tab[i++] = dp1; + i_tab[i++] = dp2; + i_tab[i++] = ip1; + if (ip2) + i_tab[i++] = ip2; + if (wip) + i_tab[i++] = wip; + *num_inodes = i; /* * Sort the elements via bubble sort. (Remember, there are at - * most 4 elements to sort, so this is adequate.) + * most 5 elements to sort, so this is adequate.) */ for (i = 0; i < *num_inodes; i++) { for (j = 1; j < *num_inodes; j++) { if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { - temp = i_tab[j]; + struct xfs_inode *temp = i_tab[j]; i_tab[j] = i_tab[j-1]; i_tab[j-1] = temp; } @@ -2657,6 +2646,31 @@ xfs_sort_for_rename( } } +static int +xfs_finish_rename( + struct xfs_trans *tp, + struct xfs_bmap_free *free_list) +{ + int committed = 0; + int error; + + /* + * If this is a synchronous mount, make sure that the rename transaction + * goes to disk before returning to the user. + */ + if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) + xfs_trans_set_sync(tp); + + error = xfs_bmap_finish(&tp, free_list, &committed); + if (error) { + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp); + return error; + } + + return xfs_trans_commit(tp); +} + /* * xfs_cross_rename() * @@ -2685,14 +2699,14 @@ xfs_cross_rename( ip2->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* Swap inode number for dirent in second parent */ error = xfs_dir_replace(tp, dp2, name2, ip1->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* * If we're renaming one or more directories across different parents, @@ -2707,16 +2721,16 @@ xfs_cross_rename( dp1->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* transfer ip2 ".." reference to dp1 */ if (!S_ISDIR(ip1->i_d.di_mode)) { error = xfs_droplink(tp, dp2); if (error) - goto out; + goto out_trans_abort; error = xfs_bumplink(tp, dp1); if (error) - goto out; + goto out_trans_abort; } /* @@ -2734,16 +2748,16 @@ xfs_cross_rename( dp2->i_ino, first_block, free_list, spaceres); if (error) - goto out; + goto out_trans_abort; /* transfer ip1 ".." reference to dp2 */ if (!S_ISDIR(ip2->i_d.di_mode)) { error = xfs_droplink(tp, dp1); if (error) - goto out; + goto out_trans_abort; error = xfs_bumplink(tp, dp2); if (error) - goto out; + goto out_trans_abort; } /* @@ -2771,66 +2785,112 @@ xfs_cross_rename( } xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE); -out: + return xfs_finish_rename(tp, free_list); + +out_trans_abort: + xfs_bmap_cancel(free_list); + xfs_trans_cancel(tp); return error; } /* + * xfs_rename_alloc_whiteout() + * + * Return a referenced, unlinked, unlocked inode that that can be used as a + * whiteout in a rename transaction. We use a tmpfile inode here so that if we + * crash between allocating the inode and linking it into the rename transaction + * recovery will free the inode and we won't leak it. + */ +static int +xfs_rename_alloc_whiteout( + struct xfs_inode *dp, + struct xfs_inode **wip) +{ + struct xfs_inode *tmpfile; + int error; + + error = xfs_create_tmpfile(dp, NULL, S_IFCHR | WHITEOUT_MODE, &tmpfile); + if (error) + return error; + + /* + * Prepare the tmpfile inode as if it were created through the VFS. + * Otherwise, the link increment paths will complain about nlink 0->1. + * Drop the link count as done by d_tmpfile(), complete the inode setup + * and flag it as linkable. + */ + drop_nlink(VFS_I(tmpfile)); + xfs_finish_inode_setup(tmpfile); + VFS_I(tmpfile)->i_state |= I_LINKABLE; + + *wip = tmpfile; + return 0; +} + +/* * xfs_rename */ int xfs_rename( - xfs_inode_t *src_dp, - struct xfs_name *src_name, - xfs_inode_t *src_ip, - xfs_inode_t *target_dp, - struct xfs_name *target_name, - xfs_inode_t *target_ip, - unsigned int flags) + struct xfs_inode *src_dp, + struct xfs_name *src_name, + struct xfs_inode *src_ip, + struct xfs_inode *target_dp, + struct xfs_name *target_name, + struct xfs_inode *target_ip, + unsigned int flags) { - xfs_trans_t *tp = NULL; - xfs_mount_t *mp = src_dp->i_mount; - int new_parent; /* moving to a new dir */ - int src_is_directory; /* src_name is a directory */ - int error; - xfs_bmap_free_t free_list; - xfs_fsblock_t first_block; - int cancel_flags; - int committed; - xfs_inode_t *inodes[4]; - int spaceres; - int num_inodes; + struct xfs_mount *mp = src_dp->i_mount; + struct xfs_trans *tp; + struct xfs_bmap_free free_list; + xfs_fsblock_t first_block; + struct xfs_inode *wip = NULL; /* whiteout inode */ + struct xfs_inode *inodes[__XFS_SORT_INODES]; + int num_inodes = __XFS_SORT_INODES; + bool new_parent = (src_dp != target_dp); + bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + int spaceres; + int error; trace_xfs_rename(src_dp, target_dp, src_name, target_name); - new_parent = (src_dp != target_dp); - src_is_directory = S_ISDIR(src_ip->i_d.di_mode); + if ((flags & RENAME_EXCHANGE) && !target_ip) + return -EINVAL; - xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + /* + * If we are doing a whiteout operation, allocate the whiteout inode + * we will be placing at the target and ensure the type is set + * appropriately. + */ + if (flags & RENAME_WHITEOUT) { + ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE))); + error = xfs_rename_alloc_whiteout(target_dp, &wip); + if (error) + return error; + + /* setup target dirent info as whiteout */ + src_name->type = XFS_DIR3_FT_CHRDEV; + } + + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, wip, inodes, &num_inodes); - xfs_bmap_init(&free_list, &first_block); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0); if (error == -ENOSPC) { spaceres = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0); } - if (error) { - xfs_trans_cancel(tp, 0); - goto std_return; - } + if (error) + goto out_trans_cancel; /* * Attach the dquots to the inodes */ error = xfs_qm_vop_rename_dqattach(inodes); - if (error) { - xfs_trans_cancel(tp, cancel_flags); - goto std_return; - } + if (error) + goto out_trans_cancel; /* * Lock all the participating inodes. Depending upon whether @@ -2851,6 +2911,8 @@ xfs_rename( xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); if (target_ip) xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); + if (wip) + xfs_trans_ijoin(tp, wip, XFS_ILOCK_EXCL); /* * If we are using project inheritance, we only allow renames @@ -2860,24 +2922,16 @@ xfs_rename( if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) { error = -EXDEV; - goto error_return; + goto out_trans_cancel; } - /* - * Handle RENAME_EXCHANGE flags - */ - if (flags & RENAME_EXCHANGE) { - if (target_ip == NULL) { - error = -EINVAL; - goto error_return; - } - error = xfs_cross_rename(tp, src_dp, src_name, src_ip, - target_dp, target_name, target_ip, - &free_list, &first_block, spaceres); - if (error) - goto abort_return; - goto finish_rename; - } + xfs_bmap_init(&free_list, &first_block); + + /* RENAME_EXCHANGE is unique from here on. */ + if (flags & RENAME_EXCHANGE) + return xfs_cross_rename(tp, src_dp, src_name, src_ip, + target_dp, target_name, target_ip, + &free_list, &first_block, spaceres); /* * Set up the target. @@ -2890,7 +2944,7 @@ xfs_rename( if (!spaceres) { error = xfs_dir_canenter(tp, target_dp, target_name); if (error) - goto error_return; + goto out_trans_cancel; } /* * If target does not exist and the rename crosses @@ -2900,10 +2954,8 @@ xfs_rename( error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, &first_block, &free_list, spaceres); - if (error == -ENOSPC) - goto error_return; if (error) - goto abort_return; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2911,7 +2963,7 @@ xfs_rename( if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); if (error) - goto abort_return; + goto out_bmap_cancel; } } else { /* target_ip != NULL */ /* @@ -2926,7 +2978,7 @@ xfs_rename( if (!(xfs_dir_isempty(target_ip)) || (target_ip->i_d.di_nlink > 2)) { error = -EEXIST; - goto error_return; + goto out_trans_cancel; } } @@ -2943,7 +2995,7 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto abort_return; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -2954,7 +3006,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto abort_return; + goto out_bmap_cancel; if (src_is_directory) { /* @@ -2962,7 +3014,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto abort_return; + goto out_bmap_cancel; } } /* target_ip != NULL */ @@ -2979,7 +3031,7 @@ xfs_rename( &first_block, &free_list, spaceres); ASSERT(error != -EEXIST); if (error) - goto abort_return; + goto out_bmap_cancel; } /* @@ -3005,49 +3057,65 @@ xfs_rename( */ error = xfs_droplink(tp, src_dp); if (error) - goto abort_return; + goto out_bmap_cancel; } - error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + /* + * For whiteouts, we only need to update the source dirent with the + * inode number of the whiteout inode rather than removing it + * altogether. + */ + if (wip) { + error = xfs_dir_replace(tp, src_dp, src_name, wip->i_ino, &first_block, &free_list, spaceres); + } else + error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, + &first_block, &free_list, spaceres); if (error) - goto abort_return; - - xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); - if (new_parent) - xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); + goto out_bmap_cancel; -finish_rename: /* - * If this is a synchronous mount, make sure that the - * rename transaction goes to disk before returning to - * the user. + * For whiteouts, we need to bump the link count on the whiteout inode. + * This means that failures all the way up to this point leave the inode + * on the unlinked list and so cleanup is a simple matter of dropping + * the remaining reference to it. If we fail here after bumping the link + * count, we're shutting down the filesystem so we'll never see the + * intermediate state on disk. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { - xfs_trans_set_sync(tp); - } + if (wip) { + ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); + error = xfs_bumplink(tp, wip); + if (error) + goto out_bmap_cancel; + error = xfs_iunlink_remove(tp, wip); + if (error) + goto out_bmap_cancel; + xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); - goto std_return; + /* + * Now we have a real link, clear the "I'm a tmpfile" state + * flag from the inode so it doesn't accidentally get misused in + * future. + */ + VFS_I(wip)->i_state &= ~I_LINKABLE; } - /* - * trans_commit will unlock src_ip, target_ip & decrement - * the vnode references. - */ - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE); + if (new_parent) + xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE); - abort_return: - cancel_flags |= XFS_TRANS_ABORT; - error_return: + error = xfs_finish_rename(tp, &free_list); + if (wip) + IRELE(wip); + return error; + +out_bmap_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, cancel_flags); - std_return: +out_trans_cancel: + xfs_trans_cancel(tp); + if (wip) + IRELE(wip); return error; } @@ -3324,7 +3392,7 @@ xfs_iflush_int( ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index a1cd55f3f351..8f22d20368d8 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -56,6 +56,7 @@ typedef struct xfs_inode { struct xfs_inode_log_item *i_itemp; /* logging information */ mrlock_t i_lock; /* inode lock */ mrlock_t i_iolock; /* inode IO lock */ + mrlock_t i_mmaplock; /* inode mmap IO lock */ atomic_t i_pincount; /* inode pin count */ spinlock_t i_flags_lock; /* inode i_flags lock */ /* Miscellaneous state. */ @@ -263,15 +264,20 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_SHARED (1<<1) #define XFS_ILOCK_EXCL (1<<2) #define XFS_ILOCK_SHARED (1<<3) +#define XFS_MMAPLOCK_EXCL (1<<4) +#define XFS_MMAPLOCK_SHARED (1<<5) #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ - | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ + | XFS_MMAPLOCK_EXCL | XFS_MMAPLOCK_SHARED) #define XFS_LOCK_FLAGS \ { XFS_IOLOCK_EXCL, "IOLOCK_EXCL" }, \ { XFS_IOLOCK_SHARED, "IOLOCK_SHARED" }, \ { XFS_ILOCK_EXCL, "ILOCK_EXCL" }, \ - { XFS_ILOCK_SHARED, "ILOCK_SHARED" } + { XFS_ILOCK_SHARED, "ILOCK_SHARED" }, \ + { XFS_MMAPLOCK_EXCL, "MMAPLOCK_EXCL" }, \ + { XFS_MMAPLOCK_SHARED, "MMAPLOCK_SHARED" } /* @@ -302,17 +308,26 @@ static inline int xfs_isiflocked(struct xfs_inode *ip) #define XFS_IOLOCK_SHIFT 16 #define XFS_IOLOCK_PARENT (XFS_LOCK_PARENT << XFS_IOLOCK_SHIFT) +#define XFS_MMAPLOCK_SHIFT 20 + #define XFS_ILOCK_SHIFT 24 #define XFS_ILOCK_PARENT (XFS_LOCK_PARENT << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTBITMAP (XFS_LOCK_RTBITMAP << XFS_ILOCK_SHIFT) #define XFS_ILOCK_RTSUM (XFS_LOCK_RTSUM << XFS_ILOCK_SHIFT) -#define XFS_IOLOCK_DEP_MASK 0x00ff0000 +#define XFS_IOLOCK_DEP_MASK 0x000f0000 +#define XFS_MMAPLOCK_DEP_MASK 0x00f00000 #define XFS_ILOCK_DEP_MASK 0xff000000 -#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | XFS_ILOCK_DEP_MASK) +#define XFS_LOCK_DEP_MASK (XFS_IOLOCK_DEP_MASK | \ + XFS_MMAPLOCK_DEP_MASK | \ + XFS_ILOCK_DEP_MASK) -#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) >> XFS_IOLOCK_SHIFT) -#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) >> XFS_ILOCK_SHIFT) +#define XFS_IOLOCK_DEP(flags) (((flags) & XFS_IOLOCK_DEP_MASK) \ + >> XFS_IOLOCK_SHIFT) +#define XFS_MMAPLOCK_DEP(flags) (((flags) & XFS_MMAPLOCK_DEP_MASK) \ + >> XFS_MMAPLOCK_SHIFT) +#define XFS_ILOCK_DEP(flags) (((flags) & XFS_ILOCK_DEP_MASK) \ + >> XFS_ILOCK_SHIFT) /* * For multiple groups support: if S_ISGID bit is set in the parent @@ -391,6 +406,28 @@ int xfs_zero_eof(struct xfs_inode *ip, xfs_off_t offset, int xfs_iozero(struct xfs_inode *ip, loff_t pos, size_t count); +/* from xfs_iops.c */ +/* + * When setting up a newly allocated inode, we need to call + * xfs_finish_inode_setup() once the inode is fully instantiated at + * the VFS level to prevent the rest of the world seeing the inode + * before we've completed instantiation. Otherwise we can do it + * the moment the inode lookup is complete. + */ +extern void xfs_setup_inode(struct xfs_inode *ip); +static inline void xfs_finish_inode_setup(struct xfs_inode *ip) +{ + xfs_iflags_clear(ip, XFS_INEW); + barrier(); + unlock_new_inode(VFS_I(ip)); +} + +static inline void xfs_setup_existing_inode(struct xfs_inode *ip) +{ + xfs_setup_inode(ip); + xfs_finish_inode_setup(ip); +} + #define IHOLD(ip) \ do { \ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \ diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ac4feae45eb3..ea7d85af5310 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -82,7 +82,7 @@ xfs_find_handle( error = user_lpath((const char __user *)hreq->path, &path); if (error) return error; - inode = path.dentry->d_inode; + inode = d_inode(path.dentry); } ip = XFS_I(inode); @@ -210,7 +210,7 @@ xfs_open_by_handle( dentry = xfs_handlereq_to_dentry(parfilp, hreq); if (IS_ERR(dentry)) return PTR_ERR(dentry); - inode = dentry->d_inode; + inode = d_inode(dentry); /* Restrict xfs_open_by_handle to directories & regular files. */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { @@ -303,7 +303,7 @@ xfs_readlink_by_handle( goto out_dput; } - error = xfs_readlink(XFS_I(dentry->d_inode), link); + error = xfs_readlink(XFS_I(d_inode(dentry)), link); if (error) goto out_kfree; error = readlink_copy(hreq->ohandle, olen, link); @@ -336,7 +336,7 @@ xfs_set_dmattrs( tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -346,7 +346,7 @@ xfs_set_dmattrs( ip->i_d.di_dmstate = state; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); return error; } @@ -376,7 +376,7 @@ xfs_fssetdm_by_handle( return PTR_ERR(dentry); } - if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { + if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { error = -EPERM; goto out; } @@ -386,7 +386,7 @@ xfs_fssetdm_by_handle( goto out; } - error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: @@ -429,7 +429,7 @@ xfs_attrlist_by_handle( goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -559,7 +559,7 @@ xfs_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: ops[i].am_error = xfs_attrmulti_attr_get( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_attrvalue, &ops[i].am_length, ops[i].am_flags); break; @@ -568,7 +568,7 @@ xfs_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_attrvalue, ops[i].am_length, ops[i].am_flags); mnt_drop_write_file(parfilp); @@ -578,7 +578,7 @@ xfs_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_flags); mnt_drop_write_file(parfilp); break; @@ -631,7 +631,7 @@ xfs_ioc_space( if (filp->f_flags & O_DSYNC) flags |= XFS_PREALLOC_SYNC; - if (ioflags & XFS_IO_INVIS) + if (ioflags & XFS_IO_INVIS) flags |= XFS_PREALLOC_INVISIBLE; error = mnt_want_write_file(filp); @@ -639,10 +639,13 @@ xfs_ioc_space( return error; xfs_ilock(ip, iolock); - error = xfs_break_layouts(inode, &iolock); + error = xfs_break_layouts(inode, &iolock, false); if (error) goto out_unlock; + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + switch (bf->l_whence) { case 0: /*SEEK_SET*/ break; @@ -1073,7 +1076,7 @@ xfs_ioctl_setattr_get_trans( return tp; out_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return ERR_PTR(error); } @@ -1250,7 +1253,7 @@ xfs_ioctl_setattr( else ip->i_d.di_extsize = 0; - code = xfs_trans_commit(tp, 0); + code = xfs_trans_commit(tp); /* * Release any dquot(s) the inode had kept before chown. @@ -1262,7 +1265,7 @@ xfs_ioctl_setattr( return code; error_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); error_free_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); @@ -1335,11 +1338,11 @@ xfs_ioc_setxflags( error = xfs_ioctl_setattr_xflags(tp, ip, &fa); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_drop_write; } - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_drop_write: mnt_drop_write_file(filp); return error; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index bfc7c7c8a0c8..b88bdc85dd3d 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -375,7 +375,7 @@ xfs_compat_attrlist_by_handle( goto out_dput; cursor = (attrlist_cursor_kern_t *)&al_hreq.pos; - error = xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen, + error = xfs_attr_list(XFS_I(d_inode(dentry)), kbuf, al_hreq.buflen, al_hreq.flags, cursor); if (error) goto out_kfree; @@ -445,7 +445,7 @@ xfs_compat_attrmulti_by_handle( switch (ops[i].am_opcode) { case ATTR_OP_GET: ops[i].am_error = xfs_attrmulti_attr_get( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, compat_ptr(ops[i].am_attrvalue), &ops[i].am_length, ops[i].am_flags); break; @@ -454,7 +454,7 @@ xfs_compat_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_set( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, compat_ptr(ops[i].am_attrvalue), ops[i].am_length, ops[i].am_flags); mnt_drop_write_file(parfilp); @@ -464,7 +464,7 @@ xfs_compat_attrmulti_by_handle( if (ops[i].am_error) break; ops[i].am_error = xfs_attrmulti_attr_remove( - dentry->d_inode, attr_name, + d_inode(dentry), attr_name, ops[i].am_flags); mnt_drop_write_file(parfilp); break; @@ -504,7 +504,7 @@ xfs_compat_fssetdm_by_handle( if (IS_ERR(dentry)) return PTR_ERR(dentry); - if (IS_IMMUTABLE(dentry->d_inode) || IS_APPEND(dentry->d_inode)) { + if (IS_IMMUTABLE(d_inode(dentry)) || IS_APPEND(d_inode(dentry))) { error = -EPERM; goto out; } @@ -514,7 +514,7 @@ xfs_compat_fssetdm_by_handle( goto out; } - error = xfs_set_dmattrs(XFS_I(dentry->d_inode), fsd.fsd_dmevmask, + error = xfs_set_dmattrs(XFS_I(d_inode(dentry)), fsd.fsd_dmevmask, fsd.fsd_dmstate); out: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index ccb1dd0d509e..1f86033171c8 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -183,7 +183,7 @@ xfs_iomap_write_direct( * Check for running out of space, note: need lock to return */ if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -213,7 +213,7 @@ xfs_iomap_write_direct( error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_unlock; @@ -236,7 +236,7 @@ out_bmap_cancel: xfs_bmap_cancel(&free_list); xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); out_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); goto out_unlock; } @@ -460,8 +460,7 @@ xfs_iomap_prealloc_size( alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(MAXEXTLEN), alloc_blocks); - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); - freesp = mp->m_sb.sb_fdblocks; + freesp = percpu_counter_read_positive(&mp->m_fdblocks); if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { shift = 2; if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) @@ -691,7 +690,7 @@ xfs_iomap_write_allocate( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, nres, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -761,7 +760,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error0; @@ -792,7 +791,7 @@ xfs_iomap_write_allocate( trans_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error0: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -854,7 +853,7 @@ xfs_iomap_write_unwritten( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -891,7 +890,7 @@ xfs_iomap_write_unwritten( if (error) goto error_on_bmapi_transaction; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; @@ -915,7 +914,7 @@ xfs_iomap_write_unwritten( error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index e53a90331422..766b23f86ce9 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -41,7 +41,6 @@ #include <linux/capability.h> #include <linux/xattr.h> -#include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/security.h> #include <linux/fiemap.h> @@ -187,6 +186,8 @@ xfs_generic_create( else d_instantiate(dentry, inode); + xfs_finish_inode_setup(ip); + out_free_acl: if (default_acl) posix_acl_release(default_acl); @@ -195,6 +196,7 @@ xfs_generic_create( return error; out_cleanup_inode: + xfs_finish_inode_setup(ip); if (!tmpfile) xfs_cleanup_inode(dir, inode, dentry); iput(inode); @@ -301,7 +303,7 @@ xfs_vn_link( struct inode *dir, struct dentry *dentry) { - struct inode *inode = old_dentry->d_inode; + struct inode *inode = d_inode(old_dentry); struct xfs_name name; int error; @@ -326,7 +328,7 @@ xfs_vn_unlink( xfs_dentry_to_name(&name, dentry, 0); - error = xfs_remove(XFS_I(dir), &name, XFS_I(dentry->d_inode)); + error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry))); if (error) return error; @@ -367,9 +369,11 @@ xfs_vn_symlink( goto out_cleanup_inode; d_instantiate(dentry, inode); + xfs_finish_inode_setup(cip); return 0; out_cleanup_inode: + xfs_finish_inode_setup(cip); xfs_cleanup_inode(dir, inode, dentry); iput(inode); out: @@ -384,22 +388,22 @@ xfs_vn_rename( struct dentry *ndentry, unsigned int flags) { - struct inode *new_inode = ndentry->d_inode; + struct inode *new_inode = d_inode(ndentry); int omode = 0; struct xfs_name oname; struct xfs_name nname; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; /* if we are exchanging files, we need to set i_mode of both files */ if (flags & RENAME_EXCHANGE) - omode = ndentry->d_inode->i_mode; + omode = d_inode(ndentry)->i_mode; xfs_dentry_to_name(&oname, odentry, omode); - xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode); + xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode); - return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), + return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)), XFS_I(ndir), &nname, new_inode ? XFS_I(new_inode) : NULL, flags); } @@ -409,10 +413,10 @@ xfs_vn_rename( * we need to be very careful about how much stack we use. * uio is kmalloced for this reason... */ -STATIC void * +STATIC const char * xfs_vn_follow_link( struct dentry *dentry, - struct nameidata *nd) + void **cookie) { char *link; int error = -ENOMEM; @@ -421,18 +425,16 @@ xfs_vn_follow_link( if (!link) goto out_err; - error = xfs_readlink(XFS_I(dentry->d_inode), link); + error = xfs_readlink(XFS_I(d_inode(dentry)), link); if (unlikely(error)) goto out_kfree; - nd_set_link(nd, link); - return NULL; + return *cookie = link; out_kfree: kfree(link); out_err: - nd_set_link(nd, ERR_PTR(error)); - return NULL; + return ERR_PTR(error); } STATIC int @@ -441,7 +443,7 @@ xfs_vn_getattr( struct dentry *dentry, struct kstat *stat) { - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -697,7 +699,7 @@ xfs_setattr_nonsize( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -728,7 +730,7 @@ xfs_setattr_nonsize( return 0; out_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); @@ -750,7 +752,6 @@ xfs_setattr_size( struct xfs_trans *tp; int error; uint lock_flags = 0; - uint commit_flags = 0; bool did_zeroing = false; trace_xfs_setattr(ip); @@ -766,6 +767,7 @@ xfs_setattr_size( return error; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_MMAPLOCK_EXCL)); ASSERT(S_ISREG(ip->i_d.di_mode)); ASSERT((iattr->ia_valid & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); @@ -829,61 +831,36 @@ xfs_setattr_size( inode_dio_wait(inode); /* - * Do all the page cache truncate work outside the transaction context - * as the "lock" order is page lock->log space reservation. i.e. - * locking pages inside the transaction can ABBA deadlock with - * writeback. We have to do the VFS inode size update before we truncate - * the pagecache, however, to avoid racing with page faults beyond the - * new EOF they are not serialised against truncate operations except by - * page locks and size updates. + * We've already locked out new page faults, so now we can safely remove + * pages from the page cache knowing they won't get refaulted until we + * drop the XFS_MMAP_EXCL lock after the extent manipulations are + * complete. The truncate_setsize() call also cleans partial EOF page + * PTEs on extending truncates and hence ensures sub-page block size + * filesystems are correctly handled, too. * - * Hence we are in a situation where a truncate can fail with ENOMEM - * from xfs_trans_reserve(), but having already truncated the in-memory - * version of the file (i.e. made user visible changes). There's not - * much we can do about this, except to hope that the caller sees ENOMEM - * and retries the truncate operation. + * We have to do all the page cache truncate work outside the + * transaction context as the "lock" order is page lock->log space + * reservation as defined by extent allocation in the writeback path. + * Hence a truncate can fail with ENOMEM from xfs_trans_reserve(), but + * having already truncated the in-memory version of the file (i.e. made + * user visible changes). There's not much we can do about this, except + * to hope that the caller sees ENOMEM and retries the truncate + * operation. */ - error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); + else + error = block_truncate_page(inode->i_mapping, newsize, + xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); - /* - * The "we can't serialise against page faults" pain gets worse. - * - * If the file is mapped then we have to clean the page at the old EOF - * when extending the file. Extending the file can expose changes the - * underlying page mapping (e.g. from beyond EOF to a hole or - * unwritten), and so on the next attempt to write to that page we need - * to remap it for write. i.e. we need .page_mkwrite() to be called. - * Hence we need to clean the page to clean the pte and so a new write - * fault will be triggered appropriately. - * - * If we do it before we change the inode size, then we can race with a - * page fault that maps the page with exactly the same problem. If we do - * it after we change the file size, then a new page fault can come in - * and allocate space before we've run the rest of the truncate - * transaction. That's kinda grotesque, but it's better than have data - * over a hole, and so that's the lesser evil that has been chosen here. - * - * The real solution, however, is to have some mechanism for locking out - * page faults while a truncate is in progress. - */ - if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) { - error = filemap_write_and_wait_range( - VFS_I(ip)->i_mapping, - round_down(oldsize, PAGE_CACHE_SIZE), - round_up(oldsize, PAGE_CACHE_SIZE) - 1); - if (error) - return error; - } - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto out_trans_cancel; - commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -923,7 +900,7 @@ xfs_setattr_size( if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) - goto out_trans_abort; + goto out_trans_cancel; /* * Truncated "down", so we're removing references to old data @@ -950,16 +927,14 @@ xfs_setattr_size( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; -out_trans_abort: - commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, commit_flags); + xfs_trans_cancel(tp); goto out_unlock; } @@ -968,16 +943,20 @@ xfs_vn_setattr( struct dentry *dentry, struct iattr *iattr) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); int error; if (iattr->ia_valid & ATTR_SIZE) { uint iolock = XFS_IOLOCK_EXCL; xfs_ilock(ip, iolock); - error = xfs_break_layouts(dentry->d_inode, &iolock); - if (!error) + error = xfs_break_layouts(d_inode(dentry), &iolock, true); + if (!error) { + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + iolock |= XFS_MMAPLOCK_EXCL; + error = xfs_setattr_size(ip, iattr); + } xfs_iunlock(ip, iolock); } else { error = xfs_setattr_nonsize(ip, iattr, 0); @@ -1002,7 +981,7 @@ xfs_vn_update_time( tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -1024,7 +1003,7 @@ xfs_vn_update_time( } xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -1209,35 +1188,31 @@ xfs_diflags_to_iflags( struct inode *inode, struct xfs_inode *ip) { - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + uint16_t flags = ip->i_d.di_flags; + + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | + S_NOATIME | S_DAX); + + if (flags & XFS_DIFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + if (flags & XFS_DIFLAG_APPEND) inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + if (flags & XFS_DIFLAG_SYNC) inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; + /* XXX: Also needs an on-disk per inode flag! */ + if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + inode->i_flags |= S_DAX; } /* - * Initialize the Linux inode, set up the operation vectors and - * unlock the inode. - * - * When reading existing inodes from disk this is called directly - * from xfs_iget, when creating a new inode it is called from - * xfs_ialloc after setting up the inode. + * Initialize the Linux inode and set up the operation vectors. * - * We are always called with an uninitialised linux inode here. - * We need to initialise the necessary fields and take a reference - * on it. + * When reading existing inodes from disk this is called directly from xfs_iget, + * when creating a new inode it is called from xfs_ialloc after setting up the + * inode. These callers have different criteria for clearing XFS_INEW, so leave + * it up to the caller to deal with unlocking the inode appropriately. */ void xfs_setup_inode( @@ -1324,9 +1299,4 @@ xfs_setup_inode( inode_has_no_xattr(inode); cache_no_acl(inode); } - - xfs_iflags_clear(ip, XFS_INEW); - barrier(); - - unlock_new_inode(inode); } diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index ea7a98e9cb70..a0f84abb0d09 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -25,8 +25,6 @@ extern const struct file_operations xfs_dir_file_operations; extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size); -extern void xfs_setup_inode(struct xfs_inode *); - /* * Internal setattr interfaces. */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 82e314258f73..f41b0c3fddab 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -229,7 +229,7 @@ xfs_bulkstat_grab_ichunk( error = xfs_inobt_get_rec(cur, irec, &stat); if (error) return error; - XFS_WANT_CORRUPTED_RETURN(stat == 1); + XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, stat == 1); /* Check if the record contains the inode in request */ if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) { @@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk( } irec->ir_free |= xfs_inobt_maskn(0, idx); - *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; + *icount = irec->ir_count - irec->ir_freecount; } return 0; @@ -415,6 +415,8 @@ xfs_bulkstat( goto del_cursor; if (icount) { irbp->ir_startino = r.ir_startino; + irbp->ir_holemask = r.ir_holemask; + irbp->ir_count = r.ir_count; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; @@ -447,13 +449,15 @@ xfs_bulkstat( * If this chunk has any allocated inodes, save it. * Also start read-ahead now for this chunk. */ - if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + if (r.ir_freecount < r.ir_count) { xfs_bulkstat_ichunk_ra(mp, agno, &r); irbp->ir_startino = r.ir_startino; + irbp->ir_holemask = r.ir_holemask; + irbp->ir_count = r.ir_count; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; - icount += XFS_INODES_PER_CHUNK - r.ir_freecount; + icount += r.ir_count - r.ir_freecount; } error = xfs_btree_increment(cur, 0, &stat); if (error || stat == 0) { @@ -599,8 +603,7 @@ xfs_inumbers( agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, r.ir_startino); - buffer[bufidx].xi_alloccount = - XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount; buffer[bufidx].xi_allocmask = ~r.ir_free; if (++bufidx == bcount) { long written; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index c31d2c2eadc4..85f883dd6207 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -32,26 +32,12 @@ typedef unsigned int __uint32_t; typedef signed long long int __int64_t; typedef unsigned long long int __uint64_t; -typedef __uint32_t inst_t; /* an instruction */ - typedef __s64 xfs_off_t; /* <file offset> type */ typedef unsigned long long xfs_ino_t; /* <inode> type */ typedef __s64 xfs_daddr_t; /* <disk address> type */ -typedef char * xfs_caddr_t; /* <core address> type */ typedef __u32 xfs_dev_t; typedef __u32 xfs_nlink_t; -/* __psint_t is the same size as a pointer */ -#if (BITS_PER_LONG == 32) -typedef __int32_t __psint_t; -typedef __uint32_t __psunsigned_t; -#elif (BITS_PER_LONG == 64) -typedef __int64_t __psint_t; -typedef __uint64_t __psunsigned_t; -#else -#error BITS_PER_LONG must be 32 or 64 -#endif - #include "xfs_types.h" #include "kmem.h" @@ -116,15 +102,6 @@ typedef __uint64_t __psunsigned_t; #undef XFS_NATIVE_HOST #endif -/* - * Feature macros (disable/enable) - */ -#ifdef CONFIG_SMP -#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ -#else -#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ -#endif - #define irix_sgid_inherit xfs_params.sgid_inherit.val #define irix_symlink_mode xfs_params.symlink_mode.val #define xfs_panic_mask xfs_params.panic_mask.val diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bcc7cfabb787..08d4fe46f0fa 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -109,7 +109,7 @@ xlog_ungrant_log_space( STATIC void xlog_verify_dest_ptr( struct xlog *log, - char *ptr); + void *ptr); STATIC void xlog_verify_grant_tail( struct xlog *log); @@ -513,7 +513,7 @@ xfs_log_done( struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags) + bool regrant) { struct xlog *log = mp->m_log; xfs_lsn_t lsn = 0; @@ -526,14 +526,11 @@ xfs_log_done( (((ticket->t_flags & XLOG_TIC_INITED) == 0) && (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; - if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { - flags |= XFS_LOG_REL_PERM_RESERV; - } + regrant = false; } - if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || - (flags & XFS_LOG_REL_PERM_RESERV)) { + if (!regrant) { trace_xfs_log_done_nonperm(log, ticket); /* @@ -541,7 +538,6 @@ xfs_log_done( * request has been made to release a permanent reservation. */ xlog_ungrant_log_space(log, ticket); - xfs_log_ticket_put(ticket); } else { trace_xfs_log_done_perm(log, ticket); @@ -553,6 +549,7 @@ xfs_log_done( ticket->t_flags |= XLOG_TIC_INITED; } + xfs_log_ticket_put(ticket); return lsn; } @@ -1447,7 +1444,7 @@ xlog_alloc_log( iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; #ifdef DEBUG - log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); + log->l_iclog_bak[i] = &iclog->ic_header; #endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); @@ -1602,7 +1599,7 @@ xlog_pack_data( int i, j, k; int size = iclog->ic_offset + roundoff; __be32 cycle_lsn; - xfs_caddr_t dp; + char *dp; cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); @@ -3664,7 +3661,7 @@ xlog_ticket_alloc( void xlog_verify_dest_ptr( struct xlog *log, - char *ptr) + void *ptr) { int i; int good_ptr = 0; @@ -3767,9 +3764,8 @@ xlog_verify_iclog( xlog_op_header_t *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; - xfs_caddr_t ptr; - xfs_caddr_t base_ptr; - __psint_t field_offset; + void *base_ptr, *ptr, *p; + ptrdiff_t field_offset; __uint8_t clientid; int len, i, j, k, op_len; int idx; @@ -3788,9 +3784,9 @@ xlog_verify_iclog( if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - ptr = (xfs_caddr_t) &iclog->ic_header; - for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; - ptr += BBSIZE) { + base_ptr = ptr = &iclog->ic_header; + p = &iclog->ic_header; + for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", __func__); @@ -3798,20 +3794,19 @@ xlog_verify_iclog( /* check fields */ len = be32_to_cpu(iclog->ic_header.h_num_logops); - ptr = iclog->ic_datap; - base_ptr = ptr; - ophead = (xlog_op_header_t *)ptr; + base_ptr = ptr = iclog->ic_datap; + ophead = ptr; xhdr = iclog->ic_data; for (i = 0; i < len; i++) { - ophead = (xlog_op_header_t *)ptr; + ophead = ptr; /* clientid is only 1 byte */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); + p = &ophead->oh_clientid; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); + idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3829,13 +3824,13 @@ xlog_verify_iclog( (unsigned long)field_offset); /* check length */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); + p = &ophead->oh_len; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((__psint_t)&ophead->oh_len - - (__psint_t)iclog->ic_datap); + idx = BTOBBT((uintptr_t)&ophead->oh_len - + (uintptr_t)iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e0deb95abd..fa27aaec72cb 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -111,15 +111,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XFS_LSN_CMP(x,y) _lsn_cmp(x,y) /* - * Macros, structures, prototypes for interface to the log manager. - */ - -/* - * Flags to xfs_log_done() - */ -#define XFS_LOG_REL_PERM_RESERV 0x1 - -/* * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk @@ -138,7 +129,7 @@ struct xfs_log_callback; xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags); + bool regrant); int _xfs_log_force(struct xfs_mount *mp, uint flags, int *log_forced); @@ -183,7 +174,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, int flags); + xfs_lsn_t *commit_lsn, bool regrant); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 45cc0ce18adf..abc2ccbff739 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -624,7 +624,7 @@ restart: spin_unlock(&cil->xc_push_lock); /* xfs_log_done always frees the ticket on error. */ - commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false); if (commit_lsn == -1) goto out_abort; @@ -773,14 +773,10 @@ xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, - int flags) + bool regrant) { struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; - int log_flags = 0; - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; /* lock out background commit */ down_read(&cil->xc_ctx_lock); @@ -795,7 +791,7 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = tp->t_commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_log_done(mp, tp->t_ticket, NULL, regrant); xfs_trans_unreserve_and_mod_sb(tp); /* @@ -809,7 +805,7 @@ xfs_log_commit_cil( * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ - xfs_trans_free_items(tp, tp->t_commit_lsn, 0); + xfs_trans_free_items(tp, tp->t_commit_lsn, false); xlog_cil_push_background(log); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index db7cbdeb2b42..1c87c8abfbed 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -409,7 +409,7 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG - char *l_iclog_bak[XLOG_MAX_ICLOGS]; + void *l_iclog_bak[XLOG_MAX_ICLOGS]; #endif }; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a5a945fc3bdc..480ebba8464f 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -147,7 +147,7 @@ xlog_put_bp( * Return the address of the start of the given block number's data * in a log buffer. The buffer covers a log sector-aligned region. */ -STATIC xfs_caddr_t +STATIC char * xlog_align( struct xlog *log, xfs_daddr_t blk_no, @@ -203,7 +203,7 @@ xlog_bread( xfs_daddr_t blk_no, int nbblks, struct xfs_buf *bp, - xfs_caddr_t *offset) + char **offset) { int error; @@ -225,9 +225,9 @@ xlog_bread_offset( xfs_daddr_t blk_no, /* block to read from */ int nbblks, /* blocks to read */ struct xfs_buf *bp, - xfs_caddr_t offset) + char *offset) { - xfs_caddr_t orig_offset = bp->b_addr; + char *orig_offset = bp->b_addr; int orig_len = BBTOB(bp->b_length); int error, error2; @@ -396,7 +396,7 @@ xlog_find_cycle_start( xfs_daddr_t *last_blk, uint cycle) { - xfs_caddr_t offset; + char *offset; xfs_daddr_t mid_blk; xfs_daddr_t end_blk; uint mid_cycle; @@ -443,7 +443,7 @@ xlog_find_verify_cycle( uint cycle; xfs_buf_t *bp; xfs_daddr_t bufblks; - xfs_caddr_t buf = NULL; + char *buf = NULL; int error = 0; /* @@ -509,7 +509,7 @@ xlog_find_verify_log_record( { xfs_daddr_t i; xfs_buf_t *bp; - xfs_caddr_t offset = NULL; + char *offset = NULL; xlog_rec_header_t *head = NULL; int error = 0; int smallmem = 0; @@ -616,7 +616,7 @@ xlog_find_head( xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; - xfs_caddr_t offset; + char *offset; xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; int num_scan_bblks; uint first_half_cycle, last_half_cycle; @@ -891,7 +891,7 @@ xlog_find_tail( { xlog_rec_header_t *rhead; xlog_op_header_t *op_head; - xfs_caddr_t offset = NULL; + char *offset = NULL; xfs_buf_t *bp; int error, i, found; xfs_daddr_t umount_data_blk; @@ -1099,7 +1099,7 @@ xlog_find_zeroed( xfs_daddr_t *blk_no) { xfs_buf_t *bp; - xfs_caddr_t offset; + char *offset; uint first_cycle, last_cycle; xfs_daddr_t new_blk, last_blk, start_blk; xfs_daddr_t num_scan_bblks; @@ -1199,7 +1199,7 @@ bp_err: STATIC void xlog_add_record( struct xlog *log, - xfs_caddr_t buf, + char *buf, int cycle, int block, int tail_cycle, @@ -1227,7 +1227,7 @@ xlog_write_log_records( int tail_cycle, int tail_block) { - xfs_caddr_t offset; + char *offset; xfs_buf_t *bp; int balign, ealign; int sectbb = log->l_sectBBsize; @@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer( return -EFSCORRUPTED; } - buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, - next_unlinked_offset); + buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; /* @@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer( * have to leave the inode in a consistent state for whoever * reads it next.... */ - xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_dinode_calc_crc(mp, xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); } @@ -1887,9 +1886,14 @@ xlog_recover_get_buf_lsn( uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; break; case XFS_ATTR3_RMT_MAGIC: - lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); - uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; - break; + /* + * Remote attr blocks are written synchronously, rather than + * being logged. That means they do not contain a valid LSN + * (i.e. transactionally ordered) in them, and hence any time we + * see a buffer to replay over the top of a remote attribute + * block we should simply do so. + */ + goto recover_immediately; case XFS_SB_MAGIC: lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); uuid = &((struct xfs_dsb *)blk)->sb_uuid; @@ -2503,8 +2507,8 @@ xlog_recover_inode_pass2( xfs_buf_t *bp; xfs_dinode_t *dip; int len; - xfs_caddr_t src; - xfs_caddr_t dest; + char *src; + char *dest; int error; int attr_index; uint fields; @@ -2546,7 +2550,7 @@ xlog_recover_inode_pass2( goto out_release; } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); - dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); + dip = xfs_buf_offset(bp, in_f->ilf_boffset); /* * Make sure the place we're flushing out to really looks @@ -2885,7 +2889,7 @@ xlog_recover_dquot_pass2( return error; ASSERT(bp); - ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); + ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); /* * If the dquot has an LSN in it, recover the dquot only if it's less @@ -3068,12 +3072,22 @@ xlog_recover_do_icreate_pass2( return -EINVAL; } - /* existing allocation is fixed value */ - ASSERT(count == mp->m_ialloc_inos); - ASSERT(length == mp->m_ialloc_blks); - if (count != mp->m_ialloc_inos || - length != mp->m_ialloc_blks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + /* + * The inode chunk is either full or sparse and we only support + * m_ialloc_min_blks sized sparse allocations at this time. + */ + if (length != mp->m_ialloc_blks && + length != mp->m_ialloc_min_blks) { + xfs_warn(log->l_mp, + "%s: unsupported chunk length", __FUNCTION__); + return -EINVAL; + } + + /* verify inode count is consistent with extent length */ + if ((count >> mp->m_sb.sb_inopblog) != length) { + xfs_warn(log->l_mp, + "%s: inconsistent inode count and chunk length", + __FUNCTION__); return -EINVAL; } @@ -3091,8 +3105,8 @@ xlog_recover_do_icreate_pass2( XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) return 0; - xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, - be32_to_cpu(icl->icl_gen)); + xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); return 0; } @@ -3364,17 +3378,17 @@ STATIC int xlog_recover_add_to_cont_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, int len) { xlog_recover_item_t *item; - xfs_caddr_t ptr, old_ptr; + char *ptr, *old_ptr; int old_len; if (list_empty(&trans->r_itemq)) { /* finish copying rest of trans header */ xlog_recover_add_item(&trans->r_itemq); - ptr = (xfs_caddr_t) &trans->r_theader + + ptr = (char *)&trans->r_theader + sizeof(xfs_trans_header_t) - len; memcpy(ptr, dp, len); return 0; @@ -3410,12 +3424,12 @@ STATIC int xlog_recover_add_to_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, int len) { xfs_inode_log_format_t *in_f; /* any will do */ xlog_recover_item_t *item; - xfs_caddr_t ptr; + char *ptr; if (!len) return 0; @@ -3504,7 +3518,7 @@ STATIC int xlog_recovery_process_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, unsigned int len, unsigned int flags, int pass) @@ -3611,8 +3625,8 @@ xlog_recover_process_ophdr( struct hlist_head rhash[], struct xlog_rec_header *rhead, struct xlog_op_header *ohead, - xfs_caddr_t dp, - xfs_caddr_t end, + char *dp, + char *end, int pass) { struct xlog_recover *trans; @@ -3661,11 +3675,11 @@ xlog_recover_process_data( struct xlog *log, struct hlist_head rhash[], struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, int pass) { struct xlog_op_header *ohead; - xfs_caddr_t end; + char *end; int num_logops; int error; @@ -3751,11 +3765,11 @@ xlog_recover_process_efi( } set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); return error; abort_error: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -3857,13 +3871,13 @@ xlog_recover_clear_agi_bucket( xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto out_error; return; out_abort: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); out_error: xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); return; @@ -4010,7 +4024,7 @@ xlog_recover_process_iunlinks( STATIC int xlog_unpack_data_crc( struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, struct xlog *log) { __le32 crc; @@ -4040,7 +4054,7 @@ xlog_unpack_data_crc( STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, struct xlog *log) { int i, j, k; @@ -4122,7 +4136,7 @@ xlog_do_recovery_pass( { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; - xfs_caddr_t offset; + char *offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size; int bblks, split_bblks; @@ -4463,10 +4477,10 @@ xlog_do_recover( xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp)); ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC); ASSERT(xfs_sb_good_version(sbp)); + xfs_reinit_percpu_counters(log->l_mp); + xfs_buf_relse(bp); - /* We've re-read the superblock so re-initialize per-cpu counters */ - xfs_icsb_reinit_counters(log->l_mp); xlog_recover_check_summary(log); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 4fa80e63eea2..461e791efad7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -43,18 +43,6 @@ #include "xfs_sysfs.h" -#ifdef HAVE_PERCPU_SB -STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, - int); -STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, - int); -STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); -#else - -#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) -#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) -#endif - static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; @@ -347,8 +335,7 @@ reread: goto reread; } - /* Initialize per-cpu counters */ - xfs_icsb_reinit_counters(mp); + xfs_reinit_percpu_counters(mp); /* no need to be quiet anymore, so reset the buf ops */ bp->b_ops = &xfs_sb_buf_ops; @@ -738,6 +725,22 @@ xfs_mountfs( } /* + * If enabled, sparse inode chunk alignment is expected to match the + * cluster size. Full inode chunk alignment must match the chunk size, + * but that is checked on sb read verification... + */ + if (xfs_sb_version_hassparseinodes(&mp->m_sb) && + mp->m_sb.sb_spino_align != + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) { + xfs_warn(mp, + "Sparse inode block alignment (%u) must match cluster size (%llu).", + mp->m_sb.sb_spino_align, + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)); + error = -EINVAL; + goto out_remove_uuid; + } + + /* * Set inode alignment fields */ xfs_set_inoalignment(mp); @@ -1087,8 +1090,6 @@ xfs_log_sbcount(xfs_mount_t *mp) if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE)) return 0; - xfs_icsb_sync_counters(mp, 0); - /* * we don't need to do this if we are updating the superblock * counters on every modification. @@ -1100,252 +1101,141 @@ xfs_log_sbcount(xfs_mount_t *mp) } /* - * xfs_mod_incore_sb_unlocked() is a utility routine commonly used to apply - * a delta to a specified field in the in-core superblock. Simply - * switch on the field indicated and apply the delta to that field. - * Fields are not allowed to dip below zero, so if the delta would - * do this do not apply it and return EINVAL. - * - * The m_sb_lock must be held when this routine is called. + * Deltas for the inode count are +/-64, hence we use a large batch size + * of 128 so we don't need to take the counter lock on every update. */ -STATIC int -xfs_mod_incore_sb_unlocked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) +#define XFS_ICOUNT_BATCH 128 +int +xfs_mod_icount( + struct xfs_mount *mp, + int64_t delta) { - int scounter; /* short counter for 32 bit fields */ - long long lcounter; /* long counter for 64 bit fields */ - long long res_used, rem; - - /* - * With the in-core superblock spin lock held, switch - * on the indicated field. Apply the delta to the - * proper field. If the fields value would dip below - * 0, then do not apply the delta and return EINVAL. - */ - switch (field) { - case XFS_SBS_ICOUNT: - lcounter = (long long)mp->m_sb.sb_icount; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_icount = lcounter; - return 0; - case XFS_SBS_IFREE: - lcounter = (long long)mp->m_sb.sb_ifree; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_ifree = lcounter; - return 0; - case XFS_SBS_FDBLOCKS: - lcounter = (long long) - mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); - res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); - - if (delta > 0) { /* Putting blocks back */ - if (res_used > delta) { - mp->m_resblks_avail += delta; - } else { - rem = delta - res_used; - mp->m_resblks_avail = mp->m_resblks; - lcounter += rem; - } - } else { /* Taking blocks away */ - lcounter += delta; - if (lcounter >= 0) { - mp->m_sb.sb_fdblocks = lcounter + - XFS_ALLOC_SET_ASIDE(mp); - return 0; - } - - /* - * We are out of blocks, use any available reserved - * blocks if were allowed to. - */ - if (!rsvd) - return -ENOSPC; - - lcounter = (long long)mp->m_resblks_avail + delta; - if (lcounter >= 0) { - mp->m_resblks_avail = lcounter; - return 0; - } - printk_once(KERN_WARNING - "Filesystem \"%s\": reserve blocks depleted! " - "Consider increasing reserve pool size.", - mp->m_fsname); - return -ENOSPC; - } - - mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); - return 0; - case XFS_SBS_FREXTENTS: - lcounter = (long long)mp->m_sb.sb_frextents; - lcounter += delta; - if (lcounter < 0) { - return -ENOSPC; - } - mp->m_sb.sb_frextents = lcounter; - return 0; - case XFS_SBS_DBLOCKS: - lcounter = (long long)mp->m_sb.sb_dblocks; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_dblocks = lcounter; - return 0; - case XFS_SBS_AGCOUNT: - scounter = mp->m_sb.sb_agcount; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_agcount = scounter; - return 0; - case XFS_SBS_IMAX_PCT: - scounter = mp->m_sb.sb_imax_pct; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_imax_pct = scounter; - return 0; - case XFS_SBS_REXTSIZE: - scounter = mp->m_sb.sb_rextsize; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextsize = scounter; - return 0; - case XFS_SBS_RBMBLOCKS: - scounter = mp->m_sb.sb_rbmblocks; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rbmblocks = scounter; - return 0; - case XFS_SBS_RBLOCKS: - lcounter = (long long)mp->m_sb.sb_rblocks; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rblocks = lcounter; - return 0; - case XFS_SBS_REXTENTS: - lcounter = (long long)mp->m_sb.sb_rextents; - lcounter += delta; - if (lcounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextents = lcounter; - return 0; - case XFS_SBS_REXTSLOG: - scounter = mp->m_sb.sb_rextslog; - scounter += delta; - if (scounter < 0) { - ASSERT(0); - return -EINVAL; - } - mp->m_sb.sb_rextslog = scounter; - return 0; - default: + __percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH); + if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) { ASSERT(0); + percpu_counter_add(&mp->m_icount, -delta); return -EINVAL; } + return 0; } -/* - * xfs_mod_incore_sb() is used to change a field in the in-core - * superblock structure by the specified delta. This modification - * is protected by the m_sb_lock. Just use the xfs_mod_incore_sb_unlocked() - * routine to do the work. - */ int -xfs_mod_incore_sb( +xfs_mod_ifree( struct xfs_mount *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) + int64_t delta) { - int status; - -#ifdef HAVE_PERCPU_SB - ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS); -#endif - spin_lock(&mp->m_sb_lock); - status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - spin_unlock(&mp->m_sb_lock); - - return status; + percpu_counter_add(&mp->m_ifree, delta); + if (percpu_counter_compare(&mp->m_ifree, 0) < 0) { + ASSERT(0); + percpu_counter_add(&mp->m_ifree, -delta); + return -EINVAL; + } + return 0; } /* - * Change more than one field in the in-core superblock structure at a time. - * - * The fields and changes to those fields are specified in the array of - * xfs_mod_sb structures passed in. Either all of the specified deltas - * will be applied or none of them will. If any modified field dips below 0, - * then all modifications will be backed out and EINVAL will be returned. - * - * Note that this function may not be used for the superblock values that - * are tracked with the in-memory per-cpu counters - a direct call to - * xfs_icsb_modify_counters is required for these. + * Deltas for the block count can vary from 1 to very large, but lock contention + * only occurs on frequent small block count updates such as in the delayed + * allocation path for buffered writes (page a time updates). Hence we set + * a large batch count (1024) to minimise global counter updates except when + * we get near to ENOSPC and we have to be very accurate with our updates. */ +#define XFS_FDBLOCKS_BATCH 1024 int -xfs_mod_incore_sb_batch( +xfs_mod_fdblocks( struct xfs_mount *mp, - xfs_mod_sb_t *msb, - uint nmsb, - int rsvd) + int64_t delta, + bool rsvd) { - xfs_mod_sb_t *msbp; - int error = 0; + int64_t lcounter; + long long res_used; + s32 batch; + + if (delta > 0) { + /* + * If the reserve pool is depleted, put blocks back into it + * first. Most of the time the pool is full. + */ + if (likely(mp->m_resblks == mp->m_resblks_avail)) { + percpu_counter_add(&mp->m_fdblocks, delta); + return 0; + } + + spin_lock(&mp->m_sb_lock); + res_used = (long long)(mp->m_resblks - mp->m_resblks_avail); + + if (res_used > delta) { + mp->m_resblks_avail += delta; + } else { + delta -= res_used; + mp->m_resblks_avail = mp->m_resblks; + percpu_counter_add(&mp->m_fdblocks, delta); + } + spin_unlock(&mp->m_sb_lock); + return 0; + } /* - * Loop through the array of mod structures and apply each individually. - * If any fail, then back out all those which have already been applied. - * Do all of this within the scope of the m_sb_lock so that all of the - * changes will be atomic. + * Taking blocks away, need to be more accurate the closer we + * are to zero. + * + * If the counter has a value of less than 2 * max batch size, + * then make everything serialise as we are real close to + * ENOSPC. + */ + if (__percpu_counter_compare(&mp->m_fdblocks, 2 * XFS_FDBLOCKS_BATCH, + XFS_FDBLOCKS_BATCH) < 0) + batch = 1; + else + batch = XFS_FDBLOCKS_BATCH; + + __percpu_counter_add(&mp->m_fdblocks, delta, batch); + if (__percpu_counter_compare(&mp->m_fdblocks, XFS_ALLOC_SET_ASIDE(mp), + XFS_FDBLOCKS_BATCH) >= 0) { + /* we had space! */ + return 0; + } + + /* + * lock up the sb for dipping into reserves before releasing the space + * that took us to ENOSPC. */ spin_lock(&mp->m_sb_lock); - for (msbp = msb; msbp < (msb + nmsb); msbp++) { - ASSERT(msbp->msb_field < XFS_SBS_ICOUNT || - msbp->msb_field > XFS_SBS_FDBLOCKS); + percpu_counter_add(&mp->m_fdblocks, -delta); + if (!rsvd) + goto fdblocks_enospc; - error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, - msbp->msb_delta, rsvd); - if (error) - goto unwind; + lcounter = (long long)mp->m_resblks_avail + delta; + if (lcounter >= 0) { + mp->m_resblks_avail = lcounter; + spin_unlock(&mp->m_sb_lock); + return 0; } + printk_once(KERN_WARNING + "Filesystem \"%s\": reserve blocks depleted! " + "Consider increasing reserve pool size.", + mp->m_fsname); +fdblocks_enospc: spin_unlock(&mp->m_sb_lock); - return 0; + return -ENOSPC; +} -unwind: - while (--msbp >= msb) { - error = xfs_mod_incore_sb_unlocked(mp, msbp->msb_field, - -msbp->msb_delta, rsvd); - ASSERT(error == 0); - } +int +xfs_mod_frextents( + struct xfs_mount *mp, + int64_t delta) +{ + int64_t lcounter; + int ret = 0; + + spin_lock(&mp->m_sb_lock); + lcounter = mp->m_sb.sb_frextents + delta; + if (lcounter < 0) + ret = -ENOSPC; + else + mp->m_sb.sb_frextents = lcounter; spin_unlock(&mp->m_sb_lock); - return error; + return ret; } /* @@ -1407,573 +1297,3 @@ xfs_dev_is_read_only( } return 0; } - -#ifdef HAVE_PERCPU_SB -/* - * Per-cpu incore superblock counters - * - * Simple concept, difficult implementation - * - * Basically, replace the incore superblock counters with a distributed per cpu - * counter for contended fields (e.g. free block count). - * - * Difficulties arise in that the incore sb is used for ENOSPC checking, and - * hence needs to be accurately read when we are running low on space. Hence - * there is a method to enable and disable the per-cpu counters based on how - * much "stuff" is available in them. - * - * Basically, a counter is enabled if there is enough free resource to justify - * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local - * ENOSPC), then we disable the counters to synchronise all callers and - * re-distribute the available resources. - * - * If, once we redistributed the available resources, we still get a failure, - * we disable the per-cpu counter and go through the slow path. - * - * The slow path is the current xfs_mod_incore_sb() function. This means that - * when we disable a per-cpu counter, we need to drain its resources back to - * the global superblock. We do this after disabling the counter to prevent - * more threads from queueing up on the counter. - * - * Essentially, this means that we still need a lock in the fast path to enable - * synchronisation between the global counters and the per-cpu counters. This - * is not a problem because the lock will be local to a CPU almost all the time - * and have little contention except when we get to ENOSPC conditions. - * - * Basically, this lock becomes a barrier that enables us to lock out the fast - * path while we do things like enabling and disabling counters and - * synchronising the counters. - * - * Locking rules: - * - * 1. m_sb_lock before picking up per-cpu locks - * 2. per-cpu locks always picked up via for_each_online_cpu() order - * 3. accurate counter sync requires m_sb_lock + per cpu locks - * 4. modifying per-cpu counters requires holding per-cpu lock - * 5. modifying global counters requires holding m_sb_lock - * 6. enabling or disabling a counter requires holding the m_sb_lock - * and _none_ of the per-cpu locks. - * - * Disabled counters are only ever re-enabled by a balance operation - * that results in more free resources per CPU than a given threshold. - * To ensure counters don't remain disabled, they are rebalanced when - * the global resource goes above a higher threshold (i.e. some hysteresis - * is present to prevent thrashing). - */ - -#ifdef CONFIG_HOTPLUG_CPU -/* - * hot-plug CPU notifier support. - * - * We need a notifier per filesystem as we need to be able to identify - * the filesystem to balance the counters out. This is achieved by - * having a notifier block embedded in the xfs_mount_t and doing pointer - * magic to get the mount pointer from the notifier block address. - */ -STATIC int -xfs_icsb_cpu_notify( - struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - xfs_icsb_cnts_t *cntp; - xfs_mount_t *mp; - - mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier); - cntp = (xfs_icsb_cnts_t *) - per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu); - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - /* Easy Case - initialize the area and locks, and - * then rebalance when online does everything else for us. */ - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - xfs_icsb_lock(mp); - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); - xfs_icsb_unlock(mp); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - /* Disable all the counters, then fold the dead cpu's - * count into the total on the global superblock and - * re-enable the counters. */ - xfs_icsb_lock(mp); - spin_lock(&mp->m_sb_lock); - xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT); - xfs_icsb_disable_counter(mp, XFS_SBS_IFREE); - xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS); - - mp->m_sb.sb_icount += cntp->icsb_icount; - mp->m_sb.sb_ifree += cntp->icsb_ifree; - mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks; - - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - - xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0); - spin_unlock(&mp->m_sb_lock); - xfs_icsb_unlock(mp); - break; - } - - return NOTIFY_OK; -} -#endif /* CONFIG_HOTPLUG_CPU */ - -int -xfs_icsb_init_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t); - if (mp->m_sb_cnts == NULL) - return -ENOMEM; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - } - - mutex_init(&mp->m_icsb_mutex); - - /* - * start with all counters disabled so that the - * initial balance kicks us off correctly - */ - mp->m_icsb_counters = -1; - -#ifdef CONFIG_HOTPLUG_CPU - mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; - mp->m_icsb_notifier.priority = 0; - register_hotcpu_notifier(&mp->m_icsb_notifier); -#endif /* CONFIG_HOTPLUG_CPU */ - - return 0; -} - -void -xfs_icsb_reinit_counters( - xfs_mount_t *mp) -{ - xfs_icsb_lock(mp); - /* - * start with all counters disabled so that the - * initial balance kicks us off correctly - */ - mp->m_icsb_counters = -1; - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); - xfs_icsb_unlock(mp); -} - -void -xfs_icsb_destroy_counters( - xfs_mount_t *mp) -{ - if (mp->m_sb_cnts) { - unregister_hotcpu_notifier(&mp->m_icsb_notifier); - free_percpu(mp->m_sb_cnts); - } - mutex_destroy(&mp->m_icsb_mutex); -} - -STATIC void -xfs_icsb_lock_cntr( - xfs_icsb_cnts_t *icsbp) -{ - while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) { - ndelay(1000); - } -} - -STATIC void -xfs_icsb_unlock_cntr( - xfs_icsb_cnts_t *icsbp) -{ - clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags); -} - - -STATIC void -xfs_icsb_lock_all_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - xfs_icsb_lock_cntr(cntp); - } -} - -STATIC void -xfs_icsb_unlock_all_counters( - xfs_mount_t *mp) -{ - xfs_icsb_cnts_t *cntp; - int i; - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - xfs_icsb_unlock_cntr(cntp); - } -} - -STATIC void -xfs_icsb_count( - xfs_mount_t *mp, - xfs_icsb_cnts_t *cnt, - int flags) -{ - xfs_icsb_cnts_t *cntp; - int i; - - memset(cnt, 0, sizeof(xfs_icsb_cnts_t)); - - if (!(flags & XFS_ICSB_LAZY_COUNT)) - xfs_icsb_lock_all_counters(mp); - - for_each_online_cpu(i) { - cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); - cnt->icsb_icount += cntp->icsb_icount; - cnt->icsb_ifree += cntp->icsb_ifree; - cnt->icsb_fdblocks += cntp->icsb_fdblocks; - } - - if (!(flags & XFS_ICSB_LAZY_COUNT)) - xfs_icsb_unlock_all_counters(mp); -} - -STATIC int -xfs_icsb_counter_disabled( - xfs_mount_t *mp, - xfs_sb_field_t field) -{ - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - return test_bit(field, &mp->m_icsb_counters); -} - -STATIC void -xfs_icsb_disable_counter( - xfs_mount_t *mp, - xfs_sb_field_t field) -{ - xfs_icsb_cnts_t cnt; - - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - - /* - * If we are already disabled, then there is nothing to do - * here. We check before locking all the counters to avoid - * the expensive lock operation when being called in the - * slow path and the counter is already disabled. This is - * safe because the only time we set or clear this state is under - * the m_icsb_mutex. - */ - if (xfs_icsb_counter_disabled(mp, field)) - return; - - xfs_icsb_lock_all_counters(mp); - if (!test_and_set_bit(field, &mp->m_icsb_counters)) { - /* drain back to superblock */ - - xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT); - switch(field) { - case XFS_SBS_ICOUNT: - mp->m_sb.sb_icount = cnt.icsb_icount; - break; - case XFS_SBS_IFREE: - mp->m_sb.sb_ifree = cnt.icsb_ifree; - break; - case XFS_SBS_FDBLOCKS: - mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; - break; - default: - BUG(); - } - } - - xfs_icsb_unlock_all_counters(mp); -} - -STATIC void -xfs_icsb_enable_counter( - xfs_mount_t *mp, - xfs_sb_field_t field, - uint64_t count, - uint64_t resid) -{ - xfs_icsb_cnts_t *cntp; - int i; - - ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS)); - - xfs_icsb_lock_all_counters(mp); - for_each_online_cpu(i) { - cntp = per_cpu_ptr(mp->m_sb_cnts, i); - switch (field) { - case XFS_SBS_ICOUNT: - cntp->icsb_icount = count + resid; - break; - case XFS_SBS_IFREE: - cntp->icsb_ifree = count + resid; - break; - case XFS_SBS_FDBLOCKS: - cntp->icsb_fdblocks = count + resid; - break; - default: - BUG(); - break; - } - resid = 0; - } - clear_bit(field, &mp->m_icsb_counters); - xfs_icsb_unlock_all_counters(mp); -} - -void -xfs_icsb_sync_counters_locked( - xfs_mount_t *mp, - int flags) -{ - xfs_icsb_cnts_t cnt; - - xfs_icsb_count(mp, &cnt, flags); - - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT)) - mp->m_sb.sb_icount = cnt.icsb_icount; - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE)) - mp->m_sb.sb_ifree = cnt.icsb_ifree; - if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS)) - mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; -} - -/* - * Accurate update of per-cpu counters to incore superblock - */ -void -xfs_icsb_sync_counters( - xfs_mount_t *mp, - int flags) -{ - spin_lock(&mp->m_sb_lock); - xfs_icsb_sync_counters_locked(mp, flags); - spin_unlock(&mp->m_sb_lock); -} - -/* - * Balance and enable/disable counters as necessary. - * - * Thresholds for re-enabling counters are somewhat magic. inode counts are - * chosen to be the same number as single on disk allocation chunk per CPU, and - * free blocks is something far enough zero that we aren't going thrash when we - * get near ENOSPC. We also need to supply a minimum we require per cpu to - * prevent looping endlessly when xfs_alloc_space asks for more than will - * be distributed to a single CPU but each CPU has enough blocks to be - * reenabled. - * - * Note that we can be called when counters are already disabled. - * xfs_icsb_disable_counter() optimises the counter locking in this case to - * prevent locking every per-cpu counter needlessly. - */ - -#define XFS_ICSB_INO_CNTR_REENABLE (uint64_t)64 -#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ - (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp)) -STATIC void -xfs_icsb_balance_counter_locked( - xfs_mount_t *mp, - xfs_sb_field_t field, - int min_per_cpu) -{ - uint64_t count, resid; - int weight = num_online_cpus(); - uint64_t min = (uint64_t)min_per_cpu; - - /* disable counter and sync counter */ - xfs_icsb_disable_counter(mp, field); - - /* update counters - first CPU gets residual*/ - switch (field) { - case XFS_SBS_ICOUNT: - count = mp->m_sb.sb_icount; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) - return; - break; - case XFS_SBS_IFREE: - count = mp->m_sb.sb_ifree; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) - return; - break; - case XFS_SBS_FDBLOCKS: - count = mp->m_sb.sb_fdblocks; - resid = do_div(count, weight); - if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp))) - return; - break; - default: - BUG(); - count = resid = 0; /* quiet, gcc */ - break; - } - - xfs_icsb_enable_counter(mp, field, count, resid); -} - -STATIC void -xfs_icsb_balance_counter( - xfs_mount_t *mp, - xfs_sb_field_t fields, - int min_per_cpu) -{ - spin_lock(&mp->m_sb_lock); - xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu); - spin_unlock(&mp->m_sb_lock); -} - -int -xfs_icsb_modify_counters( - xfs_mount_t *mp, - xfs_sb_field_t field, - int64_t delta, - int rsvd) -{ - xfs_icsb_cnts_t *icsbp; - long long lcounter; /* long counter for 64 bit fields */ - int ret = 0; - - might_sleep(); -again: - preempt_disable(); - icsbp = this_cpu_ptr(mp->m_sb_cnts); - - /* - * if the counter is disabled, go to slow path - */ - if (unlikely(xfs_icsb_counter_disabled(mp, field))) - goto slow_path; - xfs_icsb_lock_cntr(icsbp); - if (unlikely(xfs_icsb_counter_disabled(mp, field))) { - xfs_icsb_unlock_cntr(icsbp); - goto slow_path; - } - - switch (field) { - case XFS_SBS_ICOUNT: - lcounter = icsbp->icsb_icount; - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_icount = lcounter; - break; - - case XFS_SBS_IFREE: - lcounter = icsbp->icsb_ifree; - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_ifree = lcounter; - break; - - case XFS_SBS_FDBLOCKS: - BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0); - - lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); - lcounter += delta; - if (unlikely(lcounter < 0)) - goto balance_counter; - icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp); - break; - default: - BUG(); - break; - } - xfs_icsb_unlock_cntr(icsbp); - preempt_enable(); - return 0; - -slow_path: - preempt_enable(); - - /* - * serialise with a mutex so we don't burn lots of cpu on - * the superblock lock. We still need to hold the superblock - * lock, however, when we modify the global structures. - */ - xfs_icsb_lock(mp); - - /* - * Now running atomically. - * - * If the counter is enabled, someone has beaten us to rebalancing. - * Drop the lock and try again in the fast path.... - */ - if (!(xfs_icsb_counter_disabled(mp, field))) { - xfs_icsb_unlock(mp); - goto again; - } - - /* - * The counter is currently disabled. Because we are - * running atomically here, we know a rebalance cannot - * be in progress. Hence we can go straight to operating - * on the global superblock. We do not call xfs_mod_incore_sb() - * here even though we need to get the m_sb_lock. Doing so - * will cause us to re-enter this function and deadlock. - * Hence we get the m_sb_lock ourselves and then call - * xfs_mod_incore_sb_unlocked() as the unlocked path operates - * directly on the global counters. - */ - spin_lock(&mp->m_sb_lock); - ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd); - spin_unlock(&mp->m_sb_lock); - - /* - * Now that we've modified the global superblock, we - * may be able to re-enable the distributed counters - * (e.g. lots of space just got freed). After that - * we are done. - */ - if (ret != -ENOSPC) - xfs_icsb_balance_counter(mp, field, 0); - xfs_icsb_unlock(mp); - return ret; - -balance_counter: - xfs_icsb_unlock_cntr(icsbp); - preempt_enable(); - - /* - * We may have multiple threads here if multiple per-cpu - * counters run dry at the same time. This will mean we can - * do more balances than strictly necessary but it is not - * the common slowpath case. - */ - xfs_icsb_lock(mp); - - /* - * running atomically. - * - * This will leave the counter in the correct state for future - * accesses. After the rebalance, we simply try again and our retry - * will either succeed through the fast path or slow path without - * another balance operation being required. - */ - xfs_icsb_balance_counter(mp, field, delta); - xfs_icsb_unlock(mp); - goto again; -} - -#endif diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 0d8abd6364d9..7999e91cd49a 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -18,8 +18,6 @@ #ifndef __XFS_MOUNT_H__ #define __XFS_MOUNT_H__ -#ifdef __KERNEL__ - struct xlog; struct xfs_inode; struct xfs_mru_cache; @@ -29,44 +27,6 @@ struct xfs_quotainfo; struct xfs_dir_ops; struct xfs_da_geometry; -#ifdef HAVE_PERCPU_SB - -/* - * Valid per-cpu incore superblock counters. Note that if you add new counters, - * you may need to define new counter disabled bit field descriptors as there - * are more possible fields in the superblock that can fit in a bitfield on a - * 32 bit platform. The XFS_SBS_* values for the current current counters just - * fit. - */ -typedef struct xfs_icsb_cnts { - uint64_t icsb_fdblocks; - uint64_t icsb_ifree; - uint64_t icsb_icount; - unsigned long icsb_flags; -} xfs_icsb_cnts_t; - -#define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */ - -#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */ - -extern int xfs_icsb_init_counters(struct xfs_mount *); -extern void xfs_icsb_reinit_counters(struct xfs_mount *); -extern void xfs_icsb_destroy_counters(struct xfs_mount *); -extern void xfs_icsb_sync_counters(struct xfs_mount *, int); -extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); -extern int xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t, - int64_t, int); - -#else -#define xfs_icsb_init_counters(mp) (0) -#define xfs_icsb_destroy_counters(mp) do { } while (0) -#define xfs_icsb_reinit_counters(mp) do { } while (0) -#define xfs_icsb_sync_counters(mp, flags) do { } while (0) -#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) -#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \ - xfs_mod_incore_sb(mp, field, delta, rsvd) -#endif - /* dynamic preallocation free space thresholds, 5% down to 1% */ enum { XFS_LOWSP_1_PCNT = 0, @@ -81,8 +41,13 @@ typedef struct xfs_mount { struct super_block *m_super; xfs_tid_t m_tid; /* next unused tid for fs */ struct xfs_ail *m_ail; /* fs active log item list */ - xfs_sb_t m_sb; /* copy of fs superblock */ + + struct xfs_sb m_sb; /* copy of fs superblock */ spinlock_t m_sb_lock; /* sb counter lock */ + struct percpu_counter m_icount; /* allocated inodes counter */ + struct percpu_counter m_ifree; /* free inodes counter */ + struct percpu_counter m_fdblocks; /* free block counter */ + struct xfs_buf *m_sb_bp; /* buffer for superblock */ char *m_fsname; /* filesystem name */ int m_fsname_len; /* strlen of fs name */ @@ -136,6 +101,8 @@ typedef struct xfs_mount { __uint64_t m_flags; /* global mount flags */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ + int m_ialloc_min_blks;/* min blocks in sparse inode + * allocation */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ struct xfs_trans_resv m_resv; /* precomputed res values */ @@ -152,12 +119,6 @@ typedef struct xfs_mount { const struct xfs_dir_ops *m_nondir_inode_ops; /* !dir inode ops */ uint m_chsize; /* size of next field */ atomic_t m_active_trans; /* number trans frozen */ -#ifdef HAVE_PERCPU_SB - xfs_icsb_cnts_t __percpu *m_sb_cnts; /* per-cpu superblock counters */ - unsigned long m_icsb_counters; /* disabled per-cpu counters */ - struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */ - struct mutex m_icsb_mutex; /* balancer sync lock */ -#endif struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ struct delayed_work m_reclaim_work; /* background inode reclaim */ struct delayed_work m_eofblocks_work; /* background eof blocks @@ -220,6 +181,8 @@ typedef struct xfs_mount { allocator */ #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ +#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ + /* * Default minimum read and write sizes. @@ -301,35 +264,6 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } /* - * Per-cpu superblock locking functions - */ -#ifdef HAVE_PERCPU_SB -static inline void -xfs_icsb_lock(xfs_mount_t *mp) -{ - mutex_lock(&mp->m_icsb_mutex); -} - -static inline void -xfs_icsb_unlock(xfs_mount_t *mp) -{ - mutex_unlock(&mp->m_icsb_mutex); -} -#else -#define xfs_icsb_lock(mp) -#define xfs_icsb_unlock(mp) -#endif - -/* - * This structure is for use by the xfs_mod_incore_sb_batch() routine. - * xfs_growfs can specify a few fields which are more than int limit - */ -typedef struct xfs_mod_sb { - xfs_sb_field_t msb_field; /* Field to modify, see below */ - int64_t msb_delta; /* Change to make to specified field */ -} xfs_mod_sb_t; - -/* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. */ @@ -383,11 +317,14 @@ extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); extern int xfs_mountfs(xfs_mount_t *mp); extern int xfs_initialize_perag(xfs_mount_t *mp, xfs_agnumber_t agcount, xfs_agnumber_t *maxagi); - extern void xfs_unmountfs(xfs_mount_t *); -extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int); -extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, - uint, int); + +extern int xfs_mod_icount(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_ifree(struct xfs_mount *mp, int64_t delta); +extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta, + bool reserved); +extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta); + extern int xfs_mount_log_sb(xfs_mount_t *); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); extern int xfs_readsb(xfs_mount_t *, int); @@ -399,6 +336,4 @@ extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern void xfs_set_low_space_thresholds(struct xfs_mount *); -#endif /* __KERNEL__ */ - #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 30ecca3037e3..f8a674d7f092 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -437,7 +437,7 @@ xfs_mru_cache_insert( if (!mru || !mru->lists) return -EINVAL; - if (radix_tree_preload(GFP_KERNEL)) + if (radix_tree_preload(GFP_NOFS)) return -ENOMEM; INIT_LIST_HEAD(&elem->list_node); diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 365dd57ea760..ab4a6066f7ca 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -31,7 +31,8 @@ int xfs_break_layouts( struct inode *inode, - uint *iolock) + uint *iolock, + bool with_imutex) { struct xfs_inode *ip = XFS_I(inode); int error; @@ -40,8 +41,12 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); + if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) + mutex_unlock(&inode->i_mutex); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; + if (with_imutex) + mutex_lock(&inode->i_mutex); xfs_ilock(ip, *iolock); } @@ -301,7 +306,7 @@ xfs_fs_commit_blocks( tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_drop_iolock; } @@ -316,7 +321,7 @@ xfs_fs_commit_blocks( } xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_drop_iolock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h index b7fbfce660f6..8147ac108820 100644 --- a/fs/xfs/xfs_pnfs.h +++ b/fs/xfs/xfs_pnfs.h @@ -8,9 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length, int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps, struct iattr *iattr); -int xfs_break_layouts(struct inode *inode, uint *iolock); +int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex); #else -static inline int xfs_break_layouts(struct inode *inode, uint *iolock) +static inline int +xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex) { return 0; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index fbbb9e62e274..eac9549efd52 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -719,6 +719,7 @@ xfs_qm_qino_alloc( xfs_trans_t *tp; int error; int committed; + bool need_alloc = true; *ip = NULL; /* @@ -747,6 +748,7 @@ xfs_qm_qino_alloc( return error; mp->m_sb.sb_gquotino = NULLFSINO; mp->m_sb.sb_pquotino = NULLFSINO; + need_alloc = false; } } @@ -754,16 +756,15 @@ xfs_qm_qino_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, XFS_QM_QINOCREATE_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } - if (!*ip) { + if (need_alloc) { error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } } @@ -794,11 +795,14 @@ xfs_qm_qino_alloc( spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); - if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) { + error = xfs_trans_commit(tp); + if (error) { + ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); - return error; } - return 0; + if (need_alloc) + xfs_finish_inode_setup(*ip); + return error; } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 0d4d3590cf85..996a04064894 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -168,10 +168,6 @@ extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, uint, struct qc_dqblk *); extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, struct qc_dqblk *); -extern int xfs_qm_scall_getqstat(struct xfs_mount *, - struct fs_quota_stat *); -extern int xfs_qm_scall_getqstatv(struct xfs_mount *, - struct fs_quota_statv *); extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); extern int xfs_qm_scall_quotaoff(struct xfs_mount *, uint); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 9b965db45800..3640c6e896af 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -38,7 +38,6 @@ STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint); STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *, uint); -STATIC uint xfs_qm_export_flags(uint); /* * Turn off quota accounting and/or enforcement for all udquots and/or @@ -240,7 +239,7 @@ xfs_qm_scall_trunc_qfile( tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_IOLOCK_EXCL); goto out_put; } @@ -253,15 +252,14 @@ xfs_qm_scall_trunc_qfile( error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); + xfs_trans_cancel(tp); goto out_unlock; } ASSERT(ip->i_d.di_nextents == 0); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); @@ -389,159 +387,6 @@ xfs_qm_scall_quotaon( return 0; } - -/* - * Return quota status information, such as uquota-off, enforcements, etc. - * for Q_XGETQSTAT command. - */ -int -xfs_qm_scall_getqstat( - struct xfs_mount *mp, - struct fs_quota_stat *out) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_inode *uip = NULL; - struct xfs_inode *gip = NULL; - struct xfs_inode *pip = NULL; - bool tempuqip = false; - bool tempgqip = false; - bool temppqip = false; - - memset(out, 0, sizeof(fs_quota_stat_t)); - - out->qs_version = FS_QSTAT_VERSION; - out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & - (XFS_ALL_QUOTA_ACCT| - XFS_ALL_QUOTA_ENFD)); - uip = q->qi_uquotaip; - gip = q->qi_gquotaip; - pip = q->qi_pquotaip; - if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { - if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip) == 0) - tempuqip = true; - } - if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { - if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip) == 0) - tempgqip = true; - } - /* - * Q_XGETQSTAT doesn't have room for both group and project quotas. - * So, allow the project quota values to be copied out only if - * there is no group quota information available. - */ - if (!gip) { - if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) { - if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, - 0, 0, &pip) == 0) - temppqip = true; - } - } else - pip = NULL; - if (uip) { - out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; - out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; - out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; - if (tempuqip) - IRELE(uip); - } - - if (gip) { - out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; - out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; - out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; - if (tempgqip) - IRELE(gip); - } - if (pip) { - out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; - out->qs_gquota.qfs_nblks = pip->i_d.di_nblocks; - out->qs_gquota.qfs_nextents = pip->i_d.di_nextents; - if (temppqip) - IRELE(pip); - } - out->qs_incoredqs = q->qi_dquots; - out->qs_btimelimit = q->qi_btimelimit; - out->qs_itimelimit = q->qi_itimelimit; - out->qs_rtbtimelimit = q->qi_rtbtimelimit; - out->qs_bwarnlimit = q->qi_bwarnlimit; - out->qs_iwarnlimit = q->qi_iwarnlimit; - - return 0; -} - -/* - * Return quota status information, such as uquota-off, enforcements, etc. - * for Q_XGETQSTATV command, to support separate project quota field. - */ -int -xfs_qm_scall_getqstatv( - struct xfs_mount *mp, - struct fs_quota_statv *out) -{ - struct xfs_quotainfo *q = mp->m_quotainfo; - struct xfs_inode *uip = NULL; - struct xfs_inode *gip = NULL; - struct xfs_inode *pip = NULL; - bool tempuqip = false; - bool tempgqip = false; - bool temppqip = false; - - out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags & - (XFS_ALL_QUOTA_ACCT| - XFS_ALL_QUOTA_ENFD)); - out->qs_uquota.qfs_ino = mp->m_sb.sb_uquotino; - out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino; - out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino; - - uip = q->qi_uquotaip; - gip = q->qi_gquotaip; - pip = q->qi_pquotaip; - if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) { - if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, - 0, 0, &uip) == 0) - tempuqip = true; - } - if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) { - if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, - 0, 0, &gip) == 0) - tempgqip = true; - } - if (!pip && mp->m_sb.sb_pquotino != NULLFSINO) { - if (xfs_iget(mp, NULL, mp->m_sb.sb_pquotino, - 0, 0, &pip) == 0) - temppqip = true; - } - if (uip) { - out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; - out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; - if (tempuqip) - IRELE(uip); - } - - if (gip) { - out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; - out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; - if (tempgqip) - IRELE(gip); - } - if (pip) { - out->qs_pquota.qfs_nblks = pip->i_d.di_nblocks; - out->qs_pquota.qfs_nextents = pip->i_d.di_nextents; - if (temppqip) - IRELE(pip); - } - out->qs_incoredqs = q->qi_dquots; - out->qs_btimelimit = q->qi_btimelimit; - out->qs_itimelimit = q->qi_itimelimit; - out->qs_rtbtimelimit = q->qi_rtbtimelimit; - out->qs_bwarnlimit = q->qi_bwarnlimit; - out->qs_iwarnlimit = q->qi_iwarnlimit; - - return 0; -} - #define XFS_QC_MASK \ (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK) @@ -591,7 +436,7 @@ xfs_qm_scall_setqlim( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_rele; } @@ -702,7 +547,7 @@ xfs_qm_scall_setqlim( dqp->dq_flags |= XFS_DQ_DIRTY; xfs_trans_log_dquot(tp, dqp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_rele: xfs_qm_dqrele(dqp); @@ -725,7 +570,7 @@ xfs_qm_log_quotaoff_end( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -739,8 +584,7 @@ xfs_qm_log_quotaoff_end( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - return error; + return xfs_trans_commit(tp); } @@ -759,7 +603,7 @@ xfs_qm_log_quotaoff( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out; } @@ -778,7 +622,7 @@ xfs_qm_log_quotaoff( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto out; @@ -873,28 +717,6 @@ out_put: return error; } -STATIC uint -xfs_qm_export_flags( - uint flags) -{ - uint uflags; - - uflags = 0; - if (flags & XFS_UQUOTA_ACCT) - uflags |= FS_QUOTA_UDQ_ACCT; - if (flags & XFS_GQUOTA_ACCT) - uflags |= FS_QUOTA_GDQ_ACCT; - if (flags & XFS_PQUOTA_ACCT) - uflags |= FS_QUOTA_PDQ_ACCT; - if (flags & XFS_UQUOTA_ENFD) - uflags |= FS_QUOTA_UDQ_ENFD; - if (flags & XFS_GQUOTA_ENFD) - uflags |= FS_QUOTA_GDQ_ENFD; - if (flags & XFS_PQUOTA_ENFD) - uflags |= FS_QUOTA_PDQ_ENFD; - return uflags; -} - STATIC int xfs_dqrele_inode( diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 5376dd406ba2..ce6506adab7b 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -55,7 +55,6 @@ struct xfs_trans; typedef struct xfs_dqtrx { struct xfs_dquot *qt_dquot; /* the dquot this refers to */ ulong qt_blk_res; /* blks reserved on a dquot */ - ulong qt_blk_res_used; /* blks used from the reservation */ ulong qt_ino_res; /* inode reserved on a dquot */ ulong qt_ino_res_used; /* inodes used from the reservation */ long qt_bcount_delta; /* dquot blk count changes */ diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index 6923905ab33d..7795e0d01382 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -23,10 +23,81 @@ #include "xfs_inode.h" #include "xfs_quota.h" #include "xfs_trans.h" +#include "xfs_trace.h" +#include "xfs_icache.h" #include "xfs_qm.h" #include <linux/quota.h> +static void +xfs_qm_fill_state( + struct qc_type_state *tstate, + struct xfs_mount *mp, + struct xfs_inode *ip, + xfs_ino_t ino) +{ + struct xfs_quotainfo *q = mp->m_quotainfo; + bool tempqip = false; + + tstate->ino = ino; + if (!ip && ino == NULLFSINO) + return; + if (!ip) { + if (xfs_iget(mp, NULL, ino, 0, 0, &ip)) + return; + tempqip = true; + } + tstate->flags |= QCI_SYSFILE; + tstate->blocks = ip->i_d.di_nblocks; + tstate->nextents = ip->i_d.di_nextents; + tstate->spc_timelimit = q->qi_btimelimit; + tstate->ino_timelimit = q->qi_itimelimit; + tstate->rt_spc_timelimit = q->qi_rtbtimelimit; + tstate->spc_warnlimit = q->qi_bwarnlimit; + tstate->ino_warnlimit = q->qi_iwarnlimit; + tstate->rt_spc_warnlimit = q->qi_rtbwarnlimit; + if (tempqip) + IRELE(ip); +} + +/* + * Return quota status information, such as enforcements, quota file inode + * numbers etc. + */ +static int +xfs_fs_get_quota_state( + struct super_block *sb, + struct qc_state *state) +{ + struct xfs_mount *mp = XFS_M(sb); + struct xfs_quotainfo *q = mp->m_quotainfo; + + memset(state, 0, sizeof(*state)); + if (!XFS_IS_QUOTA_RUNNING(mp)) + return 0; + state->s_incoredqs = q->qi_dquots; + if (XFS_IS_UQUOTA_RUNNING(mp)) + state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED; + if (XFS_IS_UQUOTA_ENFORCED(mp)) + state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; + if (XFS_IS_GQUOTA_RUNNING(mp)) + state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED; + if (XFS_IS_GQUOTA_ENFORCED(mp)) + state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; + if (XFS_IS_PQUOTA_RUNNING(mp)) + state->s_state[PRJQUOTA].flags |= QCI_ACCT_ENABLED; + if (XFS_IS_PQUOTA_ENFORCED(mp)) + state->s_state[PRJQUOTA].flags |= QCI_LIMITS_ENFORCED; + + xfs_qm_fill_state(&state->s_state[USRQUOTA], mp, q->qi_uquotaip, + mp->m_sb.sb_uquotino); + xfs_qm_fill_state(&state->s_state[GRPQUOTA], mp, q->qi_gquotaip, + mp->m_sb.sb_gquotino); + xfs_qm_fill_state(&state->s_state[PRJQUOTA], mp, q->qi_pquotaip, + mp->m_sb.sb_pquotino); + return 0; +} + STATIC int xfs_quota_type(int type) { @@ -40,28 +111,40 @@ xfs_quota_type(int type) } } -STATIC int -xfs_fs_get_xstate( +#define XFS_QC_SETINFO_MASK (QC_TIMER_MASK | QC_WARNS_MASK) + +/* + * Adjust quota timers & warnings + */ +static int +xfs_fs_set_info( struct super_block *sb, - struct fs_quota_stat *fqs) + int type, + struct qc_info *info) { - struct xfs_mount *mp = XFS_M(sb); + struct xfs_mount *mp = XFS_M(sb); + struct qc_dqblk newlim; + if (sb->s_flags & MS_RDONLY) + return -EROFS; if (!XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; - return xfs_qm_scall_getqstat(mp, fqs); -} + if (!XFS_IS_QUOTA_ON(mp)) + return -ESRCH; + if (info->i_fieldmask & ~XFS_QC_SETINFO_MASK) + return -EINVAL; + if ((info->i_fieldmask & XFS_QC_SETINFO_MASK) == 0) + return 0; -STATIC int -xfs_fs_get_xstatev( - struct super_block *sb, - struct fs_quota_statv *fqs) -{ - struct xfs_mount *mp = XFS_M(sb); + newlim.d_fieldmask = info->i_fieldmask; + newlim.d_spc_timer = info->i_spc_timelimit; + newlim.d_ino_timer = info->i_ino_timelimit; + newlim.d_rt_spc_timer = info->i_rt_spc_timelimit; + newlim.d_ino_warns = info->i_ino_warnlimit; + newlim.d_spc_warns = info->i_spc_warnlimit; + newlim.d_rt_spc_warns = info->i_rt_spc_warnlimit; - if (!XFS_IS_QUOTA_RUNNING(mp)) - return -ENOSYS; - return xfs_qm_scall_getqstatv(mp, fqs); + return xfs_qm_scall_setqlim(mp, 0, xfs_quota_type(type), &newlim); } static unsigned int @@ -178,8 +261,8 @@ xfs_fs_set_dqblk( } const struct quotactl_ops xfs_quotactl_operations = { - .get_xstatev = xfs_fs_get_xstatev, - .get_xstate = xfs_fs_get_xstate, + .get_state = xfs_fs_get_quota_state, + .set_info = xfs_fs_set_info, .quota_enable = xfs_quota_enable, .quota_disable = xfs_quota_disable, .rm_xquota = xfs_fs_rm_xquota, diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index f2079b6911cc..f4e8c06eee26 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -780,7 +780,6 @@ xfs_growfs_rt_alloc( * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - int cancelflags = 0; xfs_trans_t *tp; tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); @@ -792,7 +791,6 @@ xfs_growfs_rt_alloc( resblks, 0); if (error) goto error_cancel; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; /* * Lock the inode. */ @@ -804,7 +802,6 @@ xfs_growfs_rt_alloc( * Allocate blocks to the bitmap file. */ nmap = 1; - cancelflags |= XFS_TRANS_ABORT; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_METADATA, &firstblock, resblks, &map, &nmap, &flist); @@ -818,14 +815,13 @@ xfs_growfs_rt_alloc( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) goto error_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error; /* * Now we need to clear the allocated blocks. * Do this one block per transaction, to keep it simple. */ - cancelflags = 0; for (bno = map.br_startoff, fsbno = map.br_startblock; bno < map.br_startoff + map.br_blockcount; bno++, fsbno++) { @@ -851,7 +847,7 @@ xfs_growfs_rt_alloc( if (bp == NULL) { error = -EIO; error_cancel: - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); goto error; } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); @@ -859,7 +855,7 @@ error_cancel: /* * Commit the transaction. */ - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto error; } @@ -973,7 +969,6 @@ xfs_growfs_rt( bmbno < nrbmblocks; bmbno++) { xfs_trans_t *tp; - int cancelflags = 0; *nmp = *mp; nsbp = &nmp->m_sb; @@ -1015,7 +1010,6 @@ xfs_growfs_rt( mp->m_rbmip->i_d.di_size = nsbp->sb_rbmblocks * nsbp->sb_blocksize; xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); - cancelflags |= XFS_TRANS_ABORT; /* * Get the summary inode into the transaction. */ @@ -1062,7 +1056,7 @@ xfs_growfs_rt( nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); if (error) { error_cancel: - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); break; } /* @@ -1076,7 +1070,7 @@ error_cancel: mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) break; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 8fcc4ccc5c79..1fb16562c159 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -109,11 +109,11 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #define MNTOPT_GQUOTANOENF "gqnoenforce"/* group quota limit enforcement */ #define MNTOPT_PQUOTANOENF "pqnoenforce"/* project quota limit enforcement */ #define MNTOPT_QUOTANOENF "qnoenforce" /* same as uqnoenforce */ -#define MNTOPT_DELAYLOG "delaylog" /* Delayed logging enabled */ -#define MNTOPT_NODELAYLOG "nodelaylog" /* Delayed logging disabled */ #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ +#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */ + /* * Table driven mount option parser. * @@ -361,28 +361,14 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) { mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE); mp->m_qflags &= ~XFS_GQUOTA_ENFD; - } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) { - xfs_warn(mp, - "delaylog is the default now, option is deprecated."); - } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { - xfs_warn(mp, - "nodelaylog support has been removed, option is deprecated."); } else if (!strcmp(this_char, MNTOPT_DISCARD)) { mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { mp->m_flags &= ~XFS_MOUNT_DISCARD; - } else if (!strcmp(this_char, "ihashsize")) { - xfs_warn(mp, - "ihashsize no longer used, option is deprecated."); - } else if (!strcmp(this_char, "osyncisdsync")) { - xfs_warn(mp, - "osyncisdsync has no effect, option is deprecated."); - } else if (!strcmp(this_char, "osyncisosync")) { - xfs_warn(mp, - "osyncisosync has no effect, option is deprecated."); - } else if (!strcmp(this_char, "irixsgid")) { - xfs_warn(mp, - "irixsgid is now a sysctl(2) variable, option is deprecated."); +#ifdef CONFIG_FS_DAX + } else if (!strcmp(this_char, MNTOPT_DAX)) { + mp->m_flags |= XFS_MOUNT_DAX; +#endif } else { xfs_warn(mp, "unknown mount option [%s].", this_char); return -EINVAL; @@ -472,8 +458,8 @@ done: } struct proc_xfs_info { - int flag; - char *str; + uint64_t flag; + char *str; }; STATIC int @@ -494,6 +480,7 @@ xfs_showargs( { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, + { XFS_MOUNT_DAX, "," MNTOPT_DAX }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -986,6 +973,8 @@ xfs_fs_inode_init_once( atomic_set(&ip->i_pincount, 0); spin_lock_init(&ip->i_flags_lock); + mrlock_init(&ip->i_mmaplock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, + "xfsino", ip->i_ino); mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER, "xfsino", ip->i_ino); } @@ -1033,23 +1022,6 @@ xfs_free_fsname( kfree(mp->m_logname); } -STATIC void -xfs_fs_put_super( - struct super_block *sb) -{ - struct xfs_mount *mp = XFS_M(sb); - - xfs_filestream_unmount(mp); - xfs_unmountfs(mp); - - xfs_freesb(mp); - xfs_icsb_destroy_counters(mp); - xfs_destroy_mount_workqueues(mp); - xfs_close_devices(mp); - xfs_free_fsname(mp); - kfree(mp); -} - STATIC int xfs_fs_sync_fs( struct super_block *sb, @@ -1083,8 +1055,11 @@ xfs_fs_statfs( { struct xfs_mount *mp = XFS_M(dentry->d_sb); xfs_sb_t *sbp = &mp->m_sb; - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); __uint64_t fakeinos, id; + __uint64_t icount; + __uint64_t ifree; + __uint64_t fdblocks; xfs_extlen_t lsize; __int64_t ffree; @@ -1095,17 +1070,21 @@ xfs_fs_statfs( statp->f_fsid.val[0] = (u32)id; statp->f_fsid.val[1] = (u32)(id >> 32); - xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); + icount = percpu_counter_sum(&mp->m_icount); + ifree = percpu_counter_sum(&mp->m_ifree); + fdblocks = percpu_counter_sum(&mp->m_fdblocks); spin_lock(&mp->m_sb_lock); statp->f_bsize = sbp->sb_blocksize; lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0; statp->f_blocks = sbp->sb_dblocks - lsize; - statp->f_bfree = statp->f_bavail = - sbp->sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); + spin_unlock(&mp->m_sb_lock); + + statp->f_bfree = fdblocks - XFS_ALLOC_SET_ASIDE(mp); + statp->f_bavail = statp->f_bfree; + fakeinos = statp->f_bfree << sbp->sb_inopblog; - statp->f_files = - MIN(sbp->sb_icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); + statp->f_files = MIN(icount + fakeinos, (__uint64_t)XFS_MAXINUMBER); if (mp->m_maxicount) statp->f_files = min_t(typeof(statp->f_files), statp->f_files, @@ -1117,10 +1096,9 @@ xfs_fs_statfs( sbp->sb_icount); /* make sure statp->f_ffree does not underflow */ - ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + ffree = statp->f_files - (icount - ifree); statp->f_ffree = max_t(__int64_t, ffree, 0); - spin_unlock(&mp->m_sb_lock); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && ((mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_PQUOTA_ENFD))) == @@ -1256,6 +1234,12 @@ xfs_fs_remount( /* ro -> rw */ if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + if (mp->m_flags & XFS_MOUNT_NORECOVERY) { + xfs_warn(mp, + "ro->rw transition prohibited on norecovery mount"); + return -EINVAL; + } + mp->m_flags &= ~XFS_MOUNT_RDONLY; /* @@ -1401,6 +1385,51 @@ xfs_finish_flags( return 0; } +static int +xfs_init_percpu_counters( + struct xfs_mount *mp) +{ + int error; + + error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); + if (error) + return -ENOMEM; + + error = percpu_counter_init(&mp->m_ifree, 0, GFP_KERNEL); + if (error) + goto free_icount; + + error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); + if (error) + goto free_ifree; + + return 0; + +free_ifree: + percpu_counter_destroy(&mp->m_ifree); +free_icount: + percpu_counter_destroy(&mp->m_icount); + return -ENOMEM; +} + +void +xfs_reinit_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); + percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); + percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); +} + +static void +xfs_destroy_percpu_counters( + struct xfs_mount *mp) +{ + percpu_counter_destroy(&mp->m_icount); + percpu_counter_destroy(&mp->m_ifree); + percpu_counter_destroy(&mp->m_fdblocks); +} + STATIC int xfs_fs_fill_super( struct super_block *sb, @@ -1449,7 +1478,7 @@ xfs_fs_fill_super( if (error) goto out_close_devices; - error = xfs_icsb_init_counters(mp); + error = xfs_init_percpu_counters(mp); if (error) goto out_destroy_workqueues; @@ -1485,6 +1514,20 @@ xfs_fs_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= MS_I_VERSION; + if (mp->m_flags & XFS_MOUNT_DAX) { + xfs_warn(mp, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + if (sb->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "Filesystem block size invalid for DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + xfs_alert(mp, + "Block device does not support DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } + } + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; @@ -1507,7 +1550,7 @@ xfs_fs_fill_super( out_free_sb: xfs_freesb(mp); out_destroy_counters: - xfs_icsb_destroy_counters(mp); + xfs_destroy_percpu_counters(mp); out_destroy_workqueues: xfs_destroy_mount_workqueues(mp); out_close_devices: @@ -1524,6 +1567,24 @@ out_destroy_workqueues: goto out_free_sb; } +STATIC void +xfs_fs_put_super( + struct super_block *sb) +{ + struct xfs_mount *mp = XFS_M(sb); + + xfs_notice(mp, "Unmounting Filesystem"); + xfs_filestream_unmount(mp); + xfs_unmountfs(mp); + + xfs_freesb(mp); + xfs_destroy_percpu_counters(mp); + xfs_destroy_mount_workqueues(mp); + xfs_close_devices(mp); + xfs_free_fsname(mp); + kfree(mp); +} + STATIC struct dentry * xfs_fs_mount( struct file_system_type *fs_type, diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 2b830c2f322e..499058fea303 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -72,6 +72,8 @@ extern const struct export_operations xfs_export_operations; extern const struct xattr_handler *xfs_xattr_handlers[]; extern const struct quotactl_ops xfs_quotactl_operations; +extern void xfs_reinit_percpu_counters(struct xfs_mount *mp); + #define XFS_M(sb) ((struct xfs_mount *)((sb)->s_fs_info)) #endif /* __XFS_SUPER_H__ */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 25791df6f638..4be27b0210af 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -104,7 +104,7 @@ xfs_readlink_bmap( cur_chunk += sizeof(struct xfs_dsymlink_hdr); } - memcpy(link + offset, bp->b_addr, byte_cnt); + memcpy(link + offset, cur_chunk, byte_cnt); pathlen -= byte_cnt; offset += byte_cnt; @@ -177,8 +177,7 @@ xfs_symlink( int pathlen; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; - bool unlock_dp_on_error = false; - uint cancel_flags; + bool unlock_dp_on_error = false; int committed; xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; @@ -221,10 +220,9 @@ xfs_symlink( XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT, &udqp, &gdqp, &pdqp); if (error) - goto std_return; + return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. @@ -239,10 +237,8 @@ xfs_symlink( resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); } - if (error) { - cancel_flags = 0; - goto error_return; - } + if (error) + goto out_trans_cancel; xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -252,7 +248,7 @@ xfs_symlink( */ if (dp->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) { error = -EPERM; - goto error_return; + goto out_trans_cancel; } /* @@ -261,7 +257,7 @@ xfs_symlink( error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); if (error) - goto error_return; + goto out_trans_cancel; /* * Check for ability to enter directory entry, if no space reserved. @@ -269,7 +265,7 @@ xfs_symlink( if (!resblks) { error = xfs_dir_canenter(tp, dp, link_name); if (error) - goto error_return; + goto out_trans_cancel; } /* * Initialize the bmap freelist prior to calling either @@ -282,15 +278,14 @@ xfs_symlink( */ error = xfs_dir_ialloc(&tp, dp, S_IFLNK | (mode & ~S_IFMT), 1, 0, prid, resblks > 0, &ip, NULL); - if (error) { - if (error == -ENOSPC) - goto error_return; - goto error1; - } + if (error) + goto out_trans_cancel; /* - * An error after we've joined dp to the transaction will result in the - * transaction cancel unlocking dp so don't do it explicitly in the + * Now we join the directory inode to the transaction. We do not do it + * earlier because xfs_dir_ialloc might commit the previous transaction + * (and release all the locks). An error from here on will result in + * the transaction cancel unlocking dp so don't do it explicitly in the * error path. */ xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); @@ -330,7 +325,7 @@ xfs_symlink( XFS_BMAPI_METADATA, &first_block, resblks, mval, &nmaps, &free_list); if (error) - goto error2; + goto out_bmap_cancel; if (resblks) resblks -= fs_blocks; @@ -348,7 +343,7 @@ xfs_symlink( BTOBB(byte_cnt), 0); if (!bp) { error = -ENOMEM; - goto error2; + goto out_bmap_cancel; } bp->b_ops = &xfs_symlink_buf_ops; @@ -378,7 +373,7 @@ xfs_symlink( error = xfs_dir_createname(tp, dp, link_name, ip->i_ino, &first_block, &free_list, resblks); if (error) - goto error2; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); @@ -392,10 +387,13 @@ xfs_symlink( } error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { - goto error2; - } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + if (error) + goto out_bmap_cancel; + + error = xfs_trans_commit(tp); + if (error) + goto out_release_inode; + xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); @@ -403,20 +401,27 @@ xfs_symlink( *ipp = ip; return 0; - error2: - IRELE(ip); - error1: +out_bmap_cancel: xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; - error_return: - xfs_trans_cancel(tp, cancel_flags); +out_trans_cancel: + xfs_trans_cancel(tp); +out_release_inode: + /* + * Wait until after the current transaction is aborted to finish the + * setup of the inode and release the inode. This prevents recursive + * transactions and deadlocks from xfs_inactive. + */ + if (ip) { + xfs_finish_inode_setup(ip); + IRELE(ip); + } + xfs_qm_dqrele(udqp); xfs_qm_dqrele(gdqp); xfs_qm_dqrele(pdqp); if (unlock_dp_on_error) xfs_iunlock(dp, XFS_ILOCK_EXCL); - std_return: return error; } @@ -454,7 +459,7 @@ xfs_inactive_symlink_rmt( tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -523,7 +528,7 @@ xfs_inactive_symlink_rmt( /* * Commit the transaction containing extent freeing and EFDs. */ - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); goto error_unlock; @@ -542,7 +547,7 @@ xfs_inactive_symlink_rmt( error_bmap_cancel: xfs_bmap_cancel(&free_list); error_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 51372e34d988..8d916d33d93d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -115,7 +115,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class, __entry->refcount = refcount; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d agno %u refcount %d caller %pf", + TP_printk("dev %d:%d agno %u refcount %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->refcount, @@ -239,7 +239,7 @@ TRACE_EVENT(xfs_iext_insert, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %lld count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -283,7 +283,7 @@ DECLARE_EVENT_CLASS(xfs_bmap_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " - "offset %lld block %lld count %lld flag %d caller %pf", + "offset %lld block %lld count %lld flag %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), @@ -329,7 +329,7 @@ DECLARE_EVENT_CLASS(xfs_buf_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx nblks 0x%x hold %d pincount %d " - "lock %d flags %s caller %pf", + "lock %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->nblks, @@ -402,7 +402,7 @@ DECLARE_EVENT_CLASS(xfs_buf_flags_class, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d flags %s caller %pf", + "lock %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -447,7 +447,7 @@ TRACE_EVENT(xfs_buf_ioerror, __entry->caller_ip = caller_ip; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " - "lock %d error %d flags %s caller %pf", + "lock %d error %d flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->bno, __entry->buffer_length, @@ -613,7 +613,7 @@ DECLARE_EVENT_CLASS(xfs_lock_class, __entry->lock_flags = lock_flags; __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", + TP_printk("dev %d:%d ino 0x%llx flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), @@ -664,6 +664,7 @@ DEFINE_INODE_EVENT(xfs_alloc_file_space); DEFINE_INODE_EVENT(xfs_free_file_space); DEFINE_INODE_EVENT(xfs_zero_file_space); DEFINE_INODE_EVENT(xfs_collapse_file_space); +DEFINE_INODE_EVENT(xfs_insert_file_space); DEFINE_INODE_EVENT(xfs_readdir); #ifdef CONFIG_XFS_POSIX_ACL DEFINE_INODE_EVENT(xfs_get_acl); @@ -685,6 +686,9 @@ DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid); +DEFINE_INODE_EVENT(xfs_filemap_fault); +DEFINE_INODE_EVENT(xfs_filemap_page_mkwrite); + DECLARE_EVENT_CLASS(xfs_iref_class, TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), TP_ARGS(ip, caller_ip), @@ -702,7 +706,7 @@ DECLARE_EVENT_CLASS(xfs_iref_class, __entry->pincount = atomic_read(&ip->i_pincount); __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %pf", + TP_printk("dev %d:%d ino 0x%llx count %d pincount %d caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->count, @@ -734,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size, __entry->blocks, __entry->shift, __entry->writeio_blocks) ) +TRACE_EVENT(xfs_irec_merge_pre, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask), + TP_ARGS(mp, agno, agino, holemask, nagino, nholemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + __field(xfs_agino_t, nagino) + __field(uint16_t, nholemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + __entry->nagino = nagino; + __entry->nholemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->agino, __entry->holemask, __entry->nagino, + __entry->nholemask) +) + +TRACE_EVENT(xfs_irec_merge_post, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask), + TP_ARGS(mp, agno, agino, holemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->agno, __entry->agino, + __entry->holemask) +) + #define DEFINE_IREF_EVENT(name) \ DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ @@ -1217,6 +1268,11 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_found); DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_new); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_update); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_none); +DEFINE_IOMAP_EVENT(xfs_gbmap_direct_endio); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), @@ -1333,7 +1389,7 @@ TRACE_EVENT(xfs_bunmap, __entry->flags = flags; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx bno 0x%llx len 0x%llx" - "flags %s caller %pf", + "flags %s caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, @@ -1466,7 +1522,7 @@ TRACE_EVENT(xfs_agf, ), TP_printk("dev %d:%d agno %u flags %s length %u roots b %u c %u " "levels b %u c %u flfirst %u fllast %u flcount %u " - "freeblks %u longest %u caller %pf", + "freeblks %u longest %u caller %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __print_flags(__entry->flags, "|", XFS_AGF_FLAGS), diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index eb90cd59a0ec..0582a27107d4 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -113,7 +113,7 @@ xfs_trans_free( * blocks. Locks and log items, however, are no inherited. They must * be added to the new transaction explicitly. */ -xfs_trans_t * +STATIC xfs_trans_t * xfs_trans_dup( xfs_trans_t *tp) { @@ -173,7 +173,7 @@ xfs_trans_reserve( uint rtextents) { int error = 0; - int rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); @@ -184,8 +184,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (blocks > 0) { - error = xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, - -((int64_t)blocks), rsvd); + error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); if (error != 0) { current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); return -ENOSPC; @@ -236,8 +235,7 @@ xfs_trans_reserve( * fail if the count would go below zero. */ if (rtextents > 0) { - error = xfs_mod_incore_sb(tp->t_mountp, XFS_SBS_FREXTENTS, - -((int64_t)rtextents), rsvd); + error = xfs_mod_frextents(tp->t_mountp, -((int64_t)rtextents)); if (error) { error = -ENOSPC; goto undo_log; @@ -253,14 +251,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - int log_flags; - - if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { - log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; - } - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -268,8 +259,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_icsb_modify_counters(tp->t_mountp, XFS_SBS_FDBLOCKS, - (int64_t)blocks, rsvd); + xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); tp->t_blk_res = 0; } @@ -488,6 +478,54 @@ xfs_trans_apply_sb_deltas( sizeof(sbp->sb_frextents) - 1); } +STATIC int +xfs_sb_mod8( + uint8_t *field, + int8_t delta) +{ + int8_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod32( + uint32_t *field, + int32_t delta) +{ + int32_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + +STATIC int +xfs_sb_mod64( + uint64_t *field, + int64_t delta) +{ + int64_t counter = *field; + + counter += delta; + if (counter < 0) { + ASSERT(0); + return -EINVAL; + } + *field = counter; + return 0; +} + /* * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations * and apply superblock counter changes to the in-core superblock. The @@ -495,13 +533,6 @@ xfs_trans_apply_sb_deltas( * applied to the in-core superblock. The idea is that that has already been * done. * - * This is done efficiently with a single call to xfs_mod_incore_sb_batch(). - * However, we have to ensure that we only modify each superblock field only - * once because the application of the delta values may not be atomic. That can - * lead to ENOSPC races occurring if we have two separate modifcations of the - * free space counter to put back the entire reservation and then take away - * what we used. - * * If we are not logging superblock counters, then the inode allocated/free and * used block counts are not updated in the on disk superblock. In this case, * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we @@ -509,21 +540,15 @@ xfs_trans_apply_sb_deltas( */ void xfs_trans_unreserve_and_mod_sb( - xfs_trans_t *tp) + struct xfs_trans *tp) { - xfs_mod_sb_t msb[9]; /* If you add cases, add entries */ - xfs_mod_sb_t *msbp; - xfs_mount_t *mp = tp->t_mountp; - /* REFERENCED */ - int error; - int rsvd; - int64_t blkdelta = 0; - int64_t rtxdelta = 0; - int64_t idelta = 0; - int64_t ifreedelta = 0; - - msbp = msb; - rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + struct xfs_mount *mp = tp->t_mountp; + bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; + int64_t blkdelta = 0; + int64_t rtxdelta = 0; + int64_t idelta = 0; + int64_t ifreedelta = 0; + int error; /* calculate deltas */ if (tp->t_blk_res > 0) @@ -547,97 +572,115 @@ xfs_trans_unreserve_and_mod_sb( /* apply the per-cpu counters */ if (blkdelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, - blkdelta, rsvd); + error = xfs_mod_fdblocks(mp, blkdelta, rsvd); if (error) goto out; } if (idelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, - idelta, rsvd); + error = xfs_mod_icount(mp, idelta); if (error) goto out_undo_fdblocks; } if (ifreedelta) { - error = xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, - ifreedelta, rsvd); + error = xfs_mod_ifree(mp, ifreedelta); if (error) goto out_undo_icount; } + if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY)) + return; + /* apply remaining deltas */ - if (rtxdelta != 0) { - msbp->msb_field = XFS_SBS_FREXTENTS; - msbp->msb_delta = rtxdelta; - msbp++; + spin_lock(&mp->m_sb_lock); + if (rtxdelta) { + error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta); + if (error) + goto out_undo_ifree; } - if (tp->t_flags & XFS_TRANS_SB_DIRTY) { - if (tp->t_dblocks_delta != 0) { - msbp->msb_field = XFS_SBS_DBLOCKS; - msbp->msb_delta = tp->t_dblocks_delta; - msbp++; - } - if (tp->t_agcount_delta != 0) { - msbp->msb_field = XFS_SBS_AGCOUNT; - msbp->msb_delta = tp->t_agcount_delta; - msbp++; - } - if (tp->t_imaxpct_delta != 0) { - msbp->msb_field = XFS_SBS_IMAX_PCT; - msbp->msb_delta = tp->t_imaxpct_delta; - msbp++; - } - if (tp->t_rextsize_delta != 0) { - msbp->msb_field = XFS_SBS_REXTSIZE; - msbp->msb_delta = tp->t_rextsize_delta; - msbp++; - } - if (tp->t_rbmblocks_delta != 0) { - msbp->msb_field = XFS_SBS_RBMBLOCKS; - msbp->msb_delta = tp->t_rbmblocks_delta; - msbp++; - } - if (tp->t_rblocks_delta != 0) { - msbp->msb_field = XFS_SBS_RBLOCKS; - msbp->msb_delta = tp->t_rblocks_delta; - msbp++; - } - if (tp->t_rextents_delta != 0) { - msbp->msb_field = XFS_SBS_REXTENTS; - msbp->msb_delta = tp->t_rextents_delta; - msbp++; - } - if (tp->t_rextslog_delta != 0) { - msbp->msb_field = XFS_SBS_REXTSLOG; - msbp->msb_delta = tp->t_rextslog_delta; - msbp++; - } + if (tp->t_dblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta); + if (error) + goto out_undo_frextents; } - - /* - * If we need to change anything, do it. - */ - if (msbp > msb) { - error = xfs_mod_incore_sb_batch(tp->t_mountp, msb, - (uint)(msbp - msb), rsvd); + if (tp->t_agcount_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta); if (error) - goto out_undo_ifreecount; + goto out_undo_dblocks; } - + if (tp->t_imaxpct_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta); + if (error) + goto out_undo_agcount; + } + if (tp->t_rextsize_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rextsize, + tp->t_rextsize_delta); + if (error) + goto out_undo_imaxpct; + } + if (tp->t_rbmblocks_delta != 0) { + error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, + tp->t_rbmblocks_delta); + if (error) + goto out_undo_rextsize; + } + if (tp->t_rblocks_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta); + if (error) + goto out_undo_rbmblocks; + } + if (tp->t_rextents_delta != 0) { + error = xfs_sb_mod64(&mp->m_sb.sb_rextents, + tp->t_rextents_delta); + if (error) + goto out_undo_rblocks; + } + if (tp->t_rextslog_delta != 0) { + error = xfs_sb_mod8(&mp->m_sb.sb_rextslog, + tp->t_rextslog_delta); + if (error) + goto out_undo_rextents; + } + spin_unlock(&mp->m_sb_lock); return; -out_undo_ifreecount: +out_undo_rextents: + if (tp->t_rextents_delta) + xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta); +out_undo_rblocks: + if (tp->t_rblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta); +out_undo_rbmblocks: + if (tp->t_rbmblocks_delta) + xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta); +out_undo_rextsize: + if (tp->t_rextsize_delta) + xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta); +out_undo_imaxpct: + if (tp->t_rextsize_delta) + xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta); +out_undo_agcount: + if (tp->t_agcount_delta) + xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta); +out_undo_dblocks: + if (tp->t_dblocks_delta) + xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta); +out_undo_frextents: + if (rtxdelta) + xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta); +out_undo_ifree: + spin_unlock(&mp->m_sb_lock); if (ifreedelta) - xfs_icsb_modify_counters(mp, XFS_SBS_IFREE, -ifreedelta, rsvd); + xfs_mod_ifree(mp, -ifreedelta); out_undo_icount: if (idelta) - xfs_icsb_modify_counters(mp, XFS_SBS_ICOUNT, -idelta, rsvd); + xfs_mod_icount(mp, -idelta); out_undo_fdblocks: if (blkdelta) - xfs_icsb_modify_counters(mp, XFS_SBS_FDBLOCKS, -blkdelta, rsvd); + xfs_mod_fdblocks(mp, -blkdelta, rsvd); out: ASSERT(error == 0); return; @@ -694,7 +737,7 @@ void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags) + bool abort) { struct xfs_log_item_desc *lidp, *next; @@ -705,7 +748,7 @@ xfs_trans_free_items( if (commit_lsn != NULLCOMMITLSN) lip->li_ops->iop_committing(lip, commit_lsn); - if (flags & XFS_TRANS_ABORT) + if (abort) lip->li_flags |= XFS_LI_ABORTED; lip->li_ops->iop_unlock(lip); @@ -842,27 +885,17 @@ xfs_trans_committed_bulk( * have already been unlocked as if the commit had succeeded. * Do not reference the transaction structure after this call. */ -int -xfs_trans_commit( +static int +__xfs_trans_commit( struct xfs_trans *tp, - uint flags) + bool regrant) { struct xfs_mount *mp = tp->t_mountp; xfs_lsn_t commit_lsn = -1; int error = 0; - int log_flags = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; /* - * Determine whether this commit is releasing a permanent - * log reservation or not. - */ - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; - } - - /* * If there is nothing to be logged by the transaction, * then unlock all of the items associated with the * transaction and free the transaction structure. @@ -886,7 +919,7 @@ xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xfs_log_commit_cil(mp, tp, &commit_lsn, flags); + xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); @@ -914,18 +947,25 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); if (commit_lsn == -1 && !error) error = -EIO; } current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); XFS_STATS_INC(xs_trans_empty); return error; } +int +xfs_trans_commit( + struct xfs_trans *tp) +{ + return __xfs_trans_commit(tp, false); +} + /* * Unlock all of the transaction's items and free the transaction. * The transaction must not have modified any of its items, because @@ -936,29 +976,22 @@ out_unreserve: */ void xfs_trans_cancel( - xfs_trans_t *tp, - int flags) + struct xfs_trans *tp) { - int log_flags; - xfs_mount_t *mp = tp->t_mountp; + struct xfs_mount *mp = tp->t_mountp; + bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); /* - * See if the caller is being too lazy to figure out if - * the transaction really needs an abort. - */ - if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY)) - flags &= ~XFS_TRANS_ABORT; - /* * See if the caller is relying on us to shut down the * filesystem. This happens in paths where we detect * corruption and decide to give up. */ - if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) { + if (dirty && !XFS_FORCED_SHUTDOWN(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) { struct xfs_log_item_desc *lidp; list_for_each_entry(lidp, &tp->t_items, lid_trans) @@ -968,27 +1001,20 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_sb(tp); xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) { - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; - } - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); - } + if (tp->t_ticket) + xfs_log_done(mp, tp->t_ticket, NULL, false); /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); xfs_trans_free(tp); } /* * Roll from one trans in the sequence of PERMANENT transactions to * the next: permanent transactions are only flushed out when - * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * committed with xfs_trans_commit(), but we still want as soon * as possible to let chunks of it go to the log. So we commit the * chunk we've been working on and get a new transaction to continue. */ @@ -1005,7 +1031,8 @@ xfs_trans_roll( * Ensure that the inode is always logged. */ trans = *tpp; - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + if (dp) + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); /* * Copy the critical parameters from one trans to the next. @@ -1021,20 +1048,13 @@ xfs_trans_roll( * is in progress. The caller takes the responsibility to cancel * the duplicate transaction that gets returned. */ - error = xfs_trans_commit(trans, 0); + error = __xfs_trans_commit(trans, true); if (error) return error; trans = *tpp; /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(trans->t_ticket); - - - /* * Reserve space in the log for th next transaction. * This also pushes items in the "AIL", the list of logged items, * out to disk if they are taking up space at the tail of the log @@ -1050,6 +1070,7 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp, 0); + if (dp) + xfs_trans_ijoin(trans, dp, 0); return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b5bc1ab3c4da..3b21b4e5e467 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -133,8 +133,6 @@ typedef struct xfs_trans { * XFS transaction mechanism exported interfaces that are * actually macros. */ -#define xfs_trans_get_log_res(tp) ((tp)->t_log_res) -#define xfs_trans_get_log_count(tp) ((tp)->t_log_count) #define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) @@ -153,7 +151,6 @@ typedef struct xfs_trans { */ xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); -xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, uint, uint); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -228,9 +225,9 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t); -int xfs_trans_commit(xfs_trans_t *, uint flags); +int xfs_trans_commit(struct xfs_trans *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); -void xfs_trans_cancel(xfs_trans_t *, int); +void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 573aefb5a573..1098cf490189 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next( { struct xfs_log_item *lip = cur->item; - if ((__psint_t)lip & 1) + if ((uintptr_t)lip & 1) lip = xfs_ail_min(ailp); if (lip) cur->item = xfs_ail_next(ailp, lip); @@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear( list_for_each_entry(cur, &ailp->xa_cursors, list) { if (cur->item == lip) cur->item = (struct xfs_log_item *) - ((__psint_t)cur->item | 1); + ((uintptr_t)cur->item | 1); } } @@ -287,7 +287,7 @@ xfs_ail_splice( * find the place in the AIL where the items belong. */ lip = cur ? cur->item : NULL; - if (!lip || (__psint_t) lip & 1) + if (!lip || (uintptr_t)lip & 1) lip = __xfs_trans_ail_cursor_last(ailp, lsn); /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 76a16df55ef7..ce78534a047e 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo( xfs_trans_t *ntp) { xfs_dqtrx_t *oq, *nq; - int i,j; + int i, j; xfs_dqtrx_t *oqa, *nqa; + ulong blk_res_used; if (!otp->t_dqinfo) return; @@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo( * Because the quota blk reservation is carried forward, * it is also necessary to carry forward the DQ_DIRTY flag. */ - if(otp->t_flags & XFS_TRANS_DQ_DIRTY) + if (otp->t_flags & XFS_TRANS_DQ_DIRTY) ntp->t_flags |= XFS_TRANS_DQ_DIRTY; for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { oqa = otp->t_dqinfo->dqs[j]; nqa = ntp->t_dqinfo->dqs[j]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + blk_res_used = 0; + if (oqa[i].qt_dquot == NULL) break; oq = &oqa[i]; nq = &nqa[i]; + if (oq->qt_blk_res && oq->qt_bcount_delta > 0) + blk_res_used = oq->qt_bcount_delta; + nq->qt_dquot = oq->qt_dquot; nq->qt_bcount_delta = nq->qt_icount_delta = 0; nq->qt_rtbcount_delta = 0; @@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo( /* * Transfer whatever is left of the reservations. */ - nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used; - oq->qt_blk_res = oq->qt_blk_res_used; + nq->qt_blk_res = oq->qt_blk_res - blk_res_used; + oq->qt_blk_res = blk_res_used; nq->qt_rtblk_res = oq->qt_rtblk_res - oq->qt_rtblk_res_used; @@ -239,10 +245,6 @@ xfs_trans_mod_dquot( * disk blocks used. */ case XFS_TRANS_DQ_BCOUNT: - if (qtrx->qt_blk_res && delta > 0) { - qtrx->qt_blk_res_used += (ulong)delta; - ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used); - } qtrx->qt_bcount_delta += delta; break; @@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas( * reservation that a transaction structure knows of. */ if (qtrx->qt_blk_res != 0) { - if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { - if (qtrx->qt_blk_res > - qtrx->qt_blk_res_used) + ulong blk_res_used = 0; + + if (qtrx->qt_bcount_delta > 0) + blk_res_used = qtrx->qt_bcount_delta; + + if (qtrx->qt_blk_res != blk_res_used) { + if (qtrx->qt_blk_res > blk_res_used) dqp->q_res_bcount -= (xfs_qcnt_t) (qtrx->qt_blk_res - - qtrx->qt_blk_res_used); + blk_res_used); else dqp->q_res_bcount -= (xfs_qcnt_t) - (qtrx->qt_blk_res_used - + (blk_res_used - qtrx->qt_blk_res); } } else { diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index bd1281862ad7..1b736294558a 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -30,7 +30,7 @@ void xfs_trans_init(struct xfs_mount *); void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags); + bool abort); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 69f6e475de97..c036815183cb 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -35,7 +35,7 @@ static int xfs_xattr_get(struct dentry *dentry, const char *name, void *value, size_t size, int xflags) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); int error, asize = size; if (strcmp(name, "") == 0) @@ -57,7 +57,7 @@ static int xfs_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int xflags) { - struct xfs_inode *ip = XFS_I(dentry->d_inode); + struct xfs_inode *ip = XFS_I(d_inode(dentry)); if (strcmp(name, "") == 0) return -EINVAL; @@ -197,7 +197,7 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) { struct xfs_attr_list_context context; struct attrlist_cursor_kern cursor = { 0 }; - struct inode *inode = dentry->d_inode; + struct inode *inode = d_inode(dentry); int error; /* |