diff options
Diffstat (limited to 'fs')
69 files changed, 1100 insertions, 532 deletions
@@ -141,9 +141,6 @@ static void aio_free_ring(struct kioctx *ctx) for (i = 0; i < ctx->nr_pages; i++) put_page(ctx->ring_pages[i]); - if (ctx->mmap_size) - vm_munmap(ctx->mmap_base, ctx->mmap_size); - if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) kfree(ctx->ring_pages); } @@ -322,11 +319,6 @@ static void free_ioctx(struct kioctx *ctx) aio_free_ring(ctx); - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - ctx->max_reqs > aio_nr); - aio_nr -= ctx->max_reqs; - spin_unlock(&aio_nr_lock); - pr_debug("freeing %p\n", ctx); /* @@ -435,17 +427,24 @@ static void kill_ioctx(struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { hlist_del_rcu(&ctx->list); - /* Between hlist_del_rcu() and dropping the initial ref */ - synchronize_rcu(); /* - * We can't punt to workqueue here because put_ioctx() -> - * free_ioctx() will unmap the ringbuffer, and that has to be - * done in the original process's context. kill_ioctx_rcu/work() - * exist for exit_aio(), as in that path free_ioctx() won't do - * the unmap. + * It'd be more correct to do this in free_ioctx(), after all + * the outstanding kiocbs have finished - but by then io_destroy + * has already returned, so io_setup() could potentially return + * -EAGAIN with no ioctxs actually in use (as far as userspace + * could tell). */ - kill_ioctx_work(&ctx->rcu_work); + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - ctx->max_reqs > aio_nr); + aio_nr -= ctx->max_reqs; + spin_unlock(&aio_nr_lock); + + if (ctx->mmap_size) + vm_munmap(ctx->mmap_base, ctx->mmap_size); + + /* Between hlist_del_rcu() and dropping the initial ref */ + call_rcu(&ctx->rcu_head, kill_ioctx_rcu); } } @@ -495,10 +494,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); - call_rcu(&ctx->rcu_head, kill_ioctx_rcu); - } + kill_ioctx(ctx); } } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 8615ee89ab55..f95dddced968 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -265,8 +265,8 @@ befs_readdir(struct file *filp, void *dirent, filldir_t filldir) result = filldir(dirent, keybuf, keysize, filp->f_pos, (ino_t) value, d_type); } - - filp->f_pos++; + if (!result) + filp->f_pos++; befs_debug(sb, "<--- befs_readdir() filp->f_pos %Ld", filp->f_pos); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e7b3cb5286a5..b8b60b660c8f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2859,8 +2859,8 @@ fail_qgroup: btrfs_free_qgroup_config(fs_info); fail_trans_kthread: kthread_stop(fs_info->transaction_kthread); - del_fs_roots(fs_info); btrfs_cleanup_transaction(fs_info->tree_root); + del_fs_roots(fs_info); fail_cleaner: kthread_stop(fs_info->cleaner_kthread); @@ -3512,15 +3512,15 @@ int close_ctree(struct btrfs_root *root) percpu_counter_sum(&fs_info->delalloc_bytes)); } - free_root_pointers(fs_info, 1); - btrfs_free_block_groups(fs_info); + btrfs_stop_all_workers(fs_info); + del_fs_roots(fs_info); - iput(fs_info->btree_inode); + free_root_pointers(fs_info, 1); - btrfs_stop_all_workers(fs_info); + iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(root, CHECK_INTEGRITY)) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index af978f7682b3..17f3064b4a3e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8012,6 +8012,9 @@ int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + if (root == NULL) + return 1; + /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0 && root != root->fs_info->tree_root) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 395b82031a42..4febca4fc2de 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4082,7 +4082,7 @@ out: return inode; } -static struct reloc_control *alloc_reloc_control(void) +static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) { struct reloc_control *rc; @@ -4093,7 +4093,8 @@ static struct reloc_control *alloc_reloc_control(void) INIT_LIST_HEAD(&rc->reloc_roots); backref_cache_init(&rc->backref_cache); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(&rc->processed_blocks, NULL); + extent_io_tree_init(&rc->processed_blocks, + fs_info->btree_inode->i_mapping); return rc; } @@ -4110,7 +4111,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) int rw = 0; int err = 0; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(fs_info); if (!rc) return -ENOMEM; @@ -4311,7 +4312,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) if (list_empty(&reloc_roots)) goto out; - rc = alloc_reloc_control(); + rc = alloc_reloc_control(root->fs_info); if (!rc) { err = -ENOMEM; goto out; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 202dd3d68be0..ebbf680378e2 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -191,27 +191,23 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) } /** - * Encode the flock and fcntl locks for the given inode into the pagelist. - * Format is: #fcntl locks, sequential fcntl locks, #flock locks, - * sequential flock locks. - * Must be called with lock_flocks() already held. - * If we encounter more of a specific lock type than expected, - * we return the value 1. + * Encode the flock and fcntl locks for the given inode into the ceph_filelock + * array. Must be called with lock_flocks() already held. + * If we encounter more of a specific lock type than expected, return -ENOSPC. */ -int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, - int num_fcntl_locks, int num_flock_locks) +int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, int num_flock_locks) { struct file_lock *lock; - struct ceph_filelock cephlock; int err = 0; int seen_fcntl = 0; int seen_flock = 0; + int l = 0; dout("encoding %d flock and %d fcntl locks", num_flock_locks, num_fcntl_locks); - err = ceph_pagelist_append(pagelist, &num_fcntl_locks, sizeof(u32)); - if (err) - goto fail; + for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_POSIX) { ++seen_fcntl; @@ -219,19 +215,12 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &cephlock); + err = lock_to_ceph_filelock(lock, &flocks[l]); if (err) goto fail; - err = ceph_pagelist_append(pagelist, &cephlock, - sizeof(struct ceph_filelock)); + ++l; } - if (err) - goto fail; } - - err = ceph_pagelist_append(pagelist, &num_flock_locks, sizeof(u32)); - if (err) - goto fail; for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) { if (lock->fl_flags & FL_FLOCK) { ++seen_flock; @@ -239,19 +228,51 @@ int ceph_encode_locks(struct inode *inode, struct ceph_pagelist *pagelist, err = -ENOSPC; goto fail; } - err = lock_to_ceph_filelock(lock, &cephlock); + err = lock_to_ceph_filelock(lock, &flocks[l]); if (err) goto fail; - err = ceph_pagelist_append(pagelist, &cephlock, - sizeof(struct ceph_filelock)); + ++l; } - if (err) - goto fail; } fail: return err; } +/** + * Copy the encoded flock and fcntl locks into the pagelist. + * Format is: #fcntl locks, sequential fcntl locks, #flock locks, + * sequential flock locks. + * Returns zero on success. + */ +int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks) +{ + int err = 0; + __le32 nlocks; + + nlocks = cpu_to_le32(num_fcntl_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + + nlocks = cpu_to_le32(num_flock_locks); + err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); + if (err) + goto out_fail; + + err = ceph_pagelist_append(pagelist, + &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); +out_fail: + return err; +} + /* * Given a pointer to a lock, convert it to a ceph filelock */ diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4f22671a5bd4..4d2920304be8 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2478,39 +2478,44 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->flock) { int num_fcntl_locks, num_flock_locks; - struct ceph_pagelist_cursor trunc_point; - - ceph_pagelist_set_cursor(pagelist, &trunc_point); - do { - lock_flocks(); - ceph_count_locks(inode, &num_fcntl_locks, - &num_flock_locks); - rec.v2.flock_len = (2*sizeof(u32) + - (num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock)); - unlock_flocks(); - - /* pre-alloc pagelist */ - ceph_pagelist_truncate(pagelist, &trunc_point); - err = ceph_pagelist_append(pagelist, &rec, reclen); - if (!err) - err = ceph_pagelist_reserve(pagelist, - rec.v2.flock_len); - - /* encode locks */ - if (!err) { - lock_flocks(); - err = ceph_encode_locks(inode, - pagelist, - num_fcntl_locks, - num_flock_locks); - unlock_flocks(); - } - } while (err == -ENOSPC); + struct ceph_filelock *flocks; + +encode_again: + lock_flocks(); + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + unlock_flocks(); + flocks = kmalloc((num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock), GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_free; + } + lock_flocks(); + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + unlock_flocks(); + if (err) { + kfree(flocks); + if (err == -ENOSPC) + goto encode_again; + goto out_free; + } + /* + * number of encoded locks is stable, so copy to pagelist + */ + rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) + + (num_fcntl_locks+num_flock_locks) * + sizeof(struct ceph_filelock)); + err = ceph_pagelist_append(pagelist, &rec, reclen); + if (!err) + err = ceph_locks_to_pagelist(flocks, pagelist, + num_fcntl_locks, + num_flock_locks); + kfree(flocks); } else { err = ceph_pagelist_append(pagelist, &rec, reclen); } - out_free: kfree(path); out_dput: diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 8696be2ff679..7ccfdb4aea2e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -822,8 +822,13 @@ extern const struct export_operations ceph_export_ops; extern int ceph_lock(struct file *file, int cmd, struct file_lock *fl); extern int ceph_flock(struct file *file, int cmd, struct file_lock *fl); extern void ceph_count_locks(struct inode *inode, int *p_num, int *f_num); -extern int ceph_encode_locks(struct inode *i, struct ceph_pagelist *p, - int p_locks, int f_locks); +extern int ceph_encode_locks_to_buffer(struct inode *inode, + struct ceph_filelock *flocks, + int num_fcntl_locks, + int num_flock_locks); +extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, + struct ceph_pagelist *pagelist, + int num_fcntl_locks, int num_flock_locks); extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); /* debugfs.c */ diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 5b97e56ddbca..e3bc39bb9d12 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3279,8 +3279,8 @@ build_unc_path_to_root(const struct smb_vol *vol, pos = full_path + unc_len; if (pplen) { - *pos++ = CIFS_DIR_SEP(cifs_sb); - strncpy(pos, vol->prepath, pplen); + *pos = CIFS_DIR_SEP(cifs_sb); + strncpy(pos + 1, vol->prepath, pplen); pos += pplen; } diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 201f0a0d6b0a..a7abbea2c096 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -295,6 +295,12 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { + int rc; + + rc = filemap_write_and_wait(file->f_mapping); + if (rc) + return rc; + return vfs_fsync(ecryptfs_file_to_lower(file), datasync); } diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index bfb531564319..8dd524f32284 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -44,8 +44,11 @@ static ssize_t efivarfs_file_write(struct file *file, bytes = efivar_entry_set_get_size(var, attributes, &datasize, data, &set); - if (!set && bytes) + if (!set && bytes) { + if (bytes == -ENOENT) + bytes = -EIO; goto out; + } if (bytes == -ENOENT) { drop_nlink(inode); @@ -76,7 +79,14 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf, int err; err = efivar_entry_size(var, &datasize); - if (err) + + /* + * efivarfs represents uncommitted variables with + * zero-length files. Reading them should return EOF. + */ + if (err == -ENOENT) + return 0; + else if (err) return err; data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL); diff --git a/fs/exec.c b/fs/exec.c index 643019585574..ffd7a813ad3d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1135,13 +1135,6 @@ void setup_new_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } - /* - * Flush performance counters when crossing a - * security domain: - */ - if (!get_dumpable(current->mm)) - perf_event_exit_task(current); - /* An exec changes our domain. We are no longer part of the thread group */ @@ -1205,6 +1198,15 @@ void install_exec_creds(struct linux_binprm *bprm) commit_creds(bprm->cred); bprm->cred = NULL; + + /* + * Disable monitoring for regular users + * when executing setuid binaries. Must + * wait until new credentials are committed + * by commit_creds() above + */ + if (get_dumpable(current->mm) != SUID_DUMP_USER) + perf_event_exit_task(current); /* * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's diff --git a/fs/file_table.c b/fs/file_table.c index cd4d87a82951..485dc0eddd67 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -306,17 +306,18 @@ void fput(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { struct task_struct *task = current; + unsigned long flags; + file_sb_list_del(file); - if (unlikely(in_interrupt() || task->flags & PF_KTHREAD)) { - unsigned long flags; - spin_lock_irqsave(&delayed_fput_lock, flags); - list_add(&file->f_u.fu_list, &delayed_fput_list); - schedule_work(&delayed_fput_work); - spin_unlock_irqrestore(&delayed_fput_lock, flags); - return; + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_u.fu_rcuhead, ____fput); + if (!task_work_add(task, &file->f_u.fu_rcuhead, true)) + return; } - init_task_work(&file->f_u.fu_rcuhead, ____fput); - task_work_add(task, &file->f_u.fu_rcuhead, true); + spin_lock_irqsave(&delayed_fput_lock, flags); + list_add(&file->f_u.fu_list, &delayed_fput_list); + schedule_work(&delayed_fput_work); + spin_unlock_irqrestore(&delayed_fput_lock, flags); } } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 254df56b847b..f3f783dc4f75 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -180,6 +180,8 @@ u64 fuse_get_attr_version(struct fuse_conn *fc) static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) { struct inode *inode; + struct dentry *parent; + struct fuse_conn *fc; inode = ACCESS_ONCE(entry->d_inode); if (inode && is_bad_inode(inode)) @@ -187,10 +189,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) else if (fuse_dentry_time(entry) < get_jiffies_64()) { int err; struct fuse_entry_out outarg; - struct fuse_conn *fc; struct fuse_req *req; struct fuse_forget_link *forget; - struct dentry *parent; u64 attr_version; /* For negative dentries, always do a fresh lookup */ @@ -241,8 +241,14 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) entry_attr_timeout(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); + } else if (inode) { + fc = get_fuse_conn(inode); + if (fc->readdirplus_auto) { + parent = dget_parent(entry); + fuse_advise_use_readdirplus(parent->d_inode); + dput(parent); + } } - fuse_advise_use_readdirplus(inode); return 1; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index d1c9b85b3f58..35f281033142 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -16,6 +16,7 @@ #include <linux/compat.h> #include <linux/swap.h> #include <linux/aio.h> +#include <linux/falloc.h> static const struct file_operations fuse_direct_io_file_operations; @@ -1278,7 +1279,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, iov_iter_init(&ii, iov, nr_segs, count, 0); - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + if (io->async) + req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii)); + else + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) return PTR_ERR(req); @@ -1314,7 +1318,11 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov, break; if (count) { fuse_put_request(fc, req); - req = fuse_get_req(fc, fuse_iter_npages(&ii)); + if (io->async) + req = fuse_get_req_for_background(fc, + fuse_iter_npages(&ii)); + else + req = fuse_get_req(fc, fuse_iter_npages(&ii)); if (IS_ERR(req)) break; } @@ -2365,6 +2373,11 @@ static void fuse_do_truncate(struct file *file) fuse_do_setattr(inode, &attr, file); } +static inline loff_t fuse_round_up(loff_t off) +{ + return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); +} + static ssize_t fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) @@ -2372,6 +2385,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, ssize_t ret = 0; struct file *file = iocb->ki_filp; struct fuse_file *ff = file->private_data; + bool async_dio = ff->fc->async_dio; loff_t pos = 0; struct inode *inode; loff_t i_size; @@ -2383,10 +2397,10 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, i_size = i_size_read(inode); /* optimization for short read */ - if (rw != WRITE && offset + count > i_size) { + if (async_dio && rw != WRITE && offset + count > i_size) { if (offset >= i_size) return 0; - count = i_size - offset; + count = min_t(loff_t, count, fuse_round_up(i_size - offset)); } io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL); @@ -2404,7 +2418,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * By default, we want to optimize all I/Os with async request * submission to the client filesystem if supported. */ - io->async = ff->fc->async_dio; + io->async = async_dio; io->iocb = iocb; /* @@ -2412,7 +2426,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, * to wait on real async I/O requests, so we must submit this request * synchronously. */ - if (!is_sync_kiocb(iocb) && (offset + count > i_size)) + if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) io->async = false; if (rw == WRITE) @@ -2424,7 +2438,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, fuse_aio_complete(io, ret < 0 ? ret : 0, -1); /* we have a non-extending, async request, so return */ - if (ret > 0 && !is_sync_kiocb(iocb)) + if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; ret = wait_on_sync_kiocb(iocb); @@ -2446,6 +2460,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, loff_t length) { struct fuse_file *ff = file->private_data; + struct inode *inode = file->f_inode; struct fuse_conn *fc = ff->fc; struct fuse_req *req; struct fuse_fallocate_in inarg = { @@ -2455,13 +2470,23 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, .mode = mode }; int err; + bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) || + (mode & FALLOC_FL_PUNCH_HOLE); if (fc->no_fallocate) return -EOPNOTSUPP; + if (lock_inode) { + mutex_lock(&inode->i_mutex); + if (mode & FALLOC_FL_PUNCH_HOLE) + fuse_set_nowrite(inode); + } + req = fuse_get_req_nopages(fc); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out; + } req->in.h.opcode = FUSE_FALLOCATE; req->in.h.nodeid = ff->nodeid; @@ -2476,6 +2501,25 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, } fuse_put_request(fc, req); + if (err) + goto out; + + /* we could have extended the file */ + if (!(mode & FALLOC_FL_KEEP_SIZE)) + fuse_write_update_size(inode, offset + length); + + if (mode & FALLOC_FL_PUNCH_HOLE) + truncate_pagecache_range(inode, offset, offset + length - 1); + + fuse_invalidate_attr(inode); + +out: + if (lock_inode) { + if (mode & FALLOC_FL_PUNCH_HOLE) + fuse_release_nowrite(inode); + mutex_unlock(&inode->i_mutex); + } + return err; } diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6201f81e4d3a..9a0cdde14a08 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -867,10 +867,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->dont_mask = 1; if (arg->flags & FUSE_AUTO_INVAL_DATA) fc->auto_inval_data = 1; - if (arg->flags & FUSE_DO_READDIRPLUS) + if (arg->flags & FUSE_DO_READDIRPLUS) { fc->do_readdirplus = 1; - if (arg->flags & FUSE_READDIRPLUS_AUTO) - fc->readdirplus_auto = 1; + if (arg->flags & FUSE_READDIRPLUS_AUTO) + fc->readdirplus_auto = 1; + } if (arg->flags & FUSE_ASYNC_DIO) fc->async_dio = 1; } else { diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1dc9a13ce6bb..93b5809c20bb 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -1286,17 +1286,26 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) if (ret) return ret; + ret = get_write_access(inode); + if (ret) + return ret; + inode_dio_wait(inode); ret = gfs2_rs_alloc(GFS2_I(inode)); if (ret) - return ret; + goto out; oldsize = inode->i_size; - if (newsize >= oldsize) - return do_grow(inode, newsize); + if (newsize >= oldsize) { + ret = do_grow(inode, newsize); + goto out; + } - return do_shrink(inode, oldsize, newsize); + ret = do_shrink(inode, oldsize, newsize); +out: + put_write_access(inode); + return ret; } int gfs2_truncatei_resume(struct gfs2_inode *ip) diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index c3e82bd23179..b631c9043460 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -354,22 +354,31 @@ static __be64 *gfs2_dir_get_hash_table(struct gfs2_inode *ip) return ERR_PTR(-EIO); } - hc = kmalloc(hsize, GFP_NOFS); - ret = -ENOMEM; + hc = kmalloc(hsize, GFP_NOFS | __GFP_NOWARN); + if (hc == NULL) + hc = __vmalloc(hsize, GFP_NOFS, PAGE_KERNEL); + if (hc == NULL) return ERR_PTR(-ENOMEM); ret = gfs2_dir_read_data(ip, hc, hsize); if (ret < 0) { - kfree(hc); + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); return ERR_PTR(ret); } spin_lock(&inode->i_lock); - if (ip->i_hash_cache) - kfree(hc); - else + if (ip->i_hash_cache) { + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); + } else { ip->i_hash_cache = hc; + } spin_unlock(&inode->i_lock); return ip->i_hash_cache; @@ -385,7 +394,10 @@ void gfs2_dir_hash_inval(struct gfs2_inode *ip) { __be64 *hc = ip->i_hash_cache; ip->i_hash_cache = NULL; - kfree(hc); + if (is_vmalloc_addr(hc)) + vfree(hc); + else + kfree(hc); } static inline int gfs2_dirent_sentinel(const struct gfs2_dirent *dent) @@ -1113,7 +1125,10 @@ static int dir_double_exhash(struct gfs2_inode *dip) if (IS_ERR(hc)) return PTR_ERR(hc); - h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS); + h = hc2 = kmalloc(hsize_bytes * 2, GFP_NOFS | __GFP_NOWARN); + if (hc2 == NULL) + hc2 = __vmalloc(hsize_bytes * 2, GFP_NOFS, PAGE_KERNEL); + if (!hc2) return -ENOMEM; @@ -1145,7 +1160,10 @@ fail: gfs2_dinode_out(dip, dibh->b_data); brelse(dibh); out_kfree: - kfree(hc2); + if (is_vmalloc_addr(hc2)) + vfree(hc2); + else + kfree(hc2); return error; } @@ -1846,6 +1864,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len, memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); ht = kzalloc(size, GFP_NOFS); + if (ht == NULL) + ht = vzalloc(size); if (!ht) return -ENOMEM; @@ -1933,7 +1953,10 @@ out_rlist: gfs2_rlist_free(&rlist); gfs2_quota_unhold(dip); out: - kfree(ht); + if (is_vmalloc_addr(ht)) + vfree(ht); + else + kfree(ht); return error; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index acd16764b133..ad0dc38d87ab 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -402,16 +402,20 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* Update file times before taking page lock */ file_update_time(vma->vm_file); + ret = get_write_access(inode); + if (ret) + goto out; + ret = gfs2_rs_alloc(ip); if (ret) - return ret; + goto out_write_access; gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret) - goto out; + goto out_uninit; set_bit(GLF_DIRTY, &ip->i_gl->gl_flags); set_bit(GIF_SW_PAGED, &ip->i_flags); @@ -480,12 +484,15 @@ out_quota_unlock: gfs2_quota_unlock(ip); out_unlock: gfs2_glock_dq(&gh); -out: +out_uninit: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); wait_for_stable_page(page); } +out_write_access: + put_write_access(inode); +out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); } @@ -594,10 +601,10 @@ static int gfs2_release(struct inode *inode, struct file *file) kfree(file->private_data); file->private_data = NULL; - if ((file->f_mode & FMODE_WRITE) && - (atomic_read(&inode->i_writecount) == 1)) - gfs2_rs_delete(ip); + if (!(file->f_mode & FMODE_WRITE)) + return 0; + gfs2_rs_delete(ip); return 0; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8833a4f264e3..62b484e4a9e4 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -189,6 +189,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, return inode; fail_refresh: + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; ip->i_iopen_gh.gh_gl->gl_object = NULL; gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_iopen: diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 68b4c8f1fce8..6c33d7b6e0c4 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -419,7 +419,9 @@ static void gfs2_before_commit(struct gfs2_sbd *sdp, unsigned int limit, if (total > limit) num = limit; gfs2_log_unlock(sdp); - page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_METADATA, num + 1, num); + page = gfs2_get_log_desc(sdp, + is_databuf ? GFS2_LOG_DESC_JDATA : + GFS2_LOG_DESC_METADATA, num + 1, num); ld = page_address(page); gfs2_log_lock(sdp); ptr = (__be64 *)(ld + 1); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 5232525934ae..9809156e3d04 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -638,8 +638,10 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) */ void gfs2_rs_delete(struct gfs2_inode *ip) { + struct inode *inode = &ip->i_inode; + down_write(&ip->i_rw_mutex); - if (ip->i_res) { + if (ip->i_res && atomic_read(&inode->i_writecount) <= 1) { gfs2_rs_deltree(ip->i_res); BUG_ON(ip->i_res->rs_free); kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 917c8e1eb4ae..e5639dec66c4 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1444,6 +1444,7 @@ static void gfs2_evict_inode(struct inode *inode) /* Must not read inode block until block type has been verified */ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; } @@ -1514,8 +1515,10 @@ out_unlock: if (gfs2_rs_active(ip->i_res)) gfs2_rs_deltree(ip->i_res); - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq(&ip->i_iopen_gh); + } gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED && error != -EROFS) @@ -1534,6 +1537,7 @@ out: ip->i_gl = NULL; if (ip->i_iopen_gh.gh_gl) { ip->i_iopen_gh.gh_gl->gl_object = NULL; + ip->i_iopen_gh.gh_flags |= GL_NOCACHE; gfs2_glock_dq_uninit(&ip->i_iopen_gh); } } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 546f6d39713a..834ac13c04b7 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -33,25 +33,27 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; + mutex_lock(&i->i_mutex); hpfs_lock(s); /*printk("dir lseek\n");*/ if (new_off == 0 || new_off == 1 || new_off == 11 || new_off == 12 || new_off == 13) goto ok; - mutex_lock(&i->i_mutex); pos = ((loff_t) hpfs_de_as_down_as_possible(s, hpfs_inode->i_dno) << 4) + 1; while (pos != new_off) { if (map_pos_dirent(i, &pos, &qbh)) hpfs_brelse4(&qbh); else goto fail; if (pos == 12) goto fail; } - mutex_unlock(&i->i_mutex); + hpfs_add_pos(i, &filp->f_pos); ok: + filp->f_pos = new_off; hpfs_unlock(s); - return filp->f_pos = new_off; -fail: mutex_unlock(&i->i_mutex); + return new_off; +fail: /*printk("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); + mutex_unlock(&i->i_mutex); return -ESPIPE; } diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 3027f4dbbab5..e4ba5fe4c3b5 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -109,10 +109,14 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + hpfs_lock(inode->i_sb); + if (to > inode->i_size) { truncate_pagecache(inode, to, inode->i_size); hpfs_truncate(inode); } + + hpfs_unlock(inode->i_sb); } static int hpfs_write_begin(struct file *file, struct address_space *mapping, diff --git a/fs/internal.h b/fs/internal.h index eaa75f75b625..68121584ae37 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -132,6 +132,12 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *); /* + * splice.c + */ +extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + loff_t *opos, size_t len, unsigned int flags); + +/* * pipe.c */ extern const struct file_operations pipefifo_fops; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index c57499dca89c..360d27c48887 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -2009,7 +2009,13 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp) bio->bi_end_io = lbmIODone; bio->bi_private = bp; - submit_bio(READ_SYNC, bio); + /*check if journaling to disk has been disabled*/ + if (log->no_integrity) { + bio->bi_size = 0; + lbmIODone(bio, 0); + } else { + submit_bio(READ_SYNC, bio); + } wait_event(bp->l_ioevent, (bp->l_flag != lbmREAD)); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2003e830ed1c..788e0a9c1fb0 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -611,11 +611,28 @@ static int jfs_freeze(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; + int rc = 0; if (!(sb->s_flags & MS_RDONLY)) { txQuiesce(sb); - lmLogShutdown(log); - updateSuper(sb, FM_CLEAN); + rc = lmLogShutdown(log); + if (rc) { + jfs_error(sb, "jfs_freeze: lmLogShutdown failed"); + + /* let operations fail rather than hang */ + txResume(sb); + + return rc; + } + rc = updateSuper(sb, FM_CLEAN); + if (rc) { + jfs_err("jfs_freeze: updateSuper failed\n"); + /* + * Don't fail here. Everything succeeded except + * marking the superblock clean, so there's really + * no harm in leaving it frozen for now. + */ + } } return 0; } @@ -627,13 +644,18 @@ static int jfs_unfreeze(struct super_block *sb) int rc = 0; if (!(sb->s_flags & MS_RDONLY)) { - updateSuper(sb, FM_MOUNT); - if ((rc = lmLogInit(log))) - jfs_err("jfs_unlock failed with return code %d", rc); - else - txResume(sb); + rc = updateSuper(sb, FM_MOUNT); + if (rc) { + jfs_error(sb, "jfs_unfreeze: updateSuper failed"); + goto out; + } + rc = lmLogInit(log); + if (rc) + jfs_error(sb, "jfs_unfreeze: lmLogInit failed"); +out: + txResume(sb); } - return 0; + return rc; } static struct dentry *jfs_do_mount(struct file_system_type *fs_type, diff --git a/fs/namei.c b/fs/namei.c index 85e40d1c0a8f..9ed9361223c0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1976,7 +1976,7 @@ static int path_lookupat(int dfd, const char *name, err = complete_walk(nd); if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!nd->inode->i_op->lookup) { + if (!can_lookup(nd->inode)) { path_put(&nd->path); err = -ENOTDIR; } @@ -2850,7 +2850,7 @@ finish_lookup: if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode)) goto out; error = -ENOTDIR; - if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup) + if ((nd->flags & LOOKUP_DIRECTORY) && !can_lookup(nd->inode)) goto out; audit_inode(name, nd->path.dentry, 0); finish_open: diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 816326093656..6792ce11f2bf 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -1029,15 +1029,6 @@ static int ncp_rmdir(struct inode *dir, struct dentry *dentry) DPRINTK("ncp_rmdir: removing %s/%s\n", dentry->d_parent->d_name.name, dentry->d_name.name); - /* - * fail with EBUSY if there are still references to this - * directory. - */ - dentry_unhash(dentry); - error = -EBUSY; - if (!d_unhashed(dentry)) - goto out; - len = sizeof(__name); error = ncp_io2vol(server, __name, &len, dentry->d_name.name, dentry->d_name.len, !ncp_preserve_case(dir)); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4e2fe714d5c2..d7ba5616989c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1078,7 +1078,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) struct nfs4_state *state = opendata->state; struct nfs_inode *nfsi = NFS_I(state->inode); struct nfs_delegation *delegation; - int open_mode = opendata->o_arg.open_flags & (O_EXCL|O_TRUNC); + int open_mode = opendata->o_arg.open_flags; fmode_t fmode = opendata->o_arg.fmode; nfs4_stateid stateid; int ret = -EAGAIN; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index a366107a7331..2d7525fbcf25 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1942,6 +1942,7 @@ static int nfs23_validate_mount_data(void *options, args->namlen = data->namlen; args->bsize = data->bsize; + args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->flags & NFS_MOUNT_SECFLAVOUR) args->auth_flavors[0] = data->pseudoflavor; if (!args->nfs_server.hostname) @@ -2637,6 +2638,7 @@ static int nfs4_validate_mount_data(void *options, goto out_no_address; args->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); + args->auth_flavors[0] = RPC_AUTH_UNIX; if (data->auth_flavourlen) { if (data->auth_flavourlen > 1) goto out_inval_auth; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index b3fdd1a323d6..e68588e6b1e8 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1408,6 +1408,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 04ee1b57c243..b4a5cdf9dbc5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -947,7 +947,7 @@ leave: ocfs2_free_dir_lookup_result(&orphan_insert); ocfs2_free_dir_lookup_result(&lookup); - if (status) + if (status && (status != -ENOTEMPTY)) mlog_errno(status); return status; @@ -2216,7 +2216,7 @@ out: brelse(orphan_dir_bh); - return 0; + return ret; } int ocfs2_create_inode_in_orphan(struct inode *dir, diff --git a/fs/pnode.c b/fs/pnode.c index 3d2a7141b87a..9af0df15256e 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -83,7 +83,8 @@ static int do_make_slave(struct mount *mnt) if (peer_mnt == mnt) peer_mnt = NULL; } - if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) + if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) && + list_empty(&mnt->mnt_share)) mnt_release_group_id(mnt); list_del_init(&mnt->mnt_share); diff --git a/fs/proc/base.c b/fs/proc/base.c index dd51e50001fe..c3834dad09b3 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2118,6 +2118,7 @@ static int show_timer(struct seq_file *m, void *v) nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); + seq_printf(m, "ClockID: %d\n", timer->it_clock); return 0; } diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c index bd4b5a740ff1..bdfabdaefdce 100644 --- a/fs/proc/kmsg.c +++ b/fs/proc/kmsg.c @@ -21,12 +21,12 @@ extern wait_queue_head_t log_wait; static int kmsg_open(struct inode * inode, struct file * file) { - return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_OPEN, NULL, 0, SYSLOG_FROM_PROC); } static int kmsg_release(struct inode * inode, struct file * file) { - (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_FILE); + (void) do_syslog(SYSLOG_ACTION_CLOSE, NULL, 0, SYSLOG_FROM_PROC); return 0; } @@ -34,15 +34,15 @@ static ssize_t kmsg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { if ((file->f_flags & O_NONBLOCK) && - !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + !do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return -EAGAIN; - return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_FILE); + return do_syslog(SYSLOG_ACTION_READ, buf, count, SYSLOG_FROM_PROC); } static unsigned int kmsg_poll(struct file *file, poll_table *wait) { poll_wait(file, &log_wait, wait); - if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_FILE)) + if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC)) return POLLIN | POLLRDNORM; return 0; } diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 8798d065e400..afa6be6fc397 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -120,7 +120,7 @@ static int qnx6_readdir(struct file *filp, void *dirent, filldir_t filldir) struct inode *inode = file_inode(filp); struct super_block *s = inode->i_sb; struct qnx6_sb_info *sbi = QNX6_SB(s); - loff_t pos = filp->f_pos & (QNX6_DIR_ENTRY_SIZE - 1); + loff_t pos = filp->f_pos & ~(QNX6_DIR_ENTRY_SIZE - 1); unsigned long npages = dir_pages(inode); unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned start = (pos & ~PAGE_CACHE_MASK) / QNX6_DIR_ENTRY_SIZE; diff --git a/fs/read_write.c b/fs/read_write.c index 03430008704e..2cefa417be34 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1064,6 +1064,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, struct fd in, out; struct inode *in_inode, *out_inode; loff_t pos; + loff_t out_pos; ssize_t retval; int fl; @@ -1077,12 +1078,14 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!(in.file->f_mode & FMODE_READ)) goto fput_in; retval = -ESPIPE; - if (!ppos) - ppos = &in.file->f_pos; - else + if (!ppos) { + pos = in.file->f_pos; + } else { + pos = *ppos; if (!(in.file->f_mode & FMODE_PREAD)) goto fput_in; - retval = rw_verify_area(READ, in.file, ppos, count); + } + retval = rw_verify_area(READ, in.file, &pos, count); if (retval < 0) goto fput_in; count = retval; @@ -1099,7 +1102,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = -EINVAL; in_inode = file_inode(in.file); out_inode = file_inode(out.file); - retval = rw_verify_area(WRITE, out.file, &out.file->f_pos, count); + out_pos = out.file->f_pos; + retval = rw_verify_area(WRITE, out.file, &out_pos, count); if (retval < 0) goto fput_out; count = retval; @@ -1107,7 +1111,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (!max) max = min(in_inode->i_sb->s_maxbytes, out_inode->i_sb->s_maxbytes); - pos = *ppos; if (unlikely(pos + count > max)) { retval = -EOVERFLOW; if (pos >= max) @@ -1126,18 +1129,23 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, if (in.file->f_flags & O_NONBLOCK) fl = SPLICE_F_NONBLOCK; #endif - retval = do_splice_direct(in.file, ppos, out.file, count, fl); + retval = do_splice_direct(in.file, &pos, out.file, &out_pos, count, fl); if (retval > 0) { add_rchar(current, retval); add_wchar(current, retval); fsnotify_access(in.file); fsnotify_modify(out.file); + out.file->f_pos = out_pos; + if (ppos) + *ppos = pos; + else + in.file->f_pos = pos; } inc_syscr(current); inc_syscw(current); - if (*ppos > max) + if (pos > max) retval = -EOVERFLOW; fput_out: diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 66c53b642a88..6c2d136561cb 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -204,6 +204,8 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent, next_pos = deh_offset(deh) + 1; if (item_moved(&tmp_ih, &path_to_entry)) { + set_cpu_key_k_offset(&pos_key, + next_pos); goto research; } } /* for */ diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 77d6d47abc83..f844533792ee 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1811,11 +1811,16 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); - if (insert_inode_locked4(inode, args.objectid, - reiserfs_find_actor, &args) < 0) { + + reiserfs_write_unlock(inode->i_sb); + err = insert_inode_locked4(inode, args.objectid, + reiserfs_find_actor, &args); + reiserfs_write_lock(inode->i_sb); + if (err) { err = -EINVAL; goto out_bad_inode; } + if (old_format_only(sb)) /* not a perfect generation count, as object ids can be reused, but ** this is as good as reiserfs can do right now. diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4cce1d9552fb..821bcf70e467 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -318,7 +318,19 @@ static int delete_one_xattr(struct dentry *dentry, void *data) static int chown_one_xattr(struct dentry *dentry, void *data) { struct iattr *attrs = data; - return reiserfs_setattr(dentry, attrs); + int ia_valid = attrs->ia_valid; + int err; + + /* + * We only want the ownership bits. Otherwise, we'll do + * things like change a directory to a regular file if + * ATTR_MODE is set. + */ + attrs->ia_valid &= (ATTR_UID|ATTR_GID); + err = reiserfs_setattr(dentry, attrs); + attrs->ia_valid = ia_valid; + + return err; } /* No i_mutex, but the inode is unconnected. */ diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index d7c01ef64eda..6c8767fdfc6a 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -443,6 +443,9 @@ int reiserfs_acl_chmod(struct inode *inode) int depth; int error; + if (IS_PRIVATE(inode)) + return 0; + if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; diff --git a/fs/splice.c b/fs/splice.c index e6b25598c8c4..d37431dd60a1 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1274,7 +1274,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, { struct file *file = sd->u.file; - return do_splice_from(pipe, file, &file->f_pos, sd->total_len, + return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags); } @@ -1283,6 +1283,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, * @in: file to splice from * @ppos: input file offset * @out: file to splice to + * @opos: output file offset * @len: number of bytes to splice * @flags: splice modifier flags * @@ -1294,7 +1295,7 @@ static int direct_splice_actor(struct pipe_inode_info *pipe, * */ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - size_t len, unsigned int flags) + loff_t *opos, size_t len, unsigned int flags) { struct splice_desc sd = { .len = len, @@ -1302,6 +1303,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, .flags = flags, .pos = *ppos, .u.file = out, + .opos = opos, }; long ret; @@ -1325,7 +1327,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, { struct pipe_inode_info *ipipe; struct pipe_inode_info *opipe; - loff_t offset, *off; + loff_t offset; long ret; ipipe = get_pipe_info(in); @@ -1356,13 +1358,15 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; if (copy_from_user(&offset, off_out, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &out->f_pos; + } else { + offset = out->f_pos; + } - ret = do_splice_from(ipipe, out, off, len, flags); + ret = do_splice_from(ipipe, out, &offset, len, flags); - if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) + if (!off_out) + out->f_pos = offset; + else if (copy_to_user(off_out, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; @@ -1376,13 +1380,15 @@ static long do_splice(struct file *in, loff_t __user *off_in, return -EINVAL; if (copy_from_user(&offset, off_in, sizeof(loff_t))) return -EFAULT; - off = &offset; - } else - off = &in->f_pos; + } else { + offset = in->f_pos; + } - ret = do_splice_to(in, off, opipe, len, flags); + ret = do_splice_to(in, &offset, opipe, len, flags); - if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) + if (!off_in) + in->f_pos = offset; + else if (copy_to_user(off_in, &offset, sizeof(loff_t))) ret = -EFAULT; return ret; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index de08c92f2e23..605af512aec2 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -349,31 +349,50 @@ static unsigned int vfs_dent_type(uint8_t type) static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) { int err, over = 0; + loff_t pos = file->f_pos; struct qstr nm; union ubifs_key key; struct ubifs_dent_node *dent; struct inode *dir = file_inode(file); struct ubifs_info *c = dir->i_sb->s_fs_info; - dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos); + dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, pos); - if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2) + if (pos > UBIFS_S_KEY_HASH_MASK || pos == 2) /* * The directory was seek'ed to a senseless position or there * are no more entries. */ return 0; + if (file->f_version == 0) { + /* + * The file was seek'ed, which means that @file->private_data + * is now invalid. This may also be just the first + * 'ubifs_readdir()' invocation, in which case + * @file->private_data is NULL, and the below code is + * basically a no-op. + */ + kfree(file->private_data); + file->private_data = NULL; + } + + /* + * 'generic_file_llseek()' unconditionally sets @file->f_version to + * zero, and we use this for detecting whether the file was seek'ed. + */ + file->f_version = 1; + /* File positions 0 and 1 correspond to "." and ".." */ - if (file->f_pos == 0) { + if (pos == 0) { ubifs_assert(!file->private_data); over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR); if (over) return 0; - file->f_pos = 1; + file->f_pos = pos = 1; } - if (file->f_pos == 1) { + if (pos == 1) { ubifs_assert(!file->private_data); over = filldir(dirent, "..", 2, 1, parent_ino(file->f_path.dentry), DT_DIR); @@ -389,7 +408,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) goto out; } - file->f_pos = key_hash_flash(c, &dent->key); + file->f_pos = pos = key_hash_flash(c, &dent->key); file->private_data = dent; } @@ -397,17 +416,16 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) if (!dent) { /* * The directory was seek'ed to and is now readdir'ed. - * Find the entry corresponding to @file->f_pos or the - * closest one. + * Find the entry corresponding to @pos or the closest one. */ - dent_key_init_hash(c, &key, dir->i_ino, file->f_pos); + dent_key_init_hash(c, &key, dir->i_ino, pos); nm.name = NULL; dent = ubifs_tnc_next_ent(c, &key, &nm); if (IS_ERR(dent)) { err = PTR_ERR(dent); goto out; } - file->f_pos = key_hash_flash(c, &dent->key); + file->f_pos = pos = key_hash_flash(c, &dent->key); file->private_data = dent; } @@ -419,7 +437,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) ubifs_inode(dir)->creat_sqnum); nm.len = le16_to_cpu(dent->nlen); - over = filldir(dirent, dent->name, nm.len, file->f_pos, + over = filldir(dirent, dent->name, nm.len, pos, le64_to_cpu(dent->inum), vfs_dent_type(dent->type)); if (over) @@ -435,9 +453,17 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) } kfree(file->private_data); - file->f_pos = key_hash_flash(c, &dent->key); + file->f_pos = pos = key_hash_flash(c, &dent->key); file->private_data = dent; cond_resched(); + + if (file->f_version == 0) + /* + * The file was seek'ed meanwhile, lets return and start + * reading direntries from the new position on the next + * invocation. + */ + return 0; } out: @@ -448,15 +474,13 @@ out: kfree(file->private_data); file->private_data = NULL; + /* 2 is a special value indicating that there are no more direntries */ file->f_pos = 2; return 0; } -/* If a directory is seeked, we have to free saved readdir() state */ static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int whence) { - kfree(file->private_data); - file->private_data = NULL; return generic_file_llseek(file, offset, whence); } diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 1d32f1d52763..306d883d89bc 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -21,6 +21,8 @@ #include "xfs_bmap_btree.h" #include "xfs_inode.h" #include "xfs_vnodeops.h" +#include "xfs_sb.h" +#include "xfs_mount.h" #include "xfs_trace.h" #include <linux/slab.h> #include <linux/xattr.h> @@ -34,7 +36,9 @@ */ STATIC struct posix_acl * -xfs_acl_from_disk(struct xfs_acl *aclp) +xfs_acl_from_disk( + struct xfs_acl *aclp, + int max_entries) { struct posix_acl_entry *acl_e; struct posix_acl *acl; @@ -42,7 +46,7 @@ xfs_acl_from_disk(struct xfs_acl *aclp) unsigned int count, i; count = be32_to_cpu(aclp->acl_cnt); - if (count > XFS_ACL_MAX_ENTRIES) + if (count > max_entries) return ERR_PTR(-EFSCORRUPTED); acl = posix_acl_alloc(count, GFP_KERNEL); @@ -108,9 +112,9 @@ xfs_get_acl(struct inode *inode, int type) struct xfs_inode *ip = XFS_I(inode); struct posix_acl *acl; struct xfs_acl *xfs_acl; - int len = sizeof(struct xfs_acl); unsigned char *ea_name; int error; + int len; acl = get_cached_acl(inode, type); if (acl != ACL_NOT_CACHED) @@ -133,8 +137,8 @@ xfs_get_acl(struct inode *inode, int type) * If we have a cached ACLs value just return it, not need to * go out to the disk. */ - - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + len = XFS_ACL_MAX_SIZE(ip->i_mount); + xfs_acl = kzalloc(len, GFP_KERNEL); if (!xfs_acl) return ERR_PTR(-ENOMEM); @@ -153,7 +157,7 @@ xfs_get_acl(struct inode *inode, int type) goto out; } - acl = xfs_acl_from_disk(xfs_acl); + acl = xfs_acl_from_disk(xfs_acl, XFS_ACL_MAX_ENTRIES(ip->i_mount)); if (IS_ERR(acl)) goto out; @@ -189,16 +193,17 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl) if (acl) { struct xfs_acl *xfs_acl; - int len; + int len = XFS_ACL_MAX_SIZE(ip->i_mount); - xfs_acl = kzalloc(sizeof(struct xfs_acl), GFP_KERNEL); + xfs_acl = kzalloc(len, GFP_KERNEL); if (!xfs_acl) return -ENOMEM; xfs_acl_to_disk(xfs_acl, acl); - len = sizeof(struct xfs_acl) - - (sizeof(struct xfs_acl_entry) * - (XFS_ACL_MAX_ENTRIES - acl->a_count)); + + /* subtract away the unused acl entries */ + len -= sizeof(struct xfs_acl_entry) * + (XFS_ACL_MAX_ENTRIES(ip->i_mount) - acl->a_count); error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl, len, ATTR_ROOT); @@ -243,7 +248,7 @@ xfs_set_mode(struct inode *inode, umode_t mode) static int xfs_acl_exists(struct inode *inode, unsigned char *name) { - int len = sizeof(struct xfs_acl); + int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); return (xfs_attr_get(XFS_I(inode), name, NULL, &len, ATTR_ROOT|ATTR_KERNOVAL) == 0); @@ -379,7 +384,7 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name, goto out_release; error = -EINVAL; - if (acl->a_count > XFS_ACL_MAX_ENTRIES) + if (acl->a_count > XFS_ACL_MAX_ENTRIES(XFS_M(inode->i_sb))) goto out_release; if (type == ACL_TYPE_ACCESS) { diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 39632d941354..4016a567b83c 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -22,19 +22,36 @@ struct inode; struct posix_acl; struct xfs_inode; -#define XFS_ACL_MAX_ENTRIES 25 #define XFS_ACL_NOT_PRESENT (-1) /* On-disk XFS access control list structure */ +struct xfs_acl_entry { + __be32 ae_tag; + __be32 ae_id; + __be16 ae_perm; + __be16 ae_pad; /* fill the implicit hole in the structure */ +}; + struct xfs_acl { - __be32 acl_cnt; - struct xfs_acl_entry { - __be32 ae_tag; - __be32 ae_id; - __be16 ae_perm; - } acl_entry[XFS_ACL_MAX_ENTRIES]; + __be32 acl_cnt; + struct xfs_acl_entry acl_entry[0]; }; +/* + * The number of ACL entries allowed is defined by the on-disk format. + * For v4 superblocks, that is limited to 25 entries. For v5 superblocks, it is + * limited only by the maximum size of the xattr that stores the information. + */ +#define XFS_ACL_MAX_ENTRIES(mp) \ + (xfs_sb_version_hascrc(&mp->m_sb) \ + ? (XATTR_SIZE_MAX - sizeof(struct xfs_acl)) / \ + sizeof(struct xfs_acl_entry) \ + : 25) + +#define XFS_ACL_MAX_SIZE(mp) \ + (sizeof(struct xfs_acl) + \ + sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) + /* On-disk XFS extended attribute names */ #define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" #define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 0bce1b348580..31d3cd129269 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -1412,7 +1412,7 @@ xfs_attr3_leaf_add_work( name_rmt->valuelen = 0; name_rmt->valueblk = 0; args->rmtblkno = 1; - args->rmtblkcnt = XFS_B_TO_FSB(mp, args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); } xfs_trans_log_buf(args->trans, bp, XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index), @@ -1445,11 +1445,12 @@ xfs_attr3_leaf_add_work( STATIC void xfs_attr3_leaf_compact( struct xfs_da_args *args, - struct xfs_attr3_icleaf_hdr *ichdr_d, + struct xfs_attr3_icleaf_hdr *ichdr_dst, struct xfs_buf *bp) { - xfs_attr_leafblock_t *leaf_s, *leaf_d; - struct xfs_attr3_icleaf_hdr ichdr_s; + struct xfs_attr_leafblock *leaf_src; + struct xfs_attr_leafblock *leaf_dst; + struct xfs_attr3_icleaf_hdr ichdr_src; struct xfs_trans *trans = args->trans; struct xfs_mount *mp = trans->t_mountp; char *tmpbuffer; @@ -1457,29 +1458,38 @@ xfs_attr3_leaf_compact( trace_xfs_attr_leaf_compact(args); tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP); - ASSERT(tmpbuffer != NULL); memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp)); memset(bp->b_addr, 0, XFS_LBSIZE(mp)); + leaf_src = (xfs_attr_leafblock_t *)tmpbuffer; + leaf_dst = bp->b_addr; /* - * Copy basic information + * Copy the on-disk header back into the destination buffer to ensure + * all the information in the header that is not part of the incore + * header structure is preserved. */ - leaf_s = (xfs_attr_leafblock_t *)tmpbuffer; - leaf_d = bp->b_addr; - ichdr_s = *ichdr_d; /* struct copy */ - ichdr_d->firstused = XFS_LBSIZE(mp); - ichdr_d->usedbytes = 0; - ichdr_d->count = 0; - ichdr_d->holes = 0; - ichdr_d->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_s); - ichdr_d->freemap[0].size = ichdr_d->firstused - ichdr_d->freemap[0].base; + memcpy(bp->b_addr, tmpbuffer, xfs_attr3_leaf_hdr_size(leaf_src)); + + /* Initialise the incore headers */ + ichdr_src = *ichdr_dst; /* struct copy */ + ichdr_dst->firstused = XFS_LBSIZE(mp); + ichdr_dst->usedbytes = 0; + ichdr_dst->count = 0; + ichdr_dst->holes = 0; + ichdr_dst->freemap[0].base = xfs_attr3_leaf_hdr_size(leaf_src); + ichdr_dst->freemap[0].size = ichdr_dst->firstused - + ichdr_dst->freemap[0].base; + + + /* write the header back to initialise the underlying buffer */ + xfs_attr3_leaf_hdr_to_disk(leaf_dst, ichdr_dst); /* * Copy all entry's in the same (sorted) order, * but allocate name/value pairs packed and in sequence. */ - xfs_attr3_leaf_moveents(leaf_s, &ichdr_s, 0, leaf_d, ichdr_d, 0, - ichdr_s.count, mp); + xfs_attr3_leaf_moveents(leaf_src, &ichdr_src, 0, leaf_dst, ichdr_dst, 0, + ichdr_src.count, mp); /* * this logs the entire buffer, but the caller must write the header * back to the buffer when it is finished modifying it. @@ -2181,14 +2191,24 @@ xfs_attr3_leaf_unbalance( struct xfs_attr_leafblock *tmp_leaf; struct xfs_attr3_icleaf_hdr tmphdr; - tmp_leaf = kmem_alloc(state->blocksize, KM_SLEEP); - memset(tmp_leaf, 0, state->blocksize); - memset(&tmphdr, 0, sizeof(tmphdr)); + tmp_leaf = kmem_zalloc(state->blocksize, KM_SLEEP); + /* + * Copy the header into the temp leaf so that all the stuff + * not in the incore header is present and gets copied back in + * once we've moved all the entries. + */ + memcpy(tmp_leaf, save_leaf, xfs_attr3_leaf_hdr_size(save_leaf)); + + memset(&tmphdr, 0, sizeof(tmphdr)); tmphdr.magic = savehdr.magic; tmphdr.forw = savehdr.forw; tmphdr.back = savehdr.back; tmphdr.firstused = state->blocksize; + + /* write the header to the temp buffer to initialise it */ + xfs_attr3_leaf_hdr_to_disk(tmp_leaf, &tmphdr); + if (xfs_attr3_leaf_order(save_blk->bp, &savehdr, drop_blk->bp, &drophdr)) { xfs_attr3_leaf_moveents(drop_leaf, &drophdr, 0, @@ -2334,8 +2354,9 @@ xfs_attr3_leaf_lookup_int( args->index = probe; args->valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, - args->valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks( + args->dp->i_mount, + args->valuelen); return XFS_ERROR(EEXIST); } } @@ -2386,7 +2407,8 @@ xfs_attr3_leaf_getvalue( ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0); valuelen = be32_to_cpu(name_rmt->valuelen); args->rmtblkno = be32_to_cpu(name_rmt->valueblk); - args->rmtblkcnt = XFS_B_TO_FSB(args->dp->i_mount, valuelen); + args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount, + valuelen); if (args->flags & ATTR_KERNOVAL) { args->valuelen = valuelen; return 0; @@ -2712,7 +2734,8 @@ xfs_attr3_leaf_list_int( args.valuelen = valuelen; args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS); args.rmtblkno = be32_to_cpu(name_rmt->valueblk); - args.rmtblkcnt = XFS_B_TO_FSB(args.dp->i_mount, valuelen); + args.rmtblkcnt = xfs_attr3_rmt_blocks( + args.dp->i_mount, valuelen); retval = xfs_attr_rmtval_get(&args); if (retval) return retval; @@ -3235,7 +3258,7 @@ xfs_attr3_leaf_inactive( name_rmt = xfs_attr3_leaf_name_remote(leaf, i); if (name_rmt->valueblk) { lp->valueblk = be32_to_cpu(name_rmt->valueblk); - lp->valuelen = XFS_B_TO_FSB(dp->i_mount, + lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount, be32_to_cpu(name_rmt->valuelen)); lp++; } diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h index f9d7846097e2..444a7704596c 100644 --- a/fs/xfs/xfs_attr_leaf.h +++ b/fs/xfs/xfs_attr_leaf.h @@ -128,6 +128,7 @@ struct xfs_attr3_leaf_hdr { __u8 holes; __u8 pad1; struct xfs_attr_leaf_map freemap[XFS_ATTR_LEAF_MAPSIZE]; + __be32 pad2; /* 64 bit alignment */ }; #define XFS_ATTR3_LEAF_CRC_OFF (offsetof(struct xfs_attr3_leaf_hdr, info.crc)) diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c index dee84466dcc9..ef6b0c124528 100644 --- a/fs/xfs/xfs_attr_remote.c +++ b/fs/xfs/xfs_attr_remote.c @@ -47,22 +47,55 @@ * Each contiguous block has a header, so it is not just a simple attribute * length to FSB conversion. */ -static int +int xfs_attr3_rmt_blocks( struct xfs_mount *mp, int attrlen) { - int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, - mp->m_sb.sb_blocksize); - return (attrlen + buflen - 1) / buflen; + if (xfs_sb_version_hascrc(&mp->m_sb)) { + int buflen = XFS_ATTR3_RMT_BUF_SPACE(mp, mp->m_sb.sb_blocksize); + return (attrlen + buflen - 1) / buflen; + } + return XFS_B_TO_FSB(mp, attrlen); +} + +/* + * Checking of the remote attribute header is split into two parts. The verifier + * does CRC, location and bounds checking, the unpacking function checks the + * attribute parameters and owner. + */ +static bool +xfs_attr3_rmt_hdr_ok( + struct xfs_mount *mp, + void *ptr, + xfs_ino_t ino, + uint32_t offset, + uint32_t size, + xfs_daddr_t bno) +{ + struct xfs_attr3_rmt_hdr *rmt = ptr; + + if (bno != be64_to_cpu(rmt->rm_blkno)) + return false; + if (offset != be32_to_cpu(rmt->rm_offset)) + return false; + if (size != be32_to_cpu(rmt->rm_bytes)) + return false; + if (ino != be64_to_cpu(rmt->rm_owner)) + return false; + + /* ok */ + return true; } static bool xfs_attr3_rmt_verify( - struct xfs_buf *bp) + struct xfs_mount *mp, + void *ptr, + int fsbsize, + xfs_daddr_t bno) { - struct xfs_mount *mp = bp->b_target->bt_mount; - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return false; @@ -70,7 +103,9 @@ xfs_attr3_rmt_verify( return false; if (!uuid_equal(&rmt->rm_uuid, &mp->m_sb.sb_uuid)) return false; - if (bp->b_bn != be64_to_cpu(rmt->rm_blkno)) + if (be64_to_cpu(rmt->rm_blkno) != bno) + return false; + if (be32_to_cpu(rmt->rm_bytes) > fsbsize - sizeof(*rmt)) return false; if (be32_to_cpu(rmt->rm_offset) + be32_to_cpu(rmt->rm_bytes) >= XATTR_SIZE_MAX) @@ -86,17 +121,40 @@ xfs_attr3_rmt_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; + char *ptr; + int len; + bool corrupt = false; + xfs_daddr_t bno; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_RMT_CRC_OFF) || - !xfs_attr3_rmt_verify(bp)) { + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_verify_cksum(ptr, XFS_LBSIZE(mp), + XFS_ATTR3_RMT_CRC_OFF)) { + corrupt = true; + break; + } + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + corrupt = true; + break; + } + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; + } + + if (corrupt) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, EFSCORRUPTED); - } + } else + ASSERT(len == 0); } static void @@ -105,23 +163,39 @@ xfs_attr3_rmt_write_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_buf_log_item *bip = bp->b_fspriv; + char *ptr; + int len; + xfs_daddr_t bno; /* no verification of non-crc buffers */ if (!xfs_sb_version_hascrc(&mp->m_sb)) return; - if (!xfs_attr3_rmt_verify(bp)) { - XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); - xfs_buf_ioerror(bp, EFSCORRUPTED); - return; - } + ptr = bp->b_addr; + bno = bp->b_bn; + len = BBTOB(bp->b_length); + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0) { + if (!xfs_attr3_rmt_verify(mp, ptr, XFS_LBSIZE(mp), bno)) { + XFS_CORRUPTION_ERROR(__func__, + XFS_ERRLEVEL_LOW, mp, bp->b_addr); + xfs_buf_ioerror(bp, EFSCORRUPTED); + return; + } + if (bip) { + struct xfs_attr3_rmt_hdr *rmt; + + rmt = (struct xfs_attr3_rmt_hdr *)ptr; + rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + } + xfs_update_cksum(ptr, XFS_LBSIZE(mp), XFS_ATTR3_RMT_CRC_OFF); - if (bip) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; - rmt->rm_lsn = cpu_to_be64(bip->bli_item.li_lsn); + len -= XFS_LBSIZE(mp); + ptr += XFS_LBSIZE(mp); + bno += mp->m_bsize; } - xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length), - XFS_ATTR3_RMT_CRC_OFF); + ASSERT(len == 0); } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { @@ -129,15 +203,16 @@ const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { .verify_write = xfs_attr3_rmt_write_verify, }; -static int +STATIC int xfs_attr3_rmt_hdr_set( struct xfs_mount *mp, + void *ptr, xfs_ino_t ino, uint32_t offset, uint32_t size, - struct xfs_buf *bp) + xfs_daddr_t bno) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + struct xfs_attr3_rmt_hdr *rmt = ptr; if (!xfs_sb_version_hascrc(&mp->m_sb)) return 0; @@ -147,36 +222,107 @@ xfs_attr3_rmt_hdr_set( rmt->rm_bytes = cpu_to_be32(size); uuid_copy(&rmt->rm_uuid, &mp->m_sb.sb_uuid); rmt->rm_owner = cpu_to_be64(ino); - rmt->rm_blkno = cpu_to_be64(bp->b_bn); - bp->b_ops = &xfs_attr3_rmt_buf_ops; + rmt->rm_blkno = cpu_to_be64(bno); return sizeof(struct xfs_attr3_rmt_hdr); } /* - * Checking of the remote attribute header is split into two parts. the verifier - * does CRC, location and bounds checking, the unpacking function checks the - * attribute parameters and owner. + * Helper functions to copy attribute data in and out of the one disk extents */ -static bool -xfs_attr3_rmt_hdr_ok( - struct xfs_mount *mp, - xfs_ino_t ino, - uint32_t offset, - uint32_t size, - struct xfs_buf *bp) +STATIC int +xfs_attr_rmtval_copyout( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **dst) { - struct xfs_attr3_rmt_hdr *rmt = bp->b_addr; + char *src = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); - if (offset != be32_to_cpu(rmt->rm_offset)) - return false; - if (size != be32_to_cpu(rmt->rm_bytes)) - return false; - if (ino != be64_to_cpu(rmt->rm_owner)) - return false; + ASSERT(len >= XFS_LBSIZE(mp)); - /* ok */ - return true; + while (len > 0 && *valuelen > 0) { + int hdr_size = 0; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min_t(int, *valuelen, byte_cnt); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + if (!xfs_attr3_rmt_hdr_ok(mp, src, ino, *offset, + byte_cnt, bno)) { + xfs_alert(mp, +"remote attribute header mismatch bno/off/len/owner (0x%llx/0x%x/Ox%x/0x%llx)", + bno, *offset, byte_cnt, ino); + return EFSCORRUPTED; + } + hdr_size = sizeof(struct xfs_attr3_rmt_hdr); + } + + memcpy(*dst, src + hdr_size, byte_cnt); + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + src += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *dst += byte_cnt; + *offset += byte_cnt; + } + return 0; +} + +STATIC void +xfs_attr_rmtval_copyin( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_ino_t ino, + int *offset, + int *valuelen, + char **src) +{ + char *dst = bp->b_addr; + xfs_daddr_t bno = bp->b_bn; + int len = BBTOB(bp->b_length); + + ASSERT(len >= XFS_LBSIZE(mp)); + + while (len > 0 && *valuelen > 0) { + int hdr_size; + int byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, XFS_LBSIZE(mp)); + + byte_cnt = min(*valuelen, byte_cnt); + hdr_size = xfs_attr3_rmt_hdr_set(mp, dst, ino, *offset, + byte_cnt, bno); + + memcpy(dst + hdr_size, *src, byte_cnt); + + /* + * If this is the last block, zero the remainder of it. + * Check that we are actually the last block, too. + */ + if (byte_cnt + hdr_size < XFS_LBSIZE(mp)) { + ASSERT(*valuelen - byte_cnt == 0); + ASSERT(len == XFS_LBSIZE(mp)); + memset(dst + hdr_size + byte_cnt, 0, + XFS_LBSIZE(mp) - hdr_size - byte_cnt); + } + + /* roll buffer forwards */ + len -= XFS_LBSIZE(mp); + dst += XFS_LBSIZE(mp); + bno += mp->m_bsize; + + /* roll attribute data forwards */ + *valuelen -= byte_cnt; + *src += byte_cnt; + *offset += byte_cnt; + } } /* @@ -190,13 +336,12 @@ xfs_attr_rmtval_get( struct xfs_bmbt_irec map[ATTR_RMTVALUE_MAPSIZE]; struct xfs_mount *mp = args->dp->i_mount; struct xfs_buf *bp; - xfs_daddr_t dblkno; xfs_dablk_t lblkno = args->rmtblkno; - void *dst = args->value; + char *dst = args->value; int valuelen = args->valuelen; int nmap; int error; - int blkcnt; + int blkcnt = args->rmtblkcnt; int i; int offset = 0; @@ -207,52 +352,36 @@ xfs_attr_rmtval_get( while (valuelen > 0) { nmap = ATTR_RMTVALUE_MAPSIZE; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, map, &nmap, + blkcnt, map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return error; ASSERT(nmap >= 1); for (i = 0; (i < nmap) && (valuelen > 0); i++) { - int byte_cnt; - char *src; + xfs_daddr_t dblkno; + int dblkcnt; ASSERT((map[i].br_startblock != DELAYSTARTBLOCK) && (map[i].br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); - blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - dblkno, blkcnt, 0, &bp, + dblkno, dblkcnt, 0, &bp, &xfs_attr3_rmt_buf_ops); if (error) return error; - byte_cnt = min_t(int, valuelen, BBTOB(bp->b_length)); - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); - - src = bp->b_addr; - if (xfs_sb_version_hascrc(&mp->m_sb)) { - if (!xfs_attr3_rmt_hdr_ok(mp, args->dp->i_ino, - offset, byte_cnt, bp)) { - xfs_alert(mp, -"remote attribute header does not match required off/len/owner (0x%x/Ox%x,0x%llx)", - offset, byte_cnt, args->dp->i_ino); - xfs_buf_relse(bp); - return EFSCORRUPTED; - - } - - src += sizeof(struct xfs_attr3_rmt_hdr); - } - - memcpy(dst, src, byte_cnt); + error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino, + &offset, &valuelen, + &dst); xfs_buf_relse(bp); + if (error) + return error; - offset += byte_cnt; - dst += byte_cnt; - valuelen -= byte_cnt; - + /* roll attribute extent map forwards */ lblkno += map[i].br_blockcount; + blkcnt -= map[i].br_blockcount; } } ASSERT(valuelen == 0); @@ -270,17 +399,13 @@ xfs_attr_rmtval_set( struct xfs_inode *dp = args->dp; struct xfs_mount *mp = dp->i_mount; struct xfs_bmbt_irec map; - struct xfs_buf *bp; - xfs_daddr_t dblkno; xfs_dablk_t lblkno; xfs_fileoff_t lfileoff = 0; - void *src = args->value; + char *src = args->value; int blkcnt; int valuelen; int nmap; int error; - int hdrcnt = 0; - bool crcs = xfs_sb_version_hascrc(&mp->m_sb); int offset = 0; trace_xfs_attr_rmtval_set(args); @@ -289,24 +414,14 @@ xfs_attr_rmtval_set( * Find a "hole" in the attribute address space large enough for * us to drop the new attribute's value into. Because CRC enable * attributes have headers, we can't just do a straight byte to FSB - * conversion. We calculate the worst case block count in this case - * and we may not need that many, so we have to handle this when - * allocating the blocks below. + * conversion and have to take the header space into account. */ - if (!crcs) - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - else - blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); - + blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen); error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff, XFS_ATTR_FORK); if (error) return error; - /* Start with the attribute data. We'll allocate the rest afterwards. */ - if (crcs) - blkcnt = XFS_B_TO_FSB(mp, args->valuelen); - args->rmtblkno = lblkno = (xfs_dablk_t)lfileoff; args->rmtblkcnt = blkcnt; @@ -349,26 +464,6 @@ xfs_attr_rmtval_set( (map.br_startblock != HOLESTARTBLOCK)); lblkno += map.br_blockcount; blkcnt -= map.br_blockcount; - hdrcnt++; - - /* - * If we have enough blocks for the attribute data, calculate - * how many extra blocks we need for headers. We might run - * through this multiple times in the case that the additional - * headers in the blocks needed for the data fragments spills - * into requiring more blocks. e.g. for 512 byte blocks, we'll - * spill for another block every 9 headers we require in this - * loop. - */ - if (crcs && blkcnt == 0) { - int total_len; - - total_len = args->valuelen + - hdrcnt * sizeof(struct xfs_attr3_rmt_hdr); - blkcnt = XFS_B_TO_FSB(mp, total_len); - blkcnt -= args->rmtblkcnt; - args->rmtblkcnt += blkcnt; - } /* * Start the next trans in the chain. @@ -385,18 +480,19 @@ xfs_attr_rmtval_set( * the INCOMPLETE flag. */ lblkno = args->rmtblkno; + blkcnt = args->rmtblkcnt; valuelen = args->valuelen; while (valuelen > 0) { - int byte_cnt; - char *buf; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + + ASSERT(blkcnt > 0); - /* - * Try to remember where we decided to put the value. - */ xfs_bmap_init(args->flist, args->firstblock); nmap = 1; error = xfs_bmapi_read(dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return(error); @@ -405,41 +501,27 @@ xfs_attr_rmtval_set( (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get(mp->m_ddev_targp, dblkno, blkcnt, 0); + bp = xfs_buf_get(mp->m_ddev_targp, dblkno, dblkcnt, 0); if (!bp) return ENOMEM; bp->b_ops = &xfs_attr3_rmt_buf_ops; - byte_cnt = BBTOB(bp->b_length); - byte_cnt = XFS_ATTR3_RMT_BUF_SPACE(mp, byte_cnt); - if (valuelen < byte_cnt) - byte_cnt = valuelen; - - buf = bp->b_addr; - buf += xfs_attr3_rmt_hdr_set(mp, dp->i_ino, offset, - byte_cnt, bp); - memcpy(buf, src, byte_cnt); - - if (byte_cnt < BBTOB(bp->b_length)) - xfs_buf_zero(bp, byte_cnt, - BBTOB(bp->b_length) - byte_cnt); + xfs_attr_rmtval_copyin(mp, bp, args->dp->i_ino, &offset, + &valuelen, &src); error = xfs_bwrite(bp); /* GROT: NOTE: synchronous write */ xfs_buf_relse(bp); if (error) return error; - src += byte_cnt; - valuelen -= byte_cnt; - offset += byte_cnt; - hdrcnt--; + /* roll attribute extent map forwards */ lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } ASSERT(valuelen == 0); - ASSERT(hdrcnt == 0); return 0; } @@ -448,33 +530,40 @@ xfs_attr_rmtval_set( * out-of-line buffer that it is stored on. */ int -xfs_attr_rmtval_remove(xfs_da_args_t *args) +xfs_attr_rmtval_remove( + struct xfs_da_args *args) { - xfs_mount_t *mp; - xfs_bmbt_irec_t map; - xfs_buf_t *bp; - xfs_daddr_t dblkno; - xfs_dablk_t lblkno; - int valuelen, blkcnt, nmap, error, done, committed; + struct xfs_mount *mp = args->dp->i_mount; + xfs_dablk_t lblkno; + int blkcnt; + int error; + int done; trace_xfs_attr_rmtval_remove(args); - mp = args->dp->i_mount; - /* - * Roll through the "value", invalidating the attribute value's - * blocks. + * Roll through the "value", invalidating the attribute value's blocks. + * Note that args->rmtblkcnt is the minimum number of data blocks we'll + * see for a CRC enabled remote attribute. Each extent will have a + * header, and so we may have more blocks than we realise here. If we + * fail to map the blocks correctly, we'll have problems with the buffer + * lookups. */ lblkno = args->rmtblkno; - valuelen = args->rmtblkcnt; - while (valuelen > 0) { + blkcnt = args->rmtblkcnt; + while (blkcnt > 0) { + struct xfs_bmbt_irec map; + struct xfs_buf *bp; + xfs_daddr_t dblkno; + int dblkcnt; + int nmap; + /* * Try to remember where we decided to put the value. */ nmap = 1; error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno, - args->rmtblkcnt, &map, &nmap, - XFS_BMAPI_ATTRFORK); + blkcnt, &map, &nmap, XFS_BMAPI_ATTRFORK); if (error) return(error); ASSERT(nmap == 1); @@ -482,21 +571,20 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) (map.br_startblock != HOLESTARTBLOCK)); dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), - blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); + dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); /* * If the "remote" value is in the cache, remove it. */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, blkcnt, XBF_TRYLOCK); + bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); xfs_buf_relse(bp); bp = NULL; } - valuelen -= map.br_blockcount; - lblkno += map.br_blockcount; + blkcnt -= map.br_blockcount; } /* @@ -506,6 +594,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args) blkcnt = args->rmtblkcnt; done = 0; while (!done) { + int committed; + xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK | XFS_BMAPI_METADATA, diff --git a/fs/xfs/xfs_attr_remote.h b/fs/xfs/xfs_attr_remote.h index c7cca60a062a..92a8fd7977cc 100644 --- a/fs/xfs/xfs_attr_remote.h +++ b/fs/xfs/xfs_attr_remote.h @@ -20,6 +20,14 @@ #define XFS_ATTR3_RMT_MAGIC 0x5841524d /* XARM */ +/* + * There is one of these headers per filesystem block in a remote attribute. + * This is done to ensure there is a 1:1 mapping between the attribute value + * length and the number of blocks needed to store the attribute. This makes the + * verification of a buffer a little more complex, but greatly simplifies the + * allocation, reading and writing of these attributes as we don't have to guess + * the number of blocks needed to store the attribute data. + */ struct xfs_attr3_rmt_hdr { __be32 rm_magic; __be32 rm_offset; @@ -39,6 +47,8 @@ struct xfs_attr3_rmt_hdr { extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops; +int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen); + int xfs_attr_rmtval_get(struct xfs_da_args *args); int xfs_attr_rmtval_set(struct xfs_da_args *args); int xfs_attr_rmtval_remove(struct xfs_da_args *args); diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index 8804b8a3c310..0903960410a2 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -2544,7 +2544,17 @@ xfs_btree_new_iroot( if (error) goto error0; + /* + * we can't just memcpy() the root in for CRC enabled btree blocks. + * In that case have to also ensure the blkno remains correct + */ memcpy(cblock, block, xfs_btree_block_len(cur)); + if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) { + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn); + else + cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn); + } be16_add_cpu(&block->bb_level, 1); xfs_btree_set_numrecs(block, 1); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 0d2554299688..1b2472a46e46 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -513,6 +513,7 @@ _xfs_buf_find( xfs_alert(btp->bt_mount, "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", __func__, blkno, eofs); + WARN_ON(1); return NULL; } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index cf263476d6b4..4ec431777048 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -262,12 +262,7 @@ xfs_buf_item_format_segment( vecp->i_addr = xfs_buf_offset(bp, buffer_offset); vecp->i_len = nbits * XFS_BLF_CHUNK; vecp->i_type = XLOG_REG_TYPE_BCHUNK; -/* - * You would think we need to bump the nvecs here too, but we do not - * this number is used by recovery, and it gets confused by the boundary - * split here - * nvecs++; - */ + nvecs++; vecp++; first_bit = next_bit; last_bit = next_bit; diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index f852b082a084..c407e1ccff43 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -219,6 +219,14 @@ xfs_swap_extents( int taforkblks = 0; __uint64_t tmp; + /* + * We have no way of updating owner information in the BMBT blocks for + * each inode on CRC enabled filesystems, so to avoid corrupting the + * this metadata we simply don't allow extent swaps to occur. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + return XFS_ERROR(EINVAL); + tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL); if (!tempifp) { error = XFS_ERROR(ENOMEM); diff --git a/fs/xfs/xfs_dir2_format.h b/fs/xfs/xfs_dir2_format.h index a3b1bd841a80..7826782b8d78 100644 --- a/fs/xfs/xfs_dir2_format.h +++ b/fs/xfs/xfs_dir2_format.h @@ -266,6 +266,7 @@ struct xfs_dir3_blk_hdr { struct xfs_dir3_data_hdr { struct xfs_dir3_blk_hdr hdr; xfs_dir2_data_free_t best_free[XFS_DIR2_DATA_FD_COUNT]; + __be32 pad; /* 64 bit alignment */ }; #define XFS_DIR3_DATA_CRC_OFF offsetof(struct xfs_dir3_data_hdr, hdr.crc) @@ -477,7 +478,7 @@ struct xfs_dir3_leaf_hdr { struct xfs_da3_blkinfo info; /* header for da routines */ __be16 count; /* count of entries */ __be16 stale; /* count of stale entries */ - __be32 pad; + __be32 pad; /* 64 bit alignment */ }; struct xfs_dir3_icleaf_hdr { @@ -715,6 +716,7 @@ struct xfs_dir3_free_hdr { __be32 firstdb; /* db of first entry */ __be32 nvalid; /* count of valid entries */ __be32 nused; /* count of used entries */ + __be32 pad; /* 64 bit alignment */ }; struct xfs_dir3_free { diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index 5246de4912d4..2226a00acd15 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -263,18 +263,19 @@ xfs_dir3_free_get_buf( * Initialize the new block to be empty, and remember * its first slot as our empty slot. */ - hdr.magic = XFS_DIR2_FREE_MAGIC; - hdr.firstdb = 0; - hdr.nused = 0; - hdr.nvalid = 0; + memset(bp->b_addr, 0, sizeof(struct xfs_dir3_free_hdr)); + memset(&hdr, 0, sizeof(hdr)); + if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dir3_free_hdr *hdr3 = bp->b_addr; hdr.magic = XFS_DIR3_FREE_MAGIC; + hdr3->hdr.blkno = cpu_to_be64(bp->b_bn); hdr3->hdr.owner = cpu_to_be64(dp->i_ino); uuid_copy(&hdr3->hdr.uuid, &mp->m_sb.sb_uuid); - } + } else + hdr.magic = XFS_DIR2_FREE_MAGIC; xfs_dir3_free_hdr_to_disk(bp->b_addr, &hdr); *bpp = bp; return 0; @@ -1921,8 +1922,6 @@ xfs_dir2_node_addname_int( */ freehdr.firstdb = (fbno - XFS_DIR2_FREE_FIRSTDB(mp)) * xfs_dir3_free_max_bests(mp); - free->hdr.nvalid = 0; - free->hdr.nused = 0; } else { free = fbp->b_addr; bests = xfs_dir3_free_bests_p(mp, free); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a41f8bf1da37..044e97a33c8d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -249,8 +249,11 @@ xfs_qm_init_dquot_blk( d->dd_diskdq.d_version = XFS_DQUOT_VERSION; d->dd_diskdq.d_id = cpu_to_be32(curid); d->dd_diskdq.d_flags = type; - if (xfs_sb_version_hascrc(&mp->m_sb)) + if (xfs_sb_version_hascrc(&mp->m_sb)) { uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } xfs_trans_dquot_buf(tp, bp, @@ -286,23 +289,6 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) dqp->q_low_space[XFS_QLOWSP_5_PCNT] = space * 5; } -STATIC void -xfs_dquot_buf_calc_crc( - struct xfs_mount *mp, - struct xfs_buf *bp) -{ - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; - int i; - - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return; - - for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++, d++) { - xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), - offsetof(struct xfs_dqblk, dd_crc)); - } -} - STATIC bool xfs_dquot_buf_verify_crc( struct xfs_mount *mp, @@ -328,12 +314,11 @@ xfs_dquot_buf_verify_crc( for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), - offsetof(struct xfs_dqblk, dd_crc))) + XFS_DQUOT_CRC_OFF)) return false; if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_uuid)) return false; } - return true; } @@ -393,6 +378,11 @@ xfs_dquot_buf_read_verify( } } +/* + * we don't calculate the CRC here as that is done when the dquot is flushed to + * the buffer after the update is done. This ensures that the dquot in the + * buffer always has an up-to-date CRC value. + */ void xfs_dquot_buf_write_verify( struct xfs_buf *bp) @@ -404,7 +394,6 @@ xfs_dquot_buf_write_verify( xfs_buf_ioerror(bp, EFSCORRUPTED); return; } - xfs_dquot_buf_calc_crc(mp, bp); } const struct xfs_buf_ops xfs_dquot_buf_ops = { @@ -1151,11 +1140,17 @@ xfs_qm_dqflush( * copy the lsn into the on-disk dquot now while we have the in memory * dquot here. This can't be done later in the write verifier as we * can't get access to the log item at that point in time. + * + * We also calculate the CRC here so that the on-disk dquot in the + * buffer always has a valid CRC. This ensures there is no possibility + * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); } /* diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 6dda3f949b04..d04695545397 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -236,6 +236,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */ #define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */ #define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */ +#define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ /* diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 87595b211da1..3c3644ea825b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -99,7 +99,9 @@ xfs_fs_geometry( (xfs_sb_version_hasattr2(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) | (xfs_sb_version_hasprojid32bit(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_PROJID32 : 0); + XFS_FSOP_GEOM_FLAGS_PROJID32 : 0) | + (xfs_sb_version_hascrc(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_V5SB : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index efbe1accb6ca..7f7be5f98f52 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1638,6 +1638,10 @@ xfs_iunlink( dip->di_next_unlinked = agi->agi_unlinked[bucket_index]; offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1723,6 +1727,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1796,6 +1804,10 @@ xfs_iunlink_remove( dip->di_next_unlinked = cpu_to_be32(NULLAGINO); offset = ip->i_imap.im_boffset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, dip); + xfs_trans_inode_buf(tp, ibp); xfs_trans_log_buf(tp, ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); @@ -1809,6 +1821,10 @@ xfs_iunlink_remove( last_dip->di_next_unlinked = cpu_to_be32(next_agino); ASSERT(next_agino != 0); offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked); + + /* need to recalc the inode CRC if appropriate */ + xfs_dinode_calc_crc(mp, last_dip); + xfs_trans_inode_buf(tp, last_ibp); xfs_trans_log_buf(tp, last_ibp, offset, (offset + sizeof(xfs_agino_t) - 1)); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d82efaa2ac73..ca9ecaa81112 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -455,6 +455,28 @@ xfs_vn_getattr( return 0; } +static void +xfs_setattr_mode( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct iattr *iattr) +{ + struct inode *inode = VFS_I(ip); + umode_t mode = iattr->ia_mode; + + ASSERT(tp); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + + ip->i_d.di_mode &= S_IFMT; + ip->i_d.di_mode |= mode & ~S_IFMT; + + inode->i_mode &= S_IFMT; + inode->i_mode |= mode & ~S_IFMT; +} + int xfs_setattr_nonsize( struct xfs_inode *ip, @@ -606,18 +628,8 @@ xfs_setattr_nonsize( /* * Change file access modes. */ - if (mask & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - - ip->i_d.di_mode &= S_IFMT; - ip->i_d.di_mode |= mode & ~S_IFMT; - - inode->i_mode &= S_IFMT; - inode->i_mode |= mode & ~S_IFMT; - } + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); /* * Change file access or modified times. @@ -714,9 +726,8 @@ xfs_setattr_size( return XFS_ERROR(error); ASSERT(S_ISREG(ip->i_d.di_mode)); - ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| - ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID| - ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); + ASSERT((mask & (ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET| + ATTR_MTIME_SET|ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0); if (!(flags & XFS_ATTR_NOLOCK)) { lock_flags |= XFS_IOLOCK_EXCL; @@ -860,6 +871,12 @@ xfs_setattr_size( xfs_inode_clear_eofblocks_tag(ip); } + /* + * Change file access modes. + */ + if (mask & ATTR_MODE) + xfs_setattr_mode(tp, ip, iattr); + if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 93f03ec17eec..7cf5e4eafe28 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1599,10 +1599,43 @@ xlog_recover_add_to_trans( } /* - * Sort the log items in the transaction. Cancelled buffers need - * to be put first so they are processed before any items that might - * modify the buffers. If they are cancelled, then the modifications - * don't need to be replayed. + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers */ STATIC int xlog_recover_reorder_trans( @@ -1612,6 +1645,10 @@ xlog_recover_reorder_trans( { xlog_recover_item_t *item, *n; LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { @@ -1619,12 +1656,18 @@ xlog_recover_reorder_trans( switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + if (buf_f->blf_flags & XFS_BLF_CANCEL) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); - list_move(&item->ri_list, &trans->r_itemq); + list_move(&item->ri_list, &cancel_list); + break; + } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); break; } + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: @@ -1632,7 +1675,7 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); - list_move_tail(&item->ri_list, &trans->r_itemq); + list_move_tail(&item->ri_list, &inode_list); break; default: xfs_warn(log->l_mp, @@ -1643,6 +1686,14 @@ xlog_recover_reorder_trans( } } ASSERT(list_empty(&sort_list)); + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); return 0; } @@ -1794,7 +1845,13 @@ xlog_recover_do_inode_buffer( xfs_agino_t *buffer_nextp; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - bp->b_ops = &xfs_inode_buf_ops; + + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { @@ -1861,6 +1918,15 @@ xlog_recover_do_inode_buffer( buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + } return 0; @@ -2097,6 +2163,17 @@ xlog_recover_do_reg_buffer( ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; + + /* * Do a sanity check if this is a dquot buffer. Just checking * the first dquot in the buffer should do. XXXThis is * probably a good thing to do for other buf types also. @@ -2134,7 +2211,16 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); - xlog_recovery_validate_buf_type(mp, bp, buf_f); + /* + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + xlog_recovery_validate_buf_type(mp, bp, buf_f); } /* @@ -2255,6 +2341,12 @@ xfs_qm_dqcheck( d->dd_diskdq.d_flags = type; d->dd_diskdq.d_id = cpu_to_be32(id); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + uuid_copy(&d->dd_uuid, &mp->m_sb.sb_uuid); + xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } + return errs; } @@ -2782,6 +2874,10 @@ xlog_recover_dquot_pass2( } memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f6bfbd734669..e8e310c05097 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -314,7 +314,8 @@ STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, xfs_sb_t *sbp, - bool check_inprogress) + bool check_inprogress, + bool check_version) { /* @@ -337,9 +338,10 @@ xfs_mount_validate_sb( /* * Version 5 superblock feature mask validation. Reject combinations the - * kernel cannot support up front before checking anything else. + * kernel cannot support up front before checking anything else. For + * write validation, we don't need to check feature masks. */ - if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { + if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) { xfs_alert(mp, "Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n" "Use of these features in this kernel is at your own risk!"); @@ -675,7 +677,8 @@ xfs_sb_to_disk( static int xfs_sb_verify( - struct xfs_buf *bp) + struct xfs_buf *bp, + bool check_version) { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_sb sb; @@ -686,7 +689,8 @@ xfs_sb_verify( * Only check the in progress field for the primary superblock as * mkfs.xfs doesn't clear it from secondary superblocks. */ - return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR); + return xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR, + check_version); } /* @@ -719,7 +723,7 @@ xfs_sb_read_verify( goto out_error; } } - error = xfs_sb_verify(bp); + error = xfs_sb_verify(bp, true); out_error: if (error) { @@ -758,7 +762,7 @@ xfs_sb_write_verify( struct xfs_buf_log_item *bip = bp->b_fspriv; int error; - error = xfs_sb_verify(bp); + error = xfs_sb_verify(bp, false); if (error) { XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, bp->b_addr); xfs_buf_ioerror(bp, error); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index f41702b43003..b75c9bb6e71e 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -41,6 +41,7 @@ #include "xfs_qm.h" #include "xfs_trace.h" #include "xfs_icache.h" +#include "xfs_cksum.h" /* * The global quota manager. There is only one of these for the entire @@ -839,7 +840,7 @@ xfs_qm_reset_dqcounts( xfs_dqid_t id, uint type) { - xfs_disk_dquot_t *ddq; + struct xfs_dqblk *dqb; int j; trace_xfs_reset_dqcounts(bp, _RET_IP_); @@ -853,8 +854,12 @@ xfs_qm_reset_dqcounts( do_div(j, sizeof(xfs_dqblk_t)); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif - ddq = bp->b_addr; + dqb = bp->b_addr; for (j = 0; j < mp->m_quotainfo->qi_dqperchunk; j++) { + struct xfs_disk_dquot *ddq; + + ddq = (struct xfs_disk_dquot *)&dqb[j]; + /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to @@ -871,7 +876,12 @@ xfs_qm_reset_dqcounts( ddq->d_bwarns = 0; ddq->d_iwarns = 0; ddq->d_rtbwarns = 0; - ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)&dqb[j], + sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } } } @@ -907,19 +917,29 @@ xfs_qm_dqiter_bufs( XFS_FSB_TO_DADDR(mp, bno), mp->m_quotainfo->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops); - if (error) - break; /* - * XXX(hch): need to figure out if it makes sense to validate - * the CRC here. + * CRC and validation errors will return a EFSCORRUPTED here. If + * this occurs, re-read without CRC validation so that we can + * repair the damage via xfs_qm_reset_dqcounts(). This process + * will leave a trace in the log indicating corruption has + * been detected. */ + if (error == EFSCORRUPTED) { + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, + XFS_FSB_TO_DADDR(mp, bno), + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + NULL); + } + + if (error) + break; + xfs_qm_reset_dqcounts(mp, bp, firstid, type); xfs_buf_delwri_queue(bp, buffer_list); xfs_buf_relse(bp); - /* - * goto the next block. - */ + + /* goto the next block. */ bno++; firstid += mp->m_quotainfo->qi_dqperchunk; } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index c41190cad6e9..6cdf6ffc36a1 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -489,31 +489,36 @@ xfs_qm_scall_setqlim( if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0) return 0; - tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); - error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), - 0, 0, XFS_DEFAULT_LOG_COUNT); - if (error) { - xfs_trans_cancel(tp, 0); - return (error); - } - /* * We don't want to race with a quotaoff so take the quotaoff lock. - * (We don't hold an inode lock, so there's nothing else to stop - * a quotaoff from happening). (XXXThis doesn't currently happen - * because we take the vfslock before calling xfs_qm_sysent). + * We don't hold an inode lock, so there's nothing else to stop + * a quotaoff from happening. */ mutex_lock(&q->qi_quotaofflock); /* - * Get the dquot (locked), and join it to the transaction. - * Allocate the dquot if this doesn't exist. + * Get the dquot (locked) before we start, as we need to do a + * transaction to allocate it if it doesn't exist. Once we have the + * dquot, unlock it so we can start the next transaction safely. We hold + * a reference to the dquot, so it's safe to do this unlock/lock without + * it being reclaimed in the mean time. */ - if ((error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp))) { - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + if (error) { ASSERT(error != ENOENT); goto out_unlock; } + xfs_dqunlock(dqp); + + tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); + error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp), + 0, 0, XFS_DEFAULT_LOG_COUNT); + if (error) { + xfs_trans_cancel(tp, 0); + goto out_rele; + } + + xfs_dqlock(dqp); xfs_trans_dqjoin(tp, dqp); ddq = &dqp->q_core; @@ -621,9 +626,10 @@ xfs_qm_scall_setqlim( xfs_trans_log_dquot(tp, dqp); error = xfs_trans_commit(tp, 0); - xfs_qm_dqrele(dqp); - out_unlock: +out_rele: + xfs_qm_dqrele(dqp); +out_unlock: mutex_unlock(&q->qi_quotaofflock); return error; } diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index c61e31c7d997..c38068f26c55 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -87,6 +87,8 @@ typedef struct xfs_dqblk { uuid_t dd_uuid; /* location information */ } xfs_dqblk_t; +#define XFS_DQUOT_CRC_OFF offsetof(struct xfs_dqblk, dd_crc) + /* * flags for q_flags field in the dquot. */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index ea341cea68cb..3033ba5e9762 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1373,6 +1373,17 @@ xfs_finish_flags( } /* + * V5 filesystems always use attr2 format for attributes. + */ + if (xfs_sb_version_hascrc(&mp->m_sb) && + (mp->m_flags & XFS_MOUNT_NOATTR2)) { + xfs_warn(mp, +"Cannot mount a V5 filesystem as %s. %s is always enabled for V5 filesystems.", + MNTOPT_NOATTR2, MNTOPT_ATTR2); + return XFS_ERROR(EINVAL); + } + + /* * mkfs'ed attr2 will turn on attr2 mount unless explicitly * told by noattr2 to turn it off */ diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 5f234389327c..195a403e1522 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -56,16 +56,9 @@ xfs_symlink_blocks( struct xfs_mount *mp, int pathlen) { - int fsblocks = 0; - int len = pathlen; + int buflen = XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - do { - fsblocks++; - len -= XFS_SYMLINK_BUF_SPACE(mp, mp->m_sb.sb_blocksize); - } while (len > 0); - - ASSERT(fsblocks <= XFS_SYMLINK_MAPS); - return fsblocks; + return (pathlen + buflen - 1) / buflen; } static int @@ -405,7 +398,7 @@ xfs_symlink( if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version)) fs_blocks = 0; else - fs_blocks = XFS_B_TO_FSB(mp, pathlen); + fs_blocks = xfs_symlink_blocks(mp, pathlen); resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks); error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); @@ -512,7 +505,7 @@ xfs_symlink( cur_chunk = target_path; offset = 0; for (n = 0; n < nmaps; n++) { - char *buf; + char *buf; d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); @@ -525,9 +518,7 @@ xfs_symlink( bp->b_ops = &xfs_symlink_buf_ops; byte_cnt = XFS_SYMLINK_BUF_SPACE(mp, byte_cnt); - if (pathlen < byte_cnt) { - byte_cnt = pathlen; - } + byte_cnt = min(byte_cnt, pathlen); buf = bp->b_addr; buf += xfs_symlink_hdr_set(mp, ip->i_ino, offset, @@ -542,6 +533,7 @@ xfs_symlink( xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) - (char *)bp->b_addr); } + ASSERT(pathlen == 0); } /* |