diff options
Diffstat (limited to 'fs')
249 files changed, 4188 insertions, 2803 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 2a5de610dd8f..bdabb2765d1b 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -483,6 +483,9 @@ static int v9fs_test_inode(struct inode *inode, void *data) if (v9inode->qid.type != st->qid.type) return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; return 1; } diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 70f9887c59a9..7f6ae21a27b3 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -87,6 +87,9 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data) if (v9inode->qid.type != st->qid.type) return 0; + + if (v9inode->qid.path != st->qid.path) + return 0; return 1; } diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 8b75463cb211..af03c2a901eb 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -94,13 +94,13 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, if (v9ses->cache) sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; - sb->s_flags |= MS_ACTIVE | MS_DIRSYNC | MS_NOATIME; + sb->s_flags |= SB_ACTIVE | SB_DIRSYNC | SB_NOATIME; if (!v9ses->cache) - sb->s_flags |= MS_SYNCHRONOUS; + sb->s_flags |= SB_SYNCHRONOUS; #ifdef CONFIG_9P_FS_POSIX_ACL if ((v9ses->flags & V9FS_ACL_MASK) == V9FS_POSIX_ACL) - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif return 0; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index c9fdfb112933..cfda2c7caedc 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -213,7 +213,7 @@ static int parse_options(struct super_block *sb, char *options) static int adfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_NODIRATIME; + *flags |= SB_NODIRATIME; return parse_options(sb, data); } @@ -372,7 +372,7 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) struct inode *root; int ret = -EINVAL; - sb->s_flags |= MS_NODIRATIME; + sb->s_flags |= SB_NODIRATIME; asb = kzalloc(sizeof(*asb), GFP_KERNEL); if (!asb) diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 185d5ab7e986..0f0e6925e97d 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -453,7 +453,7 @@ affs_error(struct super_block *sb, const char *function, const char *fmt, ...) pr_crit("error (device %s): %s(): %pV\n", sb->s_id, function, &vaf); if (!sb_rdonly(sb)) pr_warn("Remounting filesystem read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; va_end(args); } diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c index 2b1399611d9e..5ba9ef2742f6 100644 --- a/fs/affs/bitmap.c +++ b/fs/affs/bitmap.c @@ -250,12 +250,12 @@ int affs_init_bitmap(struct super_block *sb, int *flags) int i, res = 0; struct affs_sb_info *sbi = AFFS_SB(sb); - if (*flags & MS_RDONLY) + if (*flags & SB_RDONLY) return 0; if (!AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag) { pr_notice("Bitmap invalid - mounting %s read only\n", sb->s_id); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -288,7 +288,7 @@ int affs_init_bitmap(struct super_block *sb, int *flags) if (affs_checksum_block(sb, bh)) { pr_warn("Bitmap %u invalid - mounting %s read only.\n", bm->bm_key, sb->s_id); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; goto out; } pr_debug("read bitmap block %d: %d\n", blk, bm->bm_key); diff --git a/fs/affs/super.c b/fs/affs/super.c index 884bedab7266..1117e36134cc 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -356,7 +356,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = AFFS_SUPER_MAGIC; sb->s_op = &affs_sops; - sb->s_flags |= MS_NODIRATIME; + sb->s_flags |= SB_NODIRATIME; sbi = kzalloc(sizeof(struct affs_sb_info), GFP_KERNEL); if (!sbi) @@ -466,7 +466,7 @@ got_root: if ((chksum == FS_DCFFS || chksum == MUFS_DCFFS || chksum == FS_DCOFS || chksum == MUFS_DCOFS) && !sb_rdonly(sb)) { pr_notice("Dircache FS - mounting %s read only\n", sb->s_id); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } switch (chksum) { case MUFS_FS: @@ -488,7 +488,7 @@ got_root: /* fall thru */ case FS_OFS: affs_set_opt(sbi->s_flags, SF_OFS); - sb->s_flags |= MS_NOEXEC; + sb->s_flags |= SB_NOEXEC; break; case MUFS_DCOFS: case MUFS_INTLOFS: @@ -497,7 +497,7 @@ got_root: case FS_INTLOFS: affs_set_opt(sbi->s_flags, SF_INTL); affs_set_opt(sbi->s_flags, SF_OFS); - sb->s_flags |= MS_NOEXEC; + sb->s_flags |= SB_NOEXEC; break; default: pr_err("Unknown filesystem on device %s: %08X\n", @@ -513,7 +513,7 @@ got_root: sig, sig[3] + '0', blocksize); } - sb->s_flags |= MS_NODEV | MS_NOSUID; + sb->s_flags |= SB_NODEV | SB_NOSUID; sbi->s_data_blksize = sb->s_blocksize; if (affs_test_opt(sbi->s_flags, SF_OFS)) @@ -570,7 +570,7 @@ affs_remount(struct super_block *sb, int *flags, char *data) pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data); sync_filesystem(sb); - *flags |= MS_NODIRATIME; + *flags |= SB_NODIRATIME; memcpy(volume, sbi->s_volume, 32); if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, @@ -596,10 +596,10 @@ affs_remount(struct super_block *sb, int *flags, char *data) memcpy(sbi->s_volume, volume, 32); spin_unlock(&sbi->symlink_lock); - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (*flags & MS_RDONLY) + if (*flags & SB_RDONLY) affs_free_bitmap(sb); else res = affs_init_bitmap(sb, flags); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 1858c91169e4..9bb921d120d0 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -207,13 +207,8 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, rcu_read_lock(); cell = afs_lookup_cell_rcu(net, name, namesz); rcu_read_unlock(); - if (!IS_ERR(cell)) { - if (excl) { - afs_put_cell(net, cell); - return ERR_PTR(-EEXIST); - } + if (!IS_ERR(cell)) goto wait_for_cell; - } } /* Assume we're probably going to create a cell and preallocate and diff --git a/fs/afs/dir.c b/fs/afs/dir.c index ab618d32554c..ff8d5bf4354f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -765,6 +765,8 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, if (fc->ac.error < 0) return; + d_drop(new_dentry); + inode = afs_iget(fc->vnode->vfs_inode.i_sb, fc->key, newfid, newstatus, newcb, fc->cbi); if (IS_ERR(inode)) { @@ -775,9 +777,7 @@ static void afs_vnode_new_inode(struct afs_fs_cursor *fc, return; } - d_instantiate(new_dentry, inode); - if (d_unhashed(new_dentry)) - d_rehash(new_dentry); + d_add(new_dentry, inode); } /* @@ -818,6 +818,8 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) ret = afs_end_vnode_operation(&fc); if (ret < 0) goto error_key; + } else { + goto error_key; } key_put(key); @@ -972,7 +974,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, struct afs_fs_cursor fc; struct afs_file_status newstatus; struct afs_callback newcb; - struct afs_vnode *dvnode = dvnode = AFS_FS_I(dir); + struct afs_vnode *dvnode = AFS_FS_I(dir); struct afs_fid newfid; struct key *key; int ret; @@ -1006,6 +1008,8 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode, ret = afs_end_vnode_operation(&fc); if (ret < 0) goto error_key; + } else { + goto error_key; } key_put(key); @@ -1053,7 +1057,7 @@ static int afs_link(struct dentry *from, struct inode *dir, if (afs_begin_vnode_operation(&fc, dvnode, key)) { if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) { afs_end_vnode_operation(&fc); - return -ERESTARTSYS; + goto error_key; } while (afs_select_fileserver(&fc)) { @@ -1071,6 +1075,8 @@ static int afs_link(struct dentry *from, struct inode *dir, ret = afs_end_vnode_operation(&fc); if (ret < 0) goto error_key; + } else { + goto error_key; } key_put(key); @@ -1130,6 +1136,8 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry, ret = afs_end_vnode_operation(&fc); if (ret < 0) goto error_key; + } else { + goto error_key; } key_put(key); @@ -1180,7 +1188,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, if (orig_dvnode != new_dvnode) { if (mutex_lock_interruptible_nested(&new_dvnode->io_lock, 1) < 0) { afs_end_vnode_operation(&fc); - return -ERESTARTSYS; + goto error_key; } } while (afs_select_fileserver(&fc)) { @@ -1199,14 +1207,9 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry, goto error_key; } - key_put(key); - _leave(" = 0"); - return 0; - error_key: key_put(key); error: - d_drop(new_dentry); _leave(" = %d", ret); return ret; } diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 7571a5dfd5a3..c40ba2fe3cbe 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -170,7 +170,7 @@ void afs_lock_work(struct work_struct *work) { struct afs_vnode *vnode = container_of(work, struct afs_vnode, lock_work.work); - struct file_lock *fl; + struct file_lock *fl, *next; afs_lock_type_t type; struct key *key; int ret; @@ -179,117 +179,136 @@ void afs_lock_work(struct work_struct *work) spin_lock(&vnode->lock); - if (test_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) { +again: + _debug("wstate %u for %p", vnode->lock_state, vnode); + switch (vnode->lock_state) { + case AFS_VNODE_LOCK_NEED_UNLOCK: _debug("unlock"); + vnode->lock_state = AFS_VNODE_LOCK_UNLOCKING; spin_unlock(&vnode->lock); /* attempt to release the server lock; if it fails, we just - * wait 5 minutes and it'll time out anyway */ - ret = afs_release_lock(vnode, vnode->unlock_key); + * wait 5 minutes and it'll expire anyway */ + ret = afs_release_lock(vnode, vnode->lock_key); if (ret < 0) printk(KERN_WARNING "AFS:" " Failed to release lock on {%x:%x} error %d\n", vnode->fid.vid, vnode->fid.vnode, ret); spin_lock(&vnode->lock); - key_put(vnode->unlock_key); - vnode->unlock_key = NULL; - clear_bit(AFS_VNODE_UNLOCKING, &vnode->flags); - } + key_put(vnode->lock_key); + vnode->lock_key = NULL; + vnode->lock_state = AFS_VNODE_LOCK_NONE; + + if (list_empty(&vnode->pending_locks)) { + spin_unlock(&vnode->lock); + return; + } + + /* The new front of the queue now owns the state variables. */ + next = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; + goto again; - /* if we've got a lock, then it must be time to extend that lock as AFS - * locks time out after 5 minutes */ - if (!list_empty(&vnode->granted_locks)) { + /* If we've already got a lock, then it must be time to extend that + * lock as AFS locks time out after 5 minutes. + */ + case AFS_VNODE_LOCK_GRANTED: _debug("extend"); - if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) - BUG(); - fl = list_entry(vnode->granted_locks.next, - struct file_lock, fl_u.afs.link); - key = key_get(afs_file_key(fl->fl_file)); + ASSERT(!list_empty(&vnode->granted_locks)); + + key = key_get(vnode->lock_key); + vnode->lock_state = AFS_VNODE_LOCK_EXTENDING; spin_unlock(&vnode->lock); - ret = afs_extend_lock(vnode, key); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + ret = afs_extend_lock(vnode, key); /* RPC */ key_put(key); - switch (ret) { - case 0: + + if (ret < 0) + pr_warning("AFS: Failed to extend lock on {%x:%x} error %d\n", + vnode->fid.vid, vnode->fid.vnode, ret); + + spin_lock(&vnode->lock); + + if (vnode->lock_state != AFS_VNODE_LOCK_EXTENDING) + goto again; + vnode->lock_state = AFS_VNODE_LOCK_GRANTED; + + if (ret == 0) afs_schedule_lock_extension(vnode); - break; - default: - /* ummm... we failed to extend the lock - retry - * extension shortly */ - printk(KERN_WARNING "AFS:" - " Failed to extend lock on {%x:%x} error %d\n", - vnode->fid.vid, vnode->fid.vnode, ret); + else queue_delayed_work(afs_lock_manager, &vnode->lock_work, HZ * 10); - break; - } - _leave(" [extend]"); + spin_unlock(&vnode->lock); + _leave(" [ext]"); return; - } - /* if we don't have a granted lock, then we must've been called back by - * the server, and so if might be possible to get a lock we're - * currently waiting for */ - if (!list_empty(&vnode->pending_locks)) { + /* If we don't have a granted lock, then we must've been called + * back by the server, and so if might be possible to get a + * lock we're currently waiting for. + */ + case AFS_VNODE_LOCK_WAITING_FOR_CB: _debug("get"); - if (test_and_set_bit(AFS_VNODE_LOCKING, &vnode->flags)) - BUG(); - fl = list_entry(vnode->pending_locks.next, - struct file_lock, fl_u.afs.link); - key = key_get(afs_file_key(fl->fl_file)); - type = (fl->fl_type == F_RDLCK) ? - AFS_LOCK_READ : AFS_LOCK_WRITE; + key = key_get(vnode->lock_key); + type = vnode->lock_type; + vnode->lock_state = AFS_VNODE_LOCK_SETTING; spin_unlock(&vnode->lock); - ret = afs_set_lock(vnode, key, type); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); + ret = afs_set_lock(vnode, key, type); /* RPC */ + key_put(key); + + spin_lock(&vnode->lock); switch (ret) { case -EWOULDBLOCK: _debug("blocked"); break; case 0: _debug("acquired"); - if (type == AFS_LOCK_READ) - set_bit(AFS_VNODE_READLOCKED, &vnode->flags); - else - set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); - ret = AFS_LOCK_GRANTED; + vnode->lock_state = AFS_VNODE_LOCK_GRANTED; + /* Fall through */ default: - spin_lock(&vnode->lock); - /* the pending lock may have been withdrawn due to a - * signal */ - if (list_entry(vnode->pending_locks.next, - struct file_lock, fl_u.afs.link) == fl) { - fl->fl_u.afs.state = ret; - if (ret == AFS_LOCK_GRANTED) - afs_grant_locks(vnode, fl); - else - list_del_init(&fl->fl_u.afs.link); - wake_up(&fl->fl_wait); - spin_unlock(&vnode->lock); - } else { + /* Pass the lock or the error onto the first locker in + * the list - if they're looking for this type of lock. + * If they're not, we assume that whoever asked for it + * took a signal. + */ + if (list_empty(&vnode->pending_locks)) { _debug("withdrawn"); - clear_bit(AFS_VNODE_READLOCKED, &vnode->flags); - clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); - spin_unlock(&vnode->lock); - afs_release_lock(vnode, key); - if (!list_empty(&vnode->pending_locks)) - afs_lock_may_be_available(vnode); + vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK; + goto again; } - break; + + fl = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + if (vnode->lock_type != type) { + _debug("changed"); + vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK; + goto again; + } + + fl->fl_u.afs.state = ret; + if (ret == 0) + afs_grant_locks(vnode, fl); + else + list_del_init(&fl->fl_u.afs.link); + wake_up(&fl->fl_wait); + spin_unlock(&vnode->lock); + _leave(" [granted]"); + return; } - key_put(key); - _leave(" [pend]"); + + default: + /* Looks like a lock request was withdrawn. */ + spin_unlock(&vnode->lock); + _leave(" [no]"); return; } - - /* looks like the lock request was withdrawn on a signal */ - spin_unlock(&vnode->lock); - _leave(" [no locks]"); } /* @@ -298,15 +317,105 @@ void afs_lock_work(struct work_struct *work) * AF_RXRPC * - the caller must hold the vnode lock */ -static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) +static void afs_defer_unlock(struct afs_vnode *vnode) { - cancel_delayed_work(&vnode->lock_work); - if (!test_and_clear_bit(AFS_VNODE_READLOCKED, &vnode->flags) && - !test_and_clear_bit(AFS_VNODE_WRITELOCKED, &vnode->flags)) - BUG(); - if (test_and_set_bit(AFS_VNODE_UNLOCKING, &vnode->flags)) - BUG(); - vnode->unlock_key = key_get(key); + _enter(""); + + if (vnode->lock_state == AFS_VNODE_LOCK_GRANTED || + vnode->lock_state == AFS_VNODE_LOCK_EXTENDING) { + cancel_delayed_work(&vnode->lock_work); + + vnode->lock_state = AFS_VNODE_LOCK_NEED_UNLOCK; + afs_lock_may_be_available(vnode); + } +} + +/* + * Check that our view of the file metadata is up to date and check to see + * whether we think that we have a locking permit. + */ +static int afs_do_setlk_check(struct afs_vnode *vnode, struct key *key, + afs_lock_type_t type, bool can_sleep) +{ + afs_access_t access; + int ret; + + /* Make sure we've got a callback on this file and that our view of the + * data version is up to date. + */ + ret = afs_validate(vnode, key); + if (ret < 0) + return ret; + + /* Check the permission set to see if we're actually going to be + * allowed to get a lock on this file. + */ + ret = afs_check_permit(vnode, key, &access); + if (ret < 0) + return ret; + + /* At a rough estimation, you need LOCK, WRITE or INSERT perm to + * read-lock a file and WRITE or INSERT perm to write-lock a file. + * + * We can't rely on the server to do this for us since if we want to + * share a read lock that we already have, we won't go the server. + */ + if (type == AFS_LOCK_READ) { + if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE | AFS_ACE_LOCK))) + return -EACCES; + if (vnode->status.lock_count == -1 && !can_sleep) + return -EAGAIN; /* Write locked */ + } else { + if (!(access & (AFS_ACE_INSERT | AFS_ACE_WRITE))) + return -EACCES; + if (vnode->status.lock_count != 0 && !can_sleep) + return -EAGAIN; /* Locked */ + } + + return 0; +} + +/* + * Remove the front runner from the pending queue. + * - The caller must hold vnode->lock. + */ +static void afs_dequeue_lock(struct afs_vnode *vnode, struct file_lock *fl) +{ + struct file_lock *next; + + _enter(""); + + /* ->lock_type, ->lock_key and ->lock_state only belong to this + * file_lock if we're at the front of the pending queue or if we have + * the lock granted or if the lock_state is NEED_UNLOCK or UNLOCKING. + */ + if (vnode->granted_locks.next == &fl->fl_u.afs.link && + vnode->granted_locks.prev == &fl->fl_u.afs.link) { + list_del_init(&fl->fl_u.afs.link); + afs_defer_unlock(vnode); + return; + } + + if (!list_empty(&vnode->granted_locks) || + vnode->pending_locks.next != &fl->fl_u.afs.link) { + list_del_init(&fl->fl_u.afs.link); + return; + } + + list_del_init(&fl->fl_u.afs.link); + key_put(vnode->lock_key); + vnode->lock_key = NULL; + vnode->lock_state = AFS_VNODE_LOCK_NONE; + + if (list_empty(&vnode->pending_locks)) + return; + + /* The new front of the queue now owns the state variables. */ + next = list_entry(vnode->pending_locks.next, + struct file_lock, fl_u.afs.link); + vnode->lock_key = afs_file_key(next->fl_file); + vnode->lock_type = (next->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; + vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; afs_lock_may_be_available(vnode); } @@ -315,7 +424,7 @@ static void afs_defer_unlock(struct afs_vnode *vnode, struct key *key) */ static int afs_do_setlk(struct file *file, struct file_lock *fl) { - struct inode *inode = file_inode(file); + struct inode *inode = locks_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); afs_lock_type_t type; struct key *key = afs_file_key(file); @@ -333,165 +442,136 @@ static int afs_do_setlk(struct file *file, struct file_lock *fl) type = (fl->fl_type == F_RDLCK) ? AFS_LOCK_READ : AFS_LOCK_WRITE; - spin_lock(&inode->i_lock); - - /* make sure we've got a callback on this file and that our view of the - * data version is up to date */ - ret = afs_validate(vnode, key); + ret = afs_do_setlk_check(vnode, key, type, fl->fl_flags & FL_SLEEP); if (ret < 0) - goto error; - - if (vnode->status.lock_count != 0 && !(fl->fl_flags & FL_SLEEP)) { - ret = -EAGAIN; - goto error; - } + return ret; spin_lock(&vnode->lock); - /* if we've already got a readlock on the server then we can instantly + /* If we've already got a readlock on the server then we instantly * grant another readlock, irrespective of whether there are any - * pending writelocks */ + * pending writelocks. + */ if (type == AFS_LOCK_READ && - vnode->flags & (1 << AFS_VNODE_READLOCKED)) { + vnode->lock_state == AFS_VNODE_LOCK_GRANTED && + vnode->lock_type == AFS_LOCK_READ) { _debug("instant readlock"); - ASSERTCMP(vnode->flags & - ((1 << AFS_VNODE_LOCKING) | - (1 << AFS_VNODE_WRITELOCKED)), ==, 0); ASSERT(!list_empty(&vnode->granted_locks)); - goto sharing_existing_lock; + goto share_existing_lock; } - /* if there's no-one else with a lock on this vnode, then we need to - * ask the server for a lock */ - if (list_empty(&vnode->pending_locks) && - list_empty(&vnode->granted_locks)) { - _debug("not locked"); - ASSERTCMP(vnode->flags & - ((1 << AFS_VNODE_LOCKING) | - (1 << AFS_VNODE_READLOCKED) | - (1 << AFS_VNODE_WRITELOCKED)), ==, 0); - list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); - set_bit(AFS_VNODE_LOCKING, &vnode->flags); - spin_unlock(&vnode->lock); + list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); - ret = afs_set_lock(vnode, key, type); - clear_bit(AFS_VNODE_LOCKING, &vnode->flags); - switch (ret) { - case 0: - _debug("acquired"); - goto acquired_server_lock; - case -EWOULDBLOCK: - _debug("would block"); - spin_lock(&vnode->lock); - ASSERT(list_empty(&vnode->granted_locks)); - ASSERTCMP(vnode->pending_locks.next, ==, - &fl->fl_u.afs.link); - goto wait; - default: - spin_lock(&vnode->lock); - list_del_init(&fl->fl_u.afs.link); - spin_unlock(&vnode->lock); - goto error; - } - } + if (vnode->lock_state != AFS_VNODE_LOCK_NONE) + goto need_to_wait; - /* otherwise, we need to wait for a local lock to become available */ - _debug("wait local"); - list_add_tail(&fl->fl_u.afs.link, &vnode->pending_locks); -wait: - if (!(fl->fl_flags & FL_SLEEP)) { - _debug("noblock"); - ret = -EAGAIN; - goto abort_attempt; - } + /* We don't have a lock on this vnode and we aren't currently waiting + * for one either, so ask the server for a lock. + * + * Note that we need to be careful if we get interrupted by a signal + * after dispatching the request as we may still get the lock, even + * though we don't wait for the reply (it's not too bad a problem - the + * lock will expire in 10 mins anyway). + */ + _debug("not locked"); + vnode->lock_key = key_get(key); + vnode->lock_type = type; + vnode->lock_state = AFS_VNODE_LOCK_SETTING; spin_unlock(&vnode->lock); - /* now we need to sleep and wait for the lock manager thread to get the - * lock from the server */ - _debug("sleep"); - ret = wait_event_interruptible(fl->fl_wait, - fl->fl_u.afs.state <= AFS_LOCK_GRANTED); - if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { - ret = fl->fl_u.afs.state; - if (ret < 0) - goto error; - spin_lock(&vnode->lock); - goto given_lock; - } - - /* we were interrupted, but someone may still be in the throes of - * giving us the lock */ - _debug("intr"); - ASSERTCMP(ret, ==, -ERESTARTSYS); + ret = afs_set_lock(vnode, key, type); /* RPC */ spin_lock(&vnode->lock); - if (fl->fl_u.afs.state <= AFS_LOCK_GRANTED) { - ret = fl->fl_u.afs.state; - if (ret < 0) { - spin_unlock(&vnode->lock); - goto error; - } - goto given_lock; - } + switch (ret) { + default: + goto abort_attempt; -abort_attempt: - /* we aren't going to get the lock, either because we're unwilling to - * wait, or because some signal happened */ - _debug("abort"); - if (list_empty(&vnode->granted_locks) && - vnode->pending_locks.next == &fl->fl_u.afs.link) { - if (vnode->pending_locks.prev != &fl->fl_u.afs.link) { - /* kick the next pending lock into having a go */ - list_del_init(&fl->fl_u.afs.link); - afs_lock_may_be_available(vnode); - } - } else { - list_del_init(&fl->fl_u.afs.link); + case -EWOULDBLOCK: + /* The server doesn't have a lock-waiting queue, so the client + * will have to retry. The server will break the outstanding + * callbacks on a file when a lock is released. + */ + _debug("would block"); + ASSERT(list_empty(&vnode->granted_locks)); + ASSERTCMP(vnode->pending_locks.next, ==, &fl->fl_u.afs.link); + vnode->lock_state = AFS_VNODE_LOCK_WAITING_FOR_CB; + goto need_to_wait; + + case 0: + _debug("acquired"); + break; } - spin_unlock(&vnode->lock); - goto error; -acquired_server_lock: /* we've acquired a server lock, but it needs to be renewed after 5 * mins */ - spin_lock(&vnode->lock); + vnode->lock_state = AFS_VNODE_LOCK_GRANTED; afs_schedule_lock_extension(vnode); - if (type == AFS_LOCK_READ) - set_bit(AFS_VNODE_READLOCKED, &vnode->flags); - else - set_bit(AFS_VNODE_WRITELOCKED, &vnode->flags); -sharing_existing_lock: + +share_existing_lock: /* the lock has been granted as far as we're concerned... */ fl->fl_u.afs.state = AFS_LOCK_GRANTED; list_move_tail(&fl->fl_u.afs.link, &vnode->granted_locks); + given_lock: /* ... but we do still need to get the VFS's blessing */ - ASSERT(!(vnode->flags & (1 << AFS_VNODE_LOCKING))); - ASSERT((vnode->flags & ((1 << AFS_VNODE_READLOCKED) | - (1 << AFS_VNODE_WRITELOCKED))) != 0); + spin_unlock(&vnode->lock); + ret = posix_lock_file(file, fl, NULL); if (ret < 0) goto vfs_rejected_lock; - spin_unlock(&vnode->lock); - /* again, make sure we've got a callback on this file and, again, make + /* Again, make sure we've got a callback on this file and, again, make * sure that our view of the data version is up to date (we ignore - * errors incurred here and deal with the consequences elsewhere) */ + * errors incurred here and deal with the consequences elsewhere). + */ afs_validate(vnode, key); + _leave(" = 0"); + return 0; -error: - spin_unlock(&inode->i_lock); +need_to_wait: + /* We're going to have to wait. Either this client doesn't have a lock + * on the server yet and we need to wait for a callback to occur, or + * the client does have a lock on the server, but it belongs to some + * other process(es) and is incompatible with the lock we want. + */ + ret = -EAGAIN; + if (fl->fl_flags & FL_SLEEP) { + spin_unlock(&vnode->lock); + + _debug("sleep"); + ret = wait_event_interruptible(fl->fl_wait, + fl->fl_u.afs.state != AFS_LOCK_PENDING); + + spin_lock(&vnode->lock); + } + + if (fl->fl_u.afs.state == AFS_LOCK_GRANTED) + goto given_lock; + if (fl->fl_u.afs.state < 0) + ret = fl->fl_u.afs.state; + +abort_attempt: + /* we aren't going to get the lock, either because we're unwilling to + * wait, or because some signal happened */ + _debug("abort"); + afs_dequeue_lock(vnode, fl); + +error_unlock: + spin_unlock(&vnode->lock); _leave(" = %d", ret); return ret; vfs_rejected_lock: - /* the VFS rejected the lock we just obtained, so we have to discard - * what we just got */ + /* The VFS rejected the lock we just obtained, so we have to discard + * what we just got. We defer this to the lock manager work item to + * deal with. + */ _debug("vfs refused %d", ret); + spin_lock(&vnode->lock); list_del_init(&fl->fl_u.afs.link); if (list_empty(&vnode->granted_locks)) - afs_defer_unlock(vnode, key); - goto abort_attempt; + afs_defer_unlock(vnode); + goto error_unlock; } /* @@ -499,34 +579,21 @@ vfs_rejected_lock: */ static int afs_do_unlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); - struct key *key = afs_file_key(file); + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); int ret; _enter("{%x:%u},%u", vnode->fid.vid, vnode->fid.vnode, fl->fl_type); + /* Flush all pending writes before doing anything with locks. */ + vfs_fsync(file, 0); + /* only whole-file unlocks are supported */ if (fl->fl_start != 0 || fl->fl_end != OFFSET_MAX) return -EINVAL; - fl->fl_ops = &afs_lock_ops; - INIT_LIST_HEAD(&fl->fl_u.afs.link); - fl->fl_u.afs.state = AFS_LOCK_PENDING; - - spin_lock(&vnode->lock); ret = posix_lock_file(file, fl, NULL); - if (ret < 0) { - spin_unlock(&vnode->lock); - _leave(" = %d [vfs]", ret); - return ret; - } - - /* discard the server lock only if all granted locks are gone */ - if (list_empty(&vnode->granted_locks)) - afs_defer_unlock(vnode, key); - spin_unlock(&vnode->lock); - _leave(" = 0"); - return 0; + _leave(" = %d [%u]", ret, vnode->lock_state); + return ret; } /* @@ -534,7 +601,7 @@ static int afs_do_unlk(struct file *file, struct file_lock *fl) */ static int afs_do_getlk(struct file *file, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file->f_mapping->host); + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); struct key *key = afs_file_key(file); int ret, lock_count; @@ -542,29 +609,25 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) fl->fl_type = F_UNLCK; - inode_lock(&vnode->vfs_inode); - /* check local lock records first */ - ret = 0; posix_test_lock(file, fl); if (fl->fl_type == F_UNLCK) { /* no local locks; consult the server */ ret = afs_fetch_status(vnode, key); if (ret < 0) goto error; - lock_count = vnode->status.lock_count; - if (lock_count) { - if (lock_count > 0) - fl->fl_type = F_RDLCK; - else - fl->fl_type = F_WRLCK; - fl->fl_start = 0; - fl->fl_end = OFFSET_MAX; - } + + lock_count = READ_ONCE(vnode->status.lock_count); + if (lock_count > 0) + fl->fl_type = F_RDLCK; + else + fl->fl_type = F_WRLCK; + fl->fl_start = 0; + fl->fl_end = OFFSET_MAX; } + ret = 0; error: - inode_unlock(&vnode->vfs_inode); _leave(" = %d [%hd]", ret, fl->fl_type); return ret; } @@ -574,7 +637,7 @@ error: */ int afs_lock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); _enter("{%x:%u},%d,{t=%x,fl=%x,r=%Ld:%Ld}", vnode->fid.vid, vnode->fid.vnode, cmd, @@ -597,7 +660,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl) */ int afs_flock(struct file *file, int cmd, struct file_lock *fl) { - struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); + struct afs_vnode *vnode = AFS_FS_I(locks_inode(file)); _enter("{%x:%u},%d,{t=%x,fl=%x}", vnode->fid.vid, vnode->fid.vnode, cmd, @@ -627,9 +690,13 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl) */ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) { + struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + _enter(""); + spin_lock(&vnode->lock); list_add(&new->fl_u.afs.link, &fl->fl_u.afs.link); + spin_unlock(&vnode->lock); } /* @@ -638,7 +705,12 @@ static void afs_fl_copy_lock(struct file_lock *new, struct file_lock *fl) */ static void afs_fl_release_private(struct file_lock *fl) { + struct afs_vnode *vnode = AFS_FS_I(locks_inode(fl->fl_file)); + _enter(""); - list_del_init(&fl->fl_u.afs.link); + spin_lock(&vnode->lock); + afs_dequeue_lock(vnode, fl); + _debug("state %u for %p", vnode->lock_state, vnode); + spin_unlock(&vnode->lock); } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index bd8dcee7e066..e03910cebdd4 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -430,6 +430,16 @@ struct afs_volume { u8 name[AFS_MAXVOLNAME + 1]; /* NUL-padded volume name */ }; +enum afs_lock_state { + AFS_VNODE_LOCK_NONE, /* The vnode has no lock on the server */ + AFS_VNODE_LOCK_WAITING_FOR_CB, /* We're waiting for the server to break the callback */ + AFS_VNODE_LOCK_SETTING, /* We're asking the server for a lock */ + AFS_VNODE_LOCK_GRANTED, /* We have a lock on the server */ + AFS_VNODE_LOCK_EXTENDING, /* We're extending a lock on the server */ + AFS_VNODE_LOCK_NEED_UNLOCK, /* We need to unlock on the server */ + AFS_VNODE_LOCK_UNLOCKING, /* We're telling the server to unlock */ +}; + /* * AFS inode private data */ @@ -454,18 +464,16 @@ struct afs_vnode { #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ -#define AFS_VNODE_LOCKING 6 /* set if waiting for lock on vnode */ -#define AFS_VNODE_READLOCKED 7 /* set if vnode is read-locked on the server */ -#define AFS_VNODE_WRITELOCKED 8 /* set if vnode is write-locked on the server */ -#define AFS_VNODE_UNLOCKING 9 /* set if vnode is being unlocked on the server */ -#define AFS_VNODE_AUTOCELL 10 /* set if Vnode is an auto mount point */ -#define AFS_VNODE_PSEUDODIR 11 /* set if Vnode is a pseudo directory */ +#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ +#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ struct list_head granted_locks; /* locks granted on this file */ struct delayed_work lock_work; /* work to be done in locking */ - struct key *unlock_key; /* key to be used in unlocking */ + struct key *lock_key; /* Key to be used in lock ops */ + enum afs_lock_state lock_state : 8; + afs_lock_type_t lock_type : 8; /* outstanding callback notification on this file */ struct afs_cb_interest *cb_interest; /* Server on which this resides */ @@ -843,6 +851,7 @@ extern void afs_clear_permits(struct afs_vnode *); extern void afs_cache_permit(struct afs_vnode *, struct key *, unsigned int); extern void afs_zap_permits(struct rcu_head *); extern struct key *afs_request_key(struct afs_cell *); +extern int afs_check_permit(struct afs_vnode *, struct key *, afs_access_t *); extern int afs_permission(struct inode *, int); extern void __exit afs_clean_up_permit_cache(void); diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index e728ca1776c9..d04511fb3879 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -46,8 +46,7 @@ bool afs_begin_vnode_operation(struct afs_fs_cursor *fc, struct afs_vnode *vnode return false; } - if (test_bit(AFS_VNODE_READLOCKED, &vnode->flags) || - test_bit(AFS_VNODE_WRITELOCKED, &vnode->flags)) + if (vnode->lock_state != AFS_VNODE_LOCK_NONE) fc->flags |= AFS_FS_CURSOR_CUR_ONLY; return true; } @@ -117,7 +116,7 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code) case VSALVAGING: m = "being salvaged"; break; default: m = "busy"; break; } - + pr_notice("kAFS: Volume %u '%s' is %s\n", volume->vid, volume->name, m); } @@ -438,24 +437,67 @@ bool afs_select_current_fileserver(struct afs_fs_cursor *fc) _enter(""); - if (!cbi) { - fc->ac.error = -ESTALE; + switch (fc->ac.error) { + case SHRT_MAX: + if (!cbi) { + fc->ac.error = -ESTALE; + fc->flags |= AFS_FS_CURSOR_STOP; + return false; + } + + fc->cbi = afs_get_cb_interest(vnode->cb_interest); + + read_lock(&cbi->server->fs_lock); + alist = rcu_dereference_protected(cbi->server->addresses, + lockdep_is_held(&cbi->server->fs_lock)); + afs_get_addrlist(alist); + read_unlock(&cbi->server->fs_lock); + if (!alist) { + fc->ac.error = -ESTALE; + fc->flags |= AFS_FS_CURSOR_STOP; + return false; + } + + fc->ac.alist = alist; + fc->ac.addr = NULL; + fc->ac.start = READ_ONCE(alist->index); + fc->ac.index = fc->ac.start; + fc->ac.error = 0; + fc->ac.begun = false; + goto iterate_address; + + case 0: + default: + /* Success or local failure. Stop. */ fc->flags |= AFS_FS_CURSOR_STOP; + _leave(" = f [okay/local %d]", fc->ac.error); return false; - } - read_lock(&cbi->server->fs_lock); - alist = afs_get_addrlist(cbi->server->addresses); - read_unlock(&cbi->server->fs_lock); - if (!alist) { - fc->ac.error = -ESTALE; + case -ECONNABORTED: fc->flags |= AFS_FS_CURSOR_STOP; + _leave(" = f [abort]"); return false; + + case -ENETUNREACH: + case -EHOSTUNREACH: + case -ECONNREFUSED: + case -ETIMEDOUT: + case -ETIME: + _debug("no conn"); + goto iterate_address; } - fc->ac.alist = alist; - fc->ac.error = 0; - return true; +iterate_address: + /* Iterate over the current server's address list to try and find an + * address on which it will respond to us. + */ + if (afs_iterate_addresses(&fc->ac)) { + _leave(" = t"); + return true; + } + + afs_end_cursor(&fc->ac); + return false; } /* diff --git a/fs/afs/security.c b/fs/afs/security.c index 46a881a4d08f..2b00097101b3 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -284,8 +284,8 @@ someone_else_changed_it: * permitted to be accessed with this authorisation, and if so, what access it * is granted */ -static int afs_check_permit(struct afs_vnode *vnode, struct key *key, - afs_access_t *_access) +int afs_check_permit(struct afs_vnode *vnode, struct key *key, + afs_access_t *_access) { struct afs_permits *permits; bool valid = false; diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 26bad7032bba..0ab3f8457839 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -17,7 +17,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) { int i; - if (refcount_dec_and_test(&slist->usage)) { + if (slist && refcount_dec_and_test(&slist->usage)) { for (i = 0; i < slist->nr_servers; i++) { afs_put_cb_interest(net, slist->servers[i].cb_interest); afs_put_server(net, slist->servers[i].server); diff --git a/fs/afs/super.c b/fs/afs/super.c index 875b5eb02242..d3f97da61bdf 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -496,10 +496,10 @@ static struct dentry *afs_mount(struct file_system_type *fs_type, if (ret < 0) goto error_sb; as = NULL; - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; } else { _debug("reuse"); - ASSERTCMP(sb->s_flags, &, MS_ACTIVE); + ASSERTCMP(sb->s_flags, &, SB_ACTIVE); afs_destroy_sbi(as); as = NULL; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 18e46e31523c..cb5f8a3df577 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -119,6 +119,11 @@ try_again: } if (f != t) { + if (PageWriteback(page)) { + trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), + page->index, priv); + goto flush_conflicting_write; + } if (to < f || from > t) goto flush_conflicting_write; if (from < f) @@ -1297,20 +1297,10 @@ static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, - struct timespec __user *timeout) + ktime_t until) { - ktime_t until = KTIME_MAX; long ret = 0; - if (timeout) { - struct timespec ts; - - if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) - return -EFAULT; - - until = timespec_to_ktime(ts); - } - /* * Note that aio_read_events() is being called as the conditional - i.e. * we're calling it after prepare_to_wait() has set task state to @@ -1826,6 +1816,25 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, return ret; } +static long do_io_getevents(aio_context_t ctx_id, + long min_nr, + long nr, + struct io_event __user *events, + struct timespec64 *ts) +{ + ktime_t until = ts ? timespec64_to_ktime(*ts) : KTIME_MAX; + struct kioctx *ioctx = lookup_ioctx(ctx_id); + long ret = -EINVAL; + + if (likely(ioctx)) { + if (likely(min_nr <= nr && min_nr >= 0)) + ret = read_events(ioctx, min_nr, nr, events, until); + percpu_ref_put(&ioctx->users); + } + + return ret; +} + /* io_getevents: * Attempts to read at least min_nr events and up to nr events from * the completion queue for the aio_context specified by ctx_id. If @@ -1844,15 +1853,14 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, struct io_event __user *, events, struct timespec __user *, timeout) { - struct kioctx *ioctx = lookup_ioctx(ctx_id); - long ret = -EINVAL; + struct timespec64 ts; - if (likely(ioctx)) { - if (likely(min_nr <= nr && min_nr >= 0)) - ret = read_events(ioctx, min_nr, nr, events, timeout); - percpu_ref_put(&ioctx->users); + if (timeout) { + if (unlikely(get_timespec64(&ts, timeout))) + return -EFAULT; } - return ret; + + return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL); } #ifdef CONFIG_COMPAT @@ -1862,17 +1870,14 @@ COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, struct io_event __user *, events, struct compat_timespec __user *, timeout) { - struct timespec t; - struct timespec __user *ut = NULL; + struct timespec64 t; if (timeout) { - if (compat_get_timespec(&t, timeout)) + if (compat_get_timespec64(&t, timeout)) return -EFAULT; - ut = compat_alloc_user_space(sizeof(*ut)); - if (copy_to_user(ut, &t, sizeof(t))) - return -EFAULT; } - return sys_io_getevents(ctx_id, min_nr, nr, events, ut); + + return do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL); } #endif diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index d79ced925861..82e8f6edfb48 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -281,8 +281,8 @@ static int autofs4_mount_wait(const struct path *path, bool rcu_walk) pr_debug("waiting for mount name=%pd\n", path->dentry); status = autofs4_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); - ino->last_used = jiffies; } + ino->last_used = jiffies; return status; } @@ -321,21 +321,16 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) */ if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { struct dentry *parent = dentry->d_parent; + struct autofs_info *ino; struct dentry *new; new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; - if (new == dentry) - dput(new); - else { - struct autofs_info *ino; - - ino = autofs4_dentry_ino(new); - ino->last_used = jiffies; - dput(path->dentry); - path->dentry = new; - } + ino = autofs4_dentry_ino(new); + ino->last_used = jiffies; + dput(path->dentry); + path->dentry = new; } return path->dentry; } diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 4ac49d038bf3..8fc41705c7cd 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -81,7 +81,8 @@ static int autofs4_write(struct autofs_sb_info *sbi, spin_unlock_irqrestore(¤t->sighand->siglock, flags); } - return (bytes > 0); + /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */ + return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } static void autofs4_notify_daemon(struct autofs_sb_info *sbi, @@ -95,6 +96,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, } pkt; struct file *pipe = NULL; size_t pktsz; + int ret; pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", (unsigned long) wq->wait_queue_token, @@ -169,7 +171,18 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); if (autofs4_write(sbi, pipe, &pkt, pktsz)) + switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { + case 0: + break; + case -ENOMEM: + case -ERESTARTSYS: + /* Just fail this one */ + autofs4_wait_release(sbi, wq->wait_queue_token, ret); + break; + default: autofs4_catatonic_mode(sbi); + break; + } fput(pipe); } diff --git a/fs/befs/ChangeLog b/fs/befs/ChangeLog index 75a461cfaca6..16f2dfe8c2f7 100644 --- a/fs/befs/ChangeLog +++ b/fs/befs/ChangeLog @@ -365,7 +365,7 @@ Version 0.4 (2001-10-28) (fs/befs/super.c) * Tell the kernel to only mount befs read-only. - By setting the MS_RDONLY flag in befs_read_super(). + By setting the SB_RDONLY flag in befs_read_super(). Not that it was possible to write before. But now the kernel won't even try. (fs/befs/super.c) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index a92355cc453b..ee236231cafa 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -841,7 +841,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { befs_warning(sb, "No write support. Marking filesystem read-only"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } /* @@ -948,7 +948,7 @@ static int befs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - if (!(*flags & MS_RDONLY)) + if (!(*flags & SB_RDONLY)) return -EINVAL; return 0; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 5429b035e249..429326b6e2e7 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1498,7 +1498,9 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm) struct vm_area_struct *vma; for (vma = current->mm->mmap; vma; vma = vma->vm_next) { +#ifdef CONFIG_MMU unsigned long addr; +#endif if (!maydump(vma, cprm->mm_flags)) continue; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index b35ce16b3df3..5982c8a71f02 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -295,7 +295,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, - unsigned long nr_pages) + unsigned long nr_pages, + unsigned int write_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio *bio = NULL; @@ -327,7 +328,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bdev = fs_info->fs_devices->latest_bdev; bio = btrfs_bio_alloc(bdev, first_byte); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; refcount_set(&cb->pending_bios, 1); @@ -374,7 +375,7 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_put(bio); bio = btrfs_bio_alloc(bdev, first_byte); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + bio->bi_opf = REQ_OP_WRITE | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; bio_add_page(bio, page, PAGE_SIZE, 0); @@ -1528,5 +1529,5 @@ unsigned int btrfs_compress_str2level(const char *str) if (str[4] == ':' && '1' <= str[5] && str[5] <= '9' && str[6] == 0) return str[5] - '0'; - return 0; + return BTRFS_ZLIB_DEFAULT_LEVEL; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index da20755ebf21..0868cc554f14 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -34,6 +34,8 @@ /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) +#define BTRFS_ZLIB_DEFAULT_LEVEL 3 + struct compressed_bio { /* number of bios pending for this compressed extent */ refcount_t pending_bios; @@ -91,7 +93,8 @@ blk_status_t btrfs_submit_compressed_write(struct inode *inode, u64 start, unsigned long len, u64 disk_start, unsigned long compressed_len, struct page **compressed_pages, - unsigned long nr_pages); + unsigned long nr_pages, + unsigned int write_flags); blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f7df5536ab61..13c260b525a1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2957,7 +2957,7 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) */ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) { - return fs_info->sb->s_flags & MS_RDONLY || btrfs_fs_closing(fs_info); + return fs_info->sb->s_flags & SB_RDONLY || btrfs_fs_closing(fs_info); } static inline void free_fs_info(struct btrfs_fs_info *fs_info) @@ -3180,6 +3180,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput, int nr); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + unsigned int extra_bits, struct extent_state **cached_state, int dedupe); int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, struct btrfs_root *new_root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index efce9a2fa9be..10a2a579cc7f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -610,7 +610,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, * that we don't try and read the other copies of this block, just * return -EIO. */ - if (found_level == 0 && btrfs_check_leaf(root, eb)) { + if (found_level == 0 && btrfs_check_leaf_full(root, eb)) { set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = -EIO; } @@ -3848,7 +3848,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) buf->len, fs_info->dirty_metadata_batch); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - if (btrfs_header_level(buf) == 0 && btrfs_check_leaf(root, buf)) { + /* + * Since btrfs_mark_buffer_dirty() can be called with item pointer set + * but item data not updated. + * So here we should only check item pointers, not item data. + */ + if (btrfs_header_level(buf) == 0 && + btrfs_check_leaf_relaxed(root, buf)) { btrfs_print_leaf(buf); ASSERT(0); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7208ecef7088..4497f937e8fb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3502,13 +3502,6 @@ again: goto again; } - /* We've already setup this transaction, go ahead and exit */ - if (block_group->cache_generation == trans->transid && - i_size_read(inode)) { - dcs = BTRFS_DC_SETUP; - goto out_put; - } - /* * We want to set the generation to 0, that way if anything goes wrong * from here on out we know not to trust this cache when we load up next @@ -3532,6 +3525,13 @@ again: } WARN_ON(ret); + /* We've already setup this transaction, go ahead and exit */ + if (block_group->cache_generation == trans->transid && + i_size_read(inode)) { + dcs = BTRFS_DC_SETUP; + goto out_put; + } + if (i_size_read(inode) > 0) { ret = btrfs_check_trunc_cache_free_space(fs_info, &fs_info->global_block_rsv); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 16045ea86fc1..012d63870b99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1984,7 +1984,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, struct btrfs_bio *bbio = NULL; int ret; - ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); + ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); bio = btrfs_io_bio_alloc(1); @@ -3253,7 +3253,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode, delalloc_start, delalloc_end, &page_started, - nr_written); + nr_written, wbc); /* File system has been set read-only */ if (ret) { SetPageError(page); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 4a8861379d3e..93dcae0c3183 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -116,7 +116,8 @@ struct extent_io_ops { */ int (*fill_delalloc)(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written); + unsigned long *nr_written, + struct writeback_control *wbc); int (*writepage_start_hook)(struct page *page, u64 start, u64 end); void (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, @@ -365,10 +366,11 @@ int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state); static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) + u64 end, unsigned int extra_bits, + struct extent_state **cached_state) { return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE, + EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits, NULL, cached_state, GFP_NOFS); } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f80254d82f40..eb1bac7c8553 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -477,6 +477,47 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) } } +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) +{ + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, + search_len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, + NULL, cached_state, GFP_NOFS); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + /* * after copy_from_user, pages need to be dirtied and we need to make * sure holes are created between the current EOF and the start of @@ -497,14 +538,34 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages, u64 end_of_last_block; u64 end_pos = pos + write_bytes; loff_t isize = i_size_read(inode); + unsigned int extra_bits = 0; start_pos = pos & ~((u64) fs_info->sectorsize - 1); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); end_of_last_block = start_pos + num_bytes - 1; + + if (!btrfs_is_free_space_inode(BTRFS_I(inode))) { + if (start_pos >= isize && + !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) { + /* + * There can't be any extents following eof in this case + * so just set the delalloc new bit for the range + * directly. + */ + extra_bits |= EXTENT_DELALLOC_NEW; + } else { + err = btrfs_find_new_delalloc_bytes(BTRFS_I(inode), + start_pos, + num_bytes, cached); + if (err) + return err; + } + } + err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, - cached, 0); + extra_bits, cached, 0); if (err) return err; @@ -1404,47 +1465,6 @@ fail: } -static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, - const u64 start, - const u64 len, - struct extent_state **cached_state) -{ - u64 search_start = start; - const u64 end = start + len - 1; - - while (search_start < end) { - const u64 search_len = end - search_start + 1; - struct extent_map *em; - u64 em_len; - int ret = 0; - - em = btrfs_get_extent(inode, NULL, 0, search_start, - search_len, 0); - if (IS_ERR(em)) - return PTR_ERR(em); - - if (em->block_start != EXTENT_MAP_HOLE) - goto next; - - em_len = em->len; - if (em->start < search_start) - em_len -= search_start - em->start; - if (em_len > search_len) - em_len = search_len; - - ret = set_extent_bit(&inode->io_tree, search_start, - search_start + em_len - 1, - EXTENT_DELALLOC_NEW, - NULL, cached_state, GFP_NOFS); -next: - search_start = extent_map_end(em); - free_extent_map(em); - if (ret) - return ret; - } - return 0; -} - /* * This function locks the extent and properly waits for data=ordered extents * to finish before allowing the pages to be modified if need. @@ -1473,10 +1493,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, + round_up(pos + write_bytes - start_pos, fs_info->sectorsize) - 1; - if (start_pos < inode->vfs_inode.i_size || - (inode->flags & BTRFS_INODE_PREALLOC)) { + if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; - unsigned int clear_bits; lock_extent_bits(&inode->io_tree, start_pos, last_pos, cached_state); @@ -1498,19 +1516,10 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } if (ordered) btrfs_put_ordered_extent(ordered); - ret = btrfs_find_new_delalloc_bytes(inode, start_pos, - last_pos - start_pos + 1, - cached_state); - clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG; - if (ret) - clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED; - clear_extent_bit(&inode->io_tree, start_pos, - last_pos, clear_bits, - (clear_bits & EXTENT_LOCKED) ? 1 : 0, - 0, cached_state, GFP_NOFS); - if (ret) - return ret; + clear_extent_bit(&inode->io_tree, start_pos, last_pos, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + 0, 0, cached_state, GFP_NOFS); *lockstart = start_pos; *lockend = last_pos; ret = 1; @@ -2048,6 +2057,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) len = (u64)end - (u64)start + 1; trace_btrfs_sync_file(file, datasync); + btrfs_init_log_ctx(&ctx, inode); + /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by @@ -2194,8 +2205,6 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } trans->sync = true; - btrfs_init_log_ctx(&ctx, inode); - ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ @@ -2253,6 +2262,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = btrfs_end_transaction(trans); } out: + ASSERT(list_empty(&ctx.list)); err = file_check_and_advance_wb_err(file); if (!ret) ret = err; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cdc9f4015ec3..4426d1c73e50 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1264,7 +1264,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* Lock all pages first so we can lock the extent safely. */ ret = io_ctl_prepare_pages(io_ctl, inode, 0); if (ret) - goto out; + goto out_unlock; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, &cached_state); @@ -1358,6 +1358,7 @@ out_nospc_locked: out_nospc: cleanup_write_cache_enospc(inode, io_ctl, &cached_state); +out_unlock: if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) up_write(&block_group->data_rwsem); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b93fe05a39c7..993061f83067 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -378,6 +378,7 @@ struct async_cow { struct page *locked_page; u64 start; u64 end; + unsigned int write_flags; struct list_head extents; struct btrfs_work work; }; @@ -857,7 +858,8 @@ retry: async_extent->ram_size, ins.objectid, ins.offset, async_extent->pages, - async_extent->nr_pages)) { + async_extent->nr_pages, + async_cow->write_flags)) { struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct page *p = async_extent->pages[0]; const u64 start = async_extent->start; @@ -1191,7 +1193,8 @@ static noinline void async_cow_free(struct btrfs_work *work) static int cow_file_range_async(struct inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written) + unsigned long *nr_written, + unsigned int write_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct async_cow *async_cow; @@ -1208,6 +1211,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->root = root; async_cow->locked_page = locked_page; async_cow->start = start; + async_cow->write_flags = write_flags; if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && !btrfs_test_opt(fs_info, FORCE_COMPRESS)) @@ -1577,11 +1581,13 @@ static inline int need_force_cow(struct inode *inode, u64 start, u64 end) */ static int run_delalloc_range(void *private_data, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written) + unsigned long *nr_written, + struct writeback_control *wbc) { struct inode *inode = private_data; int ret; int force_cow = need_force_cow(inode, start, end); + unsigned int write_flags = wbc_to_write_flags(wbc); if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, @@ -1596,7 +1602,8 @@ static int run_delalloc_range(void *private_data, struct page *locked_page, set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags); ret = cow_file_range_async(inode, locked_page, start, end, - page_started, nr_written); + page_started, nr_written, + write_flags); } if (ret) btrfs_cleanup_ordered_extents(inode, start, end - start + 1); @@ -2025,11 +2032,12 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans, } int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, + unsigned int extra_bits, struct extent_state **cached_state, int dedupe) { WARN_ON((end & (PAGE_SIZE - 1)) == 0); return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, - cached_state); + extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2090,7 +2098,7 @@ again: goto out; } - btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state, + btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state, 0); ClearPageChecked(page); set_page_dirty(page); @@ -4790,7 +4798,7 @@ again: EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, block_start, block_end, + ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state, 0); if (ret) { unlock_extent_cached(io_tree, block_start, block_end, @@ -5438,6 +5446,14 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, goto out_err; btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); + if (location->type != BTRFS_INODE_ITEM_KEY && + location->type != BTRFS_ROOT_ITEM_KEY) { + btrfs_warn(root->fs_info, +"%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", + __func__, name, btrfs_ino(BTRFS_I(dir)), + location->objectid, location->type, location->offset); + goto out_err; + } out: btrfs_free_path(path); return ret; @@ -5754,8 +5770,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } - BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY); - index = srcu_read_lock(&fs_info->subvol_srcu); ret = fixup_tree_root_location(fs_info, dir, dentry, &location, &sub_root); @@ -9150,7 +9164,7 @@ again: EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); - ret = btrfs_set_extent_delalloc(inode, page_start, end, + ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state, 0); if (ret) { unlock_extent_cached(io_tree, page_start, page_end, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fd172a93d11a..d748ad1c3620 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1172,7 +1172,7 @@ again: if (!i_done || ret) goto out; - if (!(inode->i_sb->s_flags & MS_ACTIVE)) + if (!(inode->i_sb->s_flags & SB_ACTIVE)) goto out; /* @@ -1333,7 +1333,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * make sure we stop running if someone unmounts * the FS */ - if (!(inode->i_sb->s_flags & MS_ACTIVE)) + if (!(inode->i_sb->s_flags & SB_ACTIVE)) break; if (btrfs_defrag_cancelled(fs_info)) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4cf2eb67eba6..f0c3f00e97cb 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3268,7 +3268,8 @@ static int relocate_file_extent_cluster(struct inode *inode, nr++; } - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL, 0); + btrfs_set_extent_delalloc(inode, page_start, page_end, 0, NULL, + 0); set_page_dirty(page); unlock_extent(&BTRFS_I(inode)->io_tree, diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index c10e4c70f02d..20d3300bd268 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3521,7 +3521,40 @@ out: } /* - * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Check if inode ino2, or any of its ancestors, is inode ino1. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int check_ino_in_path(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + const u64 ino2_gen, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + if (ino1 == ino2) + return ino1_gen == ino2_gen; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + u64 parent; + u64 parent_gen; + int ret; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) + return ret; + if (parent == ino1) + return parent_gen == ino1_gen; + ino = parent; + } + return 0; +} + +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root for any + * possible path (in case ino2 is not a directory and has multiple hard links). * Return 1 if true, 0 if false and < 0 on error. */ static int is_ancestor(struct btrfs_root *root, @@ -3530,36 +3563,91 @@ static int is_ancestor(struct btrfs_root *root, const u64 ino2, struct fs_path *fs_path) { - u64 ino = ino2; - bool free_path = false; + bool free_fs_path = false; int ret = 0; + struct btrfs_path *path = NULL; + struct btrfs_key key; if (!fs_path) { fs_path = fs_path_alloc(); if (!fs_path) return -ENOMEM; - free_path = true; + free_fs_path = true; } - while (ino > BTRFS_FIRST_FREE_OBJECTID) { - u64 parent; - u64 parent_gen; + path = alloc_path_for_send(); + if (!path) { + ret = -ENOMEM; + goto out; + } - fs_path_reset(fs_path); - ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); - if (ret < 0) { - if (ret == -ENOENT && ino == ino2) - ret = 0; - goto out; + key.objectid = ino2; + key.type = BTRFS_INODE_REF_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (true) { + struct extent_buffer *leaf = path->nodes[0]; + int slot = path->slots[0]; + u32 cur_offset = 0; + u32 item_size; + + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + continue; } - if (parent == ino1) { - ret = parent_gen == ino1_gen ? 1 : 0; - goto out; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino2) + break; + if (key.type != BTRFS_INODE_REF_KEY && + key.type != BTRFS_INODE_EXTREF_KEY) + break; + + item_size = btrfs_item_size_nr(leaf, slot); + while (cur_offset < item_size) { + u64 parent; + u64 parent_gen; + + if (key.type == BTRFS_INODE_EXTREF_KEY) { + unsigned long ptr; + struct btrfs_inode_extref *extref; + + ptr = btrfs_item_ptr_offset(leaf, slot); + extref = (struct btrfs_inode_extref *) + (ptr + cur_offset); + parent = btrfs_inode_extref_parent(leaf, + extref); + cur_offset += sizeof(*extref); + cur_offset += btrfs_inode_extref_name_len(leaf, + extref); + } else { + parent = key.offset; + cur_offset = item_size; + } + + ret = get_inode_info(root, parent, NULL, &parent_gen, + NULL, NULL, NULL, NULL); + if (ret < 0) + goto out; + ret = check_ino_in_path(root, ino1, ino1_gen, + parent, parent_gen, fs_path); + if (ret) + goto out; } - ino = parent; + path->slots[0]++; } + ret = 0; out: - if (free_path) + btrfs_free_path(path); + if (free_fs_path) fs_path_free(fs_path); return ret; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 65af029559b5..3a4dce153645 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -107,7 +107,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) return; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; btrfs_info(fs_info, "forced readonly"); /* * Note that a running device replace operation is not @@ -137,7 +137,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function /* * Special case: if the error is EROFS, and we're already - * under MS_RDONLY, then it is safe here. + * under SB_RDONLY, then it is safe here. */ if (errno == -EROFS && sb_rdonly(sb)) return; @@ -168,7 +168,7 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); /* Don't go through full error handling during mount */ - if (sb->s_flags & MS_BORN) + if (sb->s_flags & SB_BORN) btrfs_handle_error(fs_info); } @@ -507,9 +507,18 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, token == Opt_compress_force || strncmp(args[0].from, "zlib", 4) == 0) { compress_type = "zlib"; + info->compress_type = BTRFS_COMPRESS_ZLIB; - info->compress_level = - btrfs_compress_str2level(args[0].from); + info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + /* + * args[0] contains uninitialized data since + * for these tokens we don't expect any + * parameter. + */ + if (token != Opt_compress && + token != Opt_compress_force) + info->compress_level = + btrfs_compress_str2level(args[0].from); btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); @@ -625,7 +634,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_acl: #ifdef CONFIG_BTRFS_FS_POSIX_ACL - info->sb->s_flags |= MS_POSIXACL; + info->sb->s_flags |= SB_POSIXACL; break; #else btrfs_err(info, "support for ACL not compiled in!"); @@ -633,7 +642,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; #endif case Opt_noacl: - info->sb->s_flags &= ~MS_POSIXACL; + info->sb->s_flags &= ~SB_POSIXACL; break; case Opt_notreelog: btrfs_set_and_info(info, NOTREELOG, @@ -851,7 +860,7 @@ check: /* * Extra check for current option against current flag */ - if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & MS_RDONLY)) { + if (btrfs_test_opt(info, NOLOGREPLAY) && !(new_flags & SB_RDONLY)) { btrfs_err(info, "nologreplay must be used with ro mount option"); ret = -EINVAL; @@ -1147,7 +1156,7 @@ static int btrfs_fill_super(struct super_block *sb, sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; #ifdef CONFIG_BTRFS_FS_POSIX_ACL - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif sb->s_flags |= SB_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; @@ -1180,7 +1189,7 @@ static int btrfs_fill_super(struct super_block *sb, } cleancache_init_fs(sb); - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; return 0; fail_close: @@ -1277,7 +1286,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",flushoncommit"); if (btrfs_test_opt(info, DISCARD)) seq_puts(seq, ",discard"); - if (!(info->sb->s_flags & MS_POSIXACL)) + if (!(info->sb->s_flags & SB_POSIXACL)) seq_puts(seq, ",noacl"); if (btrfs_test_opt(info, SPACE_CACHE)) seq_puts(seq, ",space_cache"); @@ -1409,11 +1418,11 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { - if (flags & MS_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, + if (flags & SB_RDONLY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~SB_RDONLY, device_name, newargs); } else { - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, + mnt = vfs_kern_mount(&btrfs_fs_type, flags | SB_RDONLY, device_name, newargs); if (IS_ERR(mnt)) { root = ERR_CAST(mnt); @@ -1565,7 +1574,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, u64 subvol_objectid = 0; int error = 0; - if (!(flags & MS_RDONLY)) + if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; error = btrfs_parse_early_options(data, mode, fs_type, @@ -1619,13 +1628,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (error) goto error_fs_info; - if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) { error = -EACCES; goto error_close_devices; } bdev = fs_devices->latest_bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | MS_NOSEC, + s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC, fs_info); if (IS_ERR(s)) { error = PTR_ERR(s); @@ -1635,7 +1644,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if (s->s_root) { btrfs_close_devices(fs_devices); free_fs_info(fs_info); - if ((flags ^ s->s_flags) & MS_RDONLY) + if ((flags ^ s->s_flags) & SB_RDONLY) error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); @@ -1702,11 +1711,11 @@ static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, { if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || - (flags & MS_RDONLY))) { + (flags & SB_RDONLY))) { /* wait for any defraggers to finish */ wait_event(fs_info->transaction_wait, (atomic_read(&fs_info->defrag_running) == 0)); - if (flags & MS_RDONLY) + if (flags & SB_RDONLY) sync_filesystem(fs_info->sb); } } @@ -1766,10 +1775,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out; - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { /* * this also happens on 'umount -rf' or on shutdown, when * the filesystem is busy. @@ -1781,10 +1790,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) /* avoid complains from lockdep et al. */ up(&fs_info->uuid_tree_rescan_sem); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; /* - * Setting MS_RDONLY will put the cleaner thread to + * Setting SB_RDONLY will put the cleaner thread to * sleep at the next loop if it's already active. * If it's already asleep, we'll leave unused block * groups on disk until we're mounted read-write again @@ -1856,7 +1865,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; set_bit(BTRFS_FS_OPEN, &fs_info->flags); } @@ -1866,9 +1875,9 @@ out: return 0; restore: - /* We've hit an error - don't reset MS_RDONLY */ + /* We've hit an error - don't reset SB_RDONLY */ if (sb_rdonly(sb)) - old_flags |= MS_RDONLY; + old_flags |= SB_RDONLY; sb->s_flags = old_flags; fs_info->mount_opt = old_opts; fs_info->compress_type = old_compress_type; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index d06b1c931d05..2e7f64a3b22b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -114,7 +114,7 @@ static int test_find_delalloc(u32 sectorsize) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, sectorsize - 1, NULL); + set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -145,7 +145,7 @@ static int test_find_delalloc(u32 sectorsize) test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, NULL); + set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -200,7 +200,7 @@ static int test_find_delalloc(u32 sectorsize) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index f797642c013d..30affb60da51 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -968,7 +968,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ - ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, + ret = btrfs_set_extent_delalloc(inode, 0, BTRFS_MAX_EXTENT_SIZE - 1, 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); @@ -984,7 +984,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) /* [BTRFS_MAX_EXTENT_SIZE][sectorsize] */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE, BTRFS_MAX_EXTENT_SIZE + sectorsize - 1, - NULL, 0); + 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1018,7 +1018,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE >> 1, (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - NULL, 0); + 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1036,7 +1036,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize, (BTRFS_MAX_EXTENT_SIZE << 1) + 3 * sectorsize - 1, - NULL, 0); + 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1053,7 +1053,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; @@ -1089,7 +1089,7 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) */ ret = btrfs_set_extent_delalloc(inode, BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, NULL, 0); + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, 0, NULL, 0); if (ret) { test_msg("btrfs_set_extent_delalloc returned %d\n", ret); goto out; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 114fc5f0ecc5..ce4ed6ec8f39 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -242,7 +242,8 @@ static int check_leaf_item(struct btrfs_root *root, return ret; } -int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) +static int check_leaf(struct btrfs_root *root, struct extent_buffer *leaf, + bool check_item_data) { struct btrfs_fs_info *fs_info = root->fs_info; /* No valid key type is 0, so all key should be larger than this key */ @@ -361,10 +362,15 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) return -EUCLEAN; } - /* Check if the item size and content meet other criteria */ - ret = check_leaf_item(root, leaf, &key, slot); - if (ret < 0) - return ret; + if (check_item_data) { + /* + * Check if the item size and content meet other + * criteria + */ + ret = check_leaf_item(root, leaf, &key, slot); + if (ret < 0) + return ret; + } prev_key.objectid = key.objectid; prev_key.type = key.type; @@ -374,6 +380,17 @@ int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf) return 0; } +int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf) +{ + return check_leaf(root, leaf, true); +} + +int btrfs_check_leaf_relaxed(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + return check_leaf(root, leaf, false); +} + int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node) { unsigned long nr = btrfs_header_nritems(node); diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 96c486e95d70..3d53e8d6fda0 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -20,7 +20,19 @@ #include "ctree.h" #include "extent_io.h" -int btrfs_check_leaf(struct btrfs_root *root, struct extent_buffer *leaf); +/* + * Comprehensive leaf checker. + * Will check not only the item pointers, but also every possible member + * in item data. + */ +int btrfs_check_leaf_full(struct btrfs_root *root, struct extent_buffer *leaf); + +/* + * Less strict leaf checker. + * Will only check item pointers, not reading item data. + */ +int btrfs_check_leaf_relaxed(struct btrfs_root *root, + struct extent_buffer *leaf); int btrfs_check_node(struct btrfs_root *root, struct extent_buffer *node); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index aa7c71cff575..7bf9b31561db 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4102,7 +4102,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (ordered_io_err) { ctx->io_err = -EIO; - return 0; + return ctx->io_err; } btrfs_init_map_token(&token); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f1ecb938ba4d..49810b70afd3 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -189,6 +189,7 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) struct btrfs_device, dev_list); list_del(&device->dev_list); rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); } kfree(fs_devices); @@ -578,6 +579,7 @@ static void btrfs_free_stale_device(struct btrfs_device *cur_dev) fs_devs->num_devices--; list_del(&dev->dev_list); rcu_string_free(dev->name); + bio_put(dev->flush_bio); kfree(dev); } break; @@ -630,6 +632,7 @@ static noinline int device_list_add(const char *path, name = rcu_string_strdup(path, GFP_NOFS); if (!name) { + bio_put(device->flush_bio); kfree(device); return -ENOMEM; } @@ -742,6 +745,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) name = rcu_string_strdup(orig_dev->name->str, GFP_KERNEL); if (!name) { + bio_put(device->flush_bio); kfree(device); goto error; } @@ -807,6 +811,7 @@ again: list_del_init(&device->dev_list); fs_devices->num_devices--; rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); } @@ -1750,20 +1755,24 @@ static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info, key.offset = device->devid; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = -ENOENT; + if (ret) { + if (ret > 0) + ret = -ENOENT; + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); goto out; } ret = btrfs_del_item(trans, root, path); - if (ret) - goto out; + if (ret) { + btrfs_abort_transaction(trans, ret); + btrfs_end_transaction(trans); + } + out: btrfs_free_path(path); - btrfs_commit_transaction(trans); + if (!ret) + ret = btrfs_commit_transaction(trans); return ret; } @@ -1993,7 +2002,7 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, fs_devices = srcdev->fs_devices; list_del_rcu(&srcdev->dev_list); - list_del_rcu(&srcdev->dev_alloc_list); + list_del(&srcdev->dev_alloc_list); fs_devices->num_devices--; if (srcdev->missing) fs_devices->missing_devices--; @@ -2349,6 +2358,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { + bio_put(device->flush_bio); kfree(device); ret = -ENOMEM; goto error; @@ -2358,6 +2368,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); ret = PTR_ERR(trans); goto error; @@ -2384,7 +2395,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); if (seeding_dev) { - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; ret = btrfs_prepare_sprout(fs_info); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2497,10 +2508,11 @@ error_sysfs: btrfs_sysfs_rm_device_link(fs_info->fs_devices, device); error_trans: if (seeding_dev) - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; if (trans) btrfs_end_transaction(trans); rcu_string_free(device->name); + bio_put(device->flush_bio); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2567,6 +2579,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, name = rcu_string_strdup(device_path, GFP_KERNEL); if (!name) { + bio_put(device->flush_bio); kfree(device); ret = -ENOMEM; goto error; @@ -6284,6 +6297,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ret = find_next_devid(fs_info, &tmp); if (ret) { + bio_put(dev->flush_bio); kfree(dev); return ERR_PTR(ret); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ff5d32cf9578..a14b2c974c9e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1160,7 +1160,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; struct cap_msg_args arg; - int held, revoking, dropping; + int held, revoking; int wake = 0; int delayed = 0; int ret; @@ -1168,7 +1168,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; - dropping = cap->issued & ~retain; dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", inode, cap, cap->session, @@ -1712,7 +1711,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, /* if we are unmounting, flush any unused caps immediately. */ if (mdsc->stopping) - is_delayed = 1; + is_delayed = true; spin_lock(&ci->i_ceph_lock); @@ -3189,8 +3188,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, int dirty = le32_to_cpu(m->dirty); int cleaned = 0; bool drop = false; - bool wake_ci = 0; - bool wake_mdsc = 0; + bool wake_ci = false; + bool wake_mdsc = false; list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { if (cf->tid == flush_tid) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f2550a076edc..ab81652198c4 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -493,6 +493,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_wb_ref = 0; ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; + atomic_set(&ci->i_filelock_ref, 0); ci->i_shared_gen = 0; ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; @@ -786,7 +787,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page, /* update inode */ ci->i_version = le64_to_cpu(info->version); - inode->i_version++; inode->i_rdev = le32_to_cpu(info->rdev); inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; @@ -1185,6 +1185,7 @@ retry_lookup: ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); + ceph_dir_clear_ordered(dir); d_delete(dn); dput(dn); goto retry_lookup; @@ -1322,6 +1323,7 @@ retry_lookup: dout(" %p links to %p %llx.%llx, not %llx.%llx\n", dn, d_inode(dn), ceph_vinop(d_inode(dn)), ceph_vinop(in)); + ceph_dir_clear_ordered(dir); d_invalidate(dn); have_lease = false; } @@ -1573,6 +1575,7 @@ retry_lookup: ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); + __ceph_dir_clear_ordered(ci); d_delete(dn); dput(dn); goto retry_lookup; @@ -1597,7 +1600,9 @@ retry_lookup: &req->r_caps_reservation); if (ret < 0) { pr_err("fill_inode badness on %p\n", in); - if (d_really_is_negative(dn)) + if (d_really_is_positive(dn)) + __ceph_dir_clear_ordered(ci); + else iput(in); d_drop(dn); err = ret; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index e7cce412f2cf..9e66f69ee8a5 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -30,19 +30,52 @@ void __init ceph_flock_init(void) get_random_bytes(&lock_secret, sizeof(lock_secret)); } +static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) +{ + struct inode *inode = file_inode(src->fl_file); + atomic_inc(&ceph_inode(inode)->i_filelock_ref); +} + +static void ceph_fl_release_lock(struct file_lock *fl) +{ + struct inode *inode = file_inode(fl->fl_file); + struct ceph_inode_info *ci = ceph_inode(inode); + if (atomic_dec_and_test(&ci->i_filelock_ref)) { + /* clear error when all locks are released */ + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; + spin_unlock(&ci->i_ceph_lock); + } +} + +static const struct file_lock_operations ceph_fl_lock_ops = { + .fl_copy_lock = ceph_fl_copy_lock, + .fl_release_private = ceph_fl_release_lock, +}; + /** * Implement fcntl and flock locking functions. */ -static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, +static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, int cmd, u8 wait, struct file_lock *fl) { - struct inode *inode = file_inode(file); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; u64 length = 0; u64 owner; + if (operation == CEPH_MDS_OP_SETFILELOCK) { + /* + * increasing i_filelock_ref closes race window between + * handling request reply and adding file_lock struct to + * inode. Otherwise, auth caps may get trimmed in the + * window. Caller function will decrease the counter. + */ + fl->fl_ops = &ceph_fl_lock_ops; + atomic_inc(&ceph_inode(inode)->i_filelock_ref); + } + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) wait = 0; @@ -180,10 +213,12 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, */ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) { - u8 lock_cmd; - int err; - u8 wait = 0; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + int err = 0; u16 op = CEPH_MDS_OP_SETFILELOCK; + u8 wait = 0; + u8 lock_cmd; if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; @@ -199,6 +234,26 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else if (IS_SETLKW(cmd)) wait = 1; + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { + err = -EIO; + } else if (op == CEPH_MDS_OP_SETFILELOCK) { + /* + * increasing i_filelock_ref closes race window between + * handling request reply and adding file_lock struct to + * inode. Otherwise, i_auth_cap may get trimmed in the + * window. Caller function will decrease the counter. + */ + fl->fl_ops = &ceph_fl_lock_ops; + atomic_inc(&ci->i_filelock_ref); + } + spin_unlock(&ci->i_ceph_lock); + if (err < 0) { + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) + posix_lock_file(file, fl, NULL); + return err; + } + if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; else if (F_WRLCK == fl->fl_type) @@ -206,16 +261,16 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else lock_cmd = CEPH_LOCK_UNLOCK; - err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { - if (op != CEPH_MDS_OP_GETFILELOCK) { + if (op == CEPH_MDS_OP_SETFILELOCK) { dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); - if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + if (err) { /* undo! This should only happen if * the kernel detects local * deadlock. */ - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on posix_lock_file, undid lock", err); @@ -227,9 +282,11 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { - u8 lock_cmd; - int err; + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + int err = 0; u8 wait = 0; + u8 lock_cmd; if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; @@ -239,6 +296,21 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) dout("ceph_flock, fl_file: %p", fl->fl_file); + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { + err = -EIO; + } else { + /* see comment in ceph_lock */ + fl->fl_ops = &ceph_fl_lock_ops; + atomic_inc(&ci->i_filelock_ref); + } + spin_unlock(&ci->i_ceph_lock); + if (err < 0) { + if (F_UNLCK == fl->fl_type) + locks_lock_file_wait(file, fl); + return err; + } + if (IS_SETLKW(cmd)) wait = 1; @@ -250,13 +322,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) lock_cmd = CEPH_LOCK_UNLOCK; err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, lock_cmd, wait, fl); + inode, lock_cmd, wait, fl); if (!err) { err = locks_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, CEPH_LOCK_UNLOCK, 0, fl); + inode, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on locks_lock_file_wait, undid lock", err); } } @@ -288,6 +360,37 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) *flock_count, *fcntl_count); } +/* + * Given a pointer to a lock, convert it to a ceph filelock + */ +static int lock_to_ceph_filelock(struct file_lock *lock, + struct ceph_filelock *cephlock) +{ + int err = 0; + cephlock->start = cpu_to_le64(lock->fl_start); + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); + cephlock->client = cpu_to_le64(0); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + + switch (lock->fl_type) { + case F_RDLCK: + cephlock->type = CEPH_LOCK_SHARED; + break; + case F_WRLCK: + cephlock->type = CEPH_LOCK_EXCL; + break; + case F_UNLCK: + cephlock->type = CEPH_LOCK_UNLOCK; + break; + default: + dout("Have unknown lock type %d", lock->fl_type); + err = -EINVAL; + } + + return err; +} + /** * Encode the flock and fcntl locks for the given inode into the ceph_filelock * array. Must be called with inode->i_lock already held. @@ -356,50 +459,22 @@ int ceph_locks_to_pagelist(struct ceph_filelock *flocks, if (err) goto out_fail; - err = ceph_pagelist_append(pagelist, flocks, - num_fcntl_locks * sizeof(*flocks)); - if (err) - goto out_fail; + if (num_fcntl_locks > 0) { + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + } nlocks = cpu_to_le32(num_flock_locks); err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); if (err) goto out_fail; - err = ceph_pagelist_append(pagelist, - &flocks[num_fcntl_locks], - num_flock_locks * sizeof(*flocks)); -out_fail: - return err; -} - -/* - * Given a pointer to a lock, convert it to a ceph filelock - */ -int lock_to_ceph_filelock(struct file_lock *lock, - struct ceph_filelock *cephlock) -{ - int err = 0; - cephlock->start = cpu_to_le64(lock->fl_start); - cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); - cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64((u64)lock->fl_pid); - cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); - - switch (lock->fl_type) { - case F_RDLCK: - cephlock->type = CEPH_LOCK_SHARED; - break; - case F_WRLCK: - cephlock->type = CEPH_LOCK_EXCL; - break; - case F_UNLCK: - cephlock->type = CEPH_LOCK_UNLOCK; - break; - default: - dout("Have unknown lock type %d", lock->fl_type); - err = -EINVAL; + if (num_flock_locks > 0) { + err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); } - +out_fail: return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0687ab3c3267..ab69dcb70e8a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1039,22 +1039,23 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, * session caps */ -/* caller holds s_cap_lock, we drop it */ -static void cleanup_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) - __releases(session->s_cap_lock) +static void detach_cap_releases(struct ceph_mds_session *session, + struct list_head *target) { - LIST_HEAD(tmp_list); - list_splice_init(&session->s_cap_releases, &tmp_list); + lockdep_assert_held(&session->s_cap_lock); + + list_splice_init(&session->s_cap_releases, target); session->s_num_cap_releases = 0; - spin_unlock(&session->s_cap_lock); + dout("dispose_cap_releases mds%d\n", session->s_mds); +} - dout("cleanup_cap_releases mds%d\n", session->s_mds); - while (!list_empty(&tmp_list)) { +static void dispose_cap_releases(struct ceph_mds_client *mdsc, + struct list_head *dispose) +{ + while (!list_empty(dispose)) { struct ceph_cap *cap; /* zero out the in-progress message */ - cap = list_first_entry(&tmp_list, - struct ceph_cap, session_caps); + cap = list_first_entry(dispose, struct ceph_cap, session_caps); list_del(&cap->session_caps); ceph_put_cap(mdsc, cap); } @@ -1215,6 +1216,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, } spin_unlock(&mdsc->cap_dirty_lock); + if (atomic_read(&ci->i_filelock_ref) > 0) { + /* make further file lock syscall return -EIO */ + ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; + pr_warn_ratelimited(" dropping file locks for %p %lld\n", + inode, ceph_ino(inode)); + } + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; @@ -1244,6 +1252,8 @@ static void remove_session_caps(struct ceph_mds_session *session) { struct ceph_fs_client *fsc = session->s_mdsc->fsc; struct super_block *sb = fsc->sb; + LIST_HEAD(dispose); + dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, fsc); @@ -1278,10 +1288,12 @@ static void remove_session_caps(struct ceph_mds_session *session) } // drop cap expires and unlock s_cap_lock - cleanup_cap_releases(session->s_mdsc, session); + detach_cap_releases(session, &dispose); BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); + spin_unlock(&session->s_cap_lock); + dispose_cap_releases(session->s_mdsc, &dispose); } /* @@ -1462,6 +1474,11 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) goto out; if ((used | wanted) & CEPH_CAP_ANY_WR) goto out; + /* Note: it's possible that i_filelock_ref becomes non-zero + * after dropping auth caps. It doesn't hurt because reply + * of lock mds request will re-add auth caps. */ + if (atomic_read(&ci->i_filelock_ref) > 0) + goto out; } /* The inode has cached pages, but it's no longer used. * we can safely drop it */ @@ -2827,7 +2844,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; - struct ceph_inode_info *ci; + struct ceph_inode_info *ci = cap->ci; struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; char *path; @@ -2836,8 +2853,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, u64 snap_follows; struct dentry *dentry; - ci = cap->ci; - dout(" adding %p ino %llx.%llx cap %p %lld %s\n", inode, ceph_vinop(inode), cap, cap->cap_id, ceph_cap_string(cap->issued)); @@ -2870,7 +2885,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v2.pathbase = cpu_to_le64(pathbase); - rec.v2.flock_len = 0; + rec.v2.flock_len = (__force __le32) + ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); @@ -2894,26 +2910,37 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; - struct ceph_filelock *flocks; + struct ceph_filelock *flocks = NULL; size_t struct_len, total_len = 0; u8 struct_v = 0; encode_again: - ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); - flocks = kmalloc((num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock), GFP_NOFS); - if (!flocks) { - err = -ENOMEM; - goto out_free; + if (rec.v2.flock_len) { + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + } else { + num_fcntl_locks = 0; + num_flock_locks = 0; } - err = ceph_encode_locks_to_buffer(inode, flocks, - num_fcntl_locks, - num_flock_locks); - if (err) { + if (num_fcntl_locks + num_flock_locks > 0) { + flocks = kmalloc((num_fcntl_locks + num_flock_locks) * + sizeof(struct ceph_filelock), GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_free; + } + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + if (err) { + kfree(flocks); + flocks = NULL; + if (err == -ENOSPC) + goto encode_again; + goto out_free; + } + } else { kfree(flocks); - if (err == -ENOSPC) - goto encode_again; - goto out_free; + flocks = NULL; } if (recon_state->msg_version >= 3) { @@ -2993,6 +3020,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int s_nr_caps; struct ceph_pagelist *pagelist; struct ceph_reconnect_state recon_state; + LIST_HEAD(dispose); pr_info("mds%d reconnect start\n", mds); @@ -3026,7 +3054,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, */ session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ - cleanup_cap_releases(mdsc, session); + detach_cap_releases(session, &dispose); + spin_unlock(&session->s_cap_lock); + dispose_cap_releases(mdsc, &dispose); /* trim unused caps to reduce MDS's cache rejoin time */ if (mdsc->fsc->sb->s_root) @@ -3857,14 +3887,14 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) goto err_out; } return; + bad: pr_err("error decoding fsmap\n"); err_out: mutex_lock(&mdsc->mutex); - mdsc->mdsmap_err = -ENOENT; + mdsc->mdsmap_err = err; __wake_requests(mdsc, &mdsc->waiting_for_map); mutex_unlock(&mdsc->mutex); - return; } /* diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e4082afedcb1..a62d2a9841dc 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -84,8 +84,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = -1; buf->f_namelen = NAME_MAX; - /* leave fsid little-endian, regardless of host endianness */ - fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); + /* Must convert the fsid, for consistent values across arches */ + fsid = le64_to_cpu(*(__le64 *)(&monmap->fsid)) ^ + le64_to_cpu(*((__le64 *)&monmap->fsid + 1)); buf->f_fsid.val[0] = fsid & 0xffffffff; buf->f_fsid.val[1] = fsid >> 32; @@ -330,11 +331,11 @@ static int parse_fsopt_token(char *c, void *private) break; #ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: - fsopt->sb_flags |= MS_POSIXACL; + fsopt->sb_flags |= SB_POSIXACL; break; #endif case Opt_noacl: - fsopt->sb_flags &= ~MS_POSIXACL; + fsopt->sb_flags &= ~SB_POSIXACL; break; default: BUG_ON(token); @@ -519,7 +520,7 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nopoolperm"); #ifdef CONFIG_CEPH_FS_POSIX_ACL - if (fsopt->sb_flags & MS_POSIXACL) + if (fsopt->sb_flags & SB_POSIXACL) seq_puts(m, ",acl"); else seq_puts(m, ",noacl"); @@ -987,7 +988,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, dout("ceph_mount\n"); #ifdef CONFIG_CEPH_FS_POSIX_ACL - flags |= MS_POSIXACL; + flags |= SB_POSIXACL; #endif err = parse_mount_options(&fsopt, &opt, flags, data, dev_name); if (err < 0) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3e27a28aa44a..2beeec07fa76 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -352,6 +352,7 @@ struct ceph_inode_info { int i_pin_ref; int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; + atomic_t i_filelock_ref; u32 i_shared_gen; /* increment each time we get FILE_SHARED */ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ @@ -487,6 +488,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ #define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ + /* * We set the ERROR_WRITE bit when we start seeing write errors on an inode @@ -1011,7 +1014,6 @@ extern int ceph_encode_locks_to_buffer(struct inode *inode, extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, struct ceph_pagelist *pagelist, int num_fcntl_locks, int num_flock_locks); -extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); /* debugfs.c */ extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index cbd216b57239..350fa55a1bf7 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -42,7 +42,7 @@ #define CIFS_MOUNT_MULTIUSER 0x20000 /* multiuser mount */ #define CIFS_MOUNT_STRICT_IO 0x40000 /* strict cache mode */ #define CIFS_MOUNT_RWPIDFORWARD 0x80000 /* use pid forwarding for rw */ -#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ +#define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of SB_POSIXACL in mnt_cifs_flags */ #define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ #define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ #define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8c8b75d33f31..31b7565b1617 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -125,7 +125,7 @@ cifs_read_super(struct super_block *sb) tcon = cifs_sb_master_tcon(cifs_sb); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIXACL) - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; if (tcon->ses->capabilities & tcon->ses->server->vals->cap_large_files) sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -497,7 +497,7 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",cifsacl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) seq_puts(s, ",dynperm"); - if (root->d_sb->s_flags & MS_POSIXACL) + if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(s, ",acl"); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) seq_puts(s, ",mfsymlinks"); @@ -573,7 +573,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root) static int cifs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_NODIRATIME; + *flags |= SB_NODIRATIME; return 0; } @@ -708,7 +708,7 @@ cifs_do_mount(struct file_system_type *fs_type, rc = cifs_mount(cifs_sb, volume_info); if (rc) { - if (!(flags & MS_SILENT)) + if (!(flags & SB_SILENT)) cifs_dbg(VFS, "cifs_mount failed w/return code = %d\n", rc); root = ERR_PTR(rc); @@ -720,7 +720,7 @@ cifs_do_mount(struct file_system_type *fs_type, mnt_data.flags = flags; /* BB should we make this contingent on mount parm? */ - flags |= MS_NODIRATIME | MS_NOATIME; + flags |= SB_NODIRATIME | SB_NOATIME; sb = sget(fs_type, cifs_match_super, cifs_set_super, flags, &mnt_data); if (IS_ERR(sb)) { @@ -739,7 +739,7 @@ cifs_do_mount(struct file_system_type *fs_type, goto out_super; } - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; } root = cifs_get_root(volume_info, sb); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index e185b2853eab..b16583594d1a 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -559,8 +559,8 @@ struct smb_vol { CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_STRICT_IO | \ CIFS_MOUNT_CIFS_BACKUPUID | CIFS_MOUNT_CIFS_BACKUPGID) -#define CIFS_MS_MASK (MS_RDONLY | MS_MANDLOCK | MS_NOEXEC | MS_NOSUID | \ - MS_NODEV | MS_SYNCHRONOUS) +#define CIFS_MS_MASK (SB_RDONLY | SB_MANDLOCK | SB_NOEXEC | SB_NOSUID | \ + SB_NODEV | SB_SYNCHRONOUS) struct cifs_mnt_data { struct cifs_sb_info *cifs_sb; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 7c732cb44164..ecb99079363a 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -985,7 +985,7 @@ retry_iget5_locked: } cifs_fattr_to_inode(inode, fattr); - if (sb->s_flags & MS_NOATIME) + if (sb->s_flags & SB_NOATIME) inode->i_flags |= S_NOATIME | S_NOCMTIME; if (inode->i_state & I_NEW) { inode->i_ino = hash; diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 52f975d848a0..316af84674f1 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -117,7 +117,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, #ifdef CONFIG_CIFS_POSIX if (!value) goto out; - if (sb->s_flags & MS_POSIXACL) + if (sb->s_flags & SB_POSIXACL) rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, value, (const int)size, ACL_TYPE_ACCESS, cifs_sb->local_nls, @@ -129,7 +129,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, #ifdef CONFIG_CIFS_POSIX if (!value) goto out; - if (sb->s_flags & MS_POSIXACL) + if (sb->s_flags & SB_POSIXACL) rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, value, (const int)size, ACL_TYPE_DEFAULT, cifs_sb->local_nls, @@ -266,7 +266,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, case XATTR_ACL_ACCESS: #ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & MS_POSIXACL) + if (sb->s_flags & SB_POSIXACL) rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, value, size, ACL_TYPE_ACCESS, cifs_sb->local_nls, @@ -276,7 +276,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, case XATTR_ACL_DEFAULT: #ifdef CONFIG_CIFS_POSIX - if (sb->s_flags & MS_POSIXACL) + if (sb->s_flags & SB_POSIXACL) rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, value, size, ACL_TYPE_DEFAULT, cifs_sb->local_nls, diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 6f0a6a4d5faa..97424cf206c0 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -96,7 +96,7 @@ void coda_destroy_inodecache(void) static int coda_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_NOATIME; + *flags |= SB_NOATIME; return 0; } @@ -188,7 +188,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) mutex_unlock(&vc->vc_mutex); sb->s_fs_info = vc; - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; sb->s_blocksize = 4096; /* XXXXX what do we put here?? */ sb->s_blocksize_bits = 12; sb->s_magic = CODA_SUPER_MAGIC; diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index a37f003530d7..1175a1722411 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -447,8 +447,7 @@ int venus_fsync(struct super_block *sb, struct CodaFid *fid) UPARG(CODA_FSYNC); inp->coda_fsync.VFid = *fid; - error = coda_upcall(coda_vcp(sb), sizeof(union inputArgs), - &outsize, inp); + error = coda_upcall(coda_vcp(sb), insize, &outsize, inp); CODA_FREE(inp, insize); return error; diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index bd5d91e119ca..5fc5dc660600 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -54,8 +54,6 @@ #include <linux/if_tun.h> #include <linux/ctype.h> #include <linux/syscalls.h> -#include <linux/i2c.h> -#include <linux/i2c-dev.h> #include <linux/atalk.h> #include <linux/gfp.h> #include <linux/cec.h> @@ -137,22 +135,6 @@ static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return vfs_ioctl(file, cmd, arg); } -static int w_long(struct file *file, - unsigned int cmd, compat_ulong_t __user *argp) -{ - int err; - unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp)); - - if (valp == NULL) - return -EFAULT; - err = do_ioctl(file, cmd, (unsigned long)valp); - if (err) - return err; - if (convert_in_user(valp, argp)) - return -EFAULT; - return 0; -} - struct compat_video_event { int32_t type; compat_time_t timestamp; @@ -671,96 +653,6 @@ static int serial_struct_ioctl(struct file *file, return err; } -/* - * I2C layer ioctls - */ - -struct i2c_msg32 { - u16 addr; - u16 flags; - u16 len; - compat_caddr_t buf; -}; - -struct i2c_rdwr_ioctl_data32 { - compat_caddr_t msgs; /* struct i2c_msg __user *msgs */ - u32 nmsgs; -}; - -struct i2c_smbus_ioctl_data32 { - u8 read_write; - u8 command; - u32 size; - compat_caddr_t data; /* union i2c_smbus_data *data */ -}; - -struct i2c_rdwr_aligned { - struct i2c_rdwr_ioctl_data cmd; - struct i2c_msg msgs[0]; -}; - -static int do_i2c_rdwr_ioctl(struct file *file, - unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata) -{ - struct i2c_rdwr_aligned __user *tdata; - struct i2c_msg __user *tmsgs; - struct i2c_msg32 __user *umsgs; - compat_caddr_t datap; - u32 nmsgs; - int i; - - if (get_user(nmsgs, &udata->nmsgs)) - return -EFAULT; - if (nmsgs > I2C_RDWR_IOCTL_MAX_MSGS) - return -EINVAL; - - if (get_user(datap, &udata->msgs)) - return -EFAULT; - umsgs = compat_ptr(datap); - - tdata = compat_alloc_user_space(sizeof(*tdata) + - nmsgs * sizeof(struct i2c_msg)); - tmsgs = &tdata->msgs[0]; - - if (put_user(nmsgs, &tdata->cmd.nmsgs) || - put_user(tmsgs, &tdata->cmd.msgs)) - return -EFAULT; - - for (i = 0; i < nmsgs; i++) { - if (copy_in_user(&tmsgs[i].addr, &umsgs[i].addr, 3*sizeof(u16))) - return -EFAULT; - if (get_user(datap, &umsgs[i].buf) || - put_user(compat_ptr(datap), &tmsgs[i].buf)) - return -EFAULT; - } - return do_ioctl(file, cmd, (unsigned long)tdata); -} - -static int do_i2c_smbus_ioctl(struct file *file, - unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata) -{ - struct i2c_smbus_ioctl_data __user *tdata; - union { - /* beginnings of those have identical layouts */ - struct i2c_smbus_ioctl_data32 data32; - struct i2c_smbus_ioctl_data data; - } v; - - tdata = compat_alloc_user_space(sizeof(*tdata)); - if (tdata == NULL) - return -ENOMEM; - - memset(&v, 0, sizeof(v)); - if (copy_from_user(&v.data32, udata, sizeof(v.data32))) - return -EFAULT; - v.data.data = compat_ptr(v.data32.data); - - if (copy_to_user(tdata, &v.data, sizeof(v.data))) - return -EFAULT; - - return do_ioctl(file, cmd, (unsigned long)tdata); -} - #define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) #define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t) #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) @@ -1283,13 +1175,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) -/* i2c */ -COMPATIBLE_IOCTL(I2C_SLAVE) -COMPATIBLE_IOCTL(I2C_SLAVE_FORCE) -COMPATIBLE_IOCTL(I2C_TENBIT) -COMPATIBLE_IOCTL(I2C_PEC) -COMPATIBLE_IOCTL(I2C_RETRIES) -COMPATIBLE_IOCTL(I2C_TIMEOUT) /* hiddev */ COMPATIBLE_IOCTL(HIDIOCGVERSION) COMPATIBLE_IOCTL(HIDIOCAPPLICATION) @@ -1464,13 +1349,6 @@ static long do_ioctl_trans(unsigned int cmd, case TIOCGSERIAL: case TIOCSSERIAL: return serial_struct_ioctl(file, cmd, argp); - /* i2c */ - case I2C_FUNCS: - return w_long(file, cmd, argp); - case I2C_RDWR: - return do_i2c_rdwr_ioctl(file, cmd, argp); - case I2C_SMBUS: - return do_i2c_smbus_ioctl(file, cmd, argp); /* Not implemented in the native kernel */ case RTC_IRQP_READ32: case RTC_IRQP_SET32: @@ -1580,6 +1458,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, case FICLONE: case FICLONERANGE: case FIDEDUPERANGE: + case FS_IOC_FIEMAP: goto do_ioctl; case FIBMAP: diff --git a/fs/coredump.c b/fs/coredump.c index 52c63d6c9143..1e2c87acac9b 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -680,16 +680,11 @@ void do_coredump(const siginfo_t *siginfo) * privs and don't want to unlink another user's coredump. */ if (!need_suid_safe) { - mm_segment_t old_fs; - - old_fs = get_fs(); - set_fs(KERNEL_DS); /* * If it doesn't exist, that's fine. If there's some * other problem, we'll catch it at the filp_open(). */ - (void) sys_unlink((const char __user *)cn.corename); - set_fs(old_fs); + do_unlinkat(AT_FDCWD, getname_kernel(cn.corename)); } /* diff --git a/fs/cramfs/Kconfig b/fs/cramfs/Kconfig index 11b29d491b7c..f937082f3244 100644 --- a/fs/cramfs/Kconfig +++ b/fs/cramfs/Kconfig @@ -1,6 +1,5 @@ config CRAMFS - tristate "Compressed ROM file system support (cramfs) (OBSOLETE)" - depends on BLOCK + tristate "Compressed ROM file system support (cramfs)" select ZLIB_INFLATE help Saying Y here includes support for CramFs (Compressed ROM File @@ -16,7 +15,39 @@ config CRAMFS cramfs. Note that the root file system (the one containing the directory /) cannot be compiled as a module. - This filesystem is obsoleted by SquashFS, which is much better - in terms of performance and features. + This filesystem is limited in capabilities and performance on + purpose to remain small and low on RAM usage. It is most suitable + for small embedded systems. If you have ample RAM to spare, you may + consider a more capable compressed filesystem such as SquashFS + which is much better in terms of performance and features. + + If unsure, say N. + +config CRAMFS_BLOCKDEV + bool "Support CramFs image over a regular block device" if EXPERT + depends on CRAMFS && BLOCK + default y + help + This option allows the CramFs driver to load data from a regular + block device such a disk partition or a ramdisk. + +config CRAMFS_MTD + bool "Support CramFs image directly mapped in physical memory" + depends on CRAMFS && MTD + default y if !CRAMFS_BLOCKDEV + help + This option allows the CramFs driver to load data directly from + a linear adressed memory range (usually non volatile memory + like flash) instead of going through the block device layer. + This saves some memory since no intermediate buffering is + necessary. + + The location of the CramFs image is determined by a + MTD device capable of direct memory mapping e.g. from + the 'physmap' map driver or a resulting MTD partition. + For example, this would mount the cramfs image stored in + the MTD partition named "xip_fs" on the /mnt mountpoint: + + mount -t cramfs mtd:xip_fs /mnt If unsure, say N. diff --git a/fs/cramfs/README b/fs/cramfs/README index 9d4e7ea311f4..d71b27e0ff15 100644 --- a/fs/cramfs/README +++ b/fs/cramfs/README @@ -49,17 +49,46 @@ same as the start of the (i+1)'th <block> if there is one). The first <block> immediately follows the last <block_pointer> for the file. <block_pointer>s are each 32 bits long. +When the CRAMFS_FLAG_EXT_BLOCK_POINTERS capability bit is set, each +<block_pointer>'s top bits may contain special flags as follows: + +CRAMFS_BLK_FLAG_UNCOMPRESSED (bit 31): + The block data is not compressed and should be copied verbatim. + +CRAMFS_BLK_FLAG_DIRECT_PTR (bit 30): + The <block_pointer> stores the actual block start offset and not + its end, shifted right by 2 bits. The block must therefore be + aligned to a 4-byte boundary. The block size is either blksize + if CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified, otherwise + the compressed data length is included in the first 2 bytes of + the block data. This is used to allow discontiguous data layout + and specific data block alignments e.g. for XIP applications. + + The order of <file_data>'s is a depth-first descent of the directory tree, i.e. the same order as `find -size +0 \( -type f -o -type l \) -print'. <block>: The i'th <block> is the output of zlib's compress function -applied to the i'th blksize-sized chunk of the input data. +applied to the i'th blksize-sized chunk of the input data if the +corresponding CRAMFS_BLK_FLAG_UNCOMPRESSED <block_ptr> bit is not set, +otherwise it is the input data directly. (For the last <block> of the file, the input may of course be smaller.) Each <block> may be a different size. (See <block_pointer> above.) + <block>s are merely byte-aligned, not generally u32-aligned. +When CRAMFS_BLK_FLAG_DIRECT_PTR is specified then the corresponding +<block> may be located anywhere and not necessarily contiguous with +the previous/next blocks. In that case it is minimally u32-aligned. +If CRAMFS_BLK_FLAG_UNCOMPRESSED is also specified then the size is always +blksize except for the last block which is limited by the file length. +If CRAMFS_BLK_FLAG_DIRECT_PTR is set and CRAMFS_BLK_FLAG_UNCOMPRESSED +is not set then the first 2 bytes of the block contains the size of the +remaining block data as this cannot be determined from the placement of +logically adjacent blocks. + Holes ----- diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 7919967488cb..017b0ab19bc4 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -15,10 +15,15 @@ #include <linux/module.h> #include <linux/fs.h> +#include <linux/file.h> #include <linux/pagemap.h> +#include <linux/pfn_t.h> +#include <linux/ramfs.h> #include <linux/init.h> #include <linux/string.h> #include <linux/blkdev.h> +#include <linux/mtd/mtd.h> +#include <linux/mtd/super.h> #include <linux/slab.h> #include <linux/vfs.h> #include <linux/mutex.h> @@ -36,6 +41,9 @@ struct cramfs_sb_info { unsigned long blocks; unsigned long files; unsigned long flags; + void *linear_virt_addr; + resource_size_t linear_phys_addr; + size_t mtd_point_size; }; static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb) @@ -46,6 +54,7 @@ static inline struct cramfs_sb_info *CRAMFS_SB(struct super_block *sb) static const struct super_operations cramfs_ops; static const struct inode_operations cramfs_dir_inode_operations; static const struct file_operations cramfs_directory_operations; +static const struct file_operations cramfs_physmem_fops; static const struct address_space_operations cramfs_aops; static DEFINE_MUTEX(read_mutex); @@ -93,6 +102,10 @@ static struct inode *get_cramfs_inode(struct super_block *sb, case S_IFREG: inode->i_fop = &generic_ro_fops; inode->i_data.a_ops = &cramfs_aops; + if (IS_ENABLED(CONFIG_CRAMFS_MTD) && + CRAMFS_SB(sb)->flags & CRAMFS_FLAG_EXT_BLOCK_POINTERS && + CRAMFS_SB(sb)->linear_phys_addr) + inode->i_fop = &cramfs_physmem_fops; break; case S_IFDIR: inode->i_op = &cramfs_dir_inode_operations; @@ -140,6 +153,9 @@ static struct inode *get_cramfs_inode(struct super_block *sb, * BLKS_PER_BUF*PAGE_SIZE, so that the caller doesn't need to * worry about end-of-buffer issues even when decompressing a full * page cache. + * + * Note: This is all optimized away at compile time when + * CONFIG_CRAMFS_BLOCKDEV=n. */ #define READ_BUFFERS (2) /* NEXT_BUFFER(): Loop over [0..(READ_BUFFERS-1)]. */ @@ -160,10 +176,10 @@ static struct super_block *buffer_dev[READ_BUFFERS]; static int next_buffer; /* - * Returns a pointer to a buffer containing at least LEN bytes of - * filesystem starting at byte offset OFFSET into the filesystem. + * Populate our block cache and return a pointer to it. */ -static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned int len) +static void *cramfs_blkdev_read(struct super_block *sb, unsigned int offset, + unsigned int len) { struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping; struct page *pages[BLKS_PER_BUF]; @@ -239,49 +255,278 @@ static void *cramfs_read(struct super_block *sb, unsigned int offset, unsigned i return read_buffers[buffer] + offset; } +/* + * Return a pointer to the linearly addressed cramfs image in memory. + */ +static void *cramfs_direct_read(struct super_block *sb, unsigned int offset, + unsigned int len) +{ + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + + if (!len) + return NULL; + if (len > sbi->size || offset > sbi->size - len) + return page_address(ZERO_PAGE(0)); + return sbi->linear_virt_addr + offset; +} + +/* + * Returns a pointer to a buffer containing at least LEN bytes of + * filesystem starting at byte offset OFFSET into the filesystem. + */ +static void *cramfs_read(struct super_block *sb, unsigned int offset, + unsigned int len) +{ + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + + if (IS_ENABLED(CONFIG_CRAMFS_MTD) && sbi->linear_virt_addr) + return cramfs_direct_read(sb, offset, len); + else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) + return cramfs_blkdev_read(sb, offset, len); + else + return NULL; +} + +/* + * For a mapping to be possible, we need a range of uncompressed and + * contiguous blocks. Return the offset for the first block and number of + * valid blocks for which that is true, or zero otherwise. + */ +static u32 cramfs_get_block_range(struct inode *inode, u32 pgoff, u32 *pages) +{ + struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); + int i; + u32 *blockptrs, first_block_addr; + + /* + * We can dereference memory directly here as this code may be + * reached only when there is a direct filesystem image mapping + * available in memory. + */ + blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode) + pgoff * 4); + first_block_addr = blockptrs[0] & ~CRAMFS_BLK_FLAGS; + i = 0; + do { + u32 block_off = i * (PAGE_SIZE >> CRAMFS_BLK_DIRECT_PTR_SHIFT); + u32 expect = (first_block_addr + block_off) | + CRAMFS_BLK_FLAG_DIRECT_PTR | + CRAMFS_BLK_FLAG_UNCOMPRESSED; + if (blockptrs[i] != expect) { + pr_debug("range: block %d/%d got %#x expects %#x\n", + pgoff+i, pgoff + *pages - 1, + blockptrs[i], expect); + if (i == 0) + return 0; + break; + } + } while (++i < *pages); + + *pages = i; + return first_block_addr << CRAMFS_BLK_DIRECT_PTR_SHIFT; +} + +#ifdef CONFIG_MMU + +/* + * Return true if the last page of a file in the filesystem image contains + * some other data that doesn't belong to that file. It is assumed that the + * last block is CRAMFS_BLK_FLAG_DIRECT_PTR | CRAMFS_BLK_FLAG_UNCOMPRESSED + * (verified by cramfs_get_block_range() and directly accessible in memory. + */ +static bool cramfs_last_page_is_shared(struct inode *inode) +{ + struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); + u32 partial, last_page, blockaddr, *blockptrs; + char *tail_data; + + partial = offset_in_page(inode->i_size); + if (!partial) + return false; + last_page = inode->i_size >> PAGE_SHIFT; + blockptrs = (u32 *)(sbi->linear_virt_addr + OFFSET(inode)); + blockaddr = blockptrs[last_page] & ~CRAMFS_BLK_FLAGS; + blockaddr <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; + tail_data = sbi->linear_virt_addr + blockaddr + partial; + return memchr_inv(tail_data, 0, PAGE_SIZE - partial) ? true : false; +} + +static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *inode = file_inode(file); + struct cramfs_sb_info *sbi = CRAMFS_SB(inode->i_sb); + unsigned int pages, max_pages, offset; + unsigned long address, pgoff = vma->vm_pgoff; + char *bailout_reason; + int ret; + + ret = generic_file_readonly_mmap(file, vma); + if (ret) + return ret; + + /* + * Now try to pre-populate ptes for this vma with a direct + * mapping avoiding memory allocation when possible. + */ + + /* Could COW work here? */ + bailout_reason = "vma is writable"; + if (vma->vm_flags & VM_WRITE) + goto bailout; + + max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + bailout_reason = "beyond file limit"; + if (pgoff >= max_pages) + goto bailout; + pages = min(vma_pages(vma), max_pages - pgoff); + + offset = cramfs_get_block_range(inode, pgoff, &pages); + bailout_reason = "unsuitable block layout"; + if (!offset) + goto bailout; + address = sbi->linear_phys_addr + offset; + bailout_reason = "data is not page aligned"; + if (!PAGE_ALIGNED(address)) + goto bailout; + + /* Don't map the last page if it contains some other data */ + if (pgoff + pages == max_pages && cramfs_last_page_is_shared(inode)) { + pr_debug("mmap: %s: last page is shared\n", + file_dentry(file)->d_name.name); + pages--; + } + + if (!pages) { + bailout_reason = "no suitable block remaining"; + goto bailout; + } + + if (pages == vma_pages(vma)) { + /* + * The entire vma is mappable. remap_pfn_range() will + * make it distinguishable from a non-direct mapping + * in /proc/<pid>/maps by substituting the file offset + * with the actual physical address. + */ + ret = remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, + pages * PAGE_SIZE, vma->vm_page_prot); + } else { + /* + * Let's create a mixed map if we can't map it all. + * The normal paging machinery will take care of the + * unpopulated ptes via cramfs_readpage(). + */ + int i; + vma->vm_flags |= VM_MIXEDMAP; + for (i = 0; i < pages && !ret; i++) { + unsigned long off = i * PAGE_SIZE; + pfn_t pfn = phys_to_pfn_t(address + off, PFN_DEV); + ret = vm_insert_mixed(vma, vma->vm_start + off, pfn); + } + } + + if (!ret) + pr_debug("mapped %s[%lu] at 0x%08lx (%u/%lu pages) " + "to vma 0x%08lx, page_prot 0x%llx\n", + file_dentry(file)->d_name.name, pgoff, + address, pages, vma_pages(vma), vma->vm_start, + (unsigned long long)pgprot_val(vma->vm_page_prot)); + return ret; + +bailout: + pr_debug("%s[%lu]: direct mmap impossible: %s\n", + file_dentry(file)->d_name.name, pgoff, bailout_reason); + /* Didn't manage any direct map, but normal paging is still possible */ + return 0; +} + +#else /* CONFIG_MMU */ + +static int cramfs_physmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS; +} + +static unsigned long cramfs_physmem_get_unmapped_area(struct file *file, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags) +{ + struct inode *inode = file_inode(file); + struct super_block *sb = inode->i_sb; + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); + unsigned int pages, block_pages, max_pages, offset; + + pages = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; + max_pages = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (pgoff >= max_pages || pages > max_pages - pgoff) + return -EINVAL; + block_pages = pages; + offset = cramfs_get_block_range(inode, pgoff, &block_pages); + if (!offset || block_pages != pages) + return -ENOSYS; + addr = sbi->linear_phys_addr + offset; + pr_debug("get_unmapped for %s ofs %#lx siz %lu at 0x%08lx\n", + file_dentry(file)->d_name.name, pgoff*PAGE_SIZE, len, addr); + return addr; +} + +static unsigned int cramfs_physmem_mmap_capabilities(struct file *file) +{ + return NOMMU_MAP_COPY | NOMMU_MAP_DIRECT | + NOMMU_MAP_READ | NOMMU_MAP_EXEC; +} + +#endif /* CONFIG_MMU */ + +static const struct file_operations cramfs_physmem_fops = { + .llseek = generic_file_llseek, + .read_iter = generic_file_read_iter, + .splice_read = generic_file_splice_read, + .mmap = cramfs_physmem_mmap, +#ifndef CONFIG_MMU + .get_unmapped_area = cramfs_physmem_get_unmapped_area, + .mmap_capabilities = cramfs_physmem_mmap_capabilities, +#endif +}; + static void cramfs_kill_sb(struct super_block *sb) { struct cramfs_sb_info *sbi = CRAMFS_SB(sb); - kill_block_super(sb); + if (IS_ENABLED(CCONFIG_CRAMFS_MTD) && sb->s_mtd) { + if (sbi && sbi->mtd_point_size) + mtd_unpoint(sb->s_mtd, 0, sbi->mtd_point_size); + kill_mtd_super(sb); + } else if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV) && sb->s_bdev) { + kill_block_super(sb); + } kfree(sbi); } static int cramfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } -static int cramfs_fill_super(struct super_block *sb, void *data, int silent) +static int cramfs_read_super(struct super_block *sb, + struct cramfs_super *super, int silent) { - int i; - struct cramfs_super super; + struct cramfs_sb_info *sbi = CRAMFS_SB(sb); unsigned long root_offset; - struct cramfs_sb_info *sbi; - struct inode *root; - - sb->s_flags |= MS_RDONLY; - - sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - sb->s_fs_info = sbi; - /* Invalidate the read buffers on mount: think disk change.. */ - mutex_lock(&read_mutex); - for (i = 0; i < READ_BUFFERS; i++) - buffer_blocknr[i] = -1; + /* We don't know the real size yet */ + sbi->size = PAGE_SIZE; /* Read the first block and get the superblock from it */ - memcpy(&super, cramfs_read(sb, 0, sizeof(super)), sizeof(super)); + mutex_lock(&read_mutex); + memcpy(super, cramfs_read(sb, 0, sizeof(*super)), sizeof(*super)); mutex_unlock(&read_mutex); /* Do sanity checks on the superblock */ - if (super.magic != CRAMFS_MAGIC) { + if (super->magic != CRAMFS_MAGIC) { /* check for wrong endianness */ - if (super.magic == CRAMFS_MAGIC_WEND) { + if (super->magic == CRAMFS_MAGIC_WEND) { if (!silent) pr_err("wrong endianness\n"); return -EINVAL; @@ -289,10 +534,12 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) /* check at 512 byte offset */ mutex_lock(&read_mutex); - memcpy(&super, cramfs_read(sb, 512, sizeof(super)), sizeof(super)); + memcpy(super, + cramfs_read(sb, 512, sizeof(*super)), + sizeof(*super)); mutex_unlock(&read_mutex); - if (super.magic != CRAMFS_MAGIC) { - if (super.magic == CRAMFS_MAGIC_WEND && !silent) + if (super->magic != CRAMFS_MAGIC) { + if (super->magic == CRAMFS_MAGIC_WEND && !silent) pr_err("wrong endianness\n"); else if (!silent) pr_err("wrong magic\n"); @@ -301,34 +548,34 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) } /* get feature flags first */ - if (super.flags & ~CRAMFS_SUPPORTED_FLAGS) { + if (super->flags & ~CRAMFS_SUPPORTED_FLAGS) { pr_err("unsupported filesystem features\n"); return -EINVAL; } /* Check that the root inode is in a sane state */ - if (!S_ISDIR(super.root.mode)) { + if (!S_ISDIR(super->root.mode)) { pr_err("root is not a directory\n"); return -EINVAL; } /* correct strange, hard-coded permissions of mkcramfs */ - super.root.mode |= (S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH); + super->root.mode |= 0555; - root_offset = super.root.offset << 2; - if (super.flags & CRAMFS_FLAG_FSID_VERSION_2) { - sbi->size = super.size; - sbi->blocks = super.fsid.blocks; - sbi->files = super.fsid.files; + root_offset = super->root.offset << 2; + if (super->flags & CRAMFS_FLAG_FSID_VERSION_2) { + sbi->size = super->size; + sbi->blocks = super->fsid.blocks; + sbi->files = super->fsid.files; } else { sbi->size = 1<<28; sbi->blocks = 0; sbi->files = 0; } - sbi->magic = super.magic; - sbi->flags = super.flags; + sbi->magic = super->magic; + sbi->flags = super->flags; if (root_offset == 0) pr_info("empty filesystem"); - else if (!(super.flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && + else if (!(super->flags & CRAMFS_FLAG_SHIFTED_ROOT_OFFSET) && ((root_offset != sizeof(struct cramfs_super)) && (root_offset != 512 + sizeof(struct cramfs_super)))) { @@ -336,9 +583,18 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) return -EINVAL; } + return 0; +} + +static int cramfs_finalize_super(struct super_block *sb, + struct cramfs_inode *cramfs_root) +{ + struct inode *root; + /* Set it all up.. */ + sb->s_flags |= SB_RDONLY; sb->s_op = &cramfs_ops; - root = get_cramfs_inode(sb, &super.root, 0); + root = get_cramfs_inode(sb, cramfs_root, 0); if (IS_ERR(root)) return PTR_ERR(root); sb->s_root = d_make_root(root); @@ -347,10 +603,79 @@ static int cramfs_fill_super(struct super_block *sb, void *data, int silent) return 0; } +static int cramfs_blkdev_fill_super(struct super_block *sb, void *data, + int silent) +{ + struct cramfs_sb_info *sbi; + struct cramfs_super super; + int i, err; + + sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + /* Invalidate the read buffers on mount: think disk change.. */ + for (i = 0; i < READ_BUFFERS; i++) + buffer_blocknr[i] = -1; + + err = cramfs_read_super(sb, &super, silent); + if (err) + return err; + return cramfs_finalize_super(sb, &super.root); +} + +static int cramfs_mtd_fill_super(struct super_block *sb, void *data, + int silent) +{ + struct cramfs_sb_info *sbi; + struct cramfs_super super; + int err; + + sbi = kzalloc(sizeof(struct cramfs_sb_info), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + sb->s_fs_info = sbi; + + /* Map only one page for now. Will remap it when fs size is known. */ + err = mtd_point(sb->s_mtd, 0, PAGE_SIZE, &sbi->mtd_point_size, + &sbi->linear_virt_addr, &sbi->linear_phys_addr); + if (err || sbi->mtd_point_size != PAGE_SIZE) { + pr_err("unable to get direct memory access to mtd:%s\n", + sb->s_mtd->name); + return err ? : -ENODATA; + } + + pr_info("checking physical address %pap for linear cramfs image\n", + &sbi->linear_phys_addr); + err = cramfs_read_super(sb, &super, silent); + if (err) + return err; + + /* Remap the whole filesystem now */ + pr_info("linear cramfs image on mtd:%s appears to be %lu KB in size\n", + sb->s_mtd->name, sbi->size/1024); + mtd_unpoint(sb->s_mtd, 0, PAGE_SIZE); + err = mtd_point(sb->s_mtd, 0, sbi->size, &sbi->mtd_point_size, + &sbi->linear_virt_addr, &sbi->linear_phys_addr); + if (err || sbi->mtd_point_size != sbi->size) { + pr_err("unable to get direct memory access to mtd:%s\n", + sb->s_mtd->name); + return err ? : -ENODATA; + } + + return cramfs_finalize_super(sb, &super.root); +} + static int cramfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; - u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + u64 id = 0; + + if (sb->s_bdev) + id = huge_encode_dev(sb->s_bdev->bd_dev); + else if (sb->s_dev) + id = huge_encode_dev(sb->s_dev); buf->f_type = CRAMFS_MAGIC; buf->f_bsize = PAGE_SIZE; @@ -502,34 +827,86 @@ static int cramfs_readpage(struct file *file, struct page *page) if (page->index < maxblock) { struct super_block *sb = inode->i_sb; - u32 blkptr_offset = OFFSET(inode) + page->index*4; - u32 start_offset, compr_len; + u32 blkptr_offset = OFFSET(inode) + page->index * 4; + u32 block_ptr, block_start, block_len; + bool uncompressed, direct; - start_offset = OFFSET(inode) + maxblock*4; mutex_lock(&read_mutex); - if (page->index) - start_offset = *(u32 *) cramfs_read(sb, blkptr_offset-4, - 4); - compr_len = (*(u32 *) cramfs_read(sb, blkptr_offset, 4) - - start_offset); - mutex_unlock(&read_mutex); + block_ptr = *(u32 *) cramfs_read(sb, blkptr_offset, 4); + uncompressed = (block_ptr & CRAMFS_BLK_FLAG_UNCOMPRESSED); + direct = (block_ptr & CRAMFS_BLK_FLAG_DIRECT_PTR); + block_ptr &= ~CRAMFS_BLK_FLAGS; + + if (direct) { + /* + * The block pointer is an absolute start pointer, + * shifted by 2 bits. The size is included in the + * first 2 bytes of the data block when compressed, + * or PAGE_SIZE otherwise. + */ + block_start = block_ptr << CRAMFS_BLK_DIRECT_PTR_SHIFT; + if (uncompressed) { + block_len = PAGE_SIZE; + /* if last block: cap to file length */ + if (page->index == maxblock - 1) + block_len = + offset_in_page(inode->i_size); + } else { + block_len = *(u16 *) + cramfs_read(sb, block_start, 2); + block_start += 2; + } + } else { + /* + * The block pointer indicates one past the end of + * the current block (start of next block). If this + * is the first block then it starts where the block + * pointer table ends, otherwise its start comes + * from the previous block's pointer. + */ + block_start = OFFSET(inode) + maxblock * 4; + if (page->index) + block_start = *(u32 *) + cramfs_read(sb, blkptr_offset - 4, 4); + /* Beware... previous ptr might be a direct ptr */ + if (unlikely(block_start & CRAMFS_BLK_FLAG_DIRECT_PTR)) { + /* See comments on earlier code. */ + u32 prev_start = block_start; + block_start = prev_start & ~CRAMFS_BLK_FLAGS; + block_start <<= CRAMFS_BLK_DIRECT_PTR_SHIFT; + if (prev_start & CRAMFS_BLK_FLAG_UNCOMPRESSED) { + block_start += PAGE_SIZE; + } else { + block_len = *(u16 *) + cramfs_read(sb, block_start, 2); + block_start += 2 + block_len; + } + } + block_start &= ~CRAMFS_BLK_FLAGS; + block_len = block_ptr - block_start; + } - if (compr_len == 0) + if (block_len == 0) ; /* hole */ - else if (unlikely(compr_len > (PAGE_SIZE << 1))) { - pr_err("bad compressed blocksize %u\n", - compr_len); + else if (unlikely(block_len > 2*PAGE_SIZE || + (uncompressed && block_len > PAGE_SIZE))) { + mutex_unlock(&read_mutex); + pr_err("bad data blocksize %u\n", block_len); goto err; + } else if (uncompressed) { + memcpy(pgdata, + cramfs_read(sb, block_start, block_len), + block_len); + bytes_filled = block_len; } else { - mutex_lock(&read_mutex); bytes_filled = cramfs_uncompress_block(pgdata, PAGE_SIZE, - cramfs_read(sb, start_offset, compr_len), - compr_len); - mutex_unlock(&read_mutex); - if (unlikely(bytes_filled < 0)) - goto err; + cramfs_read(sb, block_start, block_len), + block_len); } + mutex_unlock(&read_mutex); + if (unlikely(bytes_filled < 0)) + goto err; } memset(pgdata + bytes_filled, 0, PAGE_SIZE - bytes_filled); @@ -573,10 +950,22 @@ static const struct super_operations cramfs_ops = { .statfs = cramfs_statfs, }; -static struct dentry *cramfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static struct dentry *cramfs_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { - return mount_bdev(fs_type, flags, dev_name, data, cramfs_fill_super); + struct dentry *ret = ERR_PTR(-ENOPROTOOPT); + + if (IS_ENABLED(CONFIG_CRAMFS_MTD)) { + ret = mount_mtd(fs_type, flags, dev_name, data, + cramfs_mtd_fill_super); + if (!IS_ERR(ret)) + return ret; + } + if (IS_ENABLED(CONFIG_CRAMFS_BLOCKDEV)) { + ret = mount_bdev(fs_type, flags, dev_name, data, + cramfs_blkdev_fill_super); + } + return ret; } static struct file_system_type cramfs_fs_type = { @@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, static void *dax_insert_mapping_entry(struct address_space *mapping, struct vm_fault *vmf, void *entry, sector_t sector, - unsigned long flags) + unsigned long flags, bool dirty) { struct radix_tree_root *page_tree = &mapping->page_tree; void *new_entry; pgoff_t index = vmf->pgoff; - if (vmf->flags & FAULT_FLAG_WRITE) + if (dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { @@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, entry = new_entry; } - if (vmf->flags & FAULT_FLAG_WRITE) + if (dirty) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); @@ -627,7 +627,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (pfn != pmd_pfn(*pmdp)) goto unlock_pmd; - if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) + if (!pmd_dirty(*pmdp) + && !pmd_access_permitted(*pmdp, WRITE)) goto unlock_pmd; flush_cache_page(vma, address, pfn); @@ -825,38 +826,42 @@ out: } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); -static int dax_insert_mapping(struct address_space *mapping, - struct block_device *bdev, struct dax_device *dax_dev, - sector_t sector, size_t size, void *entry, - struct vm_area_struct *vma, struct vm_fault *vmf) +static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) { - unsigned long vaddr = vmf->address; - void *ret, *kaddr; + return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; +} + +static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, + pfn_t *pfnp) +{ + const sector_t sector = dax_iomap_sector(iomap, pos); pgoff_t pgoff; + void *kaddr; int id, rc; - pfn_t pfn; + long length; - rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff); if (rc) return rc; - id = dax_read_lock(); - rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); - if (rc < 0) { - dax_read_unlock(id); - return rc; + length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), + &kaddr, pfnp); + if (length < 0) { + rc = length; + goto out; } + rc = -EINVAL; + if (PFN_PHYS(length) < size) + goto out; + if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) + goto out; + /* For larger pages we need devmap */ + if (length > 1 && !pfn_t_devmap(*pfnp)) + goto out; + rc = 0; +out: dax_read_unlock(id); - - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); - if (IS_ERR(ret)) - return PTR_ERR(ret); - - trace_dax_insert_mapping(mapping->host, vmf, ret); - if (vmf->flags & FAULT_FLAG_WRITE) - return vm_insert_mixed_mkwrite(vma, vaddr, pfn); - else - return vm_insert_mixed(vma, vaddr, pfn); + return rc; } /* @@ -882,7 +887,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, } entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, - RADIX_DAX_ZERO_PAGE); + RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(entry2)) { ret = VM_FAULT_SIGBUS; goto out; @@ -941,11 +946,6 @@ int __dax_zero_page_range(struct block_device *bdev, } EXPORT_SYMBOL_GPL(__dax_zero_page_range); -static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) -{ - return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; -} - static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) @@ -1085,19 +1085,33 @@ static int dax_fault_return(int error) return VM_FAULT_SIGBUS; } -static int dax_iomap_pte_fault(struct vm_fault *vmf, +/* + * MAP_SYNC on a dax mapping guarantees dirty metadata is + * flushed on write-faults (non-cow), but not read-faults. + */ +static bool dax_fault_is_synchronous(unsigned long flags, + struct vm_area_struct *vma, struct iomap *iomap) +{ + return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) + && (iomap->flags & IOMAP_F_DIRTY); +} + +static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { - struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct vm_area_struct *vma = vmf->vma; + struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; - sector_t sector; struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; + bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync; int vmf_ret = 0; void *entry; + pfn_t pfn; trace_dax_pte_fault(inode, vmf, vmf_ret); /* @@ -1110,7 +1124,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto out; } - if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) + if (write && !vmf->cow_page) flags |= IOMAP_WRITE; entry = grab_mapping_entry(mapping, vmf->pgoff, 0); @@ -1145,9 +1159,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto error_finish_iomap; } - sector = dax_iomap_sector(&iomap, pos); - if (vmf->cow_page) { + sector_t sector = dax_iomap_sector(&iomap, pos); + switch (iomap.type) { case IOMAP_HOLE: case IOMAP_UNWRITTEN: @@ -1173,22 +1187,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, goto finish_iomap; } + sync = dax_fault_is_synchronous(flags, vma, &iomap); + switch (iomap.type) { case IOMAP_MAPPED: if (iomap.flags & IOMAP_F_NEW) { count_vm_event(PGMAJFAULT); - count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } - error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, - sector, PAGE_SIZE, entry, vmf->vma, vmf); + error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); + if (error < 0) + goto error_finish_iomap; + + entry = dax_insert_mapping_entry(mapping, vmf, entry, + dax_iomap_sector(&iomap, pos), + 0, write && !sync); + if (IS_ERR(entry)) { + error = PTR_ERR(entry); + goto error_finish_iomap; + } + + /* + * If we are doing synchronous page fault and inode needs fsync, + * we can insert PTE into page tables only after that happens. + * Skip insertion for now and return the pfn so that caller can + * insert it after fsync is done. + */ + if (sync) { + if (WARN_ON_ONCE(!pfnp)) { + error = -EIO; + goto error_finish_iomap; + } + *pfnp = pfn; + vmf_ret = VM_FAULT_NEEDDSYNC | major; + goto finish_iomap; + } + trace_dax_insert_mapping(inode, vmf, entry); + if (write) + error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); + else + error = vm_insert_mixed(vma, vaddr, pfn); + /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: - if (!(vmf->flags & FAULT_FLAG_WRITE)) { + if (!write) { vmf_ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } @@ -1223,53 +1270,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, - loff_t pos, void *entry) -{ - struct address_space *mapping = vmf->vma->vm_file->f_mapping; - const sector_t sector = dax_iomap_sector(iomap, pos); - struct dax_device *dax_dev = iomap->dax_dev; - struct block_device *bdev = iomap->bdev; - struct inode *inode = mapping->host; - const size_t size = PMD_SIZE; - void *ret = NULL, *kaddr; - long length = 0; - pgoff_t pgoff; - pfn_t pfn = {}; - int id; - - if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) - goto fallback; - - id = dax_read_lock(); - length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); - if (length < 0) - goto unlock_fallback; - length = PFN_PHYS(length); - - if (length < size) - goto unlock_fallback; - if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) - goto unlock_fallback; - if (!pfn_t_devmap(pfn)) - goto unlock_fallback; - dax_read_unlock(id); - - ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, - RADIX_DAX_PMD); - if (IS_ERR(ret)) - goto fallback; - - trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); - return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - pfn, vmf->flags & FAULT_FLAG_WRITE); - -unlock_fallback: - dax_read_unlock(id); -fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); - return VM_FAULT_FALLBACK; -} +/* + * The 'colour' (ie low bits) within a PMD of a page offset. This comes up + * more often than one might expect in the below functions. + */ +#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) @@ -1288,7 +1293,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, goto fallback; ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, - RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); + RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); if (IS_ERR(ret)) goto fallback; @@ -1310,13 +1315,14 @@ fallback: return VM_FAULT_FALLBACK; } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping = vma->vm_file->f_mapping; unsigned long pmd_addr = vmf->address & PMD_MASK; bool write = vmf->flags & FAULT_FLAG_WRITE; + bool sync; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; int result = VM_FAULT_FALLBACK; @@ -1325,6 +1331,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, void *entry; loff_t pos; int error; + pfn_t pfn; /* * Check whether offset isn't beyond end of file now. Caller is @@ -1332,7 +1339,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, * this is a reliable test. */ pgoff = linear_page_index(vma, pmd_addr); - max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; + max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); @@ -1356,13 +1363,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, if ((pmd_addr + PMD_SIZE) > vma->vm_end) goto fallback; - if (pgoff > max_pgoff) { + if (pgoff >= max_pgoff) { result = VM_FAULT_SIGBUS; goto out; } /* If the PMD would extend beyond the file size */ - if ((pgoff | PG_PMD_COLOUR) > max_pgoff) + if ((pgoff | PG_PMD_COLOUR) >= max_pgoff) goto fallback; /* @@ -1400,9 +1407,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; + sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap); + switch (iomap.type) { case IOMAP_MAPPED: - result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); + error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); + if (error < 0) + goto finish_iomap; + + entry = dax_insert_mapping_entry(mapping, vmf, entry, + dax_iomap_sector(&iomap, pos), + RADIX_DAX_PMD, write && !sync); + if (IS_ERR(entry)) + goto finish_iomap; + + /* + * If we are doing synchronous page fault and inode needs fsync, + * we can insert PMD into page tables only after that happens. + * Skip insertion for now and return the pfn so that caller can + * insert it after fsync is done. + */ + if (sync) { + if (WARN_ON_ONCE(!pfnp)) + goto finish_iomap; + *pfnp = pfn; + result = VM_FAULT_NEEDDSYNC; + goto finish_iomap; + } + + trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); + result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn, + write); break; case IOMAP_UNWRITTEN: case IOMAP_HOLE: @@ -1442,7 +1477,7 @@ out: return result; } #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, +static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -1452,7 +1487,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, /** * dax_iomap_fault - handle a page fault on a DAX file * @vmf: The description of the fault - * @ops: iomap ops passed from the file system + * @pe_size: Size of the page to fault in + * @pfnp: PFN to insert for synchronous faults if fsync is required + * @ops: Iomap ops passed from the file system * * When a page fault occurs, filesystems may call this helper in * their fault handler for DAX files. dax_iomap_fault() assumes the caller @@ -1460,15 +1497,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, * successfully. */ int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - const struct iomap_ops *ops) + pfn_t *pfnp, const struct iomap_ops *ops) { switch (pe_size) { case PE_SIZE_PTE: - return dax_iomap_pte_fault(vmf, ops); + return dax_iomap_pte_fault(vmf, pfnp, ops); case PE_SIZE_PMD: - return dax_iomap_pmd_fault(vmf, ops); + return dax_iomap_pmd_fault(vmf, pfnp, ops); default: return VM_FAULT_FALLBACK; } } EXPORT_SYMBOL_GPL(dax_iomap_fault); + +/** + * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function inserts writeable PTE or PMD entry into page tables for mmaped + * DAX file. It takes care of marking corresponding radix tree entry as dirty + * as well. + */ +static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, + enum page_entry_size pe_size, + pfn_t pfn) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + void *entry, **slot; + pgoff_t index = vmf->pgoff; + int vmf_ret, error; + + spin_lock_irq(&mapping->tree_lock); + entry = get_unlocked_mapping_entry(mapping, index, &slot); + /* Did we race with someone splitting entry or so? */ + if (!entry || + (pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) || + (pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) { + put_unlocked_mapping_entry(mapping, index, entry); + spin_unlock_irq(&mapping->tree_lock); + trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, + VM_FAULT_NOPAGE); + return VM_FAULT_NOPAGE; + } + radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); + entry = lock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + switch (pe_size) { + case PE_SIZE_PTE: + error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); + vmf_ret = dax_fault_return(error); + break; +#ifdef CONFIG_FS_DAX_PMD + case PE_SIZE_PMD: + vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + pfn, true); + break; +#endif + default: + vmf_ret = VM_FAULT_FALLBACK; + } + put_locked_mapping_entry(mapping, index); + trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); + return vmf_ret; +} + +/** + * dax_finish_sync_fault - finish synchronous page fault + * @vmf: The description of the fault + * @pe_size: Size of entry to be inserted + * @pfn: PFN to insert + * + * This function ensures that the file range touched by the page fault is + * stored persistently on the media and handles inserting of appropriate page + * table entry. + */ +int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, + pfn_t pfn) +{ + int err; + loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; + size_t len = 0; + + if (pe_size == PE_SIZE_PTE) + len = PAGE_SIZE; + else if (pe_size == PE_SIZE_PMD) + len = PMD_SIZE; + else + WARN_ON_ONCE(1); + err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); + if (err) + return VM_FAULT_SIGBUS; + return dax_insert_pfn_mkwrite(vmf, pe_size, pfn); +} +EXPORT_SYMBOL_GPL(dax_finish_sync_fault); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index e5e29f8c920b..846ca150d52e 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -36,27 +36,13 @@ #include <linux/scatterlist.h> #include <linux/slab.h> #include <asm/unaligned.h> +#include <linux/kernel.h> #include "ecryptfs_kernel.h" #define DECRYPT 0 #define ENCRYPT 1 /** - * ecryptfs_to_hex - * @dst: Buffer to take hex character representation of contents of - * src; must be at least of size (src_size * 2) - * @src: Buffer to be converted to a hex string representation - * @src_size: number of bytes to convert - */ -void ecryptfs_to_hex(char *dst, char *src, size_t src_size) -{ - int x; - - for (x = 0; x < src_size; x++) - sprintf(&dst[x * 2], "%.2x", (unsigned char)src[x]); -} - -/** * ecryptfs_from_hex * @dst: Buffer to take the bytes from src hex; must be at least of * size (src_size / 2) @@ -899,8 +885,7 @@ static int ecryptfs_process_flags(struct ecryptfs_crypt_stat *crypt_stat, u32 flags; flags = get_unaligned_be32(page_virt); - for (i = 0; i < ((sizeof(ecryptfs_flag_map) - / sizeof(struct ecryptfs_flag_map_elem))); i++) + for (i = 0; i < ARRAY_SIZE(ecryptfs_flag_map); i++) if (flags & ecryptfs_flag_map[i].file_flag) { crypt_stat->flags |= ecryptfs_flag_map[i].local_flag; } else @@ -937,8 +922,7 @@ void ecryptfs_write_crypt_stat_flags(char *page_virt, u32 flags = 0; int i; - for (i = 0; i < ((sizeof(ecryptfs_flag_map) - / sizeof(struct ecryptfs_flag_map_elem))); i++) + for (i = 0; i < ARRAY_SIZE(ecryptfs_flag_map); i++) if (crypt_stat->flags & ecryptfs_flag_map[i].local_flag) flags |= ecryptfs_flag_map[i].file_flag; /* Version is in top 8 bits of the 32-bit flag vector */ @@ -1434,8 +1418,6 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) page_virt = kmem_cache_alloc(ecryptfs_header_cache, GFP_USER); if (!page_virt) { rc = -ENOMEM; - printk(KERN_ERR "%s: Unable to allocate page_virt\n", - __func__); goto out; } rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size, @@ -1522,9 +1504,6 @@ ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, filename->encrypted_filename = kmalloc(filename->encrypted_filename_size, GFP_KERNEL); if (!filename->encrypted_filename) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc [%zd] bytes\n", __func__, - filename->encrypted_filename_size); rc = -ENOMEM; goto out; } @@ -1669,12 +1648,10 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name, BUG_ON(!mutex_is_locked(&key_tfm_list_mutex)); tmp_tfm = kmem_cache_alloc(ecryptfs_key_tfm_cache, GFP_KERNEL); - if (key_tfm != NULL) + if (key_tfm) (*key_tfm) = tmp_tfm; if (!tmp_tfm) { rc = -ENOMEM; - printk(KERN_ERR "Error attempting to allocate from " - "ecryptfs_key_tfm_cache\n"); goto out; } mutex_init(&tmp_tfm->key_tfm_mutex); @@ -1690,7 +1667,7 @@ ecryptfs_add_new_key_tfm(struct ecryptfs_key_tfm **key_tfm, char *cipher_name, "cipher with name = [%s]; rc = [%d]\n", tmp_tfm->cipher_name, rc); kmem_cache_free(ecryptfs_key_tfm_cache, tmp_tfm); - if (key_tfm != NULL) + if (key_tfm) (*key_tfm) = NULL; goto out; } @@ -1881,7 +1858,7 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, size_t src_byte_offset = 0; size_t dst_byte_offset = 0; - if (dst == NULL) { + if (!dst) { (*dst_size) = ecryptfs_max_decoded_size(src_size); goto out; } @@ -1949,9 +1926,6 @@ int ecryptfs_encrypt_and_encode_filename( filename = kzalloc(sizeof(*filename), GFP_KERNEL); if (!filename) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kzalloc [%zd] bytes\n", __func__, - sizeof(*filename)); rc = -ENOMEM; goto out; } @@ -1980,9 +1954,6 @@ int ecryptfs_encrypt_and_encode_filename( + encoded_name_no_prefix_size); (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL); if (!(*encoded_name)) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kzalloc [%zd] bytes\n", __func__, - (*encoded_name_size)); rc = -ENOMEM; kfree(filename->encrypted_filename); kfree(filename); @@ -2064,9 +2035,6 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, name, name_size); decoded_name = kmalloc(decoded_name_size, GFP_KERNEL); if (!decoded_name) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc [%zd] bytes\n", __func__, - decoded_name_size); rc = -ENOMEM; goto out; } diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 3fbc0ff79699..e74cb2a0b299 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -31,6 +31,7 @@ #include <crypto/skcipher.h> #include <keys/user-type.h> #include <keys/encrypted-type.h> +#include <linux/kernel.h> #include <linux/fs.h> #include <linux/fs_stack.h> #include <linux/namei.h> @@ -51,7 +52,13 @@ #define ECRYPTFS_XATTR_NAME "user.ecryptfs" void ecryptfs_dump_auth_tok(struct ecryptfs_auth_tok *auth_tok); -extern void ecryptfs_to_hex(char *dst, char *src, size_t src_size); +static inline void +ecryptfs_to_hex(char *dst, char *src, size_t src_size) +{ + char *end = bin2hex(dst, src, src_size); + *end = '\0'; +} + extern void ecryptfs_from_hex(char *dst, char *src, int dst_size); struct ecryptfs_key_record { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index efc2db42d175..847904aa63a9 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -64,7 +64,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque) /* i_size will be overwritten for encrypted regular files */ fsstack_copy_inode_size(inode, lower_inode); inode->i_ino = lower_inode->i_ino; - inode->i_version++; inode->i_mapping->a_ops = &ecryptfs_aops; if (S_ISLNK(inode->i_mode)) @@ -334,9 +333,6 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); if (!dentry_info) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to allocate ecryptfs_dentry_info struct\n", - __func__); dput(lower_dentry); return ERR_PTR(-ENOMEM); } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index fa218cd64f74..c89a58cfc991 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -639,11 +639,9 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, int rc = 0; s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) { - printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " - "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + if (!s) return -ENOMEM; - } + (*packet_size) = 0; rc = ecryptfs_find_auth_tok_for_sig( &auth_tok_key, @@ -687,7 +685,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, * separator, and then the filename */ s->max_packet_size = (ECRYPTFS_TAG_70_MAX_METADATA_SIZE + s->block_aligned_filename_size); - if (dest == NULL) { + if (!dest) { (*packet_size) = s->max_packet_size; goto out_unlock; } @@ -714,9 +712,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, s->block_aligned_filename = kzalloc(s->block_aligned_filename_size, GFP_KERNEL); if (!s->block_aligned_filename) { - printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " - "kzalloc [%zd] bytes\n", __func__, - s->block_aligned_filename_size); rc = -ENOMEM; goto out_unlock; } @@ -769,10 +764,6 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, s->hash_desc = kmalloc(sizeof(*s->hash_desc) + crypto_shash_descsize(s->hash_tfm), GFP_KERNEL); if (!s->hash_desc) { - printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " - "kmalloc [%zd] bytes\n", __func__, - sizeof(*s->hash_desc) + - crypto_shash_descsize(s->hash_tfm)); rc = -ENOMEM; goto out_release_free_unlock; } @@ -925,11 +916,9 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, (*filename_size) = 0; (*filename) = NULL; s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) { - printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " - "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + if (!s) return -ENOMEM; - } + if (max_packet_size < ECRYPTFS_TAG_70_MIN_METADATA_SIZE) { printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " "at least [%d]\n", __func__, max_packet_size, @@ -1015,9 +1004,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, s->decrypted_filename = kmalloc(s->block_aligned_filename_size, GFP_KERNEL); if (!s->decrypted_filename) { - printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc [%zd] bytes\n", __func__, - s->block_aligned_filename_size); rc = -ENOMEM; goto out_unlock; } @@ -1097,9 +1083,6 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, } (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL); if (!(*filename)) { - printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc [%zd] bytes\n", __func__, - ((*filename_size) + 1)); rc = -ENOMEM; goto out_free_unlock; } @@ -1333,7 +1316,7 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat, if ((*new_auth_tok)->session_key.encrypted_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { printk(KERN_WARNING "Tag 1 packet contains key larger " - "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES"); + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n"); rc = -EINVAL; goto out; } @@ -2525,11 +2508,9 @@ int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig) struct ecryptfs_key_sig *new_key_sig; new_key_sig = kmem_cache_alloc(ecryptfs_key_sig_cache, GFP_KERNEL); - if (!new_key_sig) { - printk(KERN_ERR - "Error allocating from ecryptfs_key_sig_cache\n"); + if (!new_key_sig) return -ENOMEM; - } + memcpy(new_key_sig->keysig, sig, ECRYPTFS_SIG_SIZE_HEX); new_key_sig->keysig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; /* Caller must hold keysig_list_mutex */ @@ -2545,16 +2526,12 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig, u32 global_auth_tok_flags) { struct ecryptfs_global_auth_tok *new_auth_tok; - int rc = 0; new_auth_tok = kmem_cache_zalloc(ecryptfs_global_auth_tok_cache, GFP_KERNEL); - if (!new_auth_tok) { - rc = -ENOMEM; - printk(KERN_ERR "Error allocating from " - "ecryptfs_global_auth_tok_cache\n"); - goto out; - } + if (!new_auth_tok) + return -ENOMEM; + memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); new_auth_tok->flags = global_auth_tok_flags; new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; @@ -2562,7 +2539,6 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, list_add(&new_auth_tok->mount_crypt_stat_list, &mount_crypt_stat->global_auth_tok_list); mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); -out: - return rc; + return 0; } diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 25aeaa7328ba..025d66a705db 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -426,7 +426,7 @@ static int ecryptfs_parse_options(struct ecryptfs_sb_info *sbi, char *options, mount_crypt_stat->global_default_cipher_key_size); if (!cipher_code) { ecryptfs_printk(KERN_ERR, - "eCryptfs doesn't support cipher: %s", + "eCryptfs doesn't support cipher: %s\n", mount_crypt_stat->global_default_cipher_name); rc = -EINVAL; goto out; @@ -560,8 +560,8 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags * Set the POSIX ACL flag based on whether they're enabled in the lower * mount. */ - s->s_flags = flags & ~MS_POSIXACL; - s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL; + s->s_flags = flags & ~SB_POSIXACL; + s->s_flags |= path.dentry->d_sb->s_flags & SB_POSIXACL; /** * Force a read-only eCryptfs mount when: @@ -569,7 +569,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags * 2) The ecryptfs_encrypted_view mount option is specified */ if (sb_rdonly(path.dentry->d_sb) || mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) - s->s_flags |= MS_RDONLY; + s->s_flags |= SB_RDONLY; s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; @@ -602,7 +602,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags ecryptfs_set_dentry_private(s->s_root, root_info); root_info->lower_path = path; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; return dget(s->s_root); out_free: @@ -781,7 +781,7 @@ static struct attribute *attributes[] = { NULL, }; -static struct attribute_group attr_group = { +static const struct attribute_group attr_group = { .attrs = attributes, }; diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 286f10b0363b..9fdd5bcf4564 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -147,8 +147,6 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, struct file *file) (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); if (!(*daemon)) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " - "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); goto out; } (*daemon)->file = file; @@ -250,8 +248,6 @@ int ecryptfs_process_response(struct ecryptfs_daemon *daemon, msg_ctx->msg = kmemdup(msg, msg_size, GFP_KERNEL); if (!msg_ctx->msg) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " - "GFP_KERNEL memory\n", __func__, msg_size); goto unlock; } msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_DONE; @@ -386,7 +382,6 @@ int __init ecryptfs_init_messaging(void) GFP_KERNEL); if (!ecryptfs_daemon_hash) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); mutex_unlock(&ecryptfs_daemon_hash_mux); goto out; } @@ -398,7 +393,6 @@ int __init ecryptfs_init_messaging(void) GFP_KERNEL); if (!ecryptfs_msg_ctx_arr) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); goto out; } mutex_init(&ecryptfs_msg_ctx_lists_mux); @@ -442,15 +436,16 @@ void ecryptfs_release_messaging(void) } if (ecryptfs_daemon_hash) { struct ecryptfs_daemon *daemon; + struct hlist_node *n; int i; mutex_lock(&ecryptfs_daemon_hash_mux); for (i = 0; i < (1 << ecryptfs_hash_bits); i++) { int rc; - hlist_for_each_entry(daemon, - &ecryptfs_daemon_hash[i], - euid_chain) { + hlist_for_each_entry_safe(daemon, n, + &ecryptfs_daemon_hash[i], + euid_chain) { rc = ecryptfs_exorcise_daemon(daemon); if (rc) printk(KERN_ERR "%s: Error whilst " diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index e4141f257495..f09cacaf8c80 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -163,12 +163,8 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, struct ecryptfs_message *msg; msg = kmalloc((sizeof(*msg) + data_size), GFP_KERNEL); - if (!msg) { - printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc(%zd, GFP_KERNEL)\n", __func__, - (sizeof(*msg) + data_size)); + if (!msg) return -ENOMEM; - } mutex_lock(&msg_ctx->mux); msg_ctx->msg = msg; @@ -383,7 +379,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, goto memdup; } else if (count < MIN_MSG_PKT_SIZE || count > MAX_MSG_PKT_SIZE) { printk(KERN_WARNING "%s: Acceptable packet size range is " - "[%d-%zu], but amount of data written is [%zu].", + "[%d-%zu], but amount of data written is [%zu].\n", __func__, MIN_MSG_PKT_SIZE, MAX_MSG_PKT_SIZE, count); return -EINVAL; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 1f0c471b4ba3..cdf358b209d9 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -431,8 +431,6 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) } xattr_virt = kmem_cache_alloc(ecryptfs_xattr_cache, GFP_KERNEL); if (!xattr_virt) { - printk(KERN_ERR "Out of memory whilst attempting to write " - "inode size to xattr\n"); rc = -ENOMEM; goto out; } diff --git a/fs/efs/super.c b/fs/efs/super.c index 65b59009555b..6ffb7ba1547a 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -116,7 +116,7 @@ static void destroy_inodecache(void) static int efs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -311,7 +311,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) #ifdef DEBUG pr_info("forcing read-only mode\n"); #endif - s->s_flags |= MS_RDONLY; + s->s_flags |= SB_RDONLY; } s->s_op = &efs_superblock_operations; s->s_export_op = &efs_export_ops; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 2fabd19cdeea..afd548ebc328 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -276,12 +276,6 @@ static DEFINE_MUTEX(epmutex); /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls; -/* Used for safe wake up implementation */ -static struct nested_calls poll_safewake_ncalls; - -/* Used to call file's f_op->poll() under the nested calls boundaries */ -static struct nested_calls poll_readywalk_ncalls; - /* Slab cache used to allocate "struct epitem" */ static struct kmem_cache *epi_cache __read_mostly; @@ -551,40 +545,21 @@ out_unlock: * this special case of epoll. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) + +static struct nested_calls poll_safewake_ncalls; + +static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) { unsigned long flags; + wait_queue_head_t *wqueue = (wait_queue_head_t *)cookie; - spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); - wake_up_locked_poll(wqueue, events); + spin_lock_irqsave_nested(&wqueue->lock, flags, call_nests + 1); + wake_up_locked_poll(wqueue, POLLIN); spin_unlock_irqrestore(&wqueue->lock, flags); -} -#else -static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, - unsigned long events, int subclass) -{ - wake_up_poll(wqueue, events); -} -#endif -static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) -{ - ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, - 1 + call_nests); return 0; } -/* - * Perform a safe wake up of the poll wait list. The problem is that - * with the new callback'd wake up system, it is possible that the - * poll callback is reentered from inside the call to wake_up() done - * on the poll wait queue head. The rule is that we cannot reenter the - * wake up code from the same task more than EP_MAX_NESTS times, - * and we cannot reenter the same wait queue head at all. This will - * enable to have a hierarchy of epoll file descriptor of no more than - * EP_MAX_NESTS deep. - */ static void ep_poll_safewake(wait_queue_head_t *wq) { int this_cpu = get_cpu(); @@ -595,6 +570,15 @@ static void ep_poll_safewake(wait_queue_head_t *wq) put_cpu(); } +#else + +static void ep_poll_safewake(wait_queue_head_t *wq) +{ + wake_up_poll(wq, POLLIN); +} + +#endif + static void ep_remove_wait_queue(struct eppoll_entry *pwq) { wait_queue_head_t *whead; @@ -880,11 +864,33 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) return 0; } -static inline unsigned int ep_item_poll(struct epitem *epi, poll_table *pt) +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv); +static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, + poll_table *pt); + +/* + * Differs from ep_eventpoll_poll() in that internal callers already have + * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested() + * is correctly annotated. + */ +static unsigned int ep_item_poll(struct epitem *epi, poll_table *pt, int depth) { + struct eventpoll *ep; + bool locked; + pt->_key = epi->event.events; + if (!is_file_epoll(epi->ffd.file)) + return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & + epi->event.events; + + ep = epi->ffd.file->private_data; + poll_wait(epi->ffd.file, &ep->poll_wait, pt); + locked = pt && (pt->_qproc == ep_ptable_queue_proc); - return epi->ffd.file->f_op->poll(epi->ffd.file, pt) & epi->event.events; + return ep_scan_ready_list(epi->ffd.file->private_data, + ep_read_events_proc, &depth, depth, + locked) & epi->event.events; } static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, @@ -892,13 +898,15 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, { struct epitem *epi, *tmp; poll_table pt; + int depth = *(int *)priv; init_poll_funcptr(&pt, NULL); + depth++; list_for_each_entry_safe(epi, tmp, head, rdllink) { - if (ep_item_poll(epi, &pt)) + if (ep_item_poll(epi, &pt, depth)) { return POLLIN | POLLRDNORM; - else { + } else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as @@ -912,48 +920,20 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, return 0; } -static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, - poll_table *pt); - -struct readyevents_arg { - struct eventpoll *ep; - bool locked; -}; - -static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) -{ - struct readyevents_arg *arg = priv; - - return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL, - call_nests + 1, arg->locked); -} - static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) { - int pollflags; struct eventpoll *ep = file->private_data; - struct readyevents_arg arg; - - /* - * During ep_insert() we already hold the ep->mtx for the tfile. - * Prevent re-aquisition. - */ - arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc); - arg.ep = ep; + int depth = 0; /* Insert inside our poll wait queue */ poll_wait(file, &ep->poll_wait, wait); /* * Proceed to find out if wanted events are really available inside - * the ready list. This need to be done under ep_call_nested() - * supervision, since the call to f_op->poll() done on listed files - * could re-enter here. + * the ready list. */ - pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, - ep_poll_readyevents_proc, &arg, ep, current); - - return pollflags != -1 ? pollflags : 0; + return ep_scan_ready_list(ep, ep_read_events_proc, + &depth, depth, false); } #ifdef CONFIG_PROC_FS @@ -1472,7 +1452,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, * this operation completes, the poll callback can start hitting * the new item. */ - revents = ep_item_poll(epi, &epq.pt); + revents = ep_item_poll(epi, &epq.pt, 1); /* * We have to check if something went wrong during the poll wait queue @@ -1606,7 +1586,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even * Get current event bits. We can safely use the file* here because * its usage count has been increased by the caller of this function. */ - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the item is "hot" and it is not registered inside the ready @@ -1674,7 +1654,7 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, list_del_init(&epi->rdllink); - revents = ep_item_poll(epi, &pt); + revents = ep_item_poll(epi, &pt, 1); /* * If the event mask intersect the caller-requested one, @@ -2259,7 +2239,6 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, compat_size_t, sigsetsize) { long err; - compat_sigset_t csigmask; sigset_t ksigmask, sigsaved; /* @@ -2269,9 +2248,8 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd, if (sigmask) { if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (copy_from_user(&csigmask, sigmask, sizeof(csigmask))) + if (get_compat_sigset(&ksigmask, sigmask)) return -EFAULT; - sigset_from_compat(&ksigmask, &csigmask); sigsaved = current->blocked; set_current_blocked(&ksigmask); } @@ -2315,11 +2293,10 @@ static int __init eventpoll_init(void) */ ep_nested_calls_init(&poll_loop_ncalls); +#ifdef CONFIG_DEBUG_LOCK_ALLOC /* Initialize the structure used to perform safe poll wait head wake ups */ ep_nested_calls_init(&poll_safewake_ncalls); - - /* Initialize the structure used to perform file's f_op->poll() calls */ - ep_nested_calls_init(&poll_readywalk_ncalls); +#endif /* * We can have many thousands of epitems, so prevent this from @@ -2329,11 +2306,11 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), - 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); + sizeof(struct eppoll_entry), 0, SLAB_PANIC|SLAB_ACCOUNT, NULL); return 0; } diff --git a/fs/exec.c b/fs/exec.c index 1d6243d9f2b6..6be2aa0ab26f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1340,10 +1340,15 @@ void setup_new_exec(struct linux_binprm * bprm) * avoid bad behavior from the prior rlimits. This has to * happen before arch_pick_mmap_layout(), which examines * RLIMIT_STACK, but after the point of no return to avoid - * needing to clean up the change on failure. + * races from other threads changing the limits. This also + * must be protected from races with prlimit() calls. */ + task_lock(current->group_leader); if (current->signal->rlim[RLIMIT_STACK].rlim_cur > _STK_LIM) current->signal->rlim[RLIMIT_STACK].rlim_cur = _STK_LIM; + if (current->signal->rlim[RLIMIT_STACK].rlim_max > _STK_LIM) + current->signal->rlim[RLIMIT_STACK].rlim_max = _STK_LIM; + task_unlock(current->group_leader); } arch_pick_mmap_layout(current->mm); diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index e1b3724bebf2..33db13365c5e 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -548,7 +548,7 @@ do_more: } mark_buffer_dirty(bitmap_bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); group_adjust_blocks(sb, block_group, desc, bh2, group_freed); @@ -1424,7 +1424,7 @@ allocated: percpu_counter_sub(&sbi->s_freeblocks_counter, num); mark_buffer_dirty(bitmap_bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); *errp = 0; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index c67b486488fd..2da67699dc33 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf) } down_read(&ei->dax_sem); - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops); + ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops); up_read(&ei->dax_sem); if (vmf->flags & FAULT_FLAG_WRITE) diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index a1fc3dabca41..6484199b35d1 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -145,7 +145,7 @@ void ext2_free_inode (struct inode * inode) else ext2_release_inode(sb, block_group, is_directory); mark_buffer_dirty(bitmap_bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); brelse(bitmap_bh); @@ -517,7 +517,7 @@ repeat_in_this_group: goto fail; got: mark_buffer_dirty(bitmap_bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bitmap_bh); brelse(bitmap_bh); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index e2b6be03e69b..7646818ab266 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -75,7 +75,7 @@ void ext2_error(struct super_block *sb, const char *function, if (test_opt(sb, ERRORS_RO)) { ext2_msg(sb, KERN_CRIT, "error: remounting filesystem read-only"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } } @@ -656,7 +656,7 @@ static int ext2_setup_super (struct super_block * sb, ext2_msg(sb, KERN_ERR, "error: revision level too high, " "forcing read-only mode"); - res = MS_RDONLY; + res = SB_RDONLY; } if (read_only) return res; @@ -924,9 +924,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi->s_resuid = opts.s_resuid; sbi->s_resgid = opts.s_resgid; - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? - MS_POSIXACL : 0); + SB_POSIXACL : 0); sb->s_iflags |= SB_I_CGROUPWB; if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && @@ -1178,7 +1178,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_msg(sb, KERN_WARNING, "warning: mounting ext3 filesystem as ext2"); if (ext2_setup_super (sb, es, sb_rdonly(sb))) - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ext2_write_super(sb); return 0; @@ -1341,9 +1341,9 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) "dax flag with busy inodes while remounting"); new_opts.s_mount_opt ^= EXT2_MOUNT_DAX; } - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out_set; - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || !(sbi->s_mount_state & EXT2_VALID_FS)) goto out_set; @@ -1379,7 +1379,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) */ sbi->s_mount_state = le16_to_cpu(es->s_state); if (!ext2_setup_super (sb, es, 0)) - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; spin_unlock(&sbi->s_lock); ext2_write_super(sb); @@ -1392,8 +1392,8 @@ out_set: sbi->s_mount_opt = new_opts.s_mount_opt; sbi->s_resuid = new_opts.s_resuid; sbi->s_resgid = new_opts.s_resgid; - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0); spin_unlock(&sbi->s_lock); return 0; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ad204d2724ac..a0ae27b1bc66 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -28,6 +28,7 @@ #include <linux/quotaops.h> #include <linux/pagevec.h> #include <linux/uio.h> +#include <linux/mman.h> #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" @@ -297,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, */ bool write = (vmf->flags & FAULT_FLAG_WRITE) && (vmf->vma->vm_flags & VM_SHARED); + pfn_t pfn; if (write) { sb_start_pagefault(sb); @@ -304,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); + if (IS_ERR(handle)) { + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + return VM_FAULT_SIGBUS; + } } else { down_read(&EXT4_I(inode)->i_mmap_sem); } - if (!IS_ERR(handle)) - result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); - else - result = VM_FAULT_SIGBUS; + result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops); if (write) { - if (!IS_ERR(handle)) - ext4_journal_stop(handle); + ext4_journal_stop(handle); + /* Handling synchronous page fault? */ + if (result & VM_FAULT_NEEDDSYNC) + result = dax_finish_sync_fault(vmf, pe_size, pfn); up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); } else { @@ -351,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) return -EIO; + /* + * We don't support synchronous mappings for non-DAX files. At least + * until someone comes with a sensible use case. + */ + if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC)) + return -EOPNOTSUPP; + file_accessed(file); if (IS_DAX(file_inode(file))) { vma->vm_ops = &ext4_dax_vm_ops; @@ -469,6 +482,7 @@ const struct file_operations ext4_file_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .mmap = ext4_file_mmap, + .mmap_supported_flags = MAP_SYNC, .open = ext4_file_open, .release = ext4_release_file, .fsync = ext4_sync_file, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8d2b582fb141..7df2c5644e59 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2742,7 +2742,7 @@ static int ext4_writepages(struct address_space *mapping, * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test - * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because + * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want @@ -3384,6 +3384,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static bool ext4_inode_datasync_dirty(struct inode *inode) +{ + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal) + return !jbd2_transaction_committed(journal, + EXT4_I(inode)->i_datasync_tid); + /* Any metadata buffers to write? */ + if (!list_empty(&inode->i_mapping->private_list)) + return true; + return inode->i_state & I_DIRTY_DATASYNC; +} + static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { @@ -3497,6 +3510,8 @@ retry: } iomap->flags = 0; + if (ext4_inode_datasync_dirty(inode)) + iomap->flags |= IOMAP_F_DIRTY; iomap->bdev = inode->i_sb->s_bdev; iomap->dax_dev = sbi->s_daxdev; iomap->offset = first_block << blkbits; @@ -5168,7 +5183,7 @@ static int ext4_do_update_inode(handle_t *handle, ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); - if (inode->i_sb->s_flags & MS_LAZYTIME) + if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index b7558f292420..1eec25014f62 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -592,6 +592,44 @@ static int ext4_ioc_getfsmap(struct super_block *sb, return 0; } +static long ext4_ioctl_group_add(struct file *file, + struct ext4_new_group_data *input) +{ + struct super_block *sb = file_inode(file)->i_sb; + int err, err2=0; + + err = ext4_resize_begin(sb); + if (err) + return err; + + if (ext4_has_feature_bigalloc(sb)) { + ext4_msg(sb, KERN_ERR, + "Online resizing not supported with bigalloc"); + err = -EOPNOTSUPP; + goto group_add_out; + } + + err = mnt_want_write_file(file); + if (err) + goto group_add_out; + + err = ext4_group_add(sb, input); + if (EXT4_SB(sb)->s_journal) { + jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + } + if (err == 0) + err = err2; + mnt_drop_write_file(file); + if (!err && ext4_has_group_desc_csum(sb) && + test_opt(sb, INIT_INODE_TABLE)) + err = ext4_register_li_request(sb, input->group); +group_add_out: + ext4_resize_end(sb); + return err; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -776,44 +814,12 @@ mext_out: case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; - int err, err2=0; - - err = ext4_resize_begin(sb); - if (err) - return err; if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, - sizeof(input))) { - err = -EFAULT; - goto group_add_out; - } - - if (ext4_has_feature_bigalloc(sb)) { - ext4_msg(sb, KERN_ERR, - "Online resizing not supported with bigalloc"); - err = -EOPNOTSUPP; - goto group_add_out; - } - - err = mnt_want_write_file(filp); - if (err) - goto group_add_out; + sizeof(input))) + return -EFAULT; - err = ext4_group_add(sb, &input); - if (EXT4_SB(sb)->s_journal) { - jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - } - if (err == 0) - err = err2; - mnt_drop_write_file(filp); - if (!err && ext4_has_group_desc_csum(sb) && - test_opt(sb, INIT_INODE_TABLE)) - err = ext4_register_li_request(sb, input.group); -group_add_out: - ext4_resize_end(sb); - return err; + return ext4_ioctl_group_add(filp, &input); } case EXT4_IOC_MIGRATE: @@ -1078,8 +1084,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) break; case EXT4_IOC32_GROUP_ADD: { struct compat_ext4_new_group_input __user *uinput; - struct ext4_new_group_input input; - mm_segment_t old_fs; + struct ext4_new_group_data input; int err; uinput = compat_ptr(arg); @@ -1092,12 +1097,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) &uinput->reserved_blocks); if (err) return -EFAULT; - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD, - (unsigned long) &input); - set_fs(old_fs); - return err; + return ext4_ioctl_group_add(file, &input); } case EXT4_IOC_MOVE_EXT: case EXT4_IOC_RESIZE_FS: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0556cd036b69..7c46693a14d7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -422,7 +422,7 @@ static void ext4_handle_error(struct super_block *sb) * before ->s_flags update */ smp_wmb(); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } if (test_opt(sb, ERRORS_PANIC)) { if (EXT4_SB(sb)->s_journal && @@ -635,7 +635,7 @@ void __ext4_abort(struct super_block *sb, const char *function, * before ->s_flags update */ smp_wmb(); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; if (EXT4_SB(sb)->s_journal) jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); save_error_info(sb, function, line); @@ -1682,10 +1682,10 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, sb->s_flags |= SB_I_VERSION; return 1; case Opt_lazytime: - sb->s_flags |= MS_LAZYTIME; + sb->s_flags |= SB_LAZYTIME; return 1; case Opt_nolazytime: - sb->s_flags &= ~MS_LAZYTIME; + sb->s_flags &= ~SB_LAZYTIME; return 1; } @@ -2116,7 +2116,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) { ext4_msg(sb, KERN_ERR, "revision level too high, " "forcing read-only mode"); - res = MS_RDONLY; + res = SB_RDONLY; } if (read_only) goto done; @@ -2429,7 +2429,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ - if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { + if (es->s_last_orphan && !(s_flags & SB_RDONLY)) { ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list.\n"); es->s_last_orphan = 0; @@ -2438,19 +2438,19 @@ static void ext4_orphan_cleanup(struct super_block *sb, return; } - if (s_flags & MS_RDONLY) { + if (s_flags & SB_RDONLY) { ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs"); - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. */ - if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) { + if (ext4_has_feature_quota(sb) && (s_flags & SB_RDONLY)) { int ret = ext4_enable_quotas(sb); if (!ret) @@ -2539,7 +2539,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, } } #endif - sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + sb->s_flags = s_flags; /* Restore SB_RDONLY status */ } /* @@ -2741,7 +2741,7 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) if (ext4_has_feature_readonly(sb)) { ext4_msg(sb, KERN_INFO, "filesystem is read-only"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; return 1; } @@ -3623,8 +3623,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_iflags |= SB_I_CGROUPWB; } - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV && (ext4_has_compat_features(sb) || @@ -4199,7 +4199,7 @@ no_journal: } if (ext4_setup_super(sb, es, sb_rdonly(sb))) - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; /* determine the minimum size of new large inodes, if present */ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE && @@ -4693,7 +4693,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) * the clock is set in the future, and this will cause e2fsck * to complain and force a full file system check. */ - if (!(sb->s_flags & MS_RDONLY)) + if (!(sb->s_flags & SB_RDONLY)) es->s_wtime = cpu_to_le32(get_seconds()); if (sb->s_bdev->bd_part) es->s_kbytes_written = @@ -5047,8 +5047,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) ext4_abort(sb, "Abort forced by user"); - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); es = sbi->s_es; @@ -5057,16 +5057,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); } - if (*flags & MS_LAZYTIME) - sb->s_flags |= MS_LAZYTIME; + if (*flags & SB_LAZYTIME) + sb->s_flags |= SB_LAZYTIME; - if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) { + if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) { err = -EROFS; goto restore_opts; } - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { err = sync_filesystem(sb); if (err < 0) goto restore_opts; @@ -5078,7 +5078,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * First of all, the unconditional stuff we have to do * to disable replay of the journal when we next remount */ - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; /* * OK, test if we are remounting a valid rw partition @@ -5140,7 +5140,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); if (!ext4_setup_super(sb, es, 0)) - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; if (ext4_has_feature_mmp(sb)) if (ext4_multi_mount_protect(sb, le64_to_cpu(es->s_mmp_block))) { @@ -5164,7 +5164,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } ext4_setup_system_zone(sb); - if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY)) + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) ext4_commit_super(sb, 1); #ifdef CONFIG_QUOTA @@ -5182,7 +5182,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } #endif - *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME); + *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); kfree(orig_data); return 0; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index dd2e73e10857..4aa69bc1c70a 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -617,17 +617,17 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; - if (s_flags & MS_RDONLY) { + if (s_flags & SB_RDONLY) { f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); - sbi->sb->s_flags &= ~MS_RDONLY; + sbi->sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= MS_ACTIVE; + sbi->sb->s_flags |= SB_ACTIVE; /* Turn on quotas so that they are updated correctly */ - quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY); + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -658,7 +658,7 @@ out: if (quota_enabled) f2fs_quota_off_umount(sbi->sb); #endif - sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return err; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f4e094e816c6..6abf26c31d01 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2378,7 +2378,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) static inline int f2fs_readonly(struct super_block *sb) { - return sb->s_flags & MS_RDONLY; + return sb->s_flags & SB_RDONLY; } static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 5d5bba462f26..d844dcb80570 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1005,7 +1005,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, cpc.reason = __get_cp_reason(sbi); gc_more: - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { + if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) { ret = -EINVAL; goto stop; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 92c57ace1939..b3a14b0429f2 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -598,16 +598,16 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) int quota_enabled; #endif - if (s_flags & MS_RDONLY) { + if (s_flags & SB_RDONLY) { f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); - sbi->sb->s_flags &= ~MS_RDONLY; + sbi->sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= MS_ACTIVE; + sbi->sb->s_flags |= SB_ACTIVE; /* Turn on quotas so that they are updated correctly */ - quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY); + quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -671,7 +671,7 @@ out: if (quota_enabled) f2fs_quota_off_umount(sbi->sb); #endif - sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ return ret ? ret: err; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a6c5dd450002..708155d9c2e4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -534,10 +534,10 @@ static int parse_options(struct super_block *sb, char *options) #endif break; case Opt_lazytime: - sb->s_flags |= MS_LAZYTIME; + sb->s_flags |= SB_LAZYTIME; break; case Opt_nolazytime: - sb->s_flags &= ~MS_LAZYTIME; + sb->s_flags &= ~SB_LAZYTIME; break; #ifdef CONFIG_QUOTA case Opt_quota: @@ -1168,7 +1168,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); set_opt(sbi, NOHEAP); - sbi->sb->s_flags |= MS_LAZYTIME; + sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_mounted_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); @@ -1236,7 +1236,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #endif /* recover superblocks we couldn't write due to previous RO mount */ - if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + if (!(*flags & SB_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); f2fs_msg(sb, KERN_INFO, "Try to recover all the superblocks, ret: %d", err); @@ -1255,17 +1255,17 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * Previous and new state of filesystem is RO, * so skip checking GC and FLUSH_MERGE conditions. */ - if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) + if (f2fs_readonly(sb) && (*flags & SB_RDONLY)) goto skip; #ifdef CONFIG_QUOTA - if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { + if (!f2fs_readonly(sb) && (*flags & SB_RDONLY)) { err = dquot_suspend(sb, -1); if (err < 0) goto restore_opts; } else { /* dquot_resume needs RW */ - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; if (sb_any_quota_suspended(sb)) { dquot_resume(sb, -1); } else if (f2fs_sb_has_quota_ino(sb)) { @@ -1288,7 +1288,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if background_gc = off is passed in mount * option. Also sync the filesystem. */ - if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { + if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { stop_gc_thread(sbi); need_restart_gc = true; @@ -1300,7 +1300,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { writeback_inodes_sb(sb, WB_REASON_SYNC); sync_inodes_sb(sb); @@ -1314,7 +1314,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. */ - if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { + if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); destroy_flush_cmd_control(sbi, false); } else { @@ -1329,8 +1329,8 @@ skip: kfree(s_qf_names[i]); #endif /* Update the POSIXACL Flag */ - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); return 0; restore_gc: @@ -2472,8 +2472,8 @@ try_onemore: sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; sb->s_time_gran = 1; - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | - (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | + (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); memcpy(&sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 81cecbe6d7cf..b833ffeee1e1 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -291,7 +291,6 @@ static int fat_parse_long(struct inode *dir, loff_t *pos, } } parse_long: - slots = 0; ds = (struct msdos_dir_slot *)*de; id = ds->id; if (!(id & 0x40)) diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 48b2336692f9..bac10de678cc 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -392,7 +392,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs, memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize); set_buffer_uptodate(c_bh); mark_buffer_dirty_inode(c_bh, sbi->fat_inode); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) err = sync_dirty_buffer(c_bh); brelse(c_bh); if (err) @@ -597,7 +597,7 @@ int fat_free_clusters(struct inode *inode, int cluster) } if (nr_bhs + fatent.nr_bhs > MAX_BUF_PER_PAGE) { - if (sb->s_flags & MS_SYNCHRONOUS) { + if (sb->s_flags & SB_SYNCHRONOUS) { err = fat_sync_bhs(bhs, nr_bhs); if (err) goto error; @@ -612,7 +612,7 @@ int fat_free_clusters(struct inode *inode, int cluster) fat_collect_bhs(bhs, &nr_bhs, &fatent); } while (cluster != FAT_ENT_EOF); - if (sb->s_flags & MS_SYNCHRONOUS) { + if (sb->s_flags & SB_SYNCHRONOUS) { err = fat_sync_bhs(bhs, nr_bhs); if (err) goto error; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 30c52394a7ad..20a0a89eaca5 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -779,14 +779,14 @@ static void __exit fat_destroy_inodecache(void) static int fat_remount(struct super_block *sb, int *flags, char *data) { - int new_rdonly; + bool new_rdonly; struct msdos_sb_info *sbi = MSDOS_SB(sb); - *flags |= MS_NODIRATIME | (sbi->options.isvfat ? 0 : MS_NOATIME); + *flags |= SB_NODIRATIME | (sbi->options.isvfat ? 0 : SB_NOATIME); sync_filesystem(sb); /* make sure we update state on remount. */ - new_rdonly = *flags & MS_RDONLY; + new_rdonly = *flags & SB_RDONLY; if (new_rdonly != sb_rdonly(sb)) { if (new_rdonly) fat_set_state(sb, 0, 0); @@ -1352,7 +1352,7 @@ out: if (opts->unicode_xlate) opts->utf8 = 0; if (opts->nfs == FAT_NFS_NOSTALE_RO) { - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; sb->s_export_op = &fat_export_ops_nostale; } @@ -1608,7 +1608,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat, return -ENOMEM; sb->s_fs_info = sbi; - sb->s_flags |= MS_NODIRATIME; + sb->s_flags |= SB_NODIRATIME; sb->s_magic = MSDOS_SUPER_MAGIC; sb->s_op = &fat_sops; sb->s_export_op = &fat_export_ops; diff --git a/fs/fat/misc.c b/fs/fat/misc.c index acc3aa30ee54..f9bdc1e01c98 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -33,7 +33,7 @@ void __fat_fs_error(struct super_block *sb, int report, const char *fmt, ...) if (opts->errors == FAT_ERRORS_PANIC) panic("FAT-fs (%s): fs panic from previous error\n", sb->s_id); else if (opts->errors == FAT_ERRORS_RO && !sb_rdonly(sb)) { - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; fat_msg(sb, KERN_ERR, "Filesystem has been set read-only"); } } diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 7d6a105d601b..d24d2758a363 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -646,7 +646,7 @@ static void setup(struct super_block *sb) { MSDOS_SB(sb)->dir_ops = &msdos_dir_inode_operations; sb->s_d_op = &msdos_dentry_operations; - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; } static int msdos_fill_super(struct super_block *sb, void *data, int silent) diff --git a/fs/fcntl.c b/fs/fcntl.c index 30f47d0f74a0..0522e283a4f4 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -563,6 +563,9 @@ static int put_compat_flock64(const struct flock *kfl, struct compat_flock64 __u { struct compat_flock64 fl; + BUILD_BUG_ON(sizeof(kfl->l_start) > sizeof(ufl->l_start)); + BUILD_BUG_ON(sizeof(kfl->l_len) > sizeof(ufl->l_len)); + memset(&fl, 0, sizeof(struct compat_flock64)); copy_flock_fields(&fl, kfl); if (copy_to_user(ufl, &fl, sizeof(struct compat_flock64))) @@ -632,9 +635,8 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, if (err) break; err = fixup_compat_flock(&flock); - if (err) - return err; - err = put_compat_flock(&flock, compat_ptr(arg)); + if (!err) + err = put_compat_flock(&flock, compat_ptr(arg)); break; case F_GETLK64: case F_OFD_GETLK: @@ -642,12 +644,8 @@ COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, if (err) break; err = fcntl_getlk(f.file, convert_fcntl_cmd(cmd), &flock); - if (err) - break; - err = fixup_compat_flock(&flock); - if (err) - return err; - err = put_compat_flock64(&flock, compat_ptr(arg)); + if (!err) + err = put_compat_flock64(&flock, compat_ptr(arg)); break; case F_SETLK: case F_SETLKW: diff --git a/fs/fhandle.c b/fs/fhandle.c index 474adc8d2a3a..0ace128f5d23 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -213,8 +213,8 @@ out_err: return retval; } -long do_handle_open(int mountdirfd, - struct file_handle __user *ufh, int open_flag) +static long do_handle_open(int mountdirfd, struct file_handle __user *ufh, + int open_flag) { long retval = 0; struct path path; diff --git a/fs/file.c b/fs/file.c index 4eecbf4244a5..3b080834b870 100644 --- a/fs/file.c +++ b/fs/file.c @@ -593,13 +593,16 @@ void __fd_install(struct files_struct *files, unsigned int fd, { struct fdtable *fdt; - might_sleep(); rcu_read_lock_sched(); - while (unlikely(files->resize_in_progress)) { + if (unlikely(files->resize_in_progress)) { rcu_read_unlock_sched(); - wait_event(files->resize_wait, !files->resize_in_progress); - rcu_read_lock_sched(); + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + BUG_ON(fdt->fd[fd] != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + spin_unlock(&files->file_lock); + return; } /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); @@ -632,7 +635,6 @@ int __close_fd(struct files_struct *files, unsigned fd) if (!file) goto out_unlock; rcu_assign_pointer(fdt->fd[fd], NULL); - __clear_close_on_exec(fd, fdt); __put_unused_fd(files, fd); spin_unlock(&files->file_lock); return filp_close(file, files); diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index 455ce5b77e9b..f989efa051a0 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -116,7 +116,7 @@ vxfs_statfs(struct dentry *dentry, struct kstatfs *bufp) static int vxfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -220,7 +220,7 @@ static int vxfs_fill_super(struct super_block *sbp, void *dp, int silent) int ret = -EINVAL; u32 j; - sbp->s_flags |= MS_RDONLY; + sbp->s_flags |= SB_RDONLY; infp = kzalloc(sizeof(*infp), GFP_KERNEL); if (!infp) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 08f5debd07d1..cea4836385b7 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -490,7 +490,7 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) /* while holding I_WB_SWITCH, no one else can update the association */ spin_lock(&inode->i_lock); - if (!(inode->i_sb->s_flags & MS_ACTIVE) || + if (!(inode->i_sb->s_flags & SB_ACTIVE) || inode->i_state & (I_WB_SWITCH | I_FREEING) || inode_to_wb(inode) == isw->new_wb) { spin_unlock(&inode->i_lock); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2f504d615d92..624f18bbfd2b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -130,7 +130,7 @@ static void fuse_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); - if (inode->i_sb->s_flags & MS_ACTIVE) { + if (inode->i_sb->s_flags & SB_ACTIVE) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup); @@ -141,7 +141,7 @@ static void fuse_evict_inode(struct inode *inode) static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - if (*flags & MS_MANDLOCK) + if (*flags & SB_MANDLOCK) return -EINVAL; return 0; @@ -1056,10 +1056,10 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) int is_bdev = sb->s_bdev != NULL; err = -EINVAL; - if (sb->s_flags & MS_MANDLOCK) + if (sb->s_flags & SB_MANDLOCK) goto err; - sb->s_flags &= ~(MS_NOSEC | SB_I_VERSION); + sb->s_flags &= ~(SB_NOSEC | SB_I_VERSION); if (!parse_fuse_opt(data, &d, is_bdev)) goto err; @@ -1109,9 +1109,9 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err_dev_free; /* Handle umasking inside the fuse code */ - if (sb->s_flags & MS_POSIXACL) + if (sb->s_flags & SB_POSIXACL) fc->dont_mask = 1; - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; fc->default_permissions = d.default_permissions; fc->allow_other = d.allow_other; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index a3711f543405..ad55eb86a250 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1065,15 +1065,15 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent sdp->sd_args = *args; if (sdp->sd_args.ar_spectator) { - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; set_bit(SDF_RORECOVERY, &sdp->sd_flags); } if (sdp->sd_args.ar_posix_acl) - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; if (sdp->sd_args.ar_nobarrier) set_bit(SDF_NOBARRIERS, &sdp->sd_flags); - sb->s_flags |= MS_NOSEC; + sb->s_flags |= SB_NOSEC; sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; sb->s_d_op = &gfs2_dops; @@ -1257,7 +1257,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, struct gfs2_args args; struct gfs2_sbd *sdp; - if (!(flags & MS_RDONLY)) + if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; bdev = blkdev_get_by_path(dev_name, mode, fs_type); @@ -1313,15 +1313,15 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if (s->s_root) { error = -EBUSY; - if ((flags ^ s->s_flags) & MS_RDONLY) + if ((flags ^ s->s_flags) & SB_RDONLY) goto error_super; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); sb_set_blocksize(s, block_size(bdev)); - error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); + error = fill_super(s, &args, flags & SB_SILENT ? 1 : 0); if (error) goto error_super; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; bdev->bd_super = s; } @@ -1365,7 +1365,7 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type, pr_warn("gfs2 mount does not exist\n"); return ERR_CAST(s); } - if ((flags ^ s->s_flags) & MS_RDONLY) { + if ((flags ^ s->s_flags) & SB_RDONLY) { deactivate_locked_super(s); return ERR_PTR(-EBUSY); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 9cb5c9a97d69..d81d46e19726 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1256,10 +1256,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) return -EINVAL; if (sdp->sd_args.ar_spectator) - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; - if ((sb->s_flags ^ *flags) & MS_RDONLY) { - if (*flags & MS_RDONLY) + if ((sb->s_flags ^ *flags) & SB_RDONLY) { + if (*flags & SB_RDONLY) error = gfs2_make_fs_ro(sdp); else error = gfs2_make_fs_rw(sdp); @@ -1269,9 +1269,9 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) sdp->sd_args = args; if (sdp->sd_args.ar_posix_acl) - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; else - sb->s_flags &= ~MS_POSIXACL; + sb->s_flags &= ~SB_POSIXACL; if (sdp->sd_args.ar_nobarrier) set_bit(SDF_NOBARRIERS, &sdp->sd_flags); else diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index a85ca8b2c9ba..ca8b72d0a831 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -117,7 +117,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp) kfree(tr); up_read(&sdp->sd_log_flush_lock); - if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS) + if (sdp->sd_vfs->s_flags & SB_SYNCHRONOUS) gfs2_log_flush(sdp, NULL, NORMAL_FLUSH); if (alloced) sb_end_intwrite(sdp->sd_vfs); diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 8aec5e732abf..b63a4df7327b 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -98,13 +98,11 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_bnode *src_node, int src, int len) { - struct hfs_btree *tree; struct page *src_page, *dst_page; hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; - tree = src_node->tree; src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page[0]; @@ -237,7 +235,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) { - struct super_block *sb; struct hfs_bnode *node, *node2; struct address_space *mapping; struct page *page; @@ -249,7 +246,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) return NULL; } - sb = tree->inode->i_sb; size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); node = kzalloc(size, GFP_KERNEL); diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index 894994d2c885..460281b1299e 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -204,11 +204,11 @@ int hfs_mdb_get(struct super_block *sb) attrib = mdb->drAtrb; if (!(attrib & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. mounting read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } if ((attrib & cpu_to_be16(HFS_SB_ATTRIB_SLOCK))) { pr_warn("filesystem is marked locked, mounting read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } if (!sb_rdonly(sb)) { /* Mark the volume uncleanly unmounted in case we crash */ diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 7e0d65e9586c..173876782f73 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -114,18 +114,18 @@ static int hfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_NODIRATIME; - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + *flags |= SB_NODIRATIME; + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (!(*flags & MS_RDONLY)) { + if (!(*flags & SB_RDONLY)) { if (!(HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfs is recommended. leaving read-only.\n"); - sb->s_flags |= MS_RDONLY; - *flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; + *flags |= SB_RDONLY; } else if (HFS_SB(sb)->mdb->drAtrb & cpu_to_be16(HFS_SB_ATTRIB_SLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); - sb->s_flags |= MS_RDONLY; - *flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; + *flags |= SB_RDONLY; } } return 0; @@ -407,7 +407,7 @@ static int hfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &hfs_super_operations; sb->s_xattr = hfs_xattr_handlers; - sb->s_flags |= MS_NODIRATIME; + sb->s_flags |= SB_NODIRATIME; mutex_init(&sbi->bitmap_lock); res = hfs_mdb_get(sb); diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index d77015c3f22c..177fae4e6581 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -127,14 +127,12 @@ void hfs_bnode_clear(struct hfs_bnode *node, int off, int len) void hfs_bnode_copy(struct hfs_bnode *dst_node, int dst, struct hfs_bnode *src_node, int src, int len) { - struct hfs_btree *tree; struct page **src_page, **dst_page; int l; hfs_dbg(BNODE_MOD, "copybytes: %u,%u,%u\n", dst, src, len); if (!len) return; - tree = src_node->tree; src += src_node->page_offset; dst += dst_node->page_offset; src_page = src_node->page + (src >> PAGE_SHIFT); @@ -401,7 +399,6 @@ struct hfs_bnode *hfs_bnode_findhash(struct hfs_btree *tree, u32 cnid) static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) { - struct super_block *sb; struct hfs_bnode *node, *node2; struct address_space *mapping; struct page *page; @@ -414,7 +411,6 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid) return NULL; } - sb = tree->inode->i_sb; size = sizeof(struct hfs_bnode) + tree->pages_per_bnode * sizeof(struct page *); node = kzalloc(size, GFP_KERNEL); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index e5bb2de2262a..1d458b716957 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -329,9 +329,9 @@ static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf) static int hfsplus_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (!(*flags & MS_RDONLY)) { + if (!(*flags & SB_RDONLY)) { struct hfsplus_vh *vhdr = HFSPLUS_SB(sb)->s_vhdr; int force = 0; @@ -340,20 +340,20 @@ static int hfsplus_remount(struct super_block *sb, int *flags, char *data) if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { pr_warn("filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. leaving read-only.\n"); - sb->s_flags |= MS_RDONLY; - *flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; + *flags |= SB_RDONLY; } else if (force) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { pr_warn("filesystem is marked locked, leaving read-only.\n"); - sb->s_flags |= MS_RDONLY; - *flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; + *flags |= SB_RDONLY; } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { pr_warn("filesystem is marked journaled, leaving read-only.\n"); - sb->s_flags |= MS_RDONLY; - *flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; + *flags |= SB_RDONLY; } } return 0; @@ -455,16 +455,16 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) if (!(vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_UNMNT))) { pr_warn("Filesystem was not cleanly unmounted, running fsck.hfsplus is recommended. mounting read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } else if (test_and_clear_bit(HFSPLUS_SB_FORCE, &sbi->flags)) { /* nothing */ } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { pr_warn("Filesystem is marked locked, mounting read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !sb_rdonly(sb)) { pr_warn("write access to a journaled filesystem is not supported, use the force option at your own risk, mounting read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } err = -EINVAL; diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index e0e60b148400..7c49f1ef0c85 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -288,7 +288,7 @@ struct dnode *hpfs_map_dnode(struct super_block *s, unsigned secno, goto bail; } if (((31 + de->namelen + de->down*4 + 3) & ~3) != le16_to_cpu(de->length)) { - if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & MS_RDONLY) goto ok; + if (((31 + de->namelen + de->down*4 + 3) & ~3) < le16_to_cpu(de->length) && s->s_flags & SB_RDONLY) goto ok; hpfs_error(s, "namelen does not match dirent size in dnode %08x, dirent %03x, last %03x", secno, p, pp); goto bail; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 1516fb4e28f4..c45a3b9b9ac7 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -78,7 +78,7 @@ void hpfs_error(struct super_block *s, const char *fmt, ...) else { pr_cont("; remounting read-only\n"); mark_dirty(s, 0); - s->s_flags |= MS_RDONLY; + s->s_flags |= SB_RDONLY; } } else if (sb_rdonly(s)) pr_cont("; going on - but anything won't be destroyed because it's read-only\n"); @@ -457,7 +457,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) sync_filesystem(s); - *flags |= MS_NOATIME; + *flags |= SB_NOATIME; hpfs_lock(s); uid = sbi->sb_uid; gid = sbi->sb_gid; @@ -488,7 +488,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data) sbi->sb_eas = eas; sbi->sb_chk = chk; sbi->sb_chkdsk = chkdsk; sbi->sb_err = errs; sbi->sb_timeshift = timeshift; - if (!(*flags & MS_RDONLY)) mark_dirty(s, 1); + if (!(*flags & SB_RDONLY)) mark_dirty(s, 1); hpfs_unlock(s); return 0; @@ -614,7 +614,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent) goto bail4; } - s->s_flags |= MS_NOATIME; + s->s_flags |= SB_NOATIME; /* Fill superblock stuff */ s->s_magic = HPFS_SUPER_MAGIC; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 1e76730aac0d..8a85f3f53446 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -639,11 +639,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* - * page_put due to reference from alloc_huge_page() * unlock_page because locked by add_to_page_cache() + * page_put due to reference from alloc_huge_page() */ - put_page(page); unlock_page(page); + put_page(page); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) diff --git a/fs/inode.c b/fs/inode.c index fd401028a309..03102d6ef044 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -416,7 +416,7 @@ void inode_add_lru(struct inode *inode) { if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC | I_FREEING | I_WILL_FREE)) && - !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE) + !atomic_read(&inode->i_count) && inode->i_sb->s_flags & SB_ACTIVE) inode_lru_list_add(inode); } @@ -595,7 +595,7 @@ static void dispose_list(struct list_head *head) * @sb: superblock to operate on * * Make sure that no inodes with zero refcount are retained. This is - * called by superblock shutdown after having MS_ACTIVE flag removed, + * called by superblock shutdown after having SB_ACTIVE flag removed, * so any inode reaching zero refcount during or after that call will * be immediately evicted. */ @@ -1492,7 +1492,7 @@ static void iput_final(struct inode *inode) else drop = generic_drop_inode(inode); - if (!drop && (sb->s_flags & MS_ACTIVE)) { + if (!drop && (sb->s_flags & SB_ACTIVE)) { inode_add_lru(inode); spin_unlock(&inode->i_lock); return; @@ -1644,7 +1644,7 @@ int generic_update_time(struct inode *inode, struct timespec *time, int flags) if (flags & S_MTIME) inode->i_mtime = *time; - if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION)) + if (!(inode->i_sb->s_flags & SB_LAZYTIME) || (flags & S_VERSION)) iflags |= I_DIRTY_SYNC; __mark_inode_dirty(inode, iflags); return 0; @@ -1691,7 +1691,7 @@ bool __atime_needs_update(const struct path *path, struct inode *inode, if (IS_NOATIME(inode)) return false; - if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) + if ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode)) return false; if (mnt->mnt_flags & MNT_NOATIME) diff --git a/fs/internal.h b/fs/internal.h index 48cee21b4f14..df262f41a0ef 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -55,6 +55,7 @@ extern void __init chrdev_init(void); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); +long do_unlinkat(int dfd, struct filename *name); /* * namespace.c diff --git a/fs/iomap.c b/fs/iomap.c index b9f74803e56c..47d29ccffaef 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -856,6 +856,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, struct bio *bio; bool need_zeroout = false; int nr_pages, ret; + size_t copied = 0; if ((pos | length | align) & ((1 << blkbits) - 1)) return -EINVAL; @@ -867,7 +868,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, /*FALLTHRU*/ case IOMAP_UNWRITTEN: if (!(dio->flags & IOMAP_DIO_WRITE)) { - iov_iter_zero(length, dio->submit.iter); + length = iov_iter_zero(length, dio->submit.iter); dio->size += length; return length; } @@ -904,8 +905,11 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, } do { - if (dio->error) + size_t n; + if (dio->error) { + iov_iter_revert(dio->submit.iter, copied); return 0; + } bio = bio_alloc(GFP_KERNEL, nr_pages); bio_set_dev(bio, iomap->bdev); @@ -918,20 +922,24 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, ret = bio_iov_iter_get_pages(bio, &iter); if (unlikely(ret)) { bio_put(bio); - return ret; + return copied ? copied : ret; } + n = bio->bi_iter.bi_size; if (dio->flags & IOMAP_DIO_WRITE) { bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); - task_io_account_write(bio->bi_iter.bi_size); + task_io_account_write(n); } else { bio_set_op_attrs(bio, REQ_OP_READ, 0); if (dio->flags & IOMAP_DIO_DIRTY) bio_set_pages_dirty(bio); } - dio->size += bio->bi_iter.bi_size; - pos += bio->bi_iter.bi_size; + iov_iter_advance(dio->submit.iter, n); + + dio->size += n; + pos += n; + copied += n; nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES); @@ -947,9 +955,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, if (pad) iomap_dio_zero(dio, iomap, pos, fs_block_size - pad); } - - iov_iter_advance(dio->submit.iter, length); - return length; + return copied; } ssize_t diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 447a24d77b89..bc258a4402f6 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -114,7 +114,7 @@ static void destroy_inodecache(void) static int isofs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - if (!(*flags & MS_RDONLY)) + if (!(*flags & SB_RDONLY)) return -EROFS; return 0; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index d2a85c9720e9..67546c7ad473 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -737,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) return err; } +/* Return 1 when transaction with given tid has already committed. */ +int jbd2_transaction_committed(journal_t *journal, tid_t tid) +{ + int ret = 1; + + read_lock(&journal->j_state_lock); + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == tid) + ret = 0; + if (journal->j_committing_transaction && + journal->j_committing_transaction->t_tid == tid) + ret = 0; + read_unlock(&journal->j_state_lock); + return ret; +} +EXPORT_SYMBOL(jbd2_transaction_committed); + /* * When this function returns the transaction corresponding to tid * will be completed. If the transaction has currently running, start diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index e96c6b05e43e..d8c274d39ddb 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -409,10 +409,10 @@ int jffs2_do_remount_fs(struct super_block *sb, int *flags, char *data) mutex_unlock(&c->alloc_sem); } - if (!(*flags & MS_RDONLY)) + if (!(*flags & SB_RDONLY)) jffs2_start_garbage_collect_thread(c); - *flags |= MS_NOATIME; + *flags |= SB_NOATIME; return 0; } diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 824e61ede465..c2fbec19c616 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -59,7 +59,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f) } -#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & MS_RDONLY) +#define jffs2_is_readonly(c) (OFNI_BS_2SFFJ(c)->s_flags & SB_RDONLY) #define SECTOR_ADDR(x) ( (((unsigned long)(x) / c->sector_size) * c->sector_size) ) #ifndef CONFIG_JFFS2_FS_WRITEBUFFER diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 153f1c6eb169..f60dee7faf03 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -301,10 +301,10 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &jffs2_super_operations; sb->s_export_op = &jffs2_export_ops; - sb->s_flags = sb->s_flags | MS_NOATIME; + sb->s_flags = sb->s_flags | SB_NOATIME; sb->s_xattr = jffs2_xattr_handlers; #ifdef CONFIG_JFFS2_FS_POSIX_ACL - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif ret = jffs2_do_fill_super(sb, data, silent); return ret; diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2f7b3af5b8b7..90373aebfdca 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -87,7 +87,7 @@ static void jfs_handle_error(struct super_block *sb) else if (sbi->flag & JFS_ERR_REMOUNT_RO) { jfs_err("ERROR: (device %s): remounting filesystem as read-only", sb->s_id); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } /* nothing is done for continue beyond marking the superblock dirty */ @@ -477,7 +477,7 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) return rc; } - if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) { + if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { /* * Invalidate any previously read metadata. fsck may have * changed the on-disk data since we mounted r/o @@ -488,12 +488,12 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data) ret = jfs_mount_rw(sb, 1); /* mark the fs r/w for quota activity */ - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; dquot_resume(sb, -1); return ret; } - if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) { + if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { rc = dquot_suspend(sb, -1); if (rc < 0) return rc; @@ -545,7 +545,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sbi->flag = flag; #ifdef CONFIG_JFS_POSIX_ACL - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; #endif if (newLVSize) { diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 95a7c88baed9..26dd9a50f383 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -335,7 +335,7 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags, deactivate_locked_super(sb); return ERR_PTR(error); } - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; mutex_lock(&kernfs_mutex); list_add(&info->node, &root->supers); diff --git a/fs/libfs.c b/fs/libfs.c index 3aabe553fc45..7ff3cb904acd 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -246,7 +246,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, struct inode *root; struct qstr d_name = QSTR_INIT(name, strlen(name)); - s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER, + s = sget_userns(fs_type, NULL, set_anon_super, SB_KERNMOUNT|SB_NOUSER, &init_user_ns, NULL); if (IS_ERR(s)) return ERR_CAST(s); @@ -277,7 +277,7 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name, d_instantiate(dentry, root); s->s_root = dentry; s->s_d_op = dops; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; return dget(s->s_root); Enomem: @@ -578,7 +578,7 @@ int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *c spin_lock(&pin_fs_lock); if (unlikely(!*mount)) { spin_unlock(&pin_fs_lock); - mnt = vfs_kern_mount(type, MS_KERNMOUNT, type->name, NULL); + mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (IS_ERR(mnt)) return PTR_ERR(mnt); spin_lock(&pin_fs_lock); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 0d4e590e0549..826a89184f90 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -578,8 +578,10 @@ static void nlm_complain_hosts(struct net *net) if (ln->nrhosts == 0) return; - printk(KERN_WARNING "lockd: couldn't shutdown host module for net %p!\n", net); - dprintk("lockd: %lu hosts left in net %p:\n", ln->nrhosts, net); + pr_warn("lockd: couldn't shutdown host module for net %x!\n", + net->ns.inum); + dprintk("lockd: %lu hosts left in net %x:\n", ln->nrhosts, + net->ns.inum); } else { if (nrhosts == 0) return; @@ -590,9 +592,9 @@ static void nlm_complain_hosts(struct net *net) for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; - dprintk(" %s (cnt %d use %d exp %ld net %p)\n", + dprintk(" %s (cnt %d use %d exp %ld net %x)\n", host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires, host->net); + host->h_inuse, host->h_expires, host->net->ns.inum); } } @@ -605,7 +607,8 @@ nlm_shutdown_hosts_net(struct net *net) mutex_lock(&nlm_host_mutex); /* First, make all hosts eligible for gc */ - dprintk("lockd: nuking all hosts in net %p...\n", net); + dprintk("lockd: nuking all hosts in net %x...\n", + net ? net->ns.inum : 0); for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; @@ -618,9 +621,8 @@ nlm_shutdown_hosts_net(struct net *net) /* Then, perform a garbage collection pass */ nlm_gc_hosts(net); - mutex_unlock(&nlm_host_mutex); - nlm_complain_hosts(net); + mutex_unlock(&nlm_host_mutex); } /* @@ -646,7 +648,8 @@ nlm_gc_hosts(struct net *net) struct hlist_node *next; struct nlm_host *host; - dprintk("lockd: host garbage collection for net %p\n", net); + dprintk("lockd: host garbage collection for net %x\n", + net ? net->ns.inum : 0); for_each_host(host, chain, nlm_server_hosts) { if (net && host->net != net) continue; @@ -662,9 +665,10 @@ nlm_gc_hosts(struct net *net) if (atomic_read(&host->h_count) || host->h_inuse || time_before(jiffies, host->h_expires)) { dprintk("nlm_gc_hosts skipping %s " - "(cnt %d use %d exp %ld net %p)\n", + "(cnt %d use %d exp %ld net %x)\n", host->h_name, atomic_read(&host->h_count), - host->h_inuse, host->h_expires, host->net); + host->h_inuse, host->h_expires, + host->net->ns.inum); continue; } nlm_destroy_host_locked(host); diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 9fbbd11f9ecb..96cfb2967ac7 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -110,7 +110,8 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, clnt = nsm_create(host->net, host->nodename); if (IS_ERR(clnt)) { dprintk("lockd: failed to create NSM upcall transport, " - "status=%ld, net=%p\n", PTR_ERR(clnt), host->net); + "status=%ld, net=%x\n", PTR_ERR(clnt), + host->net->ns.inum); return PTR_ERR(clnt); } diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index b837fb7e290a..9c36d614bf89 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -57,6 +57,9 @@ static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; +atomic_t nlm_ntf_refcnt = ATOMIC_INIT(0); +DECLARE_WAIT_QUEUE_HEAD(nlm_ntf_wq); + unsigned int lockd_net_id; /* @@ -259,7 +262,7 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net) if (error < 0) goto err_bind; set_grace_period(net); - dprintk("lockd_up_net: per-net data created; net=%p\n", net); + dprintk("%s: per-net data created; net=%x\n", __func__, net->ns.inum); return 0; err_bind: @@ -274,12 +277,15 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); svc_shutdown_net(serv, net); - dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net); + dprintk("%s: per-net data destroyed; net=%x\n", + __func__, net->ns.inum); } } else { - printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n", - nlmsvc_task, net); + pr_err("%s: no users! task=%p, net=%x\n", + __func__, nlmsvc_task, net->ns.inum); BUG(); } } @@ -290,7 +296,8 @@ static int lockd_inetaddr_event(struct notifier_block *this, struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; struct sockaddr_in sin; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nlm_ntf_refcnt)) goto out; if (nlmsvc_rqst) { @@ -301,6 +308,8 @@ static int lockd_inetaddr_event(struct notifier_block *this, svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin); } + atomic_dec(&nlm_ntf_refcnt); + wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -317,7 +326,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; struct sockaddr_in6 sin6; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nlm_ntf_refcnt)) goto out; if (nlmsvc_rqst) { @@ -329,6 +339,8 @@ static int lockd_inet6addr_event(struct notifier_block *this, svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, (struct sockaddr *)&sin6); } + atomic_dec(&nlm_ntf_refcnt); + wake_up(&nlm_ntf_wq); out: return NOTIFY_DONE; @@ -345,10 +357,12 @@ static void lockd_unregister_notifiers(void) #if IS_ENABLED(CONFIG_IPV6) unregister_inet6addr_notifier(&lockd_inet6addr_notifier); #endif + wait_event(nlm_ntf_wq, atomic_read(&nlm_ntf_refcnt) == 0); } static void lockd_svc_exit_thread(void) { + atomic_dec(&nlm_ntf_refcnt); lockd_unregister_notifiers(); svc_exit_thread(nlmsvc_rqst); } @@ -369,9 +383,11 @@ static int lockd_start_svc(struct svc_serv *serv) printk(KERN_WARNING "lockd_up: svc_rqst allocation failed, error=%d\n", error); + lockd_unregister_notifiers(); goto out_rqst; } + atomic_inc(&nlm_ntf_refcnt); svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; @@ -459,13 +475,16 @@ int lockd_up(struct net *net) } error = lockd_up_net(serv, net); - if (error < 0) - goto err_net; + if (error < 0) { + lockd_unregister_notifiers(); + goto err_put; + } error = lockd_start_svc(serv); - if (error < 0) - goto err_start; - + if (error < 0) { + lockd_down_net(serv, net); + goto err_put; + } nlmsvc_users++; /* * Note: svc_serv structures have an initial use count of 1, @@ -476,12 +495,6 @@ err_put: err_create: mutex_unlock(&nlmsvc_mutex); return error; - -err_start: - lockd_down_net(serv, net); -err_net: - lockd_unregister_notifiers(); - goto err_put; } EXPORT_SYMBOL_GPL(lockd_up); @@ -678,6 +691,17 @@ static int lockd_init_net(struct net *net) static void lockd_exit_net(struct net *net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + WARN_ONCE(!list_empty(&ln->lockd_manager.list), + "net %x %s: lockd_manager.list is not empty\n", + net->ns.inum, __func__); + WARN_ONCE(!list_empty(&ln->nsm_handles), + "net %x %s: nsm_handles list is not empty\n", + net->ns.inum, __func__); + WARN_ONCE(delayed_work_pending(&ln->grace_period_end), + "net %x %s: grace_period_end was not cancelled\n", + net->ns.inum, __func__); } static struct pernet_operations lockd_net_ops = { diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index a563ddbc19e6..4ec3d6e03e76 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -370,7 +370,7 @@ nlmsvc_mark_resources(struct net *net) { struct nlm_host hint; - dprintk("lockd: nlmsvc_mark_resources for net %p\n", net); + dprintk("lockd: %s for net %x\n", __func__, net ? net->ns.inum : 0); hint.net = net; nlm_traverse_files(&hint, nlmsvc_mark_host, NULL); } diff --git a/fs/locks.c b/fs/locks.c index 1bd71c4d663a..21b4dfa289ee 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -141,7 +141,7 @@ static inline bool is_remote_lock(struct file *filp) { - return likely(!(filp->f_path.dentry->d_sb->s_flags & MS_NOREMOTELOCK)); + return likely(!(filp->f_path.dentry->d_sb->s_flags & SB_NOREMOTELOCK)); } static bool lease_breaking(struct file_lock *fl) diff --git a/fs/mbcache.c b/fs/mbcache.c index d818fd236787..b8b8b9ced9f8 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink, struct mb_cache *cache = container_of(shrink, struct mb_cache, c_shrink); + /* Unlikely, but not impossible */ + if (unlikely(cache->c_entry_count < 0)) + return 0; return cache->c_entry_count; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index b6829d679643..72e308c3e66b 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -125,9 +125,9 @@ static int minix_remount (struct super_block * sb, int * flags, char * data) sync_filesystem(sb); ms = sbi->s_ms; - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) return 0; - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { if (ms->s_state & MINIX_VALID_FS || !(sbi->s_mount_state & MINIX_VALID_FS)) return 0; diff --git a/fs/namei.c b/fs/namei.c index 5424b10cfdc4..9cc91fb7f156 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1129,18 +1129,9 @@ static int follow_automount(struct path *path, struct nameidata *nd, * of the daemon to instantiate them before they can be used. */ if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | - LOOKUP_AUTOMOUNT))) { - /* Positive dentry that isn't meant to trigger an - * automount, EISDIR will allow it to be used, - * otherwise there's no mount here "now" so return - * ENOENT. - */ - if (path->dentry->d_inode) - return -EISDIR; - else - return -ENOENT; - } + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + path->dentry->d_inode) + return -EISDIR; if (path->dentry->d_sb->s_user_ns != &init_user_ns) return -EACCES; @@ -3459,7 +3450,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, goto out; child = vfs_tmpfile(path.dentry, op->mode, op->open_flag); error = PTR_ERR(child); - if (unlikely(IS_ERR(child))) + if (IS_ERR(child)) goto out2; dput(path.dentry); path.dentry = child; @@ -4010,10 +4001,9 @@ EXPORT_SYMBOL(vfs_unlink); * writeout happening, and we don't want to prevent access to the directory * while waiting on the I/O. */ -static long do_unlinkat(int dfd, const char __user *pathname) +long do_unlinkat(int dfd, struct filename *name) { int error; - struct filename *name; struct dentry *dentry; struct path path; struct qstr last; @@ -4022,8 +4012,7 @@ static long do_unlinkat(int dfd, const char __user *pathname) struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: - name = filename_parentat(dfd, getname(pathname), lookup_flags, - &path, &last, &type); + name = filename_parentat(dfd, name, lookup_flags, &path, &last, &type); if (IS_ERR(name)) return PTR_ERR(name); @@ -4065,12 +4054,12 @@ exit2: mnt_drop_write(path.mnt); exit1: path_put(&path); - putname(name); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; inode = NULL; goto retry; } + putname(name); return error; slashes: @@ -4091,12 +4080,12 @@ SYSCALL_DEFINE3(unlinkat, int, dfd, const char __user *, pathname, int, flag) if (flag & AT_REMOVEDIR) return do_rmdir(dfd, pathname); - return do_unlinkat(dfd, pathname); + return do_unlinkat(dfd, getname(pathname)); } SYSCALL_DEFINE1(unlink, const char __user *, pathname) { - return do_unlinkat(AT_FDCWD, pathname); + return do_unlinkat(AT_FDCWD, getname(pathname)); } int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 129f1937fa2c..41de88cdc053 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -103,7 +103,7 @@ static void destroy_inodecache(void) static int ncp_remount(struct super_block *sb, int *flags, char* data) { sync_filesystem(sb); - *flags |= MS_NODIRATIME; + *flags |= SB_NODIRATIME; return 0; } @@ -547,7 +547,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) else default_bufsize = 1024; - sb->s_flags |= MS_NODIRATIME; /* probably even noatime */ + sb->s_flags |= SB_NODIRATIME; /* probably even noatime */ sb->s_maxbytes = 0xFFFFFFFFU; sb->s_blocksize = 1024; /* Eh... Is this correct? */ sb->s_blocksize_bits = 10; diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c index b60627bcfc62..ef6729568432 100644 --- a/fs/nfs/cache_lib.c +++ b/fs/nfs/cache_lib.c @@ -67,7 +67,7 @@ out: */ void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) { - if (atomic_dec_and_test(&dreq->count)) + if (refcount_dec_and_test(&dreq->count)) kfree(dreq); } @@ -87,7 +87,7 @@ static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) dreq = container_of(req, struct nfs_cache_defer_req, req); dreq->deferred_req.revisit = nfs_dns_cache_revisit; - atomic_inc(&dreq->count); + refcount_inc(&dreq->count); return &dreq->deferred_req; } @@ -99,7 +99,7 @@ struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); if (dreq) { init_completion(&dreq->completion); - atomic_set(&dreq->count, 1); + refcount_set(&dreq->count, 1); dreq->req.defer = nfs_dns_cache_defer; } return dreq; diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h index 4e6236a86cf7..220ee409abc4 100644 --- a/fs/nfs/cache_lib.h +++ b/fs/nfs/cache_lib.h @@ -16,7 +16,7 @@ struct nfs_cache_defer_req { struct cache_req req; struct cache_deferred_req deferred_req; struct completion completion; - atomic_t count; + refcount_t count; }; extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index cd9d992feb2e..509dc5adeb8f 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -49,15 +49,15 @@ static int nfs4_callback_up_net(struct svc_serv *serv, struct net *net) if (ret <= 0) goto out_err; nn->nfs_callback_tcpport = ret; - dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", - nn->nfs_callback_tcpport, PF_INET, net); + dprintk("NFS: Callback listener port = %u (af %u, net %x)\n", + nn->nfs_callback_tcpport, PF_INET, net->ns.inum); ret = svc_create_xprt(serv, "tcp", net, PF_INET6, nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS); if (ret > 0) { nn->nfs_callback_tcpport6 = ret; - dprintk("NFS: Callback listener port = %u (af %u, net %p)\n", - nn->nfs_callback_tcpport6, PF_INET6, net); + dprintk("NFS: Callback listener port = %u (af %u, net %x\n", + nn->nfs_callback_tcpport6, PF_INET6, net->ns.inum); } else if (ret != -EAFNOSUPPORT) goto out_err; return 0; @@ -185,7 +185,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc if (--nn->cb_users[minorversion]) return; - dprintk("NFS: destroy per-net callback data; net=%p\n", net); + dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); svc_shutdown_net(serv, net); } @@ -198,7 +198,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, if (nn->cb_users[minorversion]++) return 0; - dprintk("NFS: create per-net callback data; net=%p\n", net); + dprintk("NFS: create per-net callback data; net=%x\n", net->ns.inum); ret = svc_bind(serv, net); if (ret < 0) { @@ -223,7 +223,7 @@ err_socks: err_bind: nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " - "net = %p\n", ret, net); + "net = %x\n", ret, net->ns.inum); return ret; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 19151f6c0e97..2435af56b87e 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -440,7 +440,7 @@ static bool referring_call_exists(struct nfs_client *clp, uint32_t nrclists, struct referring_call_list *rclists) { - bool status = 0; + bool status = false; int i, j; struct nfs4_session *session; struct nfs4_slot_table *tbl; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 22880ef6d8dd..0ac2fb1c6b63 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -163,7 +163,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; - atomic_set(&clp->cl_count, 1); + refcount_set(&clp->cl_count, 1); clp->cl_cons_state = NFS_CS_INITING; memcpy(&clp->cl_addr, cl_init->addr, cl_init->addrlen); @@ -269,7 +269,7 @@ void nfs_put_client(struct nfs_client *clp) nn = net_generic(clp->cl_net, nfs_net_id); - if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { + if (refcount_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { list_del(&clp->cl_share_link); nfs_cb_idr_remove_locked(clp); spin_unlock(&nn->nfs_client_lock); @@ -314,7 +314,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat sap)) continue; - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); return clp; } return NULL; @@ -1006,7 +1006,7 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, /* Copy data from the source */ server->nfs_client = source->nfs_client; server->destroy = source->destroy; - atomic_inc(&server->nfs_client->cl_count); + refcount_inc(&server->nfs_client->cl_count); nfs_server_copy_userdata(server, source); server->fsid = fattr->fsid; @@ -1166,7 +1166,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v) clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), - atomic_read(&clp->cl_count), + refcount_read(&clp->cl_count), clp->cl_hostname); rcu_read_unlock(); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 606dd3871f66..ade44ca0c66c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1041,6 +1041,33 @@ int nfs_delegations_present(struct nfs_client *clp) } /** + * nfs4_refresh_delegation_stateid - Update delegation stateid seqid + * @dst: stateid to refresh + * @inode: inode to check + * + * Returns "true" and updates "dst->seqid" * if inode had a delegation + * that matches our delegation stateid. Otherwise "false" is returned. + */ +bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode) +{ + struct nfs_delegation *delegation; + bool ret = false; + if (!inode) + goto out; + + rcu_read_lock(); + delegation = rcu_dereference(NFS_I(inode)->delegation); + if (delegation != NULL && + nfs4_stateid_match_other(dst, &delegation->stateid)) { + dst->seqid = delegation->stateid.seqid; + return ret; + } + rcu_read_unlock(); +out: + return ret; +} + +/** * nfs4_copy_delegation_stateid - Copy inode's state ID information * @inode: inode to check * @flags: delegation type requirement diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index ddaf2644cf13..185a09f37a89 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -62,6 +62,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type); int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid); bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, struct rpc_cred **cred); +bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode); void nfs_mark_delegation_referenced(struct nfs_delegation *delegation); int nfs4_have_delegation(struct inode *inode, fmode_t flags); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f439f1c45008..2f3f86726f5b 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -118,13 +118,6 @@ nfs_opendir(struct inode *inode, struct file *filp) goto out; } filp->private_data = ctx; - if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { - /* This is a mountpoint, so d_revalidate will never - * have been called, so we need to refresh the - * inode (for close-open consistency) ourselves. - */ - __nfs_revalidate_inode(NFS_SERVER(inode), inode); - } out: put_rpccred(cred); return res; @@ -253,7 +246,7 @@ int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descri desc->cache_entry_index = index; return 0; out_eof: - desc->eof = 1; + desc->eof = true; return -EBADCOOKIE; } @@ -307,7 +300,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des if (array->eof_index >= 0) { status = -EBADCOOKIE; if (*desc->dir_cookie == array->last_cookie) - desc->eof = 1; + desc->eof = true; } out: return status; @@ -761,7 +754,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) ent = &array->array[i]; if (!dir_emit(desc->ctx, ent->string.name, ent->string.len, nfs_compat_user_ino64(ent->ino), ent->d_type)) { - desc->eof = 1; + desc->eof = true; break; } desc->ctx->pos++; @@ -773,7 +766,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) ctx->duped = 1; } if (array->eof_index >= 0) - desc->eof = 1; + desc->eof = true; kunmap(desc->page); cache_page_release(desc); @@ -873,7 +866,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res == -EBADCOOKIE) { res = 0; /* This means either end of directory */ - if (*desc->dir_cookie && desc->eof == 0) { + if (*desc->dir_cookie && !desc->eof) { /* Or that the server has 'lost' a cookie */ res = uncached_readdir(desc); if (res == 0) @@ -1241,8 +1234,7 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - if (nfs_mapping_need_revalidate_inode(inode)) - error = __nfs_revalidate_inode(NFS_SERVER(inode), inode); + error = nfs_lookup_verify_inode(inode, flags); dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n", __func__, inode->i_ino, error ? "invalid" : "valid"); return !error; @@ -1264,7 +1256,7 @@ static int nfs_dentry_delete(const struct dentry *dentry) /* Unhash it, so that ->d_iput() would be called */ return 1; } - if (!(dentry->d_sb->s_flags & MS_ACTIVE)) { + if (!(dentry->d_sb->s_flags & SB_ACTIVE)) { /* Unhash it, so that ancestors of killed async unlink * files will be cleaned up during umount */ return 1; @@ -1393,6 +1385,7 @@ static int nfs4_lookup_revalidate(struct dentry *, unsigned int); const struct dentry_operations nfs4_dentry_operations = { .d_revalidate = nfs4_lookup_revalidate, + .d_weak_revalidate = nfs_weak_revalidate, .d_delete = nfs_dentry_delete, .d_iput = nfs_dentry_iput, .d_automount = nfs_d_automount, @@ -2064,7 +2057,7 @@ out: * should mark the directories for revalidation. */ d_move(old_dentry, new_dentry); - nfs_set_verifier(new_dentry, + nfs_set_verifier(old_dentry, nfs_save_change_attribute(new_dir)); } else if (error == -ENOENT) nfs_dentry_handle_enoent(old_dentry); @@ -2369,15 +2362,15 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set) } EXPORT_SYMBOL_GPL(nfs_access_add_cache); -#define NFS_MAY_READ (NFS4_ACCESS_READ) -#define NFS_MAY_WRITE (NFS4_ACCESS_MODIFY | \ - NFS4_ACCESS_EXTEND | \ - NFS4_ACCESS_DELETE) -#define NFS_FILE_MAY_WRITE (NFS4_ACCESS_MODIFY | \ - NFS4_ACCESS_EXTEND) +#define NFS_MAY_READ (NFS_ACCESS_READ) +#define NFS_MAY_WRITE (NFS_ACCESS_MODIFY | \ + NFS_ACCESS_EXTEND | \ + NFS_ACCESS_DELETE) +#define NFS_FILE_MAY_WRITE (NFS_ACCESS_MODIFY | \ + NFS_ACCESS_EXTEND) #define NFS_DIR_MAY_WRITE NFS_MAY_WRITE -#define NFS_MAY_LOOKUP (NFS4_ACCESS_LOOKUP) -#define NFS_MAY_EXECUTE (NFS4_ACCESS_EXECUTE) +#define NFS_MAY_LOOKUP (NFS_ACCESS_LOOKUP) +#define NFS_MAY_EXECUTE (NFS_ACCESS_EXECUTE) static int nfs_access_calc_mask(u32 access_result, umode_t umode) { @@ -2425,9 +2418,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) if (!may_block) goto out; - /* Be clever: ask server to check for all possible rights */ - cache.mask = NFS_MAY_LOOKUP | NFS_MAY_EXECUTE - | NFS_MAY_WRITE | NFS_MAY_READ; + /* + * Determine which access bits we want to ask for... + */ + cache.mask = NFS_ACCESS_READ | NFS_ACCESS_MODIFY | NFS_ACCESS_EXTEND; + if (S_ISDIR(inode->i_mode)) + cache.mask |= NFS_ACCESS_DELETE | NFS_ACCESS_LOOKUP; + else + cache.mask |= NFS_ACCESS_EXECUTE; cache.cred = cred; status = NFS_PROTO(inode)->access(inode, &cache); if (status != 0) { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0214dd1e1060..81cca49a8375 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -829,23 +829,9 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; - /* - * VFS doesn't require the open mode to match a flock() lock's type. - * NFS, however, may simulate flock() locking with posix locking which - * requires the open mode to match the lock type. - */ - switch (fl->fl_type) { - case F_UNLCK: + /* We're simulating flock() locks using posix locks on the server */ + if (fl->fl_type == F_UNLCK) return do_unlk(filp, cmd, fl, is_local); - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - } - return do_setlk(filp, cmd, fl, is_local); } EXPORT_SYMBOL_GPL(nfs_flock); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 508126eb49f9..4e54d8b5413a 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -471,10 +471,10 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr) return PNFS_NOT_ATTEMPTED; dprintk("%s USE DS: %s cl_count %d\n", __func__, - ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count)); /* No multipath support. Use first DS */ - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); @@ -515,10 +515,10 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync) dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, - offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count)); + offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count)); hdr->pgio_done_cb = filelayout_write_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; fh = nfs4_fl_select_ds_fh(lseg, j); @@ -1064,9 +1064,9 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how) goto out_err; dprintk("%s ino %lu, how %d cl_count %d\n", __func__, - data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count)); + data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count)); data->commit_done_cb = filelayout_commit_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index b0fa83a60754..c75ad982bcfc 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -187,7 +187,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo, continue; if (!ff_mirror_match_fh(mirror, pos)) continue; - if (atomic_inc_not_zero(&pos->ref)) { + if (refcount_inc_not_zero(&pos->ref)) { spin_unlock(&inode->i_lock); return pos; } @@ -218,7 +218,7 @@ static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) mirror = kzalloc(sizeof(*mirror), gfp_flags); if (mirror != NULL) { spin_lock_init(&mirror->lock); - atomic_set(&mirror->ref, 1); + refcount_set(&mirror->ref, 1); INIT_LIST_HEAD(&mirror->mirrors); } return mirror; @@ -242,7 +242,7 @@ static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror) { - if (mirror != NULL && atomic_dec_and_test(&mirror->ref)) + if (mirror != NULL && refcount_dec_and_test(&mirror->ref)) ff_layout_free_mirror(mirror); } @@ -1726,10 +1726,10 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) vers = nfs4_ff_layout_ds_version(lseg, idx); dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, - ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); + ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); hdr->pgio_done_cb = ff_layout_read_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; fh = nfs4_ff_layout_select_ds_fh(lseg, idx); if (fh) @@ -1785,11 +1785,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, - offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), + offset, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); hdr->pgio_done_cb = ff_layout_write_done_cb; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; hdr->ds_commit_idx = idx; fh = nfs4_ff_layout_select_ds_fh(lseg, idx); @@ -1863,11 +1863,11 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) vers = nfs4_ff_layout_ds_version(lseg, idx); dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, - data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count), + data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), vers); data->commit_done_cb = ff_layout_commit_done_cb; data->cred = ds_cred; - atomic_inc(&ds->ds_clp->cl_count); + refcount_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) @@ -2286,7 +2286,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags)) continue; /* mirror refcount put in cleanup_layoutstats */ - if (!atomic_inc_not_zero(&mirror->ref)) + if (!refcount_inc_not_zero(&mirror->ref)) continue; dev = &mirror->mirror_ds->id_node; memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 679cb087ef3f..411798346e48 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -14,6 +14,7 @@ #define FF_FLAGS_NO_IO_THRU_MDS 2 #define FF_FLAGS_NO_READ_IO 4 +#include <linux/refcount.h> #include "../pnfs.h" /* XXX: Let's filter out insanely large mirror count for now to avoid oom @@ -82,7 +83,7 @@ struct nfs4_ff_layout_mirror { nfs4_stateid stateid; struct rpc_cred __rcu *ro_cred; struct rpc_cred __rcu *rw_cred; - atomic_t ref; + refcount_t ref; spinlock_t lock; unsigned long flags; struct nfs4_ff_layoutstat read_stat; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1629056aa2c9..b992d2382ffa 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -752,7 +752,7 @@ int nfs_getattr(const struct path *path, struct kstat *stat, * Note that we only have to check the vfsmount flags here: * - NFS always sets S_NOATIME by so checking it would give a * bogus result - * - NFS never sets MS_NOATIME or MS_NODIRATIME so there is + * - NFS never sets SB_NOATIME or SB_NODIRATIME so there is * no point in checking those. */ if ((path->mnt->mnt_flags & MNT_NOATIME) || @@ -783,7 +783,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { - atomic_set(&l_ctx->count, 1); + refcount_set(&l_ctx->count, 1); l_ctx->lockowner = current->files; INIT_LIST_HEAD(&l_ctx->list); atomic_set(&l_ctx->io_count, 0); @@ -797,7 +797,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context do { if (pos->lockowner != current->files) continue; - atomic_inc(&pos->count); + refcount_inc(&pos->count); return pos; } while ((pos = list_entry(pos->list.next, typeof(*pos), list)) != head); return NULL; @@ -836,7 +836,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx) struct nfs_open_context *ctx = l_ctx->open_context; struct inode *inode = d_inode(ctx->dentry); - if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) + if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; list_del(&l_ctx->list); spin_unlock(&inode->i_lock); @@ -913,7 +913,7 @@ EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { if (ctx != NULL) - atomic_inc(&ctx->lock_context.count); + refcount_inc(&ctx->lock_context.count); return ctx; } EXPORT_SYMBOL_GPL(get_nfs_open_context); @@ -924,11 +924,11 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) struct super_block *sb = ctx->dentry->d_sb; if (!list_empty(&ctx->list)) { - if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) + if (!refcount_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) return; list_del(&ctx->list); spin_unlock(&inode->i_lock); - } else if (!atomic_dec_and_test(&ctx->lock_context.count)) + } else if (!refcount_dec_and_test(&ctx->lock_context.count)) return; if (inode != NULL) NFS_PROTO(inode)->close_context(ctx, is_sync); @@ -2084,8 +2084,12 @@ static int nfs_net_init(struct net *net) static void nfs_net_exit(struct net *net) { + struct nfs_net *nn = net_generic(net, nfs_net_id); + nfs_fs_proc_net_exit(net); nfs_cleanup_cb_ident_idr(net); + WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); + WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); } static struct pernet_operations nfs_net_ops = { diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 5ab17fd4700a..8357ff69962f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -10,7 +10,7 @@ #include <linux/nfs_page.h> #include <linux/wait_bit.h> -#define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS) +#define NFS_MS_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index bc673fb47fb3..49f848fd1f04 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -188,6 +188,7 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) { struct nfs3_accessargs arg = { .fh = NFS_FH(inode), + .access = entry->mask, }; struct nfs3_accessres res; struct rpc_message msg = { @@ -196,25 +197,9 @@ static int nfs3_proc_access(struct inode *inode, struct nfs_access_entry *entry) .rpc_resp = &res, .rpc_cred = entry->cred, }; - int mode = entry->mask; int status = -ENOMEM; dprintk("NFS call access\n"); - - if (mode & MAY_READ) - arg.access |= NFS3_ACCESS_READ; - if (S_ISDIR(inode->i_mode)) { - if (mode & MAY_WRITE) - arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND | NFS3_ACCESS_DELETE; - if (mode & MAY_EXEC) - arg.access |= NFS3_ACCESS_LOOKUP; - } else { - if (mode & MAY_WRITE) - arg.access |= NFS3_ACCESS_MODIFY | NFS3_ACCESS_EXTEND; - if (mode & MAY_EXEC) - arg.access |= NFS3_ACCESS_EXECUTE; - } - res.fattr = nfs_alloc_fattr(); if (res.fattr == NULL) goto out; diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index dcfcf7fd7438..b374f680830c 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -145,7 +145,7 @@ struct nfs4_lock_state { unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; nfs4_stateid ls_stateid; - atomic_t ls_count; + refcount_t ls_count; fl_owner_t ls_owner; }; @@ -162,6 +162,7 @@ enum { NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ NFS_STATE_RECOVERY_FAILED, /* OPEN stateid state recovery failed */ NFS_STATE_MAY_NOTIFY_LOCK, /* server may CB_NOTIFY_LOCK */ + NFS_STATE_CHANGE_WAIT, /* A state changing operation is outstanding */ }; struct nfs4_state { @@ -185,6 +186,8 @@ struct nfs4_state { unsigned int n_rdwr; /* Number of read/write references */ fmode_t state; /* State on the server (R,W, or RW) */ atomic_t count; + + wait_queue_head_t waitq; }; @@ -458,6 +461,10 @@ extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t, const struct nfs_lock_context *, nfs4_stateid *, struct rpc_cred **); +extern bool nfs4_refresh_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state); +extern bool nfs4_copy_open_stateid(nfs4_stateid *dst, + struct nfs4_state *state); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); @@ -465,7 +472,7 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid); extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid); extern void nfs_release_seqid(struct nfs_seqid *seqid); extern void nfs_free_seqid(struct nfs_seqid *seqid); -extern int nfs4_setup_sequence(const struct nfs_client *client, +extern int nfs4_setup_sequence(struct nfs_client *client, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task); @@ -475,6 +482,7 @@ extern int nfs4_sequence_done(struct rpc_task *task, extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp); extern const nfs4_stateid zero_stateid; +extern const nfs4_stateid invalid_stateid; /* nfs4super.c */ struct nfs_mount_info; diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index e9bea90dc017..12bbab0becb4 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -483,7 +483,7 @@ static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new, * ID and serverowner fields. Wait for CREATE_SESSION * to finish. */ if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); + refcount_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); nfs_put_client(*prev); @@ -559,7 +559,7 @@ int nfs40_walk_client_list(struct nfs_client *new, * way that a SETCLIENTID_CONFIRM to pos can succeed is * if new and pos point to the same server: */ - atomic_inc(&pos->cl_count); + refcount_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); nfs_put_client(prev); @@ -715,7 +715,7 @@ int nfs41_walk_client_list(struct nfs_client *new, continue; found: - atomic_inc(&pos->cl_count); + refcount_inc(&pos->cl_count); *result = pos; status = 0; break; @@ -749,7 +749,7 @@ nfs4_find_client_ident(struct net *net, int cb_ident) spin_lock(&nn->nfs_client_lock); clp = idr_find(&nn->cb_ident_idr, cb_ident); if (clp) - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); return clp; } @@ -793,7 +793,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, spin_lock(&nn->nfs_client_lock); list_for_each_entry(clp, &nn->nfs_client_list, cl_share_link) { - if (nfs4_cb_match_client(addr, clp, minorversion) == false) + if (!nfs4_cb_match_client(addr, clp, minorversion)) continue; if (!nfs4_has_session(clp)) @@ -804,7 +804,7 @@ nfs4_find_client_sessionid(struct net *net, const struct sockaddr *addr, sid->data, NFS4_MAX_SESSIONID_LEN) != 0) continue; - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); spin_unlock(&nn->nfs_client_lock); return clp; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index f90090e8c959..56fa5a16e097 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -96,6 +96,10 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, struct nfs_open_context *ctx, struct nfs4_label *ilabel, struct nfs4_label *olabel); #ifdef CONFIG_NFS_V4_1 +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, + struct rpc_cred *cred, + struct nfs4_slot *slot, + bool is_privileged); static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *, struct rpc_cred *); static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, @@ -254,15 +258,12 @@ const u32 nfs4_fsinfo_bitmap[3] = { FATTR4_WORD0_MAXFILESIZE }; const u32 nfs4_fs_locations_bitmap[3] = { - FATTR4_WORD0_TYPE - | FATTR4_WORD0_CHANGE + FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE | FATTR4_WORD0_FSID | FATTR4_WORD0_FILEID | FATTR4_WORD0_FS_LOCATIONS, - FATTR4_WORD1_MODE - | FATTR4_WORD1_NUMLINKS - | FATTR4_WORD1_OWNER + FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP | FATTR4_WORD1_RAWDEV | FATTR4_WORD1_SPACE_USED @@ -644,13 +645,14 @@ static int nfs40_sequence_done(struct rpc_task *task, #if defined(CONFIG_NFS_V4_1) -static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) +static void nfs41_release_slot(struct nfs4_slot *slot) { struct nfs4_session *session; struct nfs4_slot_table *tbl; - struct nfs4_slot *slot = res->sr_slot; bool send_new_highest_used_slotid = false; + if (!slot) + return; tbl = slot->table; session = tbl->session; @@ -676,13 +678,18 @@ static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) send_new_highest_used_slotid = false; out_unlock: spin_unlock(&tbl->slot_tbl_lock); - res->sr_slot = NULL; if (send_new_highest_used_slotid) nfs41_notify_server(session->clp); if (waitqueue_active(&tbl->slot_waitq)) wake_up_all(&tbl->slot_waitq); } +static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) +{ + nfs41_release_slot(res->sr_slot); + res->sr_slot = NULL; +} + static int nfs41_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) { @@ -710,13 +717,6 @@ static int nfs41_sequence_process(struct rpc_task *task, /* Check the SEQUENCE operation status */ switch (res->sr_status) { case 0: - /* If previous op on slot was interrupted and we reused - * the seq# and got a reply from the cache, then retry - */ - if (task->tk_status == -EREMOTEIO && interrupted) { - ++slot->seq_nr; - goto retry_nowait; - } /* Update the slot's sequence and clientid lease timer */ slot->seq_done = 1; clp = session->clp; @@ -750,16 +750,16 @@ static int nfs41_sequence_process(struct rpc_task *task, * The slot id we used was probably retired. Try again * using a different slot id. */ + if (slot->seq_nr < slot->table->target_highest_slotid) + goto session_recover; goto retry_nowait; case -NFS4ERR_SEQ_MISORDERED: /* * Was the last operation on this sequence interrupted? * If so, retry after bumping the sequence number. */ - if (interrupted) { - ++slot->seq_nr; - goto retry_nowait; - } + if (interrupted) + goto retry_new_seq; /* * Could this slot have been previously retired? * If so, then the server may be expecting seq_nr = 1! @@ -768,10 +768,11 @@ static int nfs41_sequence_process(struct rpc_task *task, slot->seq_nr = 1; goto retry_nowait; } - break; + goto session_recover; case -NFS4ERR_SEQ_FALSE_RETRY: - ++slot->seq_nr; - goto retry_nowait; + if (interrupted) + goto retry_new_seq; + goto session_recover; default: /* Just update the slot sequence no. */ slot->seq_done = 1; @@ -781,6 +782,11 @@ out: dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); out_noaction: return ret; +session_recover: + nfs4_schedule_session_recovery(session, res->sr_status); + goto retry_nowait; +retry_new_seq: + ++slot->seq_nr; retry_nowait: if (rpc_restart_call_prepare(task)) { nfs41_sequence_free_slot(res); @@ -857,6 +863,17 @@ static const struct rpc_call_ops nfs41_call_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; +static void +nfs4_sequence_process_interrupted(struct nfs_client *client, + struct nfs4_slot *slot, struct rpc_cred *cred) +{ + struct rpc_task *task; + + task = _nfs41_proc_sequence(client, cred, slot, true); + if (!IS_ERR(task)) + rpc_put_task_async(task); +} + #else /* !CONFIG_NFS_V4_1 */ static int nfs4_sequence_process(struct rpc_task *task, struct nfs4_sequence_res *res) @@ -877,9 +894,34 @@ int nfs4_sequence_done(struct rpc_task *task, } EXPORT_SYMBOL_GPL(nfs4_sequence_done); +static void +nfs4_sequence_process_interrupted(struct nfs_client *client, + struct nfs4_slot *slot, struct rpc_cred *cred) +{ + WARN_ON_ONCE(1); + slot->interrupted = 0; +} + #endif /* !CONFIG_NFS_V4_1 */ -int nfs4_setup_sequence(const struct nfs_client *client, +static +void nfs4_sequence_attach_slot(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + struct nfs4_slot *slot) +{ + if (!slot) + return; + slot->privileged = args->sa_privileged ? 1 : 0; + args->sa_slot = slot; + + res->sr_slot = slot; + res->sr_timestamp = jiffies; + res->sr_status_flags = 0; + res->sr_status = 1; + +} + +int nfs4_setup_sequence(struct nfs_client *client, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, struct rpc_task *task) @@ -897,29 +939,28 @@ int nfs4_setup_sequence(const struct nfs_client *client, task->tk_timeout = 0; } - spin_lock(&tbl->slot_tbl_lock); - /* The state manager will wait until the slot table is empty */ - if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) - goto out_sleep; + for (;;) { + spin_lock(&tbl->slot_tbl_lock); + /* The state manager will wait until the slot table is empty */ + if (nfs4_slot_tbl_draining(tbl) && !args->sa_privileged) + goto out_sleep; + + slot = nfs4_alloc_slot(tbl); + if (IS_ERR(slot)) { + /* Try again in 1/4 second */ + if (slot == ERR_PTR(-ENOMEM)) + task->tk_timeout = HZ >> 2; + goto out_sleep; + } + spin_unlock(&tbl->slot_tbl_lock); - slot = nfs4_alloc_slot(tbl); - if (IS_ERR(slot)) { - /* Try again in 1/4 second */ - if (slot == ERR_PTR(-ENOMEM)) - task->tk_timeout = HZ >> 2; - goto out_sleep; + if (likely(!slot->interrupted)) + break; + nfs4_sequence_process_interrupted(client, + slot, task->tk_msg.rpc_cred); } - spin_unlock(&tbl->slot_tbl_lock); - - slot->privileged = args->sa_privileged ? 1 : 0; - args->sa_slot = slot; - res->sr_slot = slot; - if (session) { - res->sr_timestamp = jiffies; - res->sr_status_flags = 0; - res->sr_status = 1; - } + nfs4_sequence_attach_slot(args, res, slot); trace_nfs4_setup_sequence(session, args); out_start: @@ -1044,6 +1085,12 @@ struct nfs4_opendata { int rpc_status; }; +struct nfs4_open_createattrs { + struct nfs4_label *label; + struct iattr *sattr; + const __u32 verf[2]; +}; + static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server, int err, struct nfs4_exception *exception) { @@ -1113,8 +1160,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct nfs4_state_owner *sp, fmode_t fmode, int flags, - const struct iattr *attrs, - struct nfs4_label *label, + const struct nfs4_open_createattrs *c, enum open_claim_type4 claim, gfp_t gfp_mask) { @@ -1122,6 +1168,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, struct inode *dir = d_inode(parent); struct nfs_server *server = NFS_SERVER(dir); struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); + struct nfs4_label *label = (c != NULL) ? c->label : NULL; struct nfs4_opendata *p; p = kzalloc(sizeof(*p), gfp_mask); @@ -1187,15 +1234,11 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, case NFS4_OPEN_CLAIM_DELEG_PREV_FH: p->o_arg.fh = NFS_FH(d_inode(dentry)); } - if (attrs != NULL && attrs->ia_valid != 0) { - __u32 verf[2]; - + if (c != NULL && c->sattr != NULL && c->sattr->ia_valid != 0) { p->o_arg.u.attrs = &p->attrs; - memcpy(&p->attrs, attrs, sizeof(p->attrs)); + memcpy(&p->attrs, c->sattr, sizeof(p->attrs)); - verf[0] = jiffies; - verf[1] = current->pid; - memcpy(p->o_arg.u.verifier.data, verf, + memcpy(p->o_arg.u.verifier.data, c->verf, sizeof(p->o_arg.u.verifier.data)); } p->c_arg.fh = &p->o_res.fh; @@ -1334,6 +1377,25 @@ static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state) } #endif /* CONFIG_NFS_V4_1 */ +static void nfs_state_log_update_open_stateid(struct nfs4_state *state) +{ + if (test_and_clear_bit(NFS_STATE_CHANGE_WAIT, &state->flags)) + wake_up_all(&state->waitq); +} + +static void nfs_state_log_out_of_order_open_stateid(struct nfs4_state *state, + const nfs4_stateid *stateid) +{ + u32 state_seqid = be32_to_cpu(state->open_stateid.seqid); + u32 stateid_seqid = be32_to_cpu(stateid->seqid); + + if (stateid_seqid == state_seqid + 1U || + (stateid_seqid == 1U && state_seqid == 0xffffffffU)) + nfs_state_log_update_open_stateid(state); + else + set_bit(NFS_STATE_CHANGE_WAIT, &state->flags); +} + static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) { struct nfs_client *clp = state->owner->so_server->nfs_client; @@ -1349,18 +1411,32 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state) nfs4_state_mark_reclaim_nograce(clp, state); } +/* + * Check for whether or not the caller may update the open stateid + * to the value passed in by stateid. + * + * Note: This function relies heavily on the server implementing + * RFC7530 Section 9.1.4.2, and RFC5661 Section 8.2.2 + * correctly. + * i.e. The stateid seqids have to be initialised to 1, and + * are then incremented on every state transition. + */ static bool nfs_need_update_open_stateid(struct nfs4_state *state, - const nfs4_stateid *stateid, nfs4_stateid *freeme) + const nfs4_stateid *stateid) { - if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0) - return true; - if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) { - nfs4_stateid_copy(freeme, &state->open_stateid); - nfs_test_and_clear_all_open_stateid(state); + if (test_bit(NFS_OPEN_STATE, &state->flags) == 0 || + !nfs4_stateid_match_other(stateid, &state->open_stateid)) { + if (stateid->seqid == cpu_to_be32(1)) + nfs_state_log_update_open_stateid(state); + else + set_bit(NFS_STATE_CHANGE_WAIT, &state->flags); return true; } - if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) + + if (nfs4_stateid_is_newer(stateid, &state->open_stateid)) { + nfs_state_log_out_of_order_open_stateid(state, stateid); return true; + } return false; } @@ -1399,11 +1475,14 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state, if (nfs4_stateid_match_other(stateid, &state->open_stateid) && !nfs4_stateid_is_newer(stateid, &state->open_stateid)) { nfs_resync_open_stateid_locked(state); - return; + goto out; } if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); + trace_nfs4_open_stateid_update(state->inode, stateid, 0); +out: + nfs_state_log_update_open_stateid(state); } static void nfs_clear_open_stateid(struct nfs4_state *state, @@ -1420,29 +1499,60 @@ static void nfs_clear_open_stateid(struct nfs4_state *state, } static void nfs_set_open_stateid_locked(struct nfs4_state *state, - const nfs4_stateid *stateid, fmode_t fmode, - nfs4_stateid *freeme) + const nfs4_stateid *stateid, nfs4_stateid *freeme) { - switch (fmode) { - case FMODE_READ: - set_bit(NFS_O_RDONLY_STATE, &state->flags); + DEFINE_WAIT(wait); + int status = 0; + for (;;) { + + if (!nfs_need_update_open_stateid(state, stateid)) + return; + if (!test_bit(NFS_STATE_CHANGE_WAIT, &state->flags)) break; - case FMODE_WRITE: - set_bit(NFS_O_WRONLY_STATE, &state->flags); + if (status) break; - case FMODE_READ|FMODE_WRITE: - set_bit(NFS_O_RDWR_STATE, &state->flags); + /* Rely on seqids for serialisation with NFSv4.0 */ + if (!nfs4_has_session(NFS_SERVER(state->inode)->nfs_client)) + break; + + prepare_to_wait(&state->waitq, &wait, TASK_KILLABLE); + /* + * Ensure we process the state changes in the same order + * in which the server processed them by delaying the + * update of the stateid until we are in sequence. + */ + write_sequnlock(&state->seqlock); + spin_unlock(&state->owner->so_lock); + rcu_read_unlock(); + trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); + if (!signal_pending(current)) { + if (schedule_timeout(5*HZ) == 0) + status = -EAGAIN; + else + status = 0; + } else + status = -EINTR; + finish_wait(&state->waitq, &wait); + rcu_read_lock(); + spin_lock(&state->owner->so_lock); + write_seqlock(&state->seqlock); } - if (!nfs_need_update_open_stateid(state, stateid, freeme)) - return; + + if (test_bit(NFS_OPEN_STATE, &state->flags) && + !nfs4_stateid_match_other(stateid, &state->open_stateid)) { + nfs4_stateid_copy(freeme, &state->open_stateid); + nfs_test_and_clear_all_open_stateid(state); + } + if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0) nfs4_stateid_copy(&state->stateid, stateid); nfs4_stateid_copy(&state->open_stateid, stateid); + trace_nfs4_open_stateid_update(state->inode, stateid, status); + nfs_state_log_update_open_stateid(state); } -static void __update_open_stateid(struct nfs4_state *state, +static void nfs_state_set_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, - const nfs4_stateid *deleg_stateid, fmode_t fmode, nfs4_stateid *freeme) { @@ -1450,17 +1560,34 @@ static void __update_open_stateid(struct nfs4_state *state, * Protect the call to nfs4_state_set_mode_locked and * serialise the stateid update */ - spin_lock(&state->owner->so_lock); write_seqlock(&state->seqlock); - if (deleg_stateid != NULL) { - nfs4_stateid_copy(&state->stateid, deleg_stateid); - set_bit(NFS_DELEGATED_STATE, &state->flags); + nfs_set_open_stateid_locked(state, open_stateid, freeme); + switch (fmode) { + case FMODE_READ: + set_bit(NFS_O_RDONLY_STATE, &state->flags); + break; + case FMODE_WRITE: + set_bit(NFS_O_WRONLY_STATE, &state->flags); + break; + case FMODE_READ|FMODE_WRITE: + set_bit(NFS_O_RDWR_STATE, &state->flags); } - if (open_stateid != NULL) - nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme); + set_bit(NFS_OPEN_STATE, &state->flags); + write_sequnlock(&state->seqlock); +} + +static void nfs_state_set_delegation(struct nfs4_state *state, + const nfs4_stateid *deleg_stateid, + fmode_t fmode) +{ + /* + * Protect the call to nfs4_state_set_mode_locked and + * serialise the stateid update + */ + write_seqlock(&state->seqlock); + nfs4_stateid_copy(&state->stateid, deleg_stateid); + set_bit(NFS_DELEGATED_STATE, &state->flags); write_sequnlock(&state->seqlock); - update_open_stateflags(state, fmode); - spin_unlock(&state->owner->so_lock); } static int update_open_stateid(struct nfs4_state *state, @@ -1478,6 +1605,12 @@ static int update_open_stateid(struct nfs4_state *state, fmode &= (FMODE_READ|FMODE_WRITE); rcu_read_lock(); + spin_lock(&state->owner->so_lock); + if (open_stateid != NULL) { + nfs_state_set_open_stateid(state, open_stateid, fmode, &freeme); + ret = 1; + } + deleg_cur = rcu_dereference(nfsi->delegation); if (deleg_cur == NULL) goto no_delegation; @@ -1494,18 +1627,16 @@ static int update_open_stateid(struct nfs4_state *state, goto no_delegation_unlock; nfs_mark_delegation_referenced(deleg_cur); - __update_open_stateid(state, open_stateid, &deleg_cur->stateid, - fmode, &freeme); + nfs_state_set_delegation(state, &deleg_cur->stateid, fmode); ret = 1; no_delegation_unlock: spin_unlock(&deleg_cur->lock); no_delegation: + if (ret) + update_open_stateflags(state, fmode); + spin_unlock(&state->owner->so_lock); rcu_read_unlock(); - if (!ret && open_stateid != NULL) { - __update_open_stateid(state, open_stateid, NULL, fmode, &freeme); - ret = 1; - } if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) nfs4_schedule_state_manager(clp); if (freeme.type != 0) @@ -1761,7 +1892,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context struct nfs4_opendata *opendata; opendata = nfs4_opendata_alloc(ctx->dentry, state->owner, 0, 0, - NULL, NULL, claim, GFP_NOFS); + NULL, claim, GFP_NOFS); if (opendata == NULL) return ERR_PTR(-ENOMEM); opendata->state = state; @@ -2518,7 +2649,7 @@ static int nfs41_check_expired_locks(struct nfs4_state *state) if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { struct rpc_cred *cred = lsp->ls_state->owner->so_cred; - atomic_inc(&lsp->ls_count); + refcount_inc(&lsp->ls_count); spin_unlock(&state->state_lock); nfs4_put_lock_state(prev); @@ -2692,8 +2823,7 @@ out: static int _nfs4_do_open(struct inode *dir, struct nfs_open_context *ctx, int flags, - struct iattr *sattr, - struct nfs4_label *label, + const struct nfs4_open_createattrs *c, int *opened) { struct nfs4_state_owner *sp; @@ -2705,6 +2835,8 @@ static int _nfs4_do_open(struct inode *dir, struct nfs4_threshold **ctx_th = &ctx->mdsthreshold; fmode_t fmode = ctx->mode & (FMODE_READ|FMODE_WRITE|FMODE_EXEC); enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL; + struct iattr *sattr = c->sattr; + struct nfs4_label *label = c->label; struct nfs4_label *olabel = NULL; int status; @@ -2723,8 +2855,8 @@ static int _nfs4_do_open(struct inode *dir, status = -ENOMEM; if (d_really_is_positive(dentry)) claim = NFS4_OPEN_CLAIM_FH; - opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr, - label, claim, GFP_KERNEL); + opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, + c, claim, GFP_KERNEL); if (opendata == NULL) goto err_put_state_owner; @@ -2805,10 +2937,18 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir, struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { }; struct nfs4_state *res; + struct nfs4_open_createattrs c = { + .label = label, + .sattr = sattr, + .verf = { + [0] = (__u32)jiffies, + [1] = (__u32)current->pid, + }, + }; int status; do { - status = _nfs4_do_open(dir, ctx, flags, sattr, label, opened); + status = _nfs4_do_open(dir, ctx, flags, &c, opened); res = ctx->state; trace_nfs4_open_file(ctx, flags, status); if (status == 0) @@ -3024,18 +3164,20 @@ static void nfs4_close_done(struct rpc_task *task, void *data) calldata->arg.lr_args = NULL; calldata->res.lr_res = NULL; break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_layout_stateid(&calldata->arg.lr_args->stateid, + calldata->inode)) + goto lr_restart; + /* Fallthrough */ case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_UNKNOWN_LAYOUTTYPE: case -NFS4ERR_WRONG_CRED: calldata->arg.lr_args = NULL; calldata->res.lr_res = NULL; - calldata->res.lr_ret = 0; - rpc_restart_call_prepare(task); - return; + goto lr_restart; } } @@ -3051,39 +3193,43 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->arg.bitmask != NULL) { calldata->arg.bitmask = NULL; calldata->res.fattr = NULL; - task->tk_status = 0; - rpc_restart_call_prepare(task); - goto out_release; + goto out_restart; } break; + case -NFS4ERR_OLD_STATEID: + /* Did we race with OPEN? */ + if (nfs4_refresh_open_stateid(&calldata->arg.stateid, + state)) + goto out_restart; + goto out_release; case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: nfs4_free_revoked_stateid(server, &calldata->arg.stateid, task->tk_msg.rpc_cred); - case -NFS4ERR_OLD_STATEID: + /* Fallthrough */ case -NFS4ERR_BAD_STATEID: - if (!nfs4_stateid_match(&calldata->arg.stateid, - &state->open_stateid)) { - rpc_restart_call_prepare(task); - goto out_release; - } - if (calldata->arg.fmode == 0) - break; + break; default: - if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - goto out_release; - } + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) + goto out_restart; } nfs_clear_open_stateid(state, &calldata->arg.stateid, res_stateid, calldata->arg.fmode); out_release: + task->tk_status = 0; nfs_release_seqid(calldata->arg.seqid); nfs_refresh_inode(calldata->inode, &calldata->fattr); dprintk("%s: done, ret = %d!\n", __func__, task->tk_status); + return; +lr_restart: + calldata->res.lr_ret = 0; +out_restart: + task->tk_status = 0; + rpc_restart_call_prepare(task); + goto out_release; } static void nfs4_close_prepare(struct rpc_task *task, void *data) @@ -3103,7 +3249,6 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); - nfs4_stateid_copy(&calldata->arg.stateid, &state->open_stateid); /* Calculate the change in open mode */ calldata->arg.fmode = 0; if (state->n_rdwr == 0) { @@ -3121,7 +3266,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; if (!nfs4_valid_open_stateid(state) || - test_bit(NFS_OPEN_STATE, &state->flags) == 0) + !nfs4_refresh_open_stateid(&calldata->arg.stateid, state)) call_close = 0; spin_unlock(&state->owner->so_lock); @@ -3215,6 +3360,8 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) calldata->inode = state->inode; calldata->state = state; calldata->arg.fh = NFS_FH(state->inode); + if (!nfs4_copy_open_stateid(&calldata->arg.stateid, state)) + goto out_free_calldata; /* Serialization for the sequence id */ alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask); @@ -3889,6 +4036,7 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry struct nfs4_accessargs args = { .fh = NFS_FH(inode), .bitmask = server->cache_consistency_bitmask, + .access = entry->mask, }; struct nfs4_accessres res = { .server = server, @@ -3899,26 +4047,8 @@ static int _nfs4_proc_access(struct inode *inode, struct nfs_access_entry *entry .rpc_resp = &res, .rpc_cred = entry->cred, }; - int mode = entry->mask; int status = 0; - /* - * Determine which access bits we want to ask for... - */ - if (mode & MAY_READ) - args.access |= NFS4_ACCESS_READ; - if (S_ISDIR(inode->i_mode)) { - if (mode & MAY_WRITE) - args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND | NFS4_ACCESS_DELETE; - if (mode & MAY_EXEC) - args.access |= NFS4_ACCESS_LOOKUP; - } else { - if (mode & MAY_WRITE) - args.access |= NFS4_ACCESS_MODIFY | NFS4_ACCESS_EXTEND; - if (mode & MAY_EXEC) - args.access |= NFS4_ACCESS_EXECUTE; - } - res.fattr = nfs_alloc_fattr(); if (res.fattr == NULL) return -ENOMEM; @@ -4843,7 +4973,7 @@ static void nfs4_renew_release(void *calldata) struct nfs4_renewdata *data = calldata; struct nfs_client *clp = data->client; - if (atomic_read(&clp->cl_count) > 1) + if (refcount_read(&clp->cl_count) > 1) nfs4_schedule_state_renewal(clp); nfs_put_client(clp); kfree(data); @@ -4891,7 +5021,7 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, if (renew_flags == 0) return 0; - if (!atomic_inc_not_zero(&clp->cl_count)) + if (!refcount_inc_not_zero(&clp->cl_count)) return -EIO; data = kmalloc(sizeof(*data), GFP_NOFS); if (data == NULL) { @@ -5643,18 +5773,20 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) data->args.lr_args = NULL; data->res.lr_res = NULL; break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_layout_stateid(&data->args.lr_args->stateid, + data->inode)) + goto lr_restart; + /* Fallthrough */ case -NFS4ERR_ADMIN_REVOKED: case -NFS4ERR_DELEG_REVOKED: case -NFS4ERR_EXPIRED: case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_UNKNOWN_LAYOUTTYPE: case -NFS4ERR_WRONG_CRED: data->args.lr_args = NULL; data->res.lr_res = NULL; - data->res.lr_ret = 0; - rpc_restart_call_prepare(task); - return; + goto lr_restart; } } @@ -5668,27 +5800,36 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) nfs4_free_revoked_stateid(data->res.server, data->args.stateid, task->tk_msg.rpc_cred); + /* Fallthrough */ case -NFS4ERR_BAD_STATEID: - case -NFS4ERR_OLD_STATEID: case -NFS4ERR_STALE_STATEID: task->tk_status = 0; break; + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_delegation_stateid(&data->stateid, data->inode)) + goto out_restart; + task->tk_status = 0; + break; case -NFS4ERR_ACCESS: if (data->args.bitmask) { data->args.bitmask = NULL; data->res.fattr = NULL; - task->tk_status = 0; - rpc_restart_call_prepare(task); - return; + goto out_restart; } + /* Fallthrough */ default: if (nfs4_async_handle_error(task, data->res.server, NULL, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return; + goto out_restart; } } data->rpc_status = task->tk_status; + return; +lr_restart: + data->res.lr_ret = 0; +out_restart: + task->tk_status = 0; + rpc_restart_call_prepare(task); } static void nfs4_delegreturn_release(void *calldata) @@ -5896,7 +6037,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->arg.seqid = seqid; p->res.seqid = seqid; p->lsp = lsp; - atomic_inc(&lsp->ls_count); + refcount_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); p->l_ctx = nfs_get_lock_context(ctx); @@ -6112,7 +6253,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; p->server = server; - atomic_inc(&lsp->ls_count); + refcount_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); memcpy(&p->fl, fl, sizeof(p->fl)); return p; @@ -6568,6 +6709,20 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) return -ENOLCK; + /* + * Don't rely on the VFS having checked the file open mode, + * since it won't do this for flock() locks. + */ + switch (request->fl_type) { + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + status = nfs4_set_lock_state(state, request); if (status != 0) return status; @@ -6763,9 +6918,7 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, struct page *page) { struct nfs_server *server = NFS_SERVER(dir); - u32 bitmask[3] = { - [0] = FATTR4_WORD0_FSID | FATTR4_WORD0_FS_LOCATIONS, - }; + u32 bitmask[3]; struct nfs4_fs_locations_arg args = { .dir_fh = NFS_FH(dir), .name = name, @@ -6784,12 +6937,15 @@ static int _nfs4_proc_fs_locations(struct rpc_clnt *client, struct inode *dir, dprintk("%s: start\n", __func__); + bitmask[0] = nfs4_fattr_bitmap[0] | FATTR4_WORD0_FS_LOCATIONS; + bitmask[1] = nfs4_fattr_bitmap[1]; + /* Ask for the fileid of the absent filesystem if mounted_on_fileid * is not supported */ if (NFS_SERVER(dir)->attr_bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) - bitmask[1] |= FATTR4_WORD1_MOUNTED_ON_FILEID; + bitmask[0] &= ~FATTR4_WORD0_FILEID; else - bitmask[0] |= FATTR4_WORD0_FILEID; + bitmask[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; nfs_fattr_init(&fs_locations->fattr); fs_locations->server = server; @@ -7472,7 +7628,7 @@ nfs4_run_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, struct nfs41_exchange_id_data *calldata; int status; - if (!atomic_inc_not_zero(&clp->cl_count)) + if (!refcount_inc_not_zero(&clp->cl_count)) return ERR_PTR(-EIO); status = -ENOMEM; @@ -8072,7 +8228,7 @@ static void nfs41_sequence_release(void *data) struct nfs4_sequence_data *calldata = data; struct nfs_client *clp = calldata->clp; - if (atomic_read(&clp->cl_count) > 1) + if (refcount_read(&clp->cl_count) > 1) nfs4_schedule_state_renewal(clp); nfs_put_client(clp); kfree(calldata); @@ -8101,7 +8257,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) trace_nfs4_sequence(clp, task->tk_status); if (task->tk_status < 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); - if (atomic_read(&clp->cl_count) == 1) + if (refcount_read(&clp->cl_count) == 1) goto out; if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { @@ -8135,6 +8291,7 @@ static const struct rpc_call_ops nfs41_sequence_ops = { static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred, + struct nfs4_slot *slot, bool is_privileged) { struct nfs4_sequence_data *calldata; @@ -8148,15 +8305,18 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, .callback_ops = &nfs41_sequence_ops, .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, }; + struct rpc_task *ret; - if (!atomic_inc_not_zero(&clp->cl_count)) - return ERR_PTR(-EIO); + ret = ERR_PTR(-EIO); + if (!refcount_inc_not_zero(&clp->cl_count)) + goto out_err; + + ret = ERR_PTR(-ENOMEM); calldata = kzalloc(sizeof(*calldata), GFP_NOFS); - if (calldata == NULL) { - nfs_put_client(clp); - return ERR_PTR(-ENOMEM); - } + if (calldata == NULL) + goto out_put_clp; nfs4_init_sequence(&calldata->args, &calldata->res, 0); + nfs4_sequence_attach_slot(&calldata->args, &calldata->res, slot); if (is_privileged) nfs4_set_sequence_privileged(&calldata->args); msg.rpc_argp = &calldata->args; @@ -8164,7 +8324,15 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, calldata->clp = clp; task_setup_data.callback_data = calldata; - return rpc_run_task(&task_setup_data); + ret = rpc_run_task(&task_setup_data); + if (IS_ERR(ret)) + goto out_err; + return ret; +out_put_clp: + nfs_put_client(clp); +out_err: + nfs41_release_slot(slot); + return ret; } static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags) @@ -8174,7 +8342,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) return -EAGAIN; - task = _nfs41_proc_sequence(clp, cred, false); + task = _nfs41_proc_sequence(clp, cred, NULL, false); if (IS_ERR(task)) ret = PTR_ERR(task); else @@ -8188,7 +8356,7 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) struct rpc_task *task; int ret; - task = _nfs41_proc_sequence(clp, cred, true); + task = _nfs41_proc_sequence(clp, cred, NULL, true); if (IS_ERR(task)) { ret = PTR_ERR(task); goto out; @@ -8588,18 +8756,27 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) server = NFS_SERVER(lrp->args.inode); switch (task->tk_status) { + case -NFS4ERR_OLD_STATEID: + if (nfs4_refresh_layout_stateid(&lrp->args.stateid, + lrp->args.inode)) + goto out_restart; + /* Fallthrough */ default: task->tk_status = 0; + /* Fallthrough */ case 0: break; case -NFS4ERR_DELAY: if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) break; - nfs4_sequence_free_slot(&lrp->res.seq_res); - rpc_restart_call_prepare(task); - return; + goto out_restart; } dprintk("<-- %s\n", __func__); + return; +out_restart: + task->tk_status = 0; + nfs4_sequence_free_slot(&lrp->res.seq_res); + rpc_restart_call_prepare(task); } static void nfs4_layoutreturn_release(void *calldata) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 0378e2257ca7..54fd56d715a8 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -69,6 +69,14 @@ const nfs4_stateid zero_stateid = { { .data = { 0 } }, .type = NFS4_SPECIAL_STATEID_TYPE, }; +const nfs4_stateid invalid_stateid = { + { + .seqid = cpu_to_be32(0xffffffffU), + .other = { 0 }, + }, + .type = NFS4_INVALID_STATEID_TYPE, +}; + static DEFINE_MUTEX(nfs_clid_init_mutex); int nfs4_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) @@ -645,6 +653,7 @@ nfs4_alloc_open_state(void) INIT_LIST_HEAD(&state->lock_states); spin_lock_init(&state->state_lock); seqlock_init(&state->seqlock); + init_waitqueue_head(&state->waitq); return state; } @@ -825,7 +834,7 @@ __nfs4_find_lock_state(struct nfs4_state *state, ret = pos; } if (ret) - atomic_inc(&ret->ls_count); + refcount_inc(&ret->ls_count); return ret; } @@ -843,7 +852,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f if (lsp == NULL) return NULL; nfs4_init_seqid_counter(&lsp->ls_seqid); - atomic_set(&lsp->ls_count, 1); + refcount_set(&lsp->ls_count, 1); lsp->ls_state = state; lsp->ls_owner = fl_owner; lsp->ls_seqid.owner_id = ida_simple_get(&server->lockowner_id, 0, 0, GFP_NOFS); @@ -907,7 +916,7 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (lsp == NULL) return; state = lsp->ls_state; - if (!atomic_dec_and_lock(&lsp->ls_count, &state->state_lock)) + if (!refcount_dec_and_lock(&lsp->ls_count, &state->state_lock)) return; list_del(&lsp->ls_locks); if (list_empty(&state->lock_states)) @@ -927,7 +936,7 @@ static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) struct nfs4_lock_state *lsp = src->fl_u.nfs4_fl.owner; dst->fl_u.nfs4_fl.owner = lsp; - atomic_inc(&lsp->ls_count); + refcount_inc(&lsp->ls_count); } static void nfs4_fl_release_lock(struct file_lock *fl) @@ -985,18 +994,39 @@ out: return ret; } -static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +bool nfs4_refresh_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) +{ + bool ret; + int seq; + + do { + ret = false; + seq = read_seqbegin(&state->seqlock); + if (nfs4_state_match_open_stateid_other(state, dst)) { + dst->seqid = state->open_stateid.seqid; + ret = true; + } + } while (read_seqretry(&state->seqlock, seq)); + return ret; +} + +bool nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state) { + bool ret; const nfs4_stateid *src; int seq; do { + ret = false; src = &zero_stateid; seq = read_seqbegin(&state->seqlock); - if (test_bit(NFS_OPEN_STATE, &state->flags)) + if (test_bit(NFS_OPEN_STATE, &state->flags)) { src = &state->open_stateid; + ret = true; + } nfs4_stateid_copy(dst, src); } while (read_seqretry(&state->seqlock, seq)); + return ret; } /* @@ -1177,7 +1207,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) return; __module_get(THIS_MODULE); - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); /* The rcu_read_lock() is not strictly necessary, as the state * manager is the only thread that ever changes the rpc_xprt @@ -1269,7 +1299,7 @@ int nfs4_wait_clnt_recover(struct nfs_client *clp) might_sleep(); - atomic_inc(&clp->cl_count); + refcount_inc(&clp->cl_count); res = wait_on_bit_action(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING, nfs_wait_bit_killable, TASK_KILLABLE); if (res) @@ -1409,6 +1439,11 @@ void nfs_inode_find_state_and_recover(struct inode *inode, found = true; continue; } + if (nfs4_stateid_match_other(&state->open_stateid, stateid) && + nfs4_state_mark_reclaim_nograce(clp, state)) { + found = true; + continue; + } if (nfs_state_lock_state_matches_stateid(state, stateid) && nfs4_state_mark_reclaim_nograce(clp, state)) found = true; @@ -2510,7 +2545,7 @@ static void nfs4_state_manager(struct nfs_client *clp) break; if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0) break; - } while (atomic_read(&clp->cl_count) > 1); + } while (refcount_read(&clp->cl_count) > 1); return; out_error: if (strlen(section)) diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index e7c6275519b0..a275fba93170 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -202,17 +202,13 @@ DECLARE_EVENT_CLASS(nfs4_clientid_event, TP_ARGS(clp, error), TP_STRUCT__entry( - __string(dstaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)) + __string(dstaddr, clp->cl_hostname) __field(int, error) ), TP_fast_assign( __entry->error = error; - __assign_str(dstaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR)); + __assign_str(dstaddr, clp->cl_hostname); ), TP_printk( @@ -1066,6 +1062,8 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr); DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_open_stateid_update_wait); DECLARE_EVENT_CLASS(nfs4_getattr_event, TP_PROTO( @@ -1133,9 +1131,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __string(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __string(dstaddr, clp ? clp->cl_hostname : "unknown") ), TP_fast_assign( @@ -1148,9 +1144,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown") ), TP_printk( @@ -1192,9 +1186,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) - __string(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __string(dstaddr, clp ? clp->cl_hostname : "unknown") __field(int, stateid_seq) __field(u32, stateid_hash) ), @@ -1209,9 +1201,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, __entry->fileid = 0; __entry->dev = 0; } - __assign_str(dstaddr, clp ? - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR) : "unknown") + __assign_str(dstaddr, clp ? clp->cl_hostname : "unknown") __entry->stateid_seq = be32_to_cpu(stateid->seqid); __entry->stateid_hash = diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 14ed9791ec9c..77c6729e57f0 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4385,6 +4385,14 @@ static int decode_delegation_stateid(struct xdr_stream *xdr, nfs4_stateid *state return decode_stateid(xdr, stateid); } +static int decode_invalid_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + nfs4_stateid dummy; + + nfs4_stateid_copy(stateid, &invalid_stateid); + return decode_stateid(xdr, &dummy); +} + static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) { int status; @@ -4393,7 +4401,7 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); if (!status) - status = decode_open_stateid(xdr, &res->stateid); + status = decode_invalid_stateid(xdr, &res->stateid); return status; } @@ -6108,6 +6116,8 @@ static int decode_layoutreturn(struct xdr_stream *xdr, res->lrs_present = be32_to_cpup(p); if (res->lrs_present) status = decode_layout_stateid(xdr, &res->stateid); + else + nfs4_stateid_copy(&res->stateid, &invalid_stateid); return status; out_overflow: print_overflow_msg(__func__, xdr); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3bcd669a3152..d602fe9e1ac8 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -251,7 +251,7 @@ EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver); void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo) { - atomic_inc(&lo->plh_refcount); + refcount_inc(&lo->plh_refcount); } static struct pnfs_layout_hdr * @@ -296,7 +296,7 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) pnfs_layoutreturn_before_put_layout_hdr(lo); - if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { + if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); pnfs_detach_layout_hdr(lo); @@ -355,6 +355,24 @@ pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg, } /* + * Update the seqid of a layout stateid + */ +bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, struct inode *inode) +{ + struct pnfs_layout_hdr *lo; + bool ret = false; + + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo && nfs4_stateid_match_other(dst, &lo->plh_stateid)) { + dst->seqid = lo->plh_stateid.seqid; + ret = true; + } + spin_unlock(&inode->i_lock); + return ret; +} + +/* * Mark a pnfs_layout_hdr and all associated layout segments as invalid * * In order to continue using the pnfs_layout_hdr, a full recovery @@ -395,14 +413,14 @@ pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { lo->plh_retry_timestamp = jiffies; if (!test_and_set_bit(fail_bit, &lo->plh_flags)) - atomic_inc(&lo->plh_refcount); + refcount_inc(&lo->plh_refcount); } static void pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit) { if (test_and_clear_bit(fail_bit, &lo->plh_flags)) - atomic_dec(&lo->plh_refcount); + refcount_dec(&lo->plh_refcount); } static void @@ -450,7 +468,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, { INIT_LIST_HEAD(&lseg->pls_list); INIT_LIST_HEAD(&lseg->pls_lc_list); - atomic_set(&lseg->pls_refcount, 1); + refcount_set(&lseg->pls_refcount, 1); set_bit(NFS_LSEG_VALID, &lseg->pls_flags); lseg->pls_layout = lo; lseg->pls_range = *range; @@ -472,7 +490,7 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); list_del_init(&lseg->pls_list); /* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */ - atomic_dec(&lo->plh_refcount); + refcount_dec(&lo->plh_refcount); if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) return; if (list_empty(&lo->plh_segs) && @@ -507,13 +525,13 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) return; dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, - atomic_read(&lseg->pls_refcount), + refcount_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); lo = lseg->pls_layout; inode = lo->plh_inode; - if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { spin_unlock(&inode->i_lock); return; @@ -551,7 +569,7 @@ pnfs_lseg_range_contained(const struct pnfs_layout_range *l1, static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg, struct list_head *tmp_list) { - if (!atomic_dec_and_test(&lseg->pls_refcount)) + if (!refcount_dec_and_test(&lseg->pls_refcount)) return false; pnfs_layout_remove_lseg(lseg->pls_layout, lseg); list_add(&lseg->pls_list, tmp_list); @@ -570,7 +588,7 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, * outstanding io is finished. */ dprintk("%s: lseg %p ref %d\n", __func__, lseg, - atomic_read(&lseg->pls_refcount)); + refcount_read(&lseg->pls_refcount)); if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list)) rv = 1; } @@ -1451,7 +1469,7 @@ alloc_init_layout_hdr(struct inode *ino, lo = pnfs_alloc_layout_hdr(ino, gfp_flags); if (!lo) return NULL; - atomic_set(&lo->plh_refcount, 1); + refcount_set(&lo->plh_refcount, 1); INIT_LIST_HEAD(&lo->plh_layouts); INIT_LIST_HEAD(&lo->plh_segs); INIT_LIST_HEAD(&lo->plh_return_segs); @@ -1513,7 +1531,7 @@ pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range, if ((range->iomode == IOMODE_RW && ls_range->iomode != IOMODE_RW) || (range->iomode != ls_range->iomode && - strict_iomode == true) || + strict_iomode) || !pnfs_lseg_range_intersecting(ls_range, range)) return 0; @@ -1546,7 +1564,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo, } dprintk("%s:Return lseg %p ref %d\n", - __func__, ret, ret ? atomic_read(&ret->pls_refcount) : 0); + __func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0); return ret; } diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 87f144f14d1e..8d507c361d98 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -30,6 +30,7 @@ #ifndef FS_NFS_PNFS_H #define FS_NFS_PNFS_H +#include <linux/refcount.h> #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/workqueue.h> @@ -54,7 +55,7 @@ struct nfs4_pnfs_ds { char *ds_remotestr; /* comma sep list of addrs */ struct list_head ds_addrs; struct nfs_client *ds_clp; - atomic_t ds_count; + refcount_t ds_count; unsigned long ds_state; #define NFS4DS_CONNECTING 0 /* ds is establishing connection */ }; @@ -63,7 +64,7 @@ struct pnfs_layout_segment { struct list_head pls_list; struct list_head pls_lc_list; struct pnfs_layout_range pls_range; - atomic_t pls_refcount; + refcount_t pls_refcount; u32 pls_seq; unsigned long pls_flags; struct pnfs_layout_hdr *pls_layout; @@ -179,7 +180,7 @@ struct pnfs_layoutdriver_type { }; struct pnfs_layout_hdr { - atomic_t plh_refcount; + refcount_t plh_refcount; atomic_t plh_outstanding; /* number of RPCs out */ struct list_head plh_layouts; /* other client layouts */ struct list_head plh_bulk_destroy; @@ -251,6 +252,7 @@ int pnfs_destroy_layouts_byfsid(struct nfs_client *clp, bool is_recall); int pnfs_destroy_layouts_byclid(struct nfs_client *clp, bool is_recall); +bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, struct inode *inode); void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -393,7 +395,7 @@ static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { if (lseg) { - atomic_inc(&lseg->pls_refcount); + refcount_inc(&lseg->pls_refcount); smp_mb__after_atomic(); } return lseg; @@ -764,6 +766,11 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) { } +static inline bool nfs4_refresh_layout_stateid(nfs4_stateid *dst, + struct inode *inode) +{ + return false; +} #endif /* CONFIG_NFS_V4_1 */ #if IS_ENABLED(CONFIG_NFS_V4_2) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 60da59be83b6..03aaa60c7768 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -338,7 +338,7 @@ print_ds(struct nfs4_pnfs_ds *ds) " client %p\n" " cl_exchange_flags %x\n", ds->ds_remotestr, - atomic_read(&ds->ds_count), ds->ds_clp, + refcount_read(&ds->ds_count), ds->ds_clp, ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0); } @@ -451,7 +451,7 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds) void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds) { - if (atomic_dec_and_lock(&ds->ds_count, + if (refcount_dec_and_lock(&ds->ds_count, &nfs4_ds_cache_lock)) { list_del_init(&ds->ds_node); spin_unlock(&nfs4_ds_cache_lock); @@ -537,7 +537,7 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; - atomic_set(&ds->ds_count, 1); + refcount_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); ds->ds_clp = NULL; list_add(&ds->ds_node, &nfs4_data_server_cache); @@ -546,10 +546,10 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) } else { kfree(remotestr); kfree(ds); - atomic_inc(&tmp_ds->ds_count); + refcount_inc(&tmp_ds->ds_count); dprintk("%s data server %s found, inc'ed ds_count to %d\n", __func__, tmp_ds->ds_remotestr, - atomic_read(&tmp_ds->ds_count)); + refcount_read(&tmp_ds->ds_count)); ds = tmp_ds; } spin_unlock(&nfs4_ds_cache_lock); diff --git a/fs/nfs/super.c b/fs/nfs/super.c index c9d24bae3025..29bacdc56f6a 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -813,9 +813,9 @@ int nfs_show_stats(struct seq_file *m, struct dentry *root) */ seq_printf(m, "\n\topts:\t"); seq_puts(m, sb_rdonly(root->d_sb) ? "ro" : "rw"); - seq_puts(m, root->d_sb->s_flags & MS_SYNCHRONOUS ? ",sync" : ""); - seq_puts(m, root->d_sb->s_flags & MS_NOATIME ? ",noatime" : ""); - seq_puts(m, root->d_sb->s_flags & MS_NODIRATIME ? ",nodiratime" : ""); + seq_puts(m, root->d_sb->s_flags & SB_SYNCHRONOUS ? ",sync" : ""); + seq_puts(m, root->d_sb->s_flags & SB_NOATIME ? ",noatime" : ""); + seq_puts(m, root->d_sb->s_flags & SB_NODIRATIME ? ",nodiratime" : ""); nfs_show_mount_options(m, nfss, 1); seq_printf(m, "\n\tage:\t%lu", (jiffies - nfss->mount_time) / HZ); @@ -1332,7 +1332,7 @@ static int nfs_parse_mount_options(char *raw, mnt->options |= NFS_OPTION_MIGRATION; break; case Opt_nomigration: - mnt->options &= NFS_OPTION_MIGRATION; + mnt->options &= ~NFS_OPTION_MIGRATION; break; /* @@ -1456,18 +1456,21 @@ static int nfs_parse_mount_options(char *raw, switch (token) { case Opt_xprt_udp6: protofamily = AF_INET6; + /* fall through */ case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: protofamily = AF_INET6; + /* fall through */ case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; break; case Opt_xprt_rdma6: protofamily = AF_INET6; + /* fall through */ case Opt_xprt_rdma: /* vector side protocols to TCP */ mnt->flags |= NFS_MOUNT_TCP; @@ -1494,11 +1497,13 @@ static int nfs_parse_mount_options(char *raw, switch (token) { case Opt_xprt_udp6: mountfamily = AF_INET6; + /* fall through */ case Opt_xprt_udp: mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: mountfamily = AF_INET6; + /* fall through */ case Opt_xprt_tcp: mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; break; @@ -1988,9 +1993,9 @@ static int nfs23_validate_mount_data(void *options, args->version = NFS_DEFAULT_VERSION; switch (data->version) { case 1: - data->namlen = 0; + data->namlen = 0; /* fall through */ case 2: - data->bsize = 0; + data->bsize = 0; /* fall through */ case 3: if (data->flags & NFS_MOUNT_VER3) goto out_no_v3; @@ -1998,11 +2003,14 @@ static int nfs23_validate_mount_data(void *options, memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); /* Turn off security negotiation */ extra_flags |= NFS_MOUNT_SECFLAVOUR; + /* fall through */ case 4: if (data->flags & NFS_MOUNT_SECFLAVOUR) goto out_no_sec; + /* fall through */ case 5: memset(data->context, 0, sizeof(data->context)); + /* fall through */ case 6: if (data->flags & NFS_MOUNT_VER3) { if (data->root.size > NFS3_FHSIZE || data->root.size == 0) @@ -2288,11 +2296,11 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) /* * noac is a special case. It implies -o sync, but that's not * necessarily reflected in the mtab options. do_remount_sb - * will clear MS_SYNCHRONOUS if -o sync wasn't specified in the + * will clear SB_SYNCHRONOUS if -o sync wasn't specified in the * remount options, so we have to explicitly reset it. */ if (data->flags & NFS_MOUNT_NOAC) - *flags |= MS_SYNCHRONOUS; + *flags |= SB_SYNCHRONOUS; /* compare new mount options with old ones */ error = nfs_compare_remount_data(nfss, data); @@ -2341,7 +2349,7 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info) /* The VFS shouldn't apply the umask to mode bits. We will do * so ourselves when necessary. */ - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; sb->s_time_gran = 1; sb->s_export_op = &nfs_export_ops; } @@ -2371,7 +2379,7 @@ static void nfs_clone_super(struct super_block *sb, /* The VFS shouldn't apply the umask to mode bits. We will do * so ourselves when necessary. */ - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; } nfs_initialise_sb(sb); @@ -2592,11 +2600,11 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, /* -o noac implies -o sync */ if (server->flags & NFS_MOUNT_NOAC) - sb_mntdata.mntflags |= MS_SYNCHRONOUS; + sb_mntdata.mntflags |= SB_SYNCHRONOUS; if (mount_info->cloned != NULL && mount_info->cloned->sb != NULL) - if (mount_info->cloned->sb->s_flags & MS_SYNCHRONOUS) - sb_mntdata.mntflags |= MS_SYNCHRONOUS; + if (mount_info->cloned->sb->s_flags & SB_SYNCHRONOUS) + sb_mntdata.mntflags |= SB_SYNCHRONOUS; /* Get a superblock - note that we may end up sharing one that already exists */ s = sget(nfs_mod->nfs_fs, compare_super, nfs_set_super, flags, &sb_mntdata); @@ -2633,7 +2641,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, if (error) goto error_splat_root; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; out: return mntroot; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index babebbccae2a..5b5f464f6f2a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -487,10 +487,8 @@ try_again: } ret = nfs_page_group_lock(head); - if (ret < 0) { - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); - } + if (ret < 0) + goto release_request; /* lock each request in the page group */ total_bytes = head->wb_bytes; @@ -515,8 +513,7 @@ try_again: if (ret < 0) { nfs_unroll_locks(inode, head, subreq); nfs_release_request(subreq); - nfs_unlock_and_release_request(head); - return ERR_PTR(ret); + goto release_request; } } /* @@ -532,8 +529,8 @@ try_again: nfs_page_group_unlock(head); nfs_unroll_locks(inode, head, subreq); nfs_unlock_and_release_request(subreq); - nfs_unlock_and_release_request(head); - return ERR_PTR(-EIO); + ret = -EIO; + goto release_request; } } @@ -576,6 +573,10 @@ try_again: /* still holds ref on head from nfs_page_find_head_request * and still has lock on head from lock loop */ return head; + +release_request: + nfs_unlock_and_release_request(head); + return ERR_PTR(ret); } static void nfs_write_error_remove_page(struct nfs_page *req) diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c index 420d3a0ab258..5be08f02a76b 100644 --- a/fs/nfs_common/grace.c +++ b/fs/nfs_common/grace.c @@ -30,7 +30,11 @@ locks_start_grace(struct net *net, struct lock_manager *lm) struct list_head *grace_list = net_generic(net, grace_net_id); spin_lock(&grace_lock); - list_add(&lm->list, grace_list); + if (list_empty(&lm->list)) + list_add(&lm->list, grace_list); + else + WARN(1, "double list_add attempt detected in net %x %s\n", + net->ns.inum, (net == &init_net) ? "(init_net)" : ""); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); @@ -55,14 +59,7 @@ locks_end_grace(struct lock_manager *lm) } EXPORT_SYMBOL_GPL(locks_end_grace); -/** - * locks_in_grace - * - * Lock managers call this function to determine when it is OK for them - * to answer ordinary lock requests, and when they should accept only - * lock reclaims. - */ -int +static bool __state_in_grace(struct net *net, bool open) { struct list_head *grace_list = net_generic(net, grace_net_id); @@ -78,15 +75,22 @@ __state_in_grace(struct net *net, bool open) return false; } -int locks_in_grace(struct net *net) +/** + * locks_in_grace + * + * Lock managers call this function to determine when it is OK for them + * to answer ordinary lock requests, and when they should accept only + * lock reclaims. + */ +bool locks_in_grace(struct net *net) { - return __state_in_grace(net, 0); + return __state_in_grace(net, false); } EXPORT_SYMBOL_GPL(locks_in_grace); -int opens_in_grace(struct net *net) +bool opens_in_grace(struct net *net) { - return __state_in_grace(net, 1); + return __state_in_grace(net, true); } EXPORT_SYMBOL_GPL(opens_in_grace); @@ -104,7 +108,9 @@ grace_exit_net(struct net *net) { struct list_head *grace_list = net_generic(net, grace_net_id); - BUG_ON(!list_empty(grace_list)); + WARN_ONCE(!list_empty(grace_list), + "net %x %s: grace_list is not empty\n", + net->ns.inum, __func__); } static struct pernet_operations grace_net_ops = { diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 46b48dbbdd32..8ceb25a10ea0 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -232,7 +232,7 @@ static struct cache_head *expkey_alloc(void) return NULL; } -static struct cache_detail svc_expkey_cache_template = { +static const struct cache_detail svc_expkey_cache_template = { .owner = THIS_MODULE, .hash_size = EXPKEY_HASHMAX, .name = "nfsd.fh", @@ -748,7 +748,7 @@ static struct cache_head *svc_export_alloc(void) return NULL; } -static struct cache_detail svc_export_cache_template = { +static const struct cache_detail svc_export_cache_template = { .owner = THIS_MODULE, .hash_size = EXPORT_HASHMAX, .name = "nfsd.export", @@ -1230,7 +1230,7 @@ nfsd_export_init(struct net *net) int rv; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - dprintk("nfsd: initializing export module (net: %p).\n", net); + dprintk("nfsd: initializing export module (net: %x).\n", net->ns.inum); nn->svc_export_cache = cache_create_net(&svc_export_cache_template, net); if (IS_ERR(nn->svc_export_cache)) @@ -1278,7 +1278,7 @@ nfsd_export_shutdown(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - dprintk("nfsd: shutting down export module (net: %p).\n", net); + dprintk("nfsd: shutting down export module (net: %x).\n", net->ns.inum); cache_unregister_net(nn->svc_expkey_cache, net); cache_unregister_net(nn->svc_export_cache, net); @@ -1286,5 +1286,5 @@ nfsd_export_shutdown(struct net *net) cache_destroy_net(nn->svc_export_cache, net); svcauth_unix_purge(net); - dprintk("nfsd: export shutdown complete (net: %p).\n", net); + dprintk("nfsd: export shutdown complete (net: %x).\n", net->ns.inum); } diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c index 6dfede6d172a..84831253203d 100644 --- a/fs/nfsd/fault_inject.c +++ b/fs/nfsd/fault_inject.c @@ -12,6 +12,7 @@ #include <linux/nsproxy.h> #include <linux/sunrpc/addr.h> #include <linux/uaccess.h> +#include <linux/kernel.h> #include "state.h" #include "netns.h" @@ -126,8 +127,6 @@ static struct nfsd_fault_inject_op inject_ops[] = { }, }; -#define NUM_INJECT_OPS (sizeof(inject_ops)/sizeof(struct nfsd_fault_inject_op)) - int nfsd_fault_inject_init(void) { unsigned int i; @@ -138,7 +137,7 @@ int nfsd_fault_inject_init(void) if (!debug_dir) goto fail; - for (i = 0; i < NUM_INJECT_OPS; i++) { + for (i = 0; i < ARRAY_SIZE(inject_ops); i++) { op = &inject_ops[i]; if (!debugfs_create_file(op->file, mode, debug_dir, op, &fops_nfsd)) goto fail; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 3714231a9d0f..36358d435cb0 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -107,7 +107,7 @@ struct nfsd_net { bool lockd_up; /* Time of server startup */ - struct timeval nfssvc_boot; + struct timespec64 nfssvc_boot; /* * Max number of connections this nfsd container will allow. Defaults @@ -119,6 +119,9 @@ struct nfsd_net { u32 clverifier_counter; struct svc_serv *nfsd_serv; + + wait_queue_head_t ntf_wq; + atomic_t ntf_refcnt; }; /* Simple check to find out if a given net was properly initialized */ diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index f38acd905441..2758480555fa 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -748,8 +748,9 @@ nfs3svc_encode_writeres(struct svc_rqst *rqstp, __be32 *p) if (resp->status == 0) { *p++ = htonl(resp->count); *p++ = htonl(resp->committed); - *p++ = htonl(nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_usec); + /* unique identifier, y2038 overflow can be ignored */ + *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); + *p++ = htonl(nn->nfssvc_boot.tv_nsec); } return xdr_ressize_check(rqstp, p); } @@ -1119,8 +1120,9 @@ nfs3svc_encode_commitres(struct svc_rqst *rqstp, __be32 *p) p = encode_wcc_data(rqstp, p, &resp->fh); /* Write verifier */ if (resp->status == 0) { - *p++ = htonl(nn->nfssvc_boot.tv_sec); - *p++ = htonl(nn->nfssvc_boot.tv_usec); + /* unique identifier, y2038 overflow can be ignored */ + *p++ = htonl((u32)nn->nfssvc_boot.tv_sec); + *p++ = htonl(nn->nfssvc_boot.tv_nsec); } return xdr_ressize_check(rqstp, p); } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 6b9b6cca469f..a5bb76593ce7 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -178,7 +178,7 @@ static struct ent *idtoname_lookup(struct cache_detail *, struct ent *); static struct ent *idtoname_update(struct cache_detail *, struct ent *, struct ent *); -static struct cache_detail idtoname_cache_template = { +static const struct cache_detail idtoname_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.idtoname", @@ -341,7 +341,7 @@ static struct ent *nametoid_update(struct cache_detail *, struct ent *, struct ent *); static int nametoid_parse(struct cache_detail *, char *, int); -static struct cache_detail nametoid_cache_template = { +static const struct cache_detail nametoid_cache_template = { .owner = THIS_MODULE, .hash_size = ENT_HASHMAX, .name = "nfs4.nametoid", diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index ea45d954e8d7..7d888369f85a 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -336,7 +336,7 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) trace_layout_recall(&ls->ls_stid.sc_stateid); - atomic_inc(&ls->ls_stid.sc_count); + refcount_inc(&ls->ls_stid.sc_count); nfsd4_run_cb(&ls->ls_recall); out_unlock: @@ -441,7 +441,7 @@ nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls) goto done; } - atomic_inc(&ls->ls_stid.sc_count); + refcount_inc(&ls->ls_stid.sc_count); list_add_tail(&new->lo_perstate, &ls->ls_layouts); new = NULL; done: diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 8487486ec496..008ea0b627d0 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -485,9 +485,6 @@ static __be32 nfsd4_getfh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - if (!cstate->current_fh.fh_dentry) - return nfserr_nofilehandle; - u->getfh = &cstate->current_fh; return nfs_ok; } @@ -535,9 +532,6 @@ static __be32 nfsd4_savefh(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - if (!cstate->current_fh.fh_dentry) - return nfserr_nofilehandle; - fh_dup2(&cstate->save_fh, &cstate->current_fh); if (HAS_STATE_ID(cstate, CURRENT_STATE_ID_FLAG)) { memcpy(&cstate->save_stateid, &cstate->current_stateid, sizeof(stateid_t)); @@ -570,10 +564,11 @@ static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) /* * This is opaque to client, so no need to byte-swap. Use - * __force to keep sparse happy + * __force to keep sparse happy. y2038 time_t overflow is + * irrelevant in this usage. */ verf[0] = (__force __be32)nn->nfssvc_boot.tv_sec; - verf[1] = (__force __be32)nn->nfssvc_boot.tv_usec; + verf[1] = (__force __be32)nn->nfssvc_boot.tv_nsec; memcpy(verifier->data, verf, sizeof(verifier->data)); } @@ -703,10 +698,8 @@ nfsd4_link(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_link *link = &u->link; - __be32 status = nfserr_nofilehandle; + __be32 status; - if (!cstate->save_fh.fh_dentry) - return status; status = nfsd_link(rqstp, &cstate->current_fh, link->li_name, link->li_namelen, &cstate->save_fh); if (!status) @@ -850,10 +843,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_rename *rename = &u->rename; - __be32 status = nfserr_nofilehandle; + __be32 status; - if (!cstate->save_fh.fh_dentry) - return status; if (opens_in_grace(SVC_NET(rqstp)) && !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0c04f81aa63b..b29b5a185a2c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -63,12 +63,16 @@ static const stateid_t zero_stateid = { static const stateid_t currentstateid = { .si_generation = 1, }; +static const stateid_t close_stateid = { + .si_generation = 0xffffffffU, +}; static u64 current_sessionid = 1; #define ZERO_STATEID(stateid) (!memcmp((stateid), &zero_stateid, sizeof(stateid_t))) #define ONE_STATEID(stateid) (!memcmp((stateid), &one_stateid, sizeof(stateid_t))) #define CURRENT_STATEID(stateid) (!memcmp((stateid), ¤tstateid, sizeof(stateid_t))) +#define CLOSE_STATEID(stateid) (!memcmp((stateid), &close_stateid, sizeof(stateid_t))) /* forward declarations */ static bool check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner); @@ -83,6 +87,11 @@ static void nfs4_free_ol_stateid(struct nfs4_stid *stid); */ static DEFINE_SPINLOCK(state_lock); +enum nfsd4_st_mutex_lock_subclass { + OPEN_STATEID_MUTEX = 0, + LOCK_STATEID_MUTEX = 1, +}; + /* * A waitqueue for all in-progress 4.0 CLOSE operations that are waiting for * the refcount on the open stateid to drop. @@ -359,7 +368,7 @@ put_nfs4_file(struct nfs4_file *fi) { might_lock(&state_lock); - if (atomic_dec_and_lock(&fi->fi_ref, &state_lock)) { + if (refcount_dec_and_lock(&fi->fi_ref, &state_lock)) { hlist_del_rcu(&fi->fi_hash); spin_unlock(&state_lock); WARN_ON_ONCE(!list_empty(&fi->fi_clnt_odstate)); @@ -568,7 +577,7 @@ alloc_clnt_odstate(struct nfs4_client *clp) co = kmem_cache_zalloc(odstate_slab, GFP_KERNEL); if (co) { co->co_client = clp; - atomic_set(&co->co_odcount, 1); + refcount_set(&co->co_odcount, 1); } return co; } @@ -586,7 +595,7 @@ static inline void get_clnt_odstate(struct nfs4_clnt_odstate *co) { if (co) - atomic_inc(&co->co_odcount); + refcount_inc(&co->co_odcount); } static void @@ -598,7 +607,7 @@ put_clnt_odstate(struct nfs4_clnt_odstate *co) return; fp = co->co_file; - if (atomic_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { + if (refcount_dec_and_lock(&co->co_odcount, &fp->fi_lock)) { list_del(&co->co_perfile); spin_unlock(&fp->fi_lock); @@ -656,7 +665,7 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla stid->sc_stateid.si_opaque.so_id = new_id; stid->sc_stateid.si_opaque.so_clid = cl->cl_clientid; /* Will be incremented before return to client: */ - atomic_set(&stid->sc_count, 1); + refcount_set(&stid->sc_count, 1); spin_lock_init(&stid->sc_lock); /* @@ -813,7 +822,7 @@ nfs4_put_stid(struct nfs4_stid *s) might_lock(&clp->cl_lock); - if (!atomic_dec_and_lock(&s->sc_count, &clp->cl_lock)) { + if (!refcount_dec_and_lock(&s->sc_count, &clp->cl_lock)) { wake_up_all(&close_wq); return; } @@ -913,7 +922,7 @@ hash_delegation_locked(struct nfs4_delegation *dp, struct nfs4_file *fp) if (status) return status; ++fp->fi_delegees; - atomic_inc(&dp->dl_stid.sc_count); + refcount_inc(&dp->dl_stid.sc_count); dp->dl_stid.sc_type = NFS4_DELEG_STID; list_add(&dp->dl_perfile, &fp->fi_delegations); list_add(&dp->dl_perclnt, &clp->cl_delegations); @@ -1214,7 +1223,7 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, WARN_ON_ONCE(!list_empty(&stp->st_locks)); - if (!atomic_dec_and_test(&s->sc_count)) { + if (!refcount_dec_and_test(&s->sc_count)) { wake_up_all(&close_wq); return; } @@ -1439,8 +1448,10 @@ free_session_slots(struct nfsd4_session *ses) { int i; - for (i = 0; i < ses->se_fchannel.maxreqs; i++) + for (i = 0; i < ses->se_fchannel.maxreqs; i++) { + free_svc_cred(&ses->se_slots[i]->sl_cred); kfree(ses->se_slots[i]); + } } /* @@ -1472,6 +1483,11 @@ static u32 nfsd4_get_drc_mem(struct nfsd4_channel_attrs *ca) spin_lock(&nfsd_drc_lock); avail = min((unsigned long)NFSD_MAX_MEM_PER_SESSION, nfsd_drc_max_mem - nfsd_drc_mem_used); + /* + * Never use more than a third of the remaining memory, + * unless it's the only way to give this client a slot: + */ + avail = clamp_t(int, avail, slotsize, avail/3); num = min_t(int, num, avail / slotsize); nfsd_drc_mem_used += num * slotsize; spin_unlock(&nfsd_drc_lock); @@ -2072,7 +2088,7 @@ find_stateid_by_type(struct nfs4_client *cl, stateid_t *t, char typemask) s = find_stateid_locked(cl, t); if (s != NULL) { if (typemask & s->sc_type) - atomic_inc(&s->sc_count); + refcount_inc(&s->sc_count); else s = NULL; } @@ -2287,14 +2303,18 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) dprintk("--> %s slot %p\n", __func__, slot); + slot->sl_flags |= NFSD4_SLOT_INITIALIZED; slot->sl_opcnt = resp->opcnt; slot->sl_status = resp->cstate.status; + free_svc_cred(&slot->sl_cred); + copy_cred(&slot->sl_cred, &resp->rqstp->rq_cred); - slot->sl_flags |= NFSD4_SLOT_INITIALIZED; - if (nfsd4_not_cached(resp)) { - slot->sl_datalen = 0; + if (!nfsd4_cache_this(resp)) { + slot->sl_flags &= ~NFSD4_SLOT_CACHED; return; } + slot->sl_flags |= NFSD4_SLOT_CACHED; + base = resp->cstate.data_offset; slot->sl_datalen = buf->len - base; if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) @@ -2321,8 +2341,16 @@ nfsd4_enc_sequence_replay(struct nfsd4_compoundargs *args, op = &args->ops[resp->opcnt - 1]; nfsd4_encode_operation(resp, op); - /* Return nfserr_retry_uncached_rep in next operation. */ - if (args->opcnt > 1 && !(slot->sl_flags & NFSD4_SLOT_CACHETHIS)) { + if (slot->sl_flags & NFSD4_SLOT_CACHED) + return op->status; + if (args->opcnt == 1) { + /* + * The original operation wasn't a solo sequence--we + * always cache those--so this retry must not match the + * original: + */ + op->status = nfserr_seq_false_retry; + } else { op = &args->ops[resp->opcnt++]; op->status = nfserr_retry_uncached_rep; nfsd4_encode_operation(resp, op); @@ -2986,6 +3014,34 @@ static bool nfsd4_request_too_big(struct svc_rqst *rqstp, return xb->len > session->se_fchannel.maxreq_sz; } +static bool replay_matches_cache(struct svc_rqst *rqstp, + struct nfsd4_sequence *seq, struct nfsd4_slot *slot) +{ + struct nfsd4_compoundargs *argp = rqstp->rq_argp; + + if ((bool)(slot->sl_flags & NFSD4_SLOT_CACHETHIS) != + (bool)seq->cachethis) + return false; + /* + * If there's an error than the reply can have fewer ops than + * the call. But if we cached a reply with *more* ops than the + * call you're sending us now, then this new call is clearly not + * really a replay of the old one: + */ + if (slot->sl_opcnt < argp->opcnt) + return false; + /* This is the only check explicitly called by spec: */ + if (!same_creds(&rqstp->rq_cred, &slot->sl_cred)) + return false; + /* + * There may be more comparisons we could actually do, but the + * spec doesn't require us to catch every case where the calls + * don't match (that would require caching the call as well as + * the reply), so we don't bother. + */ + return true; +} + __be32 nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -3045,6 +3101,9 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_seq_misordered; if (!(slot->sl_flags & NFSD4_SLOT_INITIALIZED)) goto out_put_session; + status = nfserr_seq_false_retry; + if (!replay_matches_cache(rqstp, seq, slot)) + goto out_put_session; cstate->slot = slot; cstate->session = session; cstate->clp = clp; @@ -3351,7 +3410,7 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval, { lockdep_assert_held(&state_lock); - atomic_set(&fp->fi_ref, 1); + refcount_set(&fp->fi_ref, 1); spin_lock_init(&fp->fi_lock); INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); @@ -3512,15 +3571,63 @@ nfsd4_find_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) /* ignore lock owners */ if (local->st_stateowner->so_is_open_owner == 0) continue; - if (local->st_stateowner == &oo->oo_owner) { + if (local->st_stateowner != &oo->oo_owner) + continue; + if (local->st_stid.sc_type == NFS4_OPEN_STID) { ret = local; - atomic_inc(&ret->st_stid.sc_count); + refcount_inc(&ret->st_stid.sc_count); break; } } return ret; } +static __be32 +nfsd4_verify_open_stid(struct nfs4_stid *s) +{ + __be32 ret = nfs_ok; + + switch (s->sc_type) { + default: + break; + case NFS4_CLOSED_STID: + case NFS4_CLOSED_DELEG_STID: + ret = nfserr_bad_stateid; + break; + case NFS4_REVOKED_DELEG_STID: + ret = nfserr_deleg_revoked; + } + return ret; +} + +/* Lock the stateid st_mutex, and deal with races with CLOSE */ +static __be32 +nfsd4_lock_ol_stateid(struct nfs4_ol_stateid *stp) +{ + __be32 ret; + + mutex_lock_nested(&stp->st_mutex, LOCK_STATEID_MUTEX); + ret = nfsd4_verify_open_stid(&stp->st_stid); + if (ret != nfs_ok) + mutex_unlock(&stp->st_mutex); + return ret; +} + +static struct nfs4_ol_stateid * +nfsd4_find_and_lock_existing_open(struct nfs4_file *fp, struct nfsd4_open *open) +{ + struct nfs4_ol_stateid *stp; + for (;;) { + spin_lock(&fp->fi_lock); + stp = nfsd4_find_existing_open(fp, open); + spin_unlock(&fp->fi_lock); + if (!stp || nfsd4_lock_ol_stateid(stp) == nfs_ok) + break; + nfs4_put_stid(&stp->st_stid); + } + return stp; +} + static struct nfs4_openowner * alloc_init_open_stateowner(unsigned int strhashval, struct nfsd4_open *open, struct nfsd4_compound_state *cstate) @@ -3563,8 +3670,9 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) stp = open->op_stp; /* We are moving these outside of the spinlocks to avoid the warnings */ mutex_init(&stp->st_mutex); - mutex_lock(&stp->st_mutex); + mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); +retry: spin_lock(&oo->oo_owner.so_client->cl_lock); spin_lock(&fp->fi_lock); @@ -3573,7 +3681,7 @@ init_open_stateid(struct nfs4_file *fp, struct nfsd4_open *open) goto out_unlock; open->op_stp = NULL; - atomic_inc(&stp->st_stid.sc_count); + refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_OPEN_STID; INIT_LIST_HEAD(&stp->st_locks); stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); @@ -3589,7 +3697,11 @@ out_unlock: spin_unlock(&fp->fi_lock); spin_unlock(&oo->oo_owner.so_client->cl_lock); if (retstp) { - mutex_lock(&retstp->st_mutex); + /* Handle races with CLOSE */ + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; + } /* To keep mutex tracking happy */ mutex_unlock(&stp->st_mutex); stp = retstp; @@ -3621,7 +3733,7 @@ move_to_close_lru(struct nfs4_ol_stateid *s, struct net *net) * there should be no danger of the refcount going back up again at * this point. */ - wait_event(close_wq, atomic_read(&s->st_stid.sc_count) == 2); + wait_event(close_wq, refcount_read(&s->st_stid.sc_count) == 2); release_all_access(s); if (s->st_stid.sc_file) { @@ -3647,7 +3759,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval) hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) { if (fh_match(&fp->fi_fhandle, fh)) { - if (atomic_inc_not_zero(&fp->fi_ref)) + if (refcount_inc_not_zero(&fp->fi_ref)) return fp; } } @@ -3783,7 +3895,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * lock) we know the server hasn't removed the lease yet, we know * it's safe to take a reference. */ - atomic_inc(&dp->dl_stid.sc_count); + refcount_inc(&dp->dl_stid.sc_count); nfsd4_run_cb(&dp->dl_recall); } @@ -3966,7 +4078,8 @@ static struct nfs4_delegation *find_deleg_stateid(struct nfs4_client *cl, statei { struct nfs4_stid *ret; - ret = find_stateid_by_type(cl, s, NFS4_DELEG_STID); + ret = find_stateid_by_type(cl, s, + NFS4_DELEG_STID|NFS4_REVOKED_DELEG_STID); if (!ret) return NULL; return delegstateid(ret); @@ -3989,6 +4102,12 @@ nfs4_check_deleg(struct nfs4_client *cl, struct nfsd4_open *open, deleg = find_deleg_stateid(cl, &open->op_delegate_stateid); if (deleg == NULL) goto out; + if (deleg->dl_stid.sc_type == NFS4_REVOKED_DELEG_STID) { + nfs4_put_stid(&deleg->dl_stid); + if (cl->cl_minorversion) + status = nfserr_deleg_revoked; + goto out; + } flags = share_access_to_flags(open->op_share_access); status = nfs4_check_delegmode(deleg, flags); if (status) { @@ -4392,6 +4511,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf struct nfs4_ol_stateid *stp = NULL; struct nfs4_delegation *dp = NULL; __be32 status; + bool new_stp = false; /* * Lookup file; if found, lookup stateid and check open request, @@ -4403,9 +4523,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf status = nfs4_check_deleg(cl, open, &dp); if (status) goto out; - spin_lock(&fp->fi_lock); - stp = nfsd4_find_existing_open(fp, open); - spin_unlock(&fp->fi_lock); + stp = nfsd4_find_and_lock_existing_open(fp, open); } else { open->op_file = NULL; status = nfserr_bad_stateid; @@ -4413,35 +4531,31 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf goto out; } + if (!stp) { + stp = init_open_stateid(fp, open); + if (!open->op_stp) + new_stp = true; + } + /* * OPEN the file, or upgrade an existing OPEN. * If truncate fails, the OPEN fails. + * + * stp is already locked. */ - if (stp) { + if (!new_stp) { /* Stateid was found, this is an OPEN upgrade */ - mutex_lock(&stp->st_mutex); status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) { mutex_unlock(&stp->st_mutex); goto out; } } else { - /* stp is returned locked. */ - stp = init_open_stateid(fp, open); - /* See if we lost the race to some other thread */ - if (stp->st_access_bmap != 0) { - status = nfs4_upgrade_open(rqstp, fp, current_fh, - stp, open); - if (status) { - mutex_unlock(&stp->st_mutex); - goto out; - } - goto upgrade_out; - } status = nfs4_get_vfs_file(rqstp, fp, current_fh, stp, open); if (status) { - mutex_unlock(&stp->st_mutex); + stp->st_stid.sc_type = NFS4_CLOSED_STID; release_open_stateid(stp); + mutex_unlock(&stp->st_mutex); goto out; } @@ -4450,7 +4564,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf if (stp->st_clnt_odstate == open->op_odstate) open->op_odstate = NULL; } -upgrade_out: + nfs4_inc_and_copy_stateid(&open->op_stateid, &stp->st_stid); mutex_unlock(&stp->st_mutex); @@ -4677,7 +4791,7 @@ nfs4_laundromat(struct nfsd_net *nn) spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { - nbl = list_first_entry(&nn->blocked_locks_lru, + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); posix_unblock_lock(&nbl->nbl_lock); @@ -4798,6 +4912,18 @@ static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_s return nfserr_old_stateid; } +static __be32 nfsd4_stid_check_stateid_generation(stateid_t *in, struct nfs4_stid *s, bool has_session) +{ + __be32 ret; + + spin_lock(&s->sc_lock); + ret = nfsd4_verify_open_stid(s); + if (ret == nfs_ok) + ret = check_stateid_generation(in, &s->sc_stateid, has_session); + spin_unlock(&s->sc_lock); + return ret; +} + static __be32 nfsd4_check_openowner_confirmed(struct nfs4_ol_stateid *ols) { if (ols->st_stateowner->so_is_open_owner && @@ -4811,7 +4937,8 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) struct nfs4_stid *s; __be32 status = nfserr_bad_stateid; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) return status; /* Client debugging aid. */ if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) { @@ -4826,7 +4953,7 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid) s = find_stateid_locked(cl, stateid); if (!s) goto out_unlock; - status = check_stateid_generation(stateid, &s->sc_stateid, 1); + status = nfsd4_stid_check_stateid_generation(stateid, s, 1); if (status) goto out_unlock; switch (s->sc_type) { @@ -4858,8 +4985,19 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, struct nfs4_stid **s, struct nfsd_net *nn) { __be32 status; + bool return_revoked = false; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) + /* + * only return revoked delegations if explicitly asked. + * otherwise we report revoked or bad_stateid status. + */ + if (typemask & NFS4_REVOKED_DELEG_STID) + return_revoked = true; + else if (typemask & NFS4_DELEG_STID) + typemask |= NFS4_REVOKED_DELEG_STID; + + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || + CLOSE_STATEID(stateid)) return nfserr_bad_stateid; status = lookup_clientid(&stateid->si_opaque.so_clid, cstate, nn); if (status == nfserr_stale_clientid) { @@ -4872,6 +5010,12 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, *s = find_stateid_by_type(cstate->clp, stateid, typemask); if (!*s) return nfserr_bad_stateid; + if (((*s)->sc_type == NFS4_REVOKED_DELEG_STID) && !return_revoked) { + nfs4_put_stid(*s); + if (cstate->minorversion) + return nfserr_deleg_revoked; + return nfserr_bad_stateid; + } return nfs_ok; } @@ -4971,7 +5115,7 @@ nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, &s, nn); if (status) return status; - status = check_stateid_generation(stateid, &s->sc_stateid, + status = nfsd4_stid_check_stateid_generation(stateid, s, nfsd4_has_session(cstate)); if (status) goto out; @@ -5025,7 +5169,9 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) struct nfs4_ol_stateid *stp = openlockstateid(s); __be32 ret; - mutex_lock(&stp->st_mutex); + ret = nfsd4_lock_ol_stateid(stp); + if (ret) + goto out_put_stid; ret = check_stateid_generation(stateid, &s->sc_stateid, 1); if (ret) @@ -5036,11 +5182,13 @@ nfsd4_free_lock_stateid(stateid_t *stateid, struct nfs4_stid *s) lockowner(stp->st_stateowner))) goto out; + stp->st_stid.sc_type = NFS4_CLOSED_STID; release_lock_stateid(stp); ret = nfs_ok; out: mutex_unlock(&stp->st_mutex); +out_put_stid: nfs4_put_stid(s); return ret; } @@ -5060,6 +5208,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, s = find_stateid_locked(cl, stateid); if (!s) goto out_unlock; + spin_lock(&s->sc_lock); switch (s->sc_type) { case NFS4_DELEG_STID: ret = nfserr_locks_held; @@ -5071,11 +5220,13 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, ret = nfserr_locks_held; break; case NFS4_LOCK_STID: - atomic_inc(&s->sc_count); + spin_unlock(&s->sc_lock); + refcount_inc(&s->sc_count); spin_unlock(&cl->cl_lock); ret = nfsd4_free_lock_stateid(stateid, s); goto out; case NFS4_REVOKED_DELEG_STID: + spin_unlock(&s->sc_lock); dp = delegstateid(s); list_del_init(&dp->dl_recall_lru); spin_unlock(&cl->cl_lock); @@ -5084,6 +5235,7 @@ nfsd4_free_stateid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out; /* Default falls through and returns nfserr_bad_stateid */ } + spin_unlock(&s->sc_lock); out_unlock: spin_unlock(&cl->cl_lock); out: @@ -5106,15 +5258,9 @@ static __be32 nfs4_seqid_op_checks(struct nfsd4_compound_state *cstate, stateid_ status = nfsd4_check_seqid(cstate, sop, seqid); if (status) return status; - if (stp->st_stid.sc_type == NFS4_CLOSED_STID - || stp->st_stid.sc_type == NFS4_REVOKED_DELEG_STID) - /* - * "Closed" stateid's exist *only* to return - * nfserr_replay_me from the previous step, and - * revoked delegations are kept only for free_stateid. - */ - return nfserr_bad_stateid; - mutex_lock(&stp->st_mutex); + status = nfsd4_lock_ol_stateid(stp); + if (status != nfs_ok) + return status; status = check_stateid_generation(stateid, &stp->st_stid.sc_stateid, nfsd4_has_session(cstate)); if (status == nfs_ok) status = nfs4_check_fh(current_fh, &stp->st_stid); @@ -5294,7 +5440,6 @@ static void nfsd4_close_open_stateid(struct nfs4_ol_stateid *s) bool unhashed; LIST_HEAD(reaplist); - s->st_stid.sc_type = NFS4_CLOSED_STID; spin_lock(&clp->cl_lock); unhashed = unhash_open_stateid(s, &reaplist); @@ -5334,10 +5479,17 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfsd4_bump_seqid(cstate, status); if (status) goto out; + + stp->st_stid.sc_type = NFS4_CLOSED_STID; nfs4_inc_and_copy_stateid(&close->cl_stateid, &stp->st_stid); - mutex_unlock(&stp->st_mutex); nfsd4_close_open_stateid(stp); + mutex_unlock(&stp->st_mutex); + + /* See RFC5661 sectionm 18.2.4 */ + if (stp->st_stid.sc_client->cl_minorversion) + memcpy(&close->cl_stateid, &close_stateid, + sizeof(close->cl_stateid)); /* put reference from nfs4_preprocess_seqid_op */ nfs4_put_stid(&stp->st_stid); @@ -5363,7 +5515,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; dp = delegstateid(s); - status = check_stateid_generation(stateid, &dp->dl_stid.sc_stateid, nfsd4_has_session(cstate)); + status = nfsd4_stid_check_stateid_generation(stateid, &dp->dl_stid, nfsd4_has_session(cstate)); if (status) goto put_stateid; @@ -5569,16 +5721,43 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp, return ret; } -static void +static struct nfs4_ol_stateid * +find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) +{ + struct nfs4_ol_stateid *lst; + struct nfs4_client *clp = lo->lo_owner.so_client; + + lockdep_assert_held(&clp->cl_lock); + + list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { + if (lst->st_stid.sc_type != NFS4_LOCK_STID) + continue; + if (lst->st_stid.sc_file == fp) { + refcount_inc(&lst->st_stid.sc_count); + return lst; + } + } + return NULL; +} + +static struct nfs4_ol_stateid * init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, struct nfs4_file *fp, struct inode *inode, struct nfs4_ol_stateid *open_stp) { struct nfs4_client *clp = lo->lo_owner.so_client; + struct nfs4_ol_stateid *retstp; - lockdep_assert_held(&clp->cl_lock); + mutex_init(&stp->st_mutex); + mutex_lock_nested(&stp->st_mutex, OPEN_STATEID_MUTEX); +retry: + spin_lock(&clp->cl_lock); + spin_lock(&fp->fi_lock); + retstp = find_lock_stateid(lo, fp); + if (retstp) + goto out_unlock; - atomic_inc(&stp->st_stid.sc_count); + refcount_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_LOCK_STID; stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); @@ -5586,29 +5765,22 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, stp->st_access_bmap = 0; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; - mutex_init(&stp->st_mutex); list_add(&stp->st_locks, &open_stp->st_locks); list_add(&stp->st_perstateowner, &lo->lo_owner.so_stateids); - spin_lock(&fp->fi_lock); list_add(&stp->st_perfile, &fp->fi_stateids); +out_unlock: spin_unlock(&fp->fi_lock); -} - -static struct nfs4_ol_stateid * -find_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fp) -{ - struct nfs4_ol_stateid *lst; - struct nfs4_client *clp = lo->lo_owner.so_client; - - lockdep_assert_held(&clp->cl_lock); - - list_for_each_entry(lst, &lo->lo_owner.so_stateids, st_perstateowner) { - if (lst->st_stid.sc_file == fp) { - atomic_inc(&lst->st_stid.sc_count); - return lst; + spin_unlock(&clp->cl_lock); + if (retstp) { + if (nfsd4_lock_ol_stateid(retstp) != nfs_ok) { + nfs4_put_stid(&retstp->st_stid); + goto retry; } + /* To keep mutex tracking happy */ + mutex_unlock(&stp->st_mutex); + stp = retstp; } - return NULL; + return stp; } static struct nfs4_ol_stateid * @@ -5621,26 +5793,25 @@ find_or_create_lock_stateid(struct nfs4_lockowner *lo, struct nfs4_file *fi, struct nfs4_openowner *oo = openowner(ost->st_stateowner); struct nfs4_client *clp = oo->oo_owner.so_client; + *new = false; spin_lock(&clp->cl_lock); lst = find_lock_stateid(lo, fi); - if (lst == NULL) { - spin_unlock(&clp->cl_lock); - ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); - if (ns == NULL) - return NULL; - - spin_lock(&clp->cl_lock); - lst = find_lock_stateid(lo, fi); - if (likely(!lst)) { - lst = openlockstateid(ns); - init_lock_stateid(lst, lo, fi, inode, ost); - ns = NULL; - *new = true; - } - } spin_unlock(&clp->cl_lock); - if (ns) + if (lst != NULL) { + if (nfsd4_lock_ol_stateid(lst) == nfs_ok) + goto out; + nfs4_put_stid(&lst->st_stid); + } + ns = nfs4_alloc_stid(clp, stateid_slab, nfs4_free_lock_stateid); + if (ns == NULL) + return NULL; + + lst = init_lock_stateid(openlockstateid(ns), lo, fi, inode, ost); + if (lst == openlockstateid(ns)) + *new = true; + else nfs4_put_stid(ns); +out: return lst; } @@ -5677,7 +5848,6 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_lockowner *lo; struct nfs4_ol_stateid *lst; unsigned int strhashval; - bool hashed; lo = find_lockowner_str(cl, &lock->lk_new_owner); if (!lo) { @@ -5693,25 +5863,12 @@ lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, goto out; } -retry: lst = find_or_create_lock_stateid(lo, fi, inode, ost, new); if (lst == NULL) { status = nfserr_jukebox; goto out; } - mutex_lock(&lst->st_mutex); - - /* See if it's still hashed to avoid race with FREE_STATEID */ - spin_lock(&cl->cl_lock); - hashed = !list_empty(&lst->st_perfile); - spin_unlock(&cl->cl_lock); - - if (!hashed) { - mutex_unlock(&lst->st_mutex); - nfs4_put_stid(&lst->st_stid); - goto retry; - } status = nfs_ok; *plst = lst; out: @@ -5917,14 +6074,16 @@ out: seqid_mutating_err(ntohl(status))) lock_sop->lo_owner.so_seqid++; - mutex_unlock(&lock_stp->st_mutex); - /* * If this is a new, never-before-used stateid, and we are * returning an error, then just go ahead and release it. */ - if (status && new) + if (status && new) { + lock_stp->st_stid.sc_type = NFS4_CLOSED_STID; release_lock_stateid(lock_stp); + } + + mutex_unlock(&lock_stp->st_mutex); nfs4_put_stid(&lock_stp->st_stid); } @@ -6944,6 +7103,10 @@ static int nfs4_state_create_net(struct net *net) INIT_LIST_HEAD(&nn->sessionid_hashtbl[i]); nn->conf_name_tree = RB_ROOT; nn->unconf_name_tree = RB_ROOT; + nn->boot_time = get_seconds(); + nn->grace_ended = false; + nn->nfsd4_manager.block_opens = true; + INIT_LIST_HEAD(&nn->nfsd4_manager.list); INIT_LIST_HEAD(&nn->client_lru); INIT_LIST_HEAD(&nn->close_lru); INIT_LIST_HEAD(&nn->del_recall_lru); @@ -7001,13 +7164,10 @@ nfs4_state_start_net(struct net *net) ret = nfs4_state_create_net(net); if (ret) return ret; - nn->boot_time = get_seconds(); - nn->grace_ended = false; - nn->nfsd4_manager.block_opens = true; locks_start_grace(net, &nn->nfsd4_manager); nfsd4_client_tracking_init(net); - printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", - nn->nfsd4_grace, net); + printk(KERN_INFO "NFSD: starting %ld-second grace period (net %x)\n", + nn->nfsd4_grace, net->ns.inum); queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); return 0; } @@ -7080,7 +7240,7 @@ nfs4_state_shutdown_net(struct net *net) spin_unlock(&nn->blocked_locks_lock); while (!list_empty(&reaplist)) { - nbl = list_first_entry(&nn->blocked_locks_lru, + nbl = list_first_entry(&reaplist, struct nfsd4_blocked_lock, nbl_lru); list_del_init(&nbl->nbl_lru); posix_unblock_lock(&nbl->nbl_lock); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6493df6b1bd5..d107b4426f7e 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1241,6 +1241,9 @@ static __net_init int nfsd_init_net(struct net *net) nn->nfsd4_grace = 90; nn->clverifier_counter = prandom_u32(); nn->clientid_counter = prandom_u32(); + + atomic_set(&nn->ntf_refcnt, 0); + init_waitqueue_head(&nn->ntf_wq); return 0; out_idmap_error: diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index e02bd2783124..89cb484f1cfb 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -335,7 +335,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in sin; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nn->ntf_refcnt)) goto out; if (nn->nfsd_serv) { @@ -344,6 +345,8 @@ static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, sin.sin_addr.s_addr = ifa->ifa_local; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); } + atomic_dec(&nn->ntf_refcnt); + wake_up(&nn->ntf_wq); out: return NOTIFY_DONE; @@ -363,7 +366,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, struct nfsd_net *nn = net_generic(net, nfsd_net_id); struct sockaddr_in6 sin6; - if (event != NETDEV_DOWN) + if ((event != NETDEV_DOWN) || + !atomic_inc_not_zero(&nn->ntf_refcnt)) goto out; if (nn->nfsd_serv) { @@ -374,7 +378,8 @@ static int nfsd_inet6addr_event(struct notifier_block *this, sin6.sin6_scope_id = ifa->idev->dev->ifindex; svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); } - + atomic_dec(&nn->ntf_refcnt); + wake_up(&nn->ntf_wq); out: return NOTIFY_DONE; } @@ -391,6 +396,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + atomic_dec(&nn->ntf_refcnt); /* check if the notifier still has clients */ if (atomic_dec_return(&nfsd_notifier_refcount) == 0) { unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); @@ -398,6 +404,7 @@ static void nfsd_last_thread(struct svc_serv *serv, struct net *net) unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } + wait_event(nn->ntf_wq, atomic_read(&nn->ntf_refcnt) == 0); /* * write_ports can create the server without actually starting @@ -447,7 +454,7 @@ void nfsd_reset_versions(void) */ static void set_max_drc(void) { - #define NFSD_DRC_SIZE_SHIFT 10 + #define NFSD_DRC_SIZE_SHIFT 7 nfsd_drc_max_mem = (nr_free_buffer_pages() >> NFSD_DRC_SIZE_SHIFT) * PAGE_SIZE; nfsd_drc_mem_used = 0; @@ -517,7 +524,8 @@ int nfsd_create_serv(struct net *net) register_inet6addr_notifier(&nfsd_inet6addr_notifier); #endif } - do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ + atomic_inc(&nn->ntf_refcnt); + ktime_get_real_ts64(&nn->nfssvc_boot); /* record boot time */ return 0; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 005c911b34ac..f3772ea8ba0d 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -36,6 +36,7 @@ #define _NFSD4_STATE_H #include <linux/idr.h> +#include <linux/refcount.h> #include <linux/sunrpc/svc_xprt.h> #include "nfsfh.h" @@ -83,7 +84,7 @@ struct nfsd4_callback_ops { * fields that are of general use to any stateid. */ struct nfs4_stid { - atomic_t sc_count; + refcount_t sc_count; #define NFS4_OPEN_STID 1 #define NFS4_LOCK_STID 2 #define NFS4_DELEG_STID 4 @@ -169,11 +170,13 @@ static inline struct nfs4_delegation *delegstateid(struct nfs4_stid *s) struct nfsd4_slot { u32 sl_seqid; __be32 sl_status; + struct svc_cred sl_cred; u32 sl_datalen; u16 sl_opcnt; #define NFSD4_SLOT_INUSE (1 << 0) #define NFSD4_SLOT_CACHETHIS (1 << 1) #define NFSD4_SLOT_INITIALIZED (1 << 2) +#define NFSD4_SLOT_CACHED (1 << 3) u8 sl_flags; char sl_data[]; }; @@ -465,7 +468,7 @@ struct nfs4_clnt_odstate { struct nfs4_client *co_client; struct nfs4_file *co_file; struct list_head co_perfile; - atomic_t co_odcount; + refcount_t co_odcount; }; /* @@ -481,7 +484,7 @@ struct nfs4_clnt_odstate { * the global state_lock spinlock. */ struct nfs4_file { - atomic_t fi_ref; + refcount_t fi_ref; spinlock_t fi_lock; struct hlist_node fi_hash; /* hash on fi_fhandle */ struct list_head fi_stateids; @@ -634,7 +637,7 @@ struct nfs4_file *find_file(struct knfsd_fh *fh); void put_nfs4_file(struct nfs4_file *fi); static inline void get_nfs4_file(struct nfs4_file *fi) { - atomic_inc(&fi->fi_ref); + refcount_inc(&fi->fi_ref); } struct file *find_any_file(struct nfs4_file *f); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 1e4edbf70052..bc29511b6405 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -649,9 +649,18 @@ static inline bool nfsd4_is_solo_sequence(struct nfsd4_compoundres *resp) return resp->opcnt == 1 && args->ops[0].opnum == OP_SEQUENCE; } -static inline bool nfsd4_not_cached(struct nfsd4_compoundres *resp) +/* + * The session reply cache only needs to cache replies that the client + * actually asked us to. But it's almost free for us to cache compounds + * consisting of only a SEQUENCE op, so we may as well cache those too. + * Also, the protocol doesn't give us a convenient response in the case + * of a replay of a solo SEQUENCE op that wasn't cached + * (RETRY_UNCACHED_REP can only be returned in the second op of a + * compound). + */ +static inline bool nfsd4_cache_this(struct nfsd4_compoundres *resp) { - return !(resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS) + return (resp->cstate.slot->sl_flags & NFSD4_SLOT_CACHETHIS) || nfsd4_is_solo_sequence(resp); } diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 515d13c196da..1a2894aa0194 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -150,7 +150,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry, if (err) return err; - inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO); + inode = nilfs_new_inode(dir, S_IFLNK | 0777); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index f65392fecb5c..9f3ffba41533 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1954,8 +1954,6 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, err, ii->vfs_inode.i_ino); return err; } - mark_buffer_dirty(ibh); - nilfs_mdt_mark_dirty(ifile); spin_lock(&nilfs->ns_inode_lock); if (likely(!ii->i_bh)) ii->i_bh = ibh; @@ -1964,6 +1962,10 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci, goto retry; } + // Always redirty the buffer to avoid race condition + mark_buffer_dirty(ii->i_bh); + nilfs_mdt_mark_dirty(ifile); + clear_bit(NILFS_I_QUEUED, &ii->i_state); set_bit(NILFS_I_BUSY, &ii->i_state); list_move_tail(&ii->i_dirty, &sci->sc_dirty_files); @@ -1977,7 +1979,7 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci, struct the_nilfs *nilfs) { struct nilfs_inode_info *ii, *n; - int during_mount = !(sci->sc_super->s_flags & MS_ACTIVE); + int during_mount = !(sci->sc_super->s_flags & SB_ACTIVE); int defer_iput = false; spin_lock(&nilfs->ns_inode_lock); @@ -2400,11 +2402,11 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) return err; } -static void nilfs_construction_timeout(unsigned long data) +static void nilfs_construction_timeout(struct timer_list *t) { - struct task_struct *p = (struct task_struct *)data; + struct nilfs_sc_info *sci = from_timer(sci, t, sc_timer); - wake_up_process(p); + wake_up_process(sci->sc_timer_task); } static void @@ -2542,8 +2544,7 @@ static int nilfs_segctor_thread(void *arg) struct the_nilfs *nilfs = sci->sc_super->s_fs_info; int timeout = 0; - sci->sc_timer.data = (unsigned long)current; - sci->sc_timer.function = nilfs_construction_timeout; + sci->sc_timer_task = current; /* start sync. */ sci->sc_task = current; @@ -2674,7 +2675,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, INIT_LIST_HEAD(&sci->sc_gc_inodes); INIT_LIST_HEAD(&sci->sc_iput_queue); INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func); - init_timer(&sci->sc_timer); + timer_setup(&sci->sc_timer, nilfs_construction_timeout, 0); sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT; sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ; diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 1060949d7dd2..84084a4d9b3e 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -180,6 +180,7 @@ struct nilfs_sc_info { unsigned long sc_watermark; struct timer_list sc_timer; + struct task_struct *sc_timer_task; struct task_struct *sc_task; }; diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c index 1541a1e9221a..1341a41e7b43 100644 --- a/fs/nilfs2/sufile.c +++ b/fs/nilfs2/sufile.c @@ -630,22 +630,22 @@ void nilfs_sufile_do_set_error(struct inode *sufile, __u64 segnum, } /** - * nilfs_sufile_truncate_range - truncate range of segment array - * @sufile: inode of segment usage file - * @start: start segment number (inclusive) - * @end: end segment number (inclusive) - * - * Return Value: On success, 0 is returned. On error, one of the - * following negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-EINVAL - Invalid number of segments specified - * - * %-EBUSY - Dirty or active segments are present in the range - */ + * nilfs_sufile_truncate_range - truncate range of segment array + * @sufile: inode of segment usage file + * @start: start segment number (inclusive) + * @end: end segment number (inclusive) + * + * Return Value: On success, 0 is returned. On error, one of the + * following negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-EINVAL - Invalid number of segments specified + * + * %-EBUSY - Dirty or active segments are present in the range + */ static int nilfs_sufile_truncate_range(struct inode *sufile, __u64 start, __u64 end) { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 4fc018dfcfae..3073b646e1ba 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -141,7 +141,7 @@ void __nilfs_error(struct super_block *sb, const char *function, if (nilfs_test_opt(nilfs, ERRORS_RO)) { printk(KERN_CRIT "Remounting filesystem read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } } @@ -160,7 +160,6 @@ struct inode *nilfs_alloc_inode(struct super_block *sb) ii->i_bh = NULL; ii->i_state = 0; ii->i_cno = 0; - ii->vfs_inode.i_version = 1; nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode); return &ii->vfs_inode; } @@ -870,7 +869,7 @@ int nilfs_store_magic_and_option(struct super_block *sb, /* FS independent flags */ #ifdef NILFS_ATIME_DISABLE - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; #endif nilfs_set_default_options(sb, sbp); @@ -1134,7 +1133,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } - sb->s_flags = (sb->s_flags & ~MS_POSIXACL); + sb->s_flags = (sb->s_flags & ~SB_POSIXACL); err = -EINVAL; @@ -1144,12 +1143,12 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out; - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { /* Shutting down log writer */ nilfs_detach_log_writer(sb); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; /* * Remounting a valid RW partition RDONLY, so set @@ -1179,7 +1178,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; root = NILFS_I(d_inode(sb->s_root))->i_root; err = nilfs_attach_log_writer(sb, root); @@ -1213,7 +1212,7 @@ static int nilfs_parse_snapshot_option(const char *option, const char *msg = NULL; int err; - if (!(sd->flags & MS_RDONLY)) { + if (!(sd->flags & SB_RDONLY)) { msg = "read-only option is not specified"; goto parse_error; } @@ -1287,7 +1286,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags, struct dentry *root_dentry; int err, s_new = false; - if (!(flags & MS_RDONLY)) + if (!(flags & SB_RDONLY)) mode |= FMODE_WRITE; sd.bdev = blkdev_get_by_path(dev_name, mode, fs_type); @@ -1328,14 +1327,14 @@ nilfs_mount(struct file_system_type *fs_type, int flags, snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev); sb_set_blocksize(s, block_size(sd.bdev)); - err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0); + err = nilfs_fill_super(s, data, flags & SB_SILENT ? 1 : 0); if (err) goto failed_super; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; } else if (!sd.cno) { if (nilfs_tree_is_busy(s->s_root)) { - if ((flags ^ s->s_flags) & MS_RDONLY) { + if ((flags ^ s->s_flags) & SB_RDONLY) { nilfs_msg(s, KERN_ERR, "the device already has a %s mount.", sb_rdonly(s) ? "read-only" : "read/write"); diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 2dd75bf619ad..1a85317e83f0 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -220,7 +220,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) if (!valid_fs) { nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs"); - if (s_flags & MS_RDONLY) { + if (s_flags & SB_RDONLY) { nilfs_msg(sb, KERN_INFO, "recovery required for readonly filesystem"); nilfs_msg(sb, KERN_INFO, @@ -286,7 +286,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) if (valid_fs) goto skip_recovery; - if (s_flags & MS_RDONLY) { + if (s_flags & SB_RDONLY) { __u64 features; if (nilfs_test_opt(nilfs, NORECOVERY)) { @@ -309,7 +309,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb) err = -EROFS; goto failed_unload; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; } else if (nilfs_test_opt(nilfs, NORECOVERY)) { nilfs_msg(sb, KERN_ERR, "recovery cancelled because norecovery option was specified for a read/write mount"); @@ -737,7 +737,7 @@ struct nilfs_root *nilfs_lookup_root(struct the_nilfs *nilfs, __u64 cno) } else if (cno > root->cno) { n = n->rb_right; } else { - atomic_inc(&root->count); + refcount_inc(&root->count); spin_unlock(&nilfs->ns_cptree_lock); return root; } @@ -776,7 +776,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) } else if (cno > root->cno) { p = &(*p)->rb_right; } else { - atomic_inc(&root->count); + refcount_inc(&root->count); spin_unlock(&nilfs->ns_cptree_lock); kfree(new); return root; @@ -786,7 +786,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) new->cno = cno; new->ifile = NULL; new->nilfs = nilfs; - atomic_set(&new->count, 1); + refcount_set(&new->count, 1); atomic64_set(&new->inodes_count, 0); atomic64_set(&new->blocks_count, 0); @@ -806,7 +806,7 @@ nilfs_find_or_create_root(struct the_nilfs *nilfs, __u64 cno) void nilfs_put_root(struct nilfs_root *root) { - if (atomic_dec_and_test(&root->count)) { + if (refcount_dec_and_test(&root->count)) { struct the_nilfs *nilfs = root->nilfs; nilfs_sysfs_delete_snapshot_group(root); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index b305c6f033e7..883d732b0259 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -27,6 +27,7 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> +#include <linux/refcount.h> struct nilfs_sc_info; struct nilfs_sysfs_dev_subgroups; @@ -246,7 +247,7 @@ struct nilfs_root { __u64 cno; struct rb_node rb_node; - atomic_t count; + refcount_t count; struct the_nilfs *nilfs; struct inode *ifile; @@ -299,7 +300,7 @@ void nilfs_swap_super_block(struct the_nilfs *); static inline void nilfs_get_root(struct nilfs_root *root) { - atomic_inc(&root->count); + refcount_inc(&root->count); } static inline int nilfs_valid_fs(struct the_nilfs *nilfs) diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 81d8959b6aef..219b269c737e 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -67,7 +67,7 @@ void fsnotify_unmount_inodes(struct super_block *sb) /* * If i_count is zero, the inode cannot have any watches and - * doing an __iget/iput with MS_ACTIVE clear would actually + * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. */ diff --git a/fs/nsfs.c b/fs/nsfs.c index ef243e14b6eb..7c6f76d29f56 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -255,5 +255,5 @@ void __init nsfs_init(void) nsfs_mnt = kern_mount(&nsfs); if (IS_ERR(nsfs_mnt)) panic("can't set nsfs up\n"); - nsfs_mnt->mnt_sb->s_flags &= ~MS_NOUSER; + nsfs_mnt->mnt_sb->s_flags &= ~SB_NOUSER; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 3f70f041dbe9..bb7159f697f2 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -473,7 +473,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) #ifndef NTFS_RW /* For read-only compiled driver, enforce read-only flag. */ - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; #else /* NTFS_RW */ /* * For the read-write compiled driver, if we are remounting read-write, @@ -487,7 +487,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) * When remounting read-only, mark the volume clean if no volume errors * have occurred. */ - if (sb_rdonly(sb) && !(*flags & MS_RDONLY)) { + if (sb_rdonly(sb) && !(*flags & SB_RDONLY)) { static const char *es = ". Cannot remount read-write."; /* Remounting read-write. */ @@ -548,7 +548,7 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt) NVolSetErrors(vol); return -EROFS; } - } else if (!sb_rdonly(sb) && (*flags & MS_RDONLY)) { + } else if (!sb_rdonly(sb) && (*flags & SB_RDONLY)) { /* Remounting read-only. */ if (!NVolErrors(vol)) { if (ntfs_clear_volume_flags(vol, VOLUME_IS_DIRTY)) @@ -1799,7 +1799,7 @@ static bool load_system_files(ntfs_volume *vol) es3); goto iput_mirr_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", !vol->mftmirr_ino ? es1 : es2, es3); } else @@ -1937,7 +1937,7 @@ get_ctx_vol_failed: es1, es2); goto iput_vol_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -1974,7 +1974,7 @@ get_ctx_vol_failed: } goto iput_logfile_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -2019,7 +2019,7 @@ get_ctx_vol_failed: es1, es2); goto iput_root_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -2042,7 +2042,7 @@ get_ctx_vol_failed: goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; /* * Do not set NVolErrors() because ntfs_remount() might manage * to set the dirty flag in which case all would be well. @@ -2055,7 +2055,7 @@ get_ctx_vol_failed: * If (still) a read-write mount, set the NT4 compatibility flag on * newer NTFS version volumes. */ - if (!(sb->s_flags & MS_RDONLY) && (vol->major_ver > 1) && + if (!(sb->s_flags & SB_RDONLY) && (vol->major_ver > 1) && ntfs_set_volume_flags(vol, VOLUME_MOUNTED_ON_NT4)) { static const char *es1 = "Failed to set NT4 compatibility flag"; static const char *es2 = ". Run chkdsk."; @@ -2069,7 +2069,7 @@ get_ctx_vol_failed: goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; NVolSetErrors(vol); } #endif @@ -2087,7 +2087,7 @@ get_ctx_vol_failed: goto iput_root_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; NVolSetErrors(vol); } #endif /* NTFS_RW */ @@ -2128,7 +2128,7 @@ get_ctx_vol_failed: es1, es2); goto iput_quota_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -2150,7 +2150,7 @@ get_ctx_vol_failed: goto iput_quota_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; NVolSetErrors(vol); } /* @@ -2171,7 +2171,7 @@ get_ctx_vol_failed: es1, es2); goto iput_usnjrnl_err_out; } - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); } else ntfs_warning(sb, "%s. Will not be able to remount " @@ -2194,7 +2194,7 @@ get_ctx_vol_failed: goto iput_usnjrnl_err_out; } ntfs_error(sb, "%s. Mounting read-only%s", es1, es2); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; NVolSetErrors(vol); } #endif /* NTFS_RW */ @@ -2728,7 +2728,7 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) lockdep_off(); ntfs_debug("Entering."); #ifndef NTFS_RW - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; #endif /* ! NTFS_RW */ /* Allocate a new ntfs_volume and place it in sb->s_fs_info. */ sb->s_fs_info = kmalloc(sizeof(ntfs_volume), GFP_NOFS); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 8d779227370a..bebe59feca58 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -140,7 +140,7 @@ static void o2net_rx_until_empty(struct work_struct *work); static void o2net_shutdown_sc(struct work_struct *work); static void o2net_listen_data_ready(struct sock *sk); static void o2net_sc_send_keep_req(struct work_struct *work); -static void o2net_idle_timer(unsigned long data); +static void o2net_idle_timer(struct timer_list *t); static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); @@ -450,8 +450,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); - setup_timer(&sc->sc_idle_timeout, o2net_idle_timer, - (unsigned long)sc); + timer_setup(&sc->sc_idle_timeout, o2net_idle_timer, 0); sclog(sc, "alloced\n"); @@ -1517,9 +1516,9 @@ static void o2net_sc_send_keep_req(struct work_struct *work) /* socket shutdown does a del_timer_sync against this as it tears down. * we can't start this timer until we've got to the point in sc buildup * where shutdown is going to be involved */ -static void o2net_idle_timer(unsigned long data) +static void o2net_idle_timer(struct timer_list *t) { - struct o2net_sock_container *sc = (struct o2net_sock_container *)data; + struct o2net_sock_container *sc = from_timer(sc, t, sc_idle_timeout); struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); #ifdef CONFIG_DEBUG_FS unsigned long msecs = ktime_to_ms(ktime_get()) - diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index dc455d45a66a..a1d051055472 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -227,7 +227,7 @@ int ocfs2_should_update_atime(struct inode *inode, return 0; if ((inode->i_flags & S_NOATIME) || - ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) + ((inode->i_sb->s_flags & SB_NODIRATIME) && S_ISDIR(inode->i_mode))) return 0; /* diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 040bbb6a6e4b..80efa5699fb0 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -675,9 +675,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) } /* We're going to/from readonly mode. */ - if ((bool)(*flags & MS_RDONLY) != sb_rdonly(sb)) { + if ((bool)(*flags & SB_RDONLY) != sb_rdonly(sb)) { /* Disable quota accounting before remounting RO */ - if (*flags & MS_RDONLY) { + if (*flags & SB_RDONLY) { ret = ocfs2_susp_quotas(osb, 0); if (ret < 0) goto out; @@ -691,8 +691,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto unlock_osb; } - if (*flags & MS_RDONLY) { - sb->s_flags |= MS_RDONLY; + if (*flags & SB_RDONLY) { + sb->s_flags |= SB_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; } else { if (osb->osb_flags & OCFS2_OSB_ERROR_FS) { @@ -709,14 +709,14 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) ret = -EINVAL; goto unlock_osb; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; osb->osb_flags &= ~OCFS2_OSB_SOFT_RO; } trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags); unlock_osb: spin_unlock(&osb->osb_lock); /* Enable quota accounting after remounting RW */ - if (!ret && !(*flags & MS_RDONLY)) { + if (!ret && !(*flags & SB_RDONLY)) { if (sb_any_quota_suspended(sb)) ret = ocfs2_susp_quotas(osb, 1); else @@ -724,7 +724,7 @@ unlock_osb: if (ret < 0) { /* Return back changes... */ spin_lock(&osb->osb_lock); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; osb->osb_flags |= OCFS2_OSB_SOFT_RO; spin_unlock(&osb->osb_lock); goto out; @@ -744,9 +744,9 @@ unlock_osb: if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? - MS_POSIXACL : 0); + SB_POSIXACL : 0); } out: return ret; @@ -1057,10 +1057,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OCFS2_SUPER_MAGIC; - sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) | - ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_flags = (sb->s_flags & ~(SB_POSIXACL | SB_NOSEC)) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? SB_POSIXACL : 0); - /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, + /* Hard readonly mode only if: bdev_read_only, SB_RDONLY, * heartbeat=none */ if (bdev_read_only(sb->s_bdev)) { if (!sb_rdonly(sb)) { @@ -2057,7 +2057,7 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; /* this is needed to support O_LARGEFILE */ cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits); bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); @@ -2568,7 +2568,7 @@ static int ocfs2_handle_error(struct super_block *sb) return rv; pr_crit("OCFS2: File system is now read-only.\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; ocfs2_set_ro_flag(osb, 0); } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 5fdf269ba82e..c5898c59d411 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -901,7 +901,7 @@ static int ocfs2_xattr_list_entry(struct super_block *sb, case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS: case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT: - if (!(sb->s_flags & MS_POSIXACL)) + if (!(sb->s_flags & SB_POSIXACL)) return 0; break; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 13215f26e321..2200662a9bf1 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -369,7 +369,7 @@ static struct inode *openprom_iget(struct super_block *sb, ino_t ino) static int openprom_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_NOATIME; + *flags |= SB_NOATIME; return 0; } @@ -386,7 +386,7 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent) struct op_inode_info *oi; int ret; - s->s_flags |= MS_NOATIME; + s->s_flags |= SB_NOATIME; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = OPENPROM_SUPER_MAGIC; diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c index c2d8233b1e82..480ea059a680 100644 --- a/fs/orangefs/acl.c +++ b/fs/orangefs/acl.c @@ -155,13 +155,11 @@ int orangefs_set_acl(struct inode *inode, struct posix_acl *acl, int type) int orangefs_init_acl(struct inode *inode, struct inode *dir) { - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct posix_acl *default_acl, *acl; umode_t mode = inode->i_mode; + struct iattr iattr; int error = 0; - ClearModeFlag(orangefs_inode); - error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) return error; @@ -180,9 +178,11 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir) /* If mode of the inode was changed, then do a forcible ->setattr */ if (mode != inode->i_mode) { - SetModeFlag(orangefs_inode); + memset(&iattr, 0, sizeof iattr); inode->i_mode = mode; - orangefs_flush_inode(inode); + iattr.ia_mode = mode; + iattr.ia_valid |= ATTR_MODE; + orangefs_inode_setattr(inode, &iattr); } return error; diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index a8cc588d6224..e2c2699d8016 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -386,7 +386,6 @@ static int orangefs_dir_release(struct inode *inode, struct file *file) { struct orangefs_dir *od = file->private_data; struct orangefs_dir_part *part = od->part; - orangefs_flush_inode(inode); while (part) { struct orangefs_dir_part *next = part->next; vfree(part); diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index e4a8e6a7eb17..1668fd645c45 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -383,9 +383,15 @@ out: if (type == ORANGEFS_IO_READ) { file_accessed(file); } else { - SetMtimeFlag(orangefs_inode); - inode->i_mtime = current_time(inode); - mark_inode_dirty_sync(inode); + file_update_time(file); + /* + * Must invalidate to ensure write loop doesn't + * prevent kernel from reading updated + * attribute. Size probably changed because of + * the write, and other clients could update + * any other attribute. + */ + orangefs_inode->getattr_time = jiffies - 1; } } @@ -615,8 +621,6 @@ static int orangefs_file_release(struct inode *inode, struct file *file) "orangefs_file_release: called on %pD\n", file); - orangefs_flush_inode(inode); - /* * remove all associated inode pages from the page cache and * readahead cache (if any); this forces an expensive refresh of @@ -666,8 +670,6 @@ static int orangefs_fsync(struct file *file, ret); op_release(new_op); - - orangefs_flush_inode(file_inode(file)); return ret; } diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index 28825a5b6d09..fe1d705ad91f 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -290,6 +290,22 @@ int orangefs_permission(struct inode *inode, int mask) return generic_permission(inode, mask); } +int orangefs_update_time(struct inode *inode, struct timespec *time, int flags) +{ + struct iattr iattr; + gossip_debug(GOSSIP_INODE_DEBUG, "orangefs_update_time: %pU\n", + get_khandle_from_ino(inode)); + generic_update_time(inode, time, flags); + memset(&iattr, 0, sizeof iattr); + if (flags & S_ATIME) + iattr.ia_valid |= ATTR_ATIME; + if (flags & S_CTIME) + iattr.ia_valid |= ATTR_CTIME; + if (flags & S_MTIME) + iattr.ia_valid |= ATTR_MTIME; + return orangefs_inode_setattr(inode, &iattr); +} + /* ORANGEDS2 implementation of VFS inode operations for files */ const struct inode_operations orangefs_file_inode_operations = { .get_acl = orangefs_get_acl, @@ -298,6 +314,7 @@ const struct inode_operations orangefs_file_inode_operations = { .getattr = orangefs_getattr, .listxattr = orangefs_listxattr, .permission = orangefs_permission, + .update_time = orangefs_update_time, }; static int orangefs_init_iops(struct inode *inode) diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 7e9e5d0ea3bc..c98bba2dbc94 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -22,7 +22,9 @@ static int orangefs_create(struct inode *dir, { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; + struct orangefs_object_kref ref; struct inode *inode; + struct iattr iattr; int ret; gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd\n", @@ -55,8 +57,10 @@ static int orangefs_create(struct inode *dir, if (ret < 0) goto out; - inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, - &new_op->downcall.resp.create.refn); + ref = new_op->downcall.resp.create.refn; + op_release(new_op); + + inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &ref); if (IS_ERR(inode)) { gossip_err("%s: Failed to allocate inode for file :%pd:\n", __func__, @@ -82,12 +86,13 @@ static int orangefs_create(struct inode *dir, __func__, dentry); - SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_time(dir); + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME; + orangefs_inode_setattr(dir, &iattr); mark_inode_dirty_sync(dir); ret = 0; out: - op_release(new_op); gossip_debug(GOSSIP_NAME_DEBUG, "%s: %pd: returning %d\n", __func__, @@ -221,6 +226,7 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; + struct iattr iattr; int ret; gossip_debug(GOSSIP_NAME_DEBUG, @@ -253,8 +259,10 @@ static int orangefs_unlink(struct inode *dir, struct dentry *dentry) if (!ret) { drop_nlink(inode); - SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_time(dir); + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME; + orangefs_inode_setattr(dir, &iattr); mark_inode_dirty_sync(dir); } return ret; @@ -266,7 +274,9 @@ static int orangefs_symlink(struct inode *dir, { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; + struct orangefs_object_kref ref; struct inode *inode; + struct iattr iattr; int mode = 755; int ret; @@ -307,8 +317,10 @@ static int orangefs_symlink(struct inode *dir, goto out; } - inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, - &new_op->downcall.resp.sym.refn); + ref = new_op->downcall.resp.sym.refn; + op_release(new_op); + + inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, &ref); if (IS_ERR(inode)) { gossip_err ("*** Failed to allocate orangefs symlink inode\n"); @@ -331,12 +343,13 @@ static int orangefs_symlink(struct inode *dir, get_khandle_from_ino(inode), dentry); - SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_time(dir); + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME; + orangefs_inode_setattr(dir, &iattr); mark_inode_dirty_sync(dir); ret = 0; out: - op_release(new_op); return ret; } @@ -344,7 +357,9 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; + struct orangefs_object_kref ref; struct inode *inode; + struct iattr iattr; int ret; new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR); @@ -373,8 +388,10 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode goto out; } - inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, - &new_op->downcall.resp.mkdir.refn); + ref = new_op->downcall.resp.mkdir.refn; + op_release(new_op); + + inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, &ref); if (IS_ERR(inode)) { gossip_err("*** Failed to allocate orangefs dir inode\n"); ret = PTR_ERR(inode); @@ -400,11 +417,12 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode * NOTE: we have no good way to keep nlink consistent for directories * across clients; keep constant at 1. */ - SetMtimeFlag(parent); dir->i_mtime = dir->i_ctime = current_time(dir); + memset(&iattr, 0, sizeof iattr); + iattr.ia_valid |= ATTR_MTIME; + orangefs_inode_setattr(dir, &iattr); mark_inode_dirty_sync(dir); out: - op_release(new_op); return ret; } @@ -470,4 +488,5 @@ const struct inode_operations orangefs_dir_inode_operations = { .getattr = orangefs_getattr, .listxattr = orangefs_listxattr, .permission = orangefs_permission, + .update_time = orangefs_update_time, }; diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h index b6001bb28f5a..c7db56a31b92 100644 --- a/fs/orangefs/orangefs-debug.h +++ b/fs/orangefs/orangefs-debug.h @@ -15,8 +15,10 @@ #ifdef __KERNEL__ #include <linux/types.h> +#include <linux/kernel.h> #else #include <stdint.h> +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif #define GOSSIP_NO_DEBUG (__u64)0 @@ -88,6 +90,6 @@ static struct __keyword_mask_s s_kmod_keyword_mask_map[] = { }; static const int num_kmod_keyword_mask_map = (int) - (sizeof(s_kmod_keyword_mask_map) / sizeof(struct __keyword_mask_s)); + (ARRAY_SIZE(s_kmod_keyword_mask_map)); #endif /* __ORANGEFS_DEBUG_H */ diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 004af348fb80..97adf7d100b5 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -209,37 +209,10 @@ struct orangefs_inode_s { struct inode vfs_inode; sector_t last_failed_block_index_read; - /* - * State of in-memory attributes not yet flushed to disk associated - * with this object - */ - unsigned long pinode_flags; - unsigned long getattr_time; u32 getattr_mask; }; -#define P_ATIME_FLAG 0 -#define P_MTIME_FLAG 1 -#define P_CTIME_FLAG 2 -#define P_MODE_FLAG 3 - -#define ClearAtimeFlag(pinode) clear_bit(P_ATIME_FLAG, &(pinode)->pinode_flags) -#define SetAtimeFlag(pinode) set_bit(P_ATIME_FLAG, &(pinode)->pinode_flags) -#define AtimeFlag(pinode) test_bit(P_ATIME_FLAG, &(pinode)->pinode_flags) - -#define ClearMtimeFlag(pinode) clear_bit(P_MTIME_FLAG, &(pinode)->pinode_flags) -#define SetMtimeFlag(pinode) set_bit(P_MTIME_FLAG, &(pinode)->pinode_flags) -#define MtimeFlag(pinode) test_bit(P_MTIME_FLAG, &(pinode)->pinode_flags) - -#define ClearCtimeFlag(pinode) clear_bit(P_CTIME_FLAG, &(pinode)->pinode_flags) -#define SetCtimeFlag(pinode) set_bit(P_CTIME_FLAG, &(pinode)->pinode_flags) -#define CtimeFlag(pinode) test_bit(P_CTIME_FLAG, &(pinode)->pinode_flags) - -#define ClearModeFlag(pinode) clear_bit(P_MODE_FLAG, &(pinode)->pinode_flags) -#define SetModeFlag(pinode) set_bit(P_MODE_FLAG, &(pinode)->pinode_flags) -#define ModeFlag(pinode) test_bit(P_MODE_FLAG, &(pinode)->pinode_flags) - /* per superblock private orangefs info */ struct orangefs_sb_info_s { struct orangefs_khandle root_khandle; @@ -275,12 +248,6 @@ struct orangefs_kiocb_s { /* orangefs kernel operation type */ struct orangefs_kernel_op_s *op; - /* The user space buffers from/to which I/O is being staged */ - struct iovec *iov; - - /* number of elements in the iovector */ - unsigned long nr_segs; - /* set to indicate the type of the operation */ int rw; @@ -442,6 +409,8 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, int orangefs_permission(struct inode *inode, int mask); +int orangefs_update_time(struct inode *, struct timespec *, int); + /* * defined in xattr.c */ @@ -484,8 +453,6 @@ bool __is_daemon_in_service(void); */ __s32 fsid_of_op(struct orangefs_kernel_op_s *op); -int orangefs_flush_inode(struct inode *inode); - ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, void *buffer, diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index f82336496311..97fe93129f38 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -4,6 +4,7 @@ * * See COPYING in top-level directory. */ +#include <linux/kernel.h> #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-dev-proto.h" @@ -437,89 +438,8 @@ int orangefs_inode_setattr(struct inode *inode, struct iattr *iattr) op_release(new_op); - /* - * successful setattr should clear the atime, mtime and - * ctime flags. - */ - if (ret == 0) { - ClearAtimeFlag(orangefs_inode); - ClearMtimeFlag(orangefs_inode); - ClearCtimeFlag(orangefs_inode); - ClearModeFlag(orangefs_inode); + if (ret == 0) orangefs_inode->getattr_time = jiffies - 1; - } - - return ret; -} - -int orangefs_flush_inode(struct inode *inode) -{ - /* - * If it is a dirty inode, this function gets called. - * Gather all the information that needs to be setattr'ed - * Right now, this will only be used for mode, atime, mtime - * and/or ctime. - */ - struct iattr wbattr; - int ret; - int mtime_flag; - int ctime_flag; - int atime_flag; - int mode_flag; - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - - memset(&wbattr, 0, sizeof(wbattr)); - - /* - * check inode flags up front, and clear them if they are set. This - * will prevent multiple processes from all trying to flush the same - * inode if they call close() simultaneously - */ - mtime_flag = MtimeFlag(orangefs_inode); - ClearMtimeFlag(orangefs_inode); - ctime_flag = CtimeFlag(orangefs_inode); - ClearCtimeFlag(orangefs_inode); - atime_flag = AtimeFlag(orangefs_inode); - ClearAtimeFlag(orangefs_inode); - mode_flag = ModeFlag(orangefs_inode); - ClearModeFlag(orangefs_inode); - - /* -- Lazy atime,mtime and ctime update -- - * Note: all times are dictated by server in the new scheme - * and not by the clients - * - * Also mode updates are being handled now.. - */ - - if (mtime_flag) - wbattr.ia_valid |= ATTR_MTIME; - if (ctime_flag) - wbattr.ia_valid |= ATTR_CTIME; - if (atime_flag) - wbattr.ia_valid |= ATTR_ATIME; - - if (mode_flag) { - wbattr.ia_mode = inode->i_mode; - wbattr.ia_valid |= ATTR_MODE; - } - - gossip_debug(GOSSIP_UTILS_DEBUG, - "*********** orangefs_flush_inode: %pU " - "(ia_valid %d)\n", - get_khandle_from_ino(inode), - wbattr.ia_valid); - if (wbattr.ia_valid == 0) { - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_flush_inode skipping setattr()\n"); - return 0; - } - - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_flush_inode (%pU) writing mode %o\n", - get_khandle_from_ino(inode), - inode->i_mode); - - ret = orangefs_inode_setattr(inode, &wbattr); return ret; } @@ -606,7 +526,7 @@ int orangefs_normalize_to_errno(__s32 error_code) /* Convert ORANGEFS encoded errno values into regular errno values. */ } else if ((-error_code) & ORANGEFS_ERROR_BIT) { i = (-error_code) & ~(ORANGEFS_ERROR_BIT|ORANGEFS_ERROR_CLASS_BITS); - if (i < sizeof(PINT_errno_mapping)/sizeof(*PINT_errno_mapping)) + if (i < ARRAY_SIZE(PINT_errno_mapping)) error_code = -PINT_errno_mapping[i]; else error_code = -EINVAL; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 47ebd9bfd1a1..36f1390b5ed7 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -40,7 +40,7 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root) { struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(root->d_sb); - if (root->d_sb->s_flags & MS_POSIXACL) + if (root->d_sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); if (orangefs_sb->flags & ORANGEFS_OPT_INTR) seq_puts(m, ",intr"); @@ -60,7 +60,7 @@ static int parse_mount_options(struct super_block *sb, char *options, * Force any potential flags that might be set from the mount * to zero, ie, initialize to unset. */ - sb->s_flags &= ~MS_POSIXACL; + sb->s_flags &= ~SB_POSIXACL; orangefs_sb->flags &= ~ORANGEFS_OPT_INTR; orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; @@ -73,7 +73,7 @@ static int parse_mount_options(struct super_block *sb, char *options, token = match_token(p, tokens, args); switch (token) { case Opt_acl: - sb->s_flags |= MS_POSIXACL; + sb->s_flags |= SB_POSIXACL; break; case Opt_intr: orangefs_sb->flags |= ORANGEFS_OPT_INTR; @@ -99,8 +99,6 @@ static void orangefs_inode_cache_ctor(void *req) inode_init_once(&orangefs_inode->vfs_inode); init_rwsem(&orangefs_inode->xattr_sem); - - orangefs_inode->vfs_inode.i_version = 1; } static struct inode *orangefs_alloc_inode(struct super_block *sb) @@ -119,7 +117,6 @@ static struct inode *orangefs_alloc_inode(struct super_block *sb) orangefs_inode->refn.fs_id = ORANGEFS_FS_ID_NULL; orangefs_inode->last_failed_block_index_read = 0; memset(orangefs_inode->link_target, 0, sizeof(orangefs_inode->link_target)); - orangefs_inode->pinode_flags = 0; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_alloc_inode: allocated %p\n", @@ -299,21 +296,9 @@ void fsid_key_table_finalize(void) { } -/* Called whenever the VFS dirties the inode in response to atime updates */ -static void orangefs_dirty_inode(struct inode *inode, int flags) -{ - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); - - gossip_debug(GOSSIP_SUPER_DEBUG, - "orangefs_dirty_inode: %pU\n", - get_khandle_from_ino(inode)); - SetAtimeFlag(orangefs_inode); -} - static const struct super_operations orangefs_s_ops = { .alloc_inode = orangefs_alloc_inode, .destroy_inode = orangefs_destroy_inode, - .dirty_inode = orangefs_dirty_inode, .drop_inode = generic_delete_inode, .statfs = orangefs_statfs, .remount_fs = orangefs_remount_fs, @@ -522,7 +507,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, ret = orangefs_fill_sb(sb, &new_op->downcall.resp.fs_mount, data, - flags & MS_SILENT ? 1 : 0); + flags & SB_SILENT ? 1 : 0); if (ret) { d = ERR_PTR(ret); diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c index d856cdf91763..db107fe91ab3 100644 --- a/fs/orangefs/symlink.c +++ b/fs/orangefs/symlink.c @@ -15,4 +15,5 @@ const struct inode_operations orangefs_symlink_inode_operations = { .getattr = orangefs_getattr, .listxattr = orangefs_listxattr, .permission = orangefs_permission, + .update_time = orangefs_update_time, }; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index c441f9387a1b..eb3b8d39fb61 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -22,7 +22,6 @@ #include <linux/ratelimit.h> #include <linux/exportfs.h> #include "overlayfs.h" -#include "ovl_entry.h" #define OVL_COPY_UP_CHUNK_SIZE (1 << 20) @@ -486,6 +485,7 @@ static int ovl_copy_up_inode(struct ovl_copy_up_ctx *c, struct dentry *temp) static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) { struct inode *udir = c->destdir->d_inode; + struct inode *inode; struct dentry *newdentry = NULL; struct dentry *temp = NULL; int err; @@ -508,7 +508,11 @@ static int ovl_copy_up_locked(struct ovl_copy_up_ctx *c) if (err) goto out_cleanup; - ovl_inode_update(d_inode(c->dentry), newdentry); + inode = d_inode(c->dentry); + ovl_inode_update(inode, newdentry); + if (S_ISDIR(inode->i_mode)) + ovl_set_flag(OVL_WHITEOUTS, inode); + out: dput(temp); return err; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index cc961a3bd3bd..e13921824c70 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -181,6 +181,11 @@ static bool ovl_type_origin(struct dentry *dentry) return OVL_TYPE_ORIGIN(ovl_path_type(dentry)); } +static bool ovl_may_have_whiteouts(struct dentry *dentry) +{ + return ovl_test_flag(OVL_WHITEOUTS, d_inode(dentry)); +} + static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct cattr *attr, struct dentry *hardlink) { @@ -300,7 +305,6 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry) { int err; struct dentry *ret = NULL; - enum ovl_path_type type = ovl_path_type(dentry); LIST_HEAD(list); err = ovl_check_empty_dir(dentry, &list); @@ -313,13 +317,13 @@ static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry) * When removing an empty opaque directory, then it makes no sense to * replace it with an exact replica of itself. * - * If no upperdentry then skip clearing whiteouts. + * If upperdentry has whiteouts, clear them. * * Can race with copy-up, since we don't hold the upperdir mutex. * Doesn't matter, since copy-up can't create a non-empty directory * from an empty one. */ - if (OVL_TYPE_UPPER(type) && OVL_TYPE_MERGE(type)) + if (!list_empty(&list)) ret = ovl_clear_empty(dentry, &list); out_free: @@ -698,8 +702,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) struct dentry *opaquedir = NULL; int err; - /* Redirect dir can be !ovl_lower_positive && OVL_TYPE_MERGE */ - if (is_dir && ovl_dentry_get_redirect(dentry)) { + /* Redirect/origin dir can be !ovl_lower_positive && not clean */ + if (is_dir && (ovl_dentry_get_redirect(dentry) || + ovl_may_have_whiteouts(dentry))) { opaquedir = ovl_check_empty_and_clear(dentry); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) @@ -946,7 +951,8 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, old_cred = ovl_override_creds(old->d_sb); - if (overwrite && new_is_dir && ovl_type_merge_or_lower(new)) { + if (overwrite && new_is_dir && (ovl_type_merge_or_lower(new) || + ovl_may_have_whiteouts(new))) { opaquedir = ovl_check_empty_and_clear(new); err = PTR_ERR(opaquedir); if (IS_ERR(opaquedir)) { @@ -1069,9 +1075,10 @@ static int ovl_rename(struct inode *olddir, struct dentry *old, drop_nlink(d_inode(new)); } - ovl_dentry_version_inc(old->d_parent, - !overwrite && ovl_type_origin(new)); - ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old)); + ovl_dentry_version_inc(old->d_parent, ovl_type_origin(old) || + (!overwrite && ovl_type_origin(new))); + ovl_dentry_version_inc(new->d_parent, ovl_type_origin(old) || + (d_inode(new) && ovl_type_origin(new))); out_dput: dput(newdentry); diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 321511ed8c42..00b6b294272a 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -15,6 +15,14 @@ #include <linux/ratelimit.h> #include "overlayfs.h" + +static dev_t ovl_get_pseudo_dev(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->lowerstack[0].layer->pseudo_dev; +} + int ovl_setattr(struct dentry *dentry, struct iattr *attr) { int err; @@ -66,6 +74,7 @@ int ovl_getattr(const struct path *path, struct kstat *stat, struct path realpath; const struct cred *old_cred; bool is_dir = S_ISDIR(dentry->d_inode->i_mode); + bool samefs = ovl_same_sb(dentry->d_sb); int err; type = ovl_path_real(dentry, &realpath); @@ -75,16 +84,13 @@ int ovl_getattr(const struct path *path, struct kstat *stat, goto out; /* - * When all layers are on the same fs, all real inode number are - * unique, so we use the overlay st_dev, which is friendly to du -x. - * - * We also use st_ino of the copy up origin, if we know it. - * This guaranties constant st_dev/st_ino across copy up. + * For non-dir or same fs, we use st_ino of the copy up origin, if we + * know it. This guaranties constant st_dev/st_ino across copy up. * * If filesystem supports NFS export ops, this also guaranties * persistent st_ino across mount cycle. */ - if (ovl_same_sb(dentry->d_sb)) { + if (!is_dir || samefs) { if (OVL_TYPE_ORIGIN(type)) { struct kstat lowerstat; u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0); @@ -95,7 +101,6 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (err) goto out; - WARN_ON_ONCE(stat->dev != lowerstat.dev); /* * Lower hardlinks may be broken on copy up to different * upper files, so we cannot use the lower origin st_ino @@ -107,17 +112,36 @@ int ovl_getattr(const struct path *path, struct kstat *stat, if (is_dir || lowerstat.nlink == 1 || ovl_test_flag(OVL_INDEX, d_inode(dentry))) stat->ino = lowerstat.ino; + + if (samefs) + WARN_ON_ONCE(stat->dev != lowerstat.dev); + else + stat->dev = ovl_get_pseudo_dev(dentry); } - stat->dev = dentry->d_sb->s_dev; - } else if (is_dir) { + if (samefs) { + /* + * When all layers are on the same fs, all real inode + * number are unique, so we use the overlay st_dev, + * which is friendly to du -x. + */ + stat->dev = dentry->d_sb->s_dev; + } else if (!OVL_TYPE_UPPER(type)) { + /* + * For non-samefs setup, to make sure that st_dev/st_ino + * pair is unique across the system, we use a unique + * anonymous st_dev for lower layer inode. + */ + stat->dev = ovl_get_pseudo_dev(dentry); + } + } else { /* - * If not all layers are on the same fs the pair {real st_ino; - * overlay st_dev} is not unique, so use the non persistent - * overlay st_ino. - * * Always use the overlay st_dev for directories, so 'find * -xdev' will scan the entire overlay mount and won't cross the * overlay mount boundaries. + * + * If not all layers are on the same fs the pair {real st_ino; + * overlay st_dev} is not unique, so use the non persistent + * overlay st_ino for directories. */ stat->dev = dentry->d_sb->s_dev; stat->ino = dentry->d_inode->i_ino; @@ -409,6 +433,7 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) #ifdef CONFIG_LOCKDEP static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; + static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING]; int depth = inode->i_sb->s_stack_depth - 1; @@ -419,6 +444,8 @@ static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); else lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); + + lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]); #endif } @@ -657,6 +684,16 @@ struct inode *ovl_get_inode(struct dentry *dentry, struct dentry *upperdentry, if (upperdentry && ovl_is_impuredir(upperdentry)) ovl_set_flag(OVL_IMPURE, inode); + /* Check for non-merge dir that may have whiteouts */ + if (S_ISDIR(realinode->i_mode)) { + struct ovl_entry *oe = dentry->d_fsdata; + + if (((upperdentry && lowerdentry) || oe->numlower > 1) || + ovl_check_origin_xattr(upperdentry ?: lowerdentry)) { + ovl_set_flag(OVL_WHITEOUTS, inode); + } + } + if (inode->i_state & I_NEW) unlock_new_inode(inode); out: diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index a12dc10bf726..625ed8066570 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -15,7 +15,6 @@ #include <linux/mount.h> #include <linux/exportfs.h> #include "overlayfs.h" -#include "ovl_entry.h" struct ovl_lookup_data { struct qstr name; @@ -286,16 +285,15 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, static int ovl_check_origin(struct dentry *upperdentry, - struct path *lowerstack, unsigned int numlower, - struct path **stackp, unsigned int *ctrp) + struct ovl_path *lower, unsigned int numlower, + struct ovl_path **stackp, unsigned int *ctrp) { struct vfsmount *mnt; struct dentry *origin = NULL; int i; - for (i = 0; i < numlower; i++) { - mnt = lowerstack[i].mnt; + mnt = lower[i].layer->mnt; origin = ovl_get_origin(upperdentry, mnt); if (IS_ERR(origin)) return PTR_ERR(origin); @@ -309,12 +307,12 @@ static int ovl_check_origin(struct dentry *upperdentry, BUG_ON(*ctrp); if (!*stackp) - *stackp = kmalloc(sizeof(struct path), GFP_KERNEL); + *stackp = kmalloc(sizeof(struct ovl_path), GFP_KERNEL); if (!*stackp) { dput(origin); return -ENOMEM; } - **stackp = (struct path) { .dentry = origin, .mnt = mnt }; + **stackp = (struct ovl_path){.dentry = origin, .layer = lower[i].layer}; *ctrp = 1; return 0; @@ -350,8 +348,8 @@ static int ovl_verify_origin_fh(struct dentry *dentry, const struct ovl_fh *fh) * * Return 0 on match, -ESTALE on mismatch, < 0 on error. */ -int ovl_verify_origin(struct dentry *dentry, struct vfsmount *mnt, - struct dentry *origin, bool is_upper, bool set) +int ovl_verify_origin(struct dentry *dentry, struct dentry *origin, + bool is_upper, bool set) { struct inode *inode; struct ovl_fh *fh; @@ -384,13 +382,13 @@ fail: * OVL_XATTR_ORIGIN and that origin file handle can be decoded to lower path. * Return 0 on match, -ESTALE on mismatch or stale origin, < 0 on error. */ -int ovl_verify_index(struct dentry *index, struct path *lowerstack, +int ovl_verify_index(struct dentry *index, struct ovl_path *lower, unsigned int numlower) { struct ovl_fh *fh = NULL; size_t len; - struct path origin = { }; - struct path *stack = &origin; + struct ovl_path origin = { }; + struct ovl_path *stack = &origin; unsigned int ctr = 0; int err; @@ -429,7 +427,7 @@ int ovl_verify_index(struct dentry *index, struct path *lowerstack, if (err) goto fail; - err = ovl_check_origin(index, lowerstack, numlower, &stack, &ctr); + err = ovl_check_origin(index, lower, numlower, &stack, &ctr); if (!err && !ctr) err = -ESTALE; if (err) @@ -568,11 +566,24 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path) idx++; } BUG_ON(idx > oe->numlower); - *path = oe->lowerstack[idx - 1]; + path->dentry = oe->lowerstack[idx - 1].dentry; + path->mnt = oe->lowerstack[idx - 1].layer->mnt; return (idx < oe->numlower) ? idx + 1 : -1; } +static int ovl_find_layer(struct ovl_fs *ofs, struct ovl_path *path) +{ + int i; + + for (i = 0; i < ofs->numlower; i++) { + if (ofs->lower_layers[i].mnt == path->layer->mnt) + break; + } + + return i; +} + struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { @@ -581,7 +592,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, struct ovl_fs *ofs = dentry->d_sb->s_fs_info; struct ovl_entry *poe = dentry->d_parent->d_fsdata; struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; - struct path *stack = NULL; + struct ovl_path *stack = NULL; struct dentry *upperdir, *upperdentry = NULL; struct dentry *index = NULL; unsigned int ctr = 0; @@ -630,7 +641,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, err = ovl_check_origin(upperdentry, roe->lowerstack, roe->numlower, &stack, &ctr); if (err) - goto out; + goto out_put_upper; } if (d.redirect) { @@ -646,17 +657,17 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (!d.stop && poe->numlower) { err = -ENOMEM; - stack = kcalloc(ofs->numlower, sizeof(struct path), + stack = kcalloc(ofs->numlower, sizeof(struct ovl_path), GFP_KERNEL); if (!stack) goto out_put_upper; } for (i = 0; !d.stop && i < poe->numlower; i++) { - struct path lowerpath = poe->lowerstack[i]; + struct ovl_path lower = poe->lowerstack[i]; d.last = i == poe->numlower - 1; - err = ovl_lookup_layer(lowerpath.dentry, &d, &this); + err = ovl_lookup_layer(lower.dentry, &d, &this); if (err) goto out_put; @@ -664,7 +675,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, continue; stack[ctr].dentry = this; - stack[ctr].mnt = lowerpath.mnt; + stack[ctr].layer = lower.layer; ctr++; if (d.stop) @@ -674,10 +685,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, poe = roe; /* Find the current layer on the root dentry */ - for (i = 0; i < poe->numlower; i++) - if (poe->lowerstack[i].mnt == lowerpath.mnt) - break; - if (WARN_ON(i == poe->numlower)) + i = ovl_find_layer(ofs, &lower); + if (WARN_ON(i == ofs->numlower)) break; } } @@ -700,7 +709,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out_put; oe->opaque = upperopaque; - memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr); + memcpy(oe->lowerstack, stack, sizeof(struct ovl_path) * ctr); dentry->d_fsdata = oe; if (upperdentry) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index d9a0edd4e57e..13eab09a6b6f 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/uuid.h> +#include "ovl_entry.h" enum ovl_path_type { __OVL_PATH_UPPER = (1 << 0), @@ -28,7 +29,10 @@ enum ovl_path_type { #define OVL_XATTR_NLINK OVL_XATTR_PREFIX "nlink" enum ovl_flag { + /* Pure upper dir that may contain non pure upper entries */ OVL_IMPURE, + /* Non-merge dir that may contain whiteout entries */ + OVL_WHITEOUTS, OVL_INDEX, }; @@ -223,6 +227,7 @@ bool ovl_is_whiteout(struct dentry *dentry); struct file *ovl_path_open(struct path *path, int flags); int ovl_copy_up_start(struct dentry *dentry); void ovl_copy_up_end(struct dentry *dentry); +bool ovl_check_origin_xattr(struct dentry *dentry); bool ovl_check_dir_xattr(struct dentry *dentry, const char *name); int ovl_check_setxattr(struct dentry *dentry, struct dentry *upperdentry, const char *name, const void *value, size_t size, @@ -244,9 +249,9 @@ static inline bool ovl_is_impuredir(struct dentry *dentry) /* namei.c */ -int ovl_verify_origin(struct dentry *dentry, struct vfsmount *mnt, - struct dentry *origin, bool is_upper, bool set); -int ovl_verify_index(struct dentry *index, struct path *lowerstack, +int ovl_verify_origin(struct dentry *dentry, struct dentry *origin, + bool is_upper, bool set); +int ovl_verify_index(struct dentry *index, struct ovl_path *lower, unsigned int numlower); int ovl_get_index_name(struct dentry *origin, struct qstr *name); int ovl_path_next(int idx, struct dentry *dentry, struct path *path); @@ -263,7 +268,7 @@ int ovl_check_d_type_supported(struct path *realpath); void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, struct dentry *dentry, int level); int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, - struct path *lowerstack, unsigned int numlower); + struct ovl_path *lower, unsigned int numlower); /* inode.c */ int ovl_set_nlink_upper(struct dentry *dentry); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 36b49bd09264..752bab645879 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -17,11 +17,21 @@ struct ovl_config { bool index; }; +struct ovl_layer { + struct vfsmount *mnt; + dev_t pseudo_dev; +}; + +struct ovl_path { + struct ovl_layer *layer; + struct dentry *dentry; +}; + /* private information held for overlayfs's superblock */ struct ovl_fs { struct vfsmount *upper_mnt; unsigned numlower; - struct vfsmount **lower_mnt; + struct ovl_layer *lower_layers; /* workbasedir is the path at workdir= mount option */ struct dentry *workbasedir; /* workdir is the 'work' directory under workbasedir */ @@ -52,7 +62,7 @@ struct ovl_entry { struct rcu_head rcu; }; unsigned numlower; - struct path lowerstack[]; + struct ovl_path lowerstack[]; }; struct ovl_entry *ovl_alloc_entry(unsigned int numlower); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index c310e3ff7f3f..0daa4354fec4 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -26,6 +26,7 @@ struct ovl_cache_entry { struct list_head l_node; struct rb_node node; struct ovl_cache_entry *next_maybe_whiteout; + bool is_upper; bool is_whiteout; char name[]; }; @@ -158,6 +159,7 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, /* Defer setting d_ino for upper entry to ovl_iterate() */ if (ovl_calc_d_ino(rdd, p)) p->ino = 0; + p->is_upper = rdd->is_upper; p->is_whiteout = false; if (d_type == DT_CHR) { @@ -316,21 +318,37 @@ static inline int ovl_dir_read(struct path *realpath, return err; } +/* + * Can we iterate real dir directly? + * + * Non-merge dir may contain whiteouts from a time it was a merge upper, before + * lower dir was removed under it and possibly before it was rotated from upper + * to lower layer. + */ +static bool ovl_dir_is_real(struct dentry *dir) +{ + return !ovl_test_flag(OVL_WHITEOUTS, d_inode(dir)); +} + static void ovl_dir_reset(struct file *file) { struct ovl_dir_file *od = file->private_data; struct ovl_dir_cache *cache = od->cache; struct dentry *dentry = file->f_path.dentry; - enum ovl_path_type type = ovl_path_type(dentry); + bool is_real; if (cache && ovl_dentry_version_get(dentry) != cache->version) { ovl_cache_put(od, dentry); od->cache = NULL; od->cursor = NULL; } - WARN_ON(!od->is_real && !OVL_TYPE_MERGE(type)); - if (od->is_real && OVL_TYPE_MERGE(type)) + is_real = ovl_dir_is_real(dentry); + if (od->is_real != is_real) { + /* is_real can only become false when dir is copied up */ + if (WARN_ON(is_real)) + return; od->is_real = false; + } } static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, @@ -816,7 +834,7 @@ static int ovl_dir_open(struct inode *inode, struct file *file) return PTR_ERR(realfile); } od->realfile = realfile; - od->is_real = !OVL_TYPE_MERGE(type); + od->is_real = ovl_dir_is_real(file->f_path.dentry); od->is_upper = OVL_TYPE_UPPER(type); file->private_data = od; @@ -835,7 +853,7 @@ const struct file_operations ovl_dir_operations = { int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) { int err; - struct ovl_cache_entry *p; + struct ovl_cache_entry *p, *n; struct rb_root root = RB_ROOT; err = ovl_dir_read_merged(dentry, list, &root); @@ -844,18 +862,29 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) err = 0; - list_for_each_entry(p, list, l_node) { - if (p->is_whiteout) - continue; + list_for_each_entry_safe(p, n, list, l_node) { + /* + * Select whiteouts in upperdir, they should + * be cleared when deleting this directory. + */ + if (p->is_whiteout) { + if (p->is_upper) + continue; + goto del_entry; + } if (p->name[0] == '.') { if (p->len == 1) - continue; + goto del_entry; if (p->len == 2 && p->name[1] == '.') - continue; + goto del_entry; } err = -ENOTEMPTY; break; + +del_entry: + list_del(&p->l_node); + kfree(p); } return err; @@ -869,7 +898,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) list_for_each_entry(p, list, l_node) { struct dentry *dentry; - if (!p->is_whiteout) + if (WARN_ON(!p->is_whiteout || !p->is_upper)) continue; dentry = lookup_one_len(p->name, upper, p->len); @@ -985,7 +1014,7 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, } int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, - struct path *lowerstack, unsigned int numlower) + struct ovl_path *lower, unsigned int numlower) { int err; struct dentry *index = NULL; @@ -1020,7 +1049,7 @@ int ovl_indexdir_cleanup(struct dentry *dentry, struct vfsmount *mnt, index = NULL; break; } - err = ovl_verify_index(index, lowerstack, numlower); + err = ovl_verify_index(index, lower, numlower); /* Cleanup stale and orphan index entries */ if (err && (err == -ESTALE || err == -ENOENT)) err = ovl_cleanup(dir, index); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f5738e96a052..288d20f9a55a 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -18,7 +18,6 @@ #include <linux/seq_file.h> #include <linux/posix_acl_xattr.h> #include "overlayfs.h" -#include "ovl_entry.h" MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Overlay filesystem"); @@ -39,15 +38,20 @@ module_param_named(index, ovl_index_def, bool, 0644); MODULE_PARM_DESC(ovl_index_def, "Default to on or off for the inodes index feature"); +static void ovl_entry_stack_free(struct ovl_entry *oe) +{ + unsigned int i; + + for (i = 0; i < oe->numlower; i++) + dput(oe->lowerstack[i].dentry); +} + static void ovl_dentry_release(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; if (oe) { - unsigned int i; - - for (i = 0; i < oe->numlower; i++) - dput(oe->lowerstack[i].dentry); + ovl_entry_stack_free(oe); kfree_rcu(oe, rcu); } } @@ -207,39 +211,48 @@ static void ovl_destroy_inode(struct inode *inode) call_rcu(&inode->i_rcu, ovl_i_callback); } -static void ovl_put_super(struct super_block *sb) +static void ovl_free_fs(struct ovl_fs *ofs) { - struct ovl_fs *ufs = sb->s_fs_info; unsigned i; - dput(ufs->indexdir); - dput(ufs->workdir); - if (ufs->workdir_locked) - ovl_inuse_unlock(ufs->workbasedir); - dput(ufs->workbasedir); - if (ufs->upper_mnt && ufs->upperdir_locked) - ovl_inuse_unlock(ufs->upper_mnt->mnt_root); - mntput(ufs->upper_mnt); - for (i = 0; i < ufs->numlower; i++) - mntput(ufs->lower_mnt[i]); - kfree(ufs->lower_mnt); - - kfree(ufs->config.lowerdir); - kfree(ufs->config.upperdir); - kfree(ufs->config.workdir); - put_cred(ufs->creator_cred); - kfree(ufs); + dput(ofs->indexdir); + dput(ofs->workdir); + if (ofs->workdir_locked) + ovl_inuse_unlock(ofs->workbasedir); + dput(ofs->workbasedir); + if (ofs->upperdir_locked) + ovl_inuse_unlock(ofs->upper_mnt->mnt_root); + mntput(ofs->upper_mnt); + for (i = 0; i < ofs->numlower; i++) { + mntput(ofs->lower_layers[i].mnt); + free_anon_bdev(ofs->lower_layers[i].pseudo_dev); + } + kfree(ofs->lower_layers); + + kfree(ofs->config.lowerdir); + kfree(ofs->config.upperdir); + kfree(ofs->config.workdir); + if (ofs->creator_cred) + put_cred(ofs->creator_cred); + kfree(ofs); +} + +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + ovl_free_fs(ofs); } static int ovl_sync_fs(struct super_block *sb, int wait) { - struct ovl_fs *ufs = sb->s_fs_info; + struct ovl_fs *ofs = sb->s_fs_info; struct super_block *upper_sb; int ret; - if (!ufs->upper_mnt) + if (!ofs->upper_mnt) return 0; - upper_sb = ufs->upper_mnt->mnt_sb; + upper_sb = ofs->upper_mnt->mnt_sb; if (!upper_sb->s_op->sync_fs) return 0; @@ -277,9 +290,9 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) } /* Will this overlay be forced to mount/remount ro? */ -static bool ovl_force_readonly(struct ovl_fs *ufs) +static bool ovl_force_readonly(struct ovl_fs *ofs) { - return (!ufs->upper_mnt || !ufs->workdir); + return (!ofs->upper_mnt || !ofs->workdir); } /** @@ -291,29 +304,29 @@ static bool ovl_force_readonly(struct ovl_fs *ufs) static int ovl_show_options(struct seq_file *m, struct dentry *dentry) { struct super_block *sb = dentry->d_sb; - struct ovl_fs *ufs = sb->s_fs_info; + struct ovl_fs *ofs = sb->s_fs_info; - seq_show_option(m, "lowerdir", ufs->config.lowerdir); - if (ufs->config.upperdir) { - seq_show_option(m, "upperdir", ufs->config.upperdir); - seq_show_option(m, "workdir", ufs->config.workdir); + seq_show_option(m, "lowerdir", ofs->config.lowerdir); + if (ofs->config.upperdir) { + seq_show_option(m, "upperdir", ofs->config.upperdir); + seq_show_option(m, "workdir", ofs->config.workdir); } - if (ufs->config.default_permissions) + if (ofs->config.default_permissions) seq_puts(m, ",default_permissions"); - if (ufs->config.redirect_dir != ovl_redirect_dir_def) + if (ofs->config.redirect_dir != ovl_redirect_dir_def) seq_printf(m, ",redirect_dir=%s", - ufs->config.redirect_dir ? "on" : "off"); - if (ufs->config.index != ovl_index_def) + ofs->config.redirect_dir ? "on" : "off"); + if (ofs->config.index != ovl_index_def) seq_printf(m, ",index=%s", - ufs->config.index ? "on" : "off"); + ofs->config.index ? "on" : "off"); return 0; } static int ovl_remount(struct super_block *sb, int *flags, char *data) { - struct ovl_fs *ufs = sb->s_fs_info; + struct ovl_fs *ofs = sb->s_fs_info; - if (!(*flags & MS_RDONLY) && ovl_force_readonly(ufs)) + if (!(*flags & SB_RDONLY) && ovl_force_readonly(ofs)) return -EROFS; return 0; @@ -451,13 +464,11 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) #define OVL_WORKDIR_NAME "work" #define OVL_INDEXDIR_NAME "index" -static struct dentry *ovl_workdir_create(struct super_block *sb, - struct ovl_fs *ufs, - struct dentry *dentry, +static struct dentry *ovl_workdir_create(struct ovl_fs *ofs, const char *name, bool persist) { - struct inode *dir = dentry->d_inode; - struct vfsmount *mnt = ufs->upper_mnt; + struct inode *dir = ofs->workbasedir->d_inode; + struct vfsmount *mnt = ofs->upper_mnt; struct dentry *work; int err; bool retried = false; @@ -471,7 +482,7 @@ static struct dentry *ovl_workdir_create(struct super_block *sb, locked = true; retry: - work = lookup_one_len(name, dentry, strlen(name)); + work = lookup_one_len(name, ofs->workbasedir, strlen(name)); if (!IS_ERR(work)) { struct iattr attr = { @@ -541,8 +552,7 @@ out_dput: dput(work); out_err: pr_warn("overlayfs: failed to create directory %s/%s (errno: %i); mounting read-only\n", - ufs->config.workdir, name, -err); - sb->s_flags |= MS_RDONLY; + ofs->config.workdir, name, -err); work = NULL; goto out_unlock; } @@ -585,7 +595,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) return 0; out_put: - path_put(path); + path_put_init(path); out: return err; } @@ -603,7 +613,7 @@ static int ovl_mount_dir(const char *name, struct path *path) if (ovl_dentry_remote(path->dentry)) { pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n", tmp); - path_put(path); + path_put_init(path); err = -EINVAL; } kfree(tmp); @@ -655,7 +665,7 @@ static int ovl_lower_dir(const char *name, struct path *path, return 0; out_put: - path_put(path); + path_put_init(path); out: return err; } @@ -826,129 +836,269 @@ static const struct xattr_handler *ovl_xattr_handlers[] = { NULL }; -static int ovl_fill_super(struct super_block *sb, void *data, int silent) +static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath) { - struct path upperpath = { }; - struct path workpath = { }; - struct dentry *root_dentry; - struct ovl_entry *oe; - struct ovl_fs *ufs; - struct path *stack = NULL; - char *lowertmp; - char *lower; - unsigned int numlower; - unsigned int stacklen = 0; - unsigned int i; - bool remote = false; - struct cred *cred; + struct vfsmount *upper_mnt; int err; - err = -ENOMEM; - ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); - if (!ufs) + err = ovl_mount_dir(ofs->config.upperdir, upperpath); + if (err) + goto out; + + /* Upper fs should not be r/o */ + if (sb_rdonly(upperpath->mnt->mnt_sb)) { + pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); + err = -EINVAL; + goto out; + } + + err = ovl_check_namelen(upperpath, ofs, ofs->config.upperdir); + if (err) goto out; - ufs->config.redirect_dir = ovl_redirect_dir_def; - ufs->config.index = ovl_index_def; - err = ovl_parse_opt((char *) data, &ufs->config); + err = -EBUSY; + if (ovl_inuse_trylock(upperpath->dentry)) { + ofs->upperdir_locked = true; + } else if (ofs->config.index) { + pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); + goto out; + } else { + pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); + } + + upper_mnt = clone_private_mount(upperpath); + err = PTR_ERR(upper_mnt); + if (IS_ERR(upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out; + } + + /* Don't inherit atime flags */ + upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); + ofs->upper_mnt = upper_mnt; + err = 0; +out: + return err; +} + +static int ovl_make_workdir(struct ovl_fs *ofs, struct path *workpath) +{ + struct dentry *temp; + int err; + + ofs->workdir = ovl_workdir_create(ofs, OVL_WORKDIR_NAME, false); + if (!ofs->workdir) + return 0; + + /* + * Upper should support d_type, else whiteouts are visible. Given + * workdir and upper are on same fs, we can do iterate_dir() on + * workdir. This check requires successful creation of workdir in + * previous step. + */ + err = ovl_check_d_type_supported(workpath); + if (err < 0) + return err; + + /* + * We allowed this configuration and don't want to break users over + * kernel upgrade. So warn instead of erroring out. + */ + if (!err) + pr_warn("overlayfs: upper fs needs to support d_type.\n"); + + /* Check if upper/work fs supports O_TMPFILE */ + temp = ovl_do_tmpfile(ofs->workdir, S_IFREG | 0); + ofs->tmpfile = !IS_ERR(temp); + if (ofs->tmpfile) + dput(temp); + else + pr_warn("overlayfs: upper fs does not support tmpfile.\n"); + + /* + * Check if upper/work fs supports trusted.overlay.* xattr + */ + err = ovl_do_setxattr(ofs->workdir, OVL_XATTR_OPAQUE, "0", 1, 0); + if (err) { + ofs->noxattr = true; + pr_warn("overlayfs: upper fs does not support xattr.\n"); + } else { + vfs_removexattr(ofs->workdir, OVL_XATTR_OPAQUE); + } + + /* Check if upper/work fs supports file handles */ + if (ofs->config.index && + !ovl_can_decode_fh(ofs->workdir->d_sb)) { + ofs->config.index = false; + pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); + } + + return 0; +} + +static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath) +{ + int err; + struct path workpath = { }; + + err = ovl_mount_dir(ofs->config.workdir, &workpath); if (err) - goto out_free_config; + goto out; err = -EINVAL; - if (!ufs->config.lowerdir) { - if (!silent) - pr_err("overlayfs: missing 'lowerdir'\n"); - goto out_free_config; + if (upperpath->mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath->dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out; } - sb->s_stack_depth = 0; - sb->s_maxbytes = MAX_LFS_FILESIZE; - if (ufs->config.upperdir) { - if (!ufs->config.workdir) { - pr_err("overlayfs: missing 'workdir'\n"); - goto out_free_config; - } + err = -EBUSY; + if (ovl_inuse_trylock(workpath.dentry)) { + ofs->workdir_locked = true; + } else if (ofs->config.index) { + pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); + goto out; + } else { + pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); + } - err = ovl_mount_dir(ufs->config.upperdir, &upperpath); - if (err) - goto out_free_config; + ofs->workbasedir = dget(workpath.dentry); + err = ovl_make_workdir(ofs, &workpath); + if (err) + goto out; - /* Upper fs should not be r/o */ - if (sb_rdonly(upperpath.mnt->mnt_sb)) { - pr_err("overlayfs: upper fs is r/o, try multi-lower layers mount\n"); - err = -EINVAL; - goto out_put_upperpath; - } + err = 0; +out: + path_put(&workpath); - err = ovl_check_namelen(&upperpath, ufs, ufs->config.upperdir); - if (err) - goto out_put_upperpath; - - err = -EBUSY; - if (ovl_inuse_trylock(upperpath.dentry)) { - ufs->upperdir_locked = true; - } else if (ufs->config.index) { - pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n"); - goto out_put_upperpath; - } else { - pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); - } + return err; +} + +static int ovl_get_indexdir(struct ovl_fs *ofs, struct ovl_entry *oe, + struct path *upperpath) +{ + int err; - err = ovl_mount_dir(ufs->config.workdir, &workpath); + /* Verify lower root is upper root origin */ + err = ovl_verify_origin(upperpath->dentry, oe->lowerstack[0].dentry, + false, true); + if (err) { + pr_err("overlayfs: failed to verify upper root origin\n"); + goto out; + } + + ofs->indexdir = ovl_workdir_create(ofs, OVL_INDEXDIR_NAME, true); + if (ofs->indexdir) { + /* Verify upper root is index dir origin */ + err = ovl_verify_origin(ofs->indexdir, upperpath->dentry, + true, true); if (err) - goto out_unlock_upperdentry; + pr_err("overlayfs: failed to verify index dir origin\n"); - err = -EINVAL; - if (upperpath.mnt != workpath.mnt) { - pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); - goto out_put_workpath; - } - if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { - pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); - goto out_put_workpath; + /* Cleanup bad/stale/orphan index entries */ + if (!err) + err = ovl_indexdir_cleanup(ofs->indexdir, + ofs->upper_mnt, + oe->lowerstack, + oe->numlower); + } + if (err || !ofs->indexdir) + pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); + +out: + return err; +} + +static int ovl_get_lower_layers(struct ovl_fs *ofs, struct path *stack, + unsigned int numlower) +{ + int err; + unsigned int i; + + err = -ENOMEM; + ofs->lower_layers = kcalloc(numlower, sizeof(struct ovl_layer), + GFP_KERNEL); + if (ofs->lower_layers == NULL) + goto out; + for (i = 0; i < numlower; i++) { + struct vfsmount *mnt; + dev_t dev; + + err = get_anon_bdev(&dev); + if (err) { + pr_err("overlayfs: failed to get anonymous bdev for lowerpath\n"); + goto out; } - err = -EBUSY; - if (ovl_inuse_trylock(workpath.dentry)) { - ufs->workdir_locked = true; - } else if (ufs->config.index) { - pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n"); - goto out_put_workpath; - } else { - pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n"); + mnt = clone_private_mount(&stack[i]); + err = PTR_ERR(mnt); + if (IS_ERR(mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + free_anon_bdev(dev); + goto out; } + /* + * Make lower layers R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; + + ofs->lower_layers[ofs->numlower].mnt = mnt; + ofs->lower_layers[ofs->numlower].pseudo_dev = dev; + ofs->numlower++; - ufs->workbasedir = workpath.dentry; - sb->s_stack_depth = upperpath.mnt->mnt_sb->s_stack_depth; + /* Check if all lower layers are on same sb */ + if (i == 0) + ofs->same_sb = mnt->mnt_sb; + else if (ofs->same_sb != mnt->mnt_sb) + ofs->same_sb = NULL; } + err = 0; +out: + return err; +} + +static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, + struct ovl_fs *ofs) +{ + int err; + char *lowertmp, *lower; + struct path *stack = NULL; + unsigned int stacklen, numlower = 0, i; + bool remote = false; + struct ovl_entry *oe; + err = -ENOMEM; - lowertmp = kstrdup(ufs->config.lowerdir, GFP_KERNEL); + lowertmp = kstrdup(ofs->config.lowerdir, GFP_KERNEL); if (!lowertmp) - goto out_unlock_workdentry; + goto out_err; err = -EINVAL; stacklen = ovl_split_lowerdirs(lowertmp); if (stacklen > OVL_MAX_STACK) { pr_err("overlayfs: too many lower directories, limit is %d\n", OVL_MAX_STACK); - goto out_free_lowertmp; - } else if (!ufs->config.upperdir && stacklen == 1) { + goto out_err; + } else if (!ofs->config.upperdir && stacklen == 1) { pr_err("overlayfs: at least 2 lowerdir are needed while upperdir nonexistent\n"); - goto out_free_lowertmp; + goto out_err; } err = -ENOMEM; stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL); if (!stack) - goto out_free_lowertmp; + goto out_err; err = -EINVAL; lower = lowertmp; for (numlower = 0; numlower < stacklen; numlower++) { - err = ovl_lower_dir(lower, &stack[numlower], ufs, + err = ovl_lower_dir(lower, &stack[numlower], ofs, &sb->s_stack_depth, &remote); if (err) - goto out_put_lowerpath; + goto out_err; lower = strchr(lower, '\0') + 1; } @@ -957,190 +1107,144 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) sb->s_stack_depth++; if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { pr_err("overlayfs: maximum fs stacking depth exceeded\n"); - goto out_put_lowerpath; + goto out_err; } - if (ufs->config.upperdir) { - ufs->upper_mnt = clone_private_mount(&upperpath); - err = PTR_ERR(ufs->upper_mnt); - if (IS_ERR(ufs->upper_mnt)) { - pr_err("overlayfs: failed to clone upperpath\n"); - goto out_put_lowerpath; - } + err = ovl_get_lower_layers(ofs, stack, numlower); + if (err) + goto out_err; - /* Don't inherit atime flags */ - ufs->upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME); + err = -ENOMEM; + oe = ovl_alloc_entry(numlower); + if (!oe) + goto out_err; - sb->s_time_gran = ufs->upper_mnt->mnt_sb->s_time_gran; + for (i = 0; i < numlower; i++) { + oe->lowerstack[i].dentry = dget(stack[i].dentry); + oe->lowerstack[i].layer = &ofs->lower_layers[i]; + } - ufs->workdir = ovl_workdir_create(sb, ufs, workpath.dentry, - OVL_WORKDIR_NAME, false); - /* - * Upper should support d_type, else whiteouts are visible. - * Given workdir and upper are on same fs, we can do - * iterate_dir() on workdir. This check requires successful - * creation of workdir in previous step. - */ - if (ufs->workdir) { - struct dentry *temp; - - err = ovl_check_d_type_supported(&workpath); - if (err < 0) - goto out_put_workdir; - - /* - * We allowed this configuration and don't want to - * break users over kernel upgrade. So warn instead - * of erroring out. - */ - if (!err) - pr_warn("overlayfs: upper fs needs to support d_type.\n"); - - /* Check if upper/work fs supports O_TMPFILE */ - temp = ovl_do_tmpfile(ufs->workdir, S_IFREG | 0); - ufs->tmpfile = !IS_ERR(temp); - if (ufs->tmpfile) - dput(temp); - else - pr_warn("overlayfs: upper fs does not support tmpfile.\n"); - - /* - * Check if upper/work fs supports trusted.overlay.* - * xattr - */ - err = ovl_do_setxattr(ufs->workdir, OVL_XATTR_OPAQUE, - "0", 1, 0); - if (err) { - ufs->noxattr = true; - pr_warn("overlayfs: upper fs does not support xattr.\n"); - } else { - vfs_removexattr(ufs->workdir, OVL_XATTR_OPAQUE); - } + if (remote) + sb->s_d_op = &ovl_reval_dentry_operations; + else + sb->s_d_op = &ovl_dentry_operations; - /* Check if upper/work fs supports file handles */ - if (ufs->config.index && - !ovl_can_decode_fh(ufs->workdir->d_sb)) { - ufs->config.index = false; - pr_warn("overlayfs: upper fs does not support file handles, falling back to index=off.\n"); - } - } - } +out: + for (i = 0; i < numlower; i++) + path_put(&stack[i]); + kfree(stack); + kfree(lowertmp); + + return oe; + +out_err: + oe = ERR_PTR(err); + goto out; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path upperpath = { }; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ofs; + struct cred *cred; + int err; err = -ENOMEM; - ufs->lower_mnt = kcalloc(numlower, sizeof(struct vfsmount *), GFP_KERNEL); - if (ufs->lower_mnt == NULL) - goto out_put_workdir; - for (i = 0; i < numlower; i++) { - struct vfsmount *mnt = clone_private_mount(&stack[i]); + ofs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ofs) + goto out; - err = PTR_ERR(mnt); - if (IS_ERR(mnt)) { - pr_err("overlayfs: failed to clone lowerpath\n"); - goto out_put_lower_mnt; - } - /* - * Make lower_mnt R/O. That way fchmod/fchown on lower file - * will fail instead of modifying lower fs. - */ - mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME; + ofs->creator_cred = cred = prepare_creds(); + if (!cred) + goto out_err; - ufs->lower_mnt[ufs->numlower] = mnt; - ufs->numlower++; + ofs->config.redirect_dir = ovl_redirect_dir_def; + ofs->config.index = ovl_index_def; + err = ovl_parse_opt((char *) data, &ofs->config); + if (err) + goto out_err; - /* Check if all lower layers are on same sb */ - if (i == 0) - ufs->same_sb = mnt->mnt_sb; - else if (ufs->same_sb != mnt->mnt_sb) - ufs->same_sb = NULL; + err = -EINVAL; + if (!ofs->config.lowerdir) { + if (!silent) + pr_err("overlayfs: missing 'lowerdir'\n"); + goto out_err; } - /* If the upper fs is nonexistent, we mark overlayfs r/o too */ - if (!ufs->upper_mnt) - sb->s_flags |= MS_RDONLY; - else if (ufs->upper_mnt->mnt_sb != ufs->same_sb) - ufs->same_sb = NULL; - - if (!(ovl_force_readonly(ufs)) && ufs->config.index) { - /* Verify lower root is upper root origin */ - err = ovl_verify_origin(upperpath.dentry, ufs->lower_mnt[0], - stack[0].dentry, false, true); - if (err) { - pr_err("overlayfs: failed to verify upper root origin\n"); - goto out_put_lower_mnt; + sb->s_stack_depth = 0; + sb->s_maxbytes = MAX_LFS_FILESIZE; + if (ofs->config.upperdir) { + if (!ofs->config.workdir) { + pr_err("overlayfs: missing 'workdir'\n"); + goto out_err; } - ufs->indexdir = ovl_workdir_create(sb, ufs, workpath.dentry, - OVL_INDEXDIR_NAME, true); - if (ufs->indexdir) { - /* Verify upper root is index dir origin */ - err = ovl_verify_origin(ufs->indexdir, ufs->upper_mnt, - upperpath.dentry, true, true); - if (err) - pr_err("overlayfs: failed to verify index dir origin\n"); + err = ovl_get_upper(ofs, &upperpath); + if (err) + goto out_err; - /* Cleanup bad/stale/orphan index entries */ - if (!err) - err = ovl_indexdir_cleanup(ufs->indexdir, - ufs->upper_mnt, - stack, numlower); - } - if (err || !ufs->indexdir) - pr_warn("overlayfs: try deleting index dir or mounting with '-o index=off' to disable inodes index.\n"); + err = ovl_get_workdir(ofs, &upperpath); if (err) - goto out_put_indexdir; + goto out_err; + + if (!ofs->workdir) + sb->s_flags |= SB_RDONLY; + + sb->s_stack_depth = ofs->upper_mnt->mnt_sb->s_stack_depth; + sb->s_time_gran = ofs->upper_mnt->mnt_sb->s_time_gran; + } + oe = ovl_get_lowerstack(sb, ofs); + err = PTR_ERR(oe); + if (IS_ERR(oe)) + goto out_err; - /* Show index=off/on in /proc/mounts for any of the reasons above */ - if (!ufs->indexdir) - ufs->config.index = false; + /* If the upper fs is nonexistent, we mark overlayfs r/o too */ + if (!ofs->upper_mnt) + sb->s_flags |= SB_RDONLY; + else if (ofs->upper_mnt->mnt_sb != ofs->same_sb) + ofs->same_sb = NULL; - if (remote) - sb->s_d_op = &ovl_reval_dentry_operations; - else - sb->s_d_op = &ovl_dentry_operations; + if (!(ovl_force_readonly(ofs)) && ofs->config.index) { + err = ovl_get_indexdir(ofs, oe, &upperpath); + if (err) + goto out_free_oe; - err = -ENOMEM; - ufs->creator_cred = cred = prepare_creds(); - if (!cred) - goto out_put_indexdir; + if (!ofs->indexdir) + sb->s_flags |= SB_RDONLY; + } + + /* Show index=off/on in /proc/mounts for any of the reasons above */ + if (!ofs->indexdir) + ofs->config.index = false; /* Never override disk quota limits or use reserved space */ cap_lower(cred->cap_effective, CAP_SYS_RESOURCE); - err = -ENOMEM; - oe = ovl_alloc_entry(numlower); - if (!oe) - goto out_put_cred; - sb->s_magic = OVERLAYFS_SUPER_MAGIC; sb->s_op = &ovl_super_operations; sb->s_xattr = ovl_xattr_handlers; - sb->s_fs_info = ufs; - sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK; + sb->s_fs_info = ofs; + sb->s_flags |= SB_POSIXACL | SB_NOREMOTELOCK; + err = -ENOMEM; root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0)); if (!root_dentry) goto out_free_oe; mntput(upperpath.mnt); - for (i = 0; i < numlower; i++) - mntput(stack[i].mnt); - mntput(workpath.mnt); - kfree(lowertmp); - if (upperpath.dentry) { oe->has_upper = true; if (ovl_is_impuredir(upperpath.dentry)) ovl_set_flag(OVL_IMPURE, d_inode(root_dentry)); } - for (i = 0; i < numlower; i++) { - oe->lowerstack[i].dentry = stack[i].dentry; - oe->lowerstack[i].mnt = ufs->lower_mnt[i]; - } - kfree(stack); root_dentry->d_fsdata = oe; + /* Root is always merge -> can have whiteouts */ + ovl_set_flag(OVL_WHITEOUTS, d_inode(root_dentry)); ovl_inode_init(d_inode(root_dentry), upperpath.dentry, ovl_dentry_lower(root_dentry)); @@ -1149,39 +1253,11 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) return 0; out_free_oe: + ovl_entry_stack_free(oe); kfree(oe); -out_put_cred: - put_cred(ufs->creator_cred); -out_put_indexdir: - dput(ufs->indexdir); -out_put_lower_mnt: - for (i = 0; i < ufs->numlower; i++) - mntput(ufs->lower_mnt[i]); - kfree(ufs->lower_mnt); -out_put_workdir: - dput(ufs->workdir); - mntput(ufs->upper_mnt); -out_put_lowerpath: - for (i = 0; i < numlower; i++) - path_put(&stack[i]); - kfree(stack); -out_free_lowertmp: - kfree(lowertmp); -out_unlock_workdentry: - if (ufs->workdir_locked) - ovl_inuse_unlock(workpath.dentry); -out_put_workpath: - path_put(&workpath); -out_unlock_upperdentry: - if (ufs->upperdir_locked) - ovl_inuse_unlock(upperpath.dentry); -out_put_upperpath: +out_err: path_put(&upperpath); -out_free_config: - kfree(ufs->config.lowerdir); - kfree(ufs->config.upperdir); - kfree(ufs->config.workdir); - kfree(ufs); + ovl_free_fs(ofs); out: return err; } diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index b9b239fa5cfd..d6bb1c9f5e7a 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -17,7 +17,6 @@ #include <linux/namei.h> #include <linux/ratelimit.h> #include "overlayfs.h" -#include "ovl_entry.h" int ovl_want_write(struct dentry *dentry) { @@ -125,7 +124,12 @@ void ovl_path_lower(struct dentry *dentry, struct path *path) { struct ovl_entry *oe = dentry->d_fsdata; - *path = oe->numlower ? oe->lowerstack[0] : (struct path) { }; + if (oe->numlower) { + path->mnt = oe->lowerstack[0].layer->mnt; + path->dentry = oe->lowerstack[0].dentry; + } else { + *path = (struct path) { }; + } } enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) @@ -329,6 +333,19 @@ void ovl_copy_up_end(struct dentry *dentry) mutex_unlock(&OVL_I(d_inode(dentry))->lock); } +bool ovl_check_origin_xattr(struct dentry *dentry) +{ + int res; + + res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0); + + /* Zero size value means "copied up but origin unknown" */ + if (res >= 0) + return true; + + return false; +} + bool ovl_check_dir_xattr(struct dentry *dentry, const char *name) { int res; diff --git a/fs/pipe.c b/fs/pipe.c index 349c9d56d4b3..6d98566201ef 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1018,13 +1018,19 @@ const struct file_operations pipefifo_fops = { /* * Currently we rely on the pipe array holding a power-of-2 number - * of pages. + * of pages. Returns 0 on error. */ -static inline unsigned int round_pipe_size(unsigned int size) +unsigned int round_pipe_size(unsigned int size) { unsigned long nr_pages; + if (size < pipe_min_size) + size = pipe_min_size; + nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages == 0) + return 0; + return roundup_pow_of_two(nr_pages) << PAGE_SHIFT; } @@ -1040,6 +1046,8 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) long ret = 0; size = round_pipe_size(arg); + if (size == 0) + return -EINVAL; nr_pages = size >> PAGE_SHIFT; if (!nr_pages) @@ -1117,20 +1125,13 @@ out_revert_acct: } /* - * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax + * This should work even if CONFIG_PROC_FS isn't set, as proc_dopipe_max_size * will return an error. */ int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf, size_t *lenp, loff_t *ppos) { - int ret; - - ret = proc_dointvec_minmax(table, write, buf, lenp, ppos); - if (ret < 0 || !write) - return ret; - - pipe_max_size = round_pipe_size(pipe_max_size); - return ret; + return proc_dopipe_max_size(table, write, buf, lenp, ppos); } /* diff --git a/fs/proc/Makefile b/fs/proc/Makefile index f7456c4e7d0f..ead487e80510 100644 --- a/fs/proc/Makefile +++ b/fs/proc/Makefile @@ -21,6 +21,7 @@ proc-y += loadavg.o proc-y += meminfo.o proc-y += stat.o proc-y += uptime.o +proc-y += util.o proc-y += version.o proc-y += softirqs.o proc-y += namespaces.o diff --git a/fs/proc/array.c b/fs/proc/array.c index 6f6fc1672ad1..79375fc115d2 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -366,6 +366,11 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) cpumask_pr_args(&task->cpus_allowed)); } +static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) +{ + seq_printf(m, "CoreDumping:\t%d\n", !!mm->core_state); +} + int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -376,6 +381,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, if (mm) { task_mem(m, mm); + task_core_dumping(m, mm); mmput(mm); } task_sig(m, task); diff --git a/fs/proc/base.c b/fs/proc/base.c index 9d357b2ea6cb..28fa85276eec 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -443,8 +443,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%pK>] %pB\n", - (void *)entries[i], (void *)entries[i]); + seq_printf(m, "[<0>] %pB\n", (void *)entries[i]); } unlock_trace(task); } @@ -1682,7 +1681,7 @@ const struct inode_operations proc_pid_link_inode_operations = { /* building an inode */ -void task_dump_owner(struct task_struct *task, mode_t mode, +void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid) { /* Depending on the state of dumpable compute who should own a diff --git a/fs/proc/cpuinfo.c b/fs/proc/cpuinfo.c index e0f867cd8553..96f1087e372c 100644 --- a/fs/proc/cpuinfo.c +++ b/fs/proc/cpuinfo.c @@ -1,12 +1,18 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/cpufreq.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +__weak void arch_freq_prepare_all(void) +{ +} + extern const struct seq_operations cpuinfo_op; static int cpuinfo_open(struct inode *inode, struct file *file) { + arch_freq_prepare_all(); return seq_open(file, &cpuinfo_op); } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 225f541f7078..dd0f82622427 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -483,7 +483,7 @@ int proc_fill_super(struct super_block *s, void *data, int silent) /* User space would break if executables or devices appear on proc */ s->s_iflags |= SB_I_USERNS_VISIBLE | SB_I_NOEXEC | SB_I_NODEV; - s->s_flags |= MS_NODIRATIME | MS_NOSUID | MS_NOEXEC; + s->s_flags |= SB_NODIRATIME | SB_NOSUID | SB_NOEXEC; s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = PROC_SUPER_MAGIC; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index a34195e92b20..4a67188c8d74 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -100,31 +100,10 @@ static inline struct task_struct *get_proc_task(struct inode *inode) return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -void task_dump_owner(struct task_struct *task, mode_t mode, +void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid); -static inline unsigned name_to_int(const struct qstr *qstr) -{ - const char *name = qstr->name; - int len = qstr->len; - unsigned n = 0; - - if (len > 1 && *name == '0') - goto out; - while (len-- > 0) { - unsigned c = *name++ - '0'; - if (c > 9) - goto out; - if (n >= (~0U-9)/10) - goto out; - n *= 10; - n += c; - } - return n; -out: - return ~0U; -} - +unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 9bc5c58c00ee..a000d7547479 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -24,7 +24,7 @@ static int loadavg_proc_show(struct seq_file *m, void *v) LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, - task_active_pid_ns(current)->last_pid); + idr_get_cursor(&task_active_pid_ns(current)->idr)); return 0; } diff --git a/fs/proc/root.c b/fs/proc/root.c index 4e42aba97f2e..ede8e64974be 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -91,7 +91,7 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, { struct pid_namespace *ns; - if (flags & MS_KERNMOUNT) { + if (flags & SB_KERNMOUNT) { ns = data; data = NULL; } else { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 875231c36cb3..339e4c1c044d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -661,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_HUGETLB)] = "ht", + [ilog2(VM_SYNC)] = "sf", [ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_DONTDUMP)] = "dd", diff --git a/fs/proc/util.c b/fs/proc/util.c new file mode 100644 index 000000000000..b161cfa0f9fa --- /dev/null +++ b/fs/proc/util.c @@ -0,0 +1,23 @@ +#include <linux/dcache.h> + +unsigned name_to_int(const struct qstr *qstr) +{ + const char *name = qstr->name; + int len = qstr->len; + unsigned n = 0; + + if (len > 1 && *name == '0') + goto out; + do { + unsigned c = *name++ - '0'; + if (c > 9) + goto out; + if (n >= (~0U-9)/10) + goto out; + n *= 10; + n += c; + } while (--len > 0); + return n; +out: + return ~0U; +} diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 7b635d173213..b786840facd9 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -45,10 +45,10 @@ struct proc_fs_info { static int show_sb_opts(struct seq_file *m, struct super_block *sb) { static const struct proc_fs_info fs_info[] = { - { MS_SYNCHRONOUS, ",sync" }, - { MS_DIRSYNC, ",dirsync" }, - { MS_MANDLOCK, ",mand" }, - { MS_LAZYTIME, ",lazytime" }, + { SB_SYNCHRONOUS, ",sync" }, + { SB_DIRSYNC, ",dirsync" }, + { SB_MANDLOCK, ",mand" }, + { SB_LAZYTIME, ",lazytime" }, { 0, NULL } }; const struct proc_fs_info *fs_infop; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index 086e491faf04..691032107f8c 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -61,7 +61,7 @@ MODULE_PARM_DESC(update_ms, "milliseconds before pstore updates its content " static int pstore_new_entry; -static void pstore_timefunc(unsigned long); +static void pstore_timefunc(struct timer_list *); static DEFINE_TIMER(pstore_timer, pstore_timefunc); static void pstore_dowork(struct work_struct *); @@ -651,7 +651,7 @@ static int pstore_write_user_compat(struct pstore_record *record, return -EINVAL; record->buf = memdup_user(buf, record->size); - if (unlikely(IS_ERR(record->buf))) { + if (IS_ERR(record->buf)) { ret = PTR_ERR(record->buf); goto out; } @@ -890,7 +890,7 @@ static void pstore_dowork(struct work_struct *work) pstore_get_records(1); } -static void pstore_timefunc(unsigned long dummy) +static void pstore_timefunc(struct timer_list *unused) { if (pstore_new_entry) { pstore_new_entry = 0; diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index 3a67cfb142d8..3d46fe302fcb 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -47,7 +47,7 @@ static int qnx4_remount(struct super_block *sb, int *flags, char *data) sync_filesystem(sb); qs = qnx4_sb(sb); qs->Version = QNX4_VERSION; - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -199,7 +199,7 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent) s->s_op = &qnx4_sops; s->s_magic = QNX4_SUPER_MAGIC; - s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ + s->s_flags |= SB_RDONLY; /* Yup, read-only yet */ /* Check the superblock signature. Since the qnx4 code is dangerous, we should leave as quickly as possible diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 1192422a1c56..4aeb26bcb4d0 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -56,7 +56,7 @@ static int qnx6_show_options(struct seq_file *seq, struct dentry *root) static int qnx6_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -427,7 +427,7 @@ mmi_success: } s->s_op = &qnx6_sops; s->s_magic = QNX6_SUPER_MAGIC; - s->s_flags |= MS_RDONLY; /* Yup, read-only yet */ + s->s_flags |= SB_RDONLY; /* Yup, read-only yet */ /* ease the later tree level calculations */ sbi = QNX6_SB(s); diff --git a/fs/read_write.c b/fs/read_write.c index 0046d72efe94..f8547b82dfb3 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -635,27 +635,6 @@ SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf, return ret; } -/* - * Reduce an iovec's length in-place. Return the resulting number of segments - */ -unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to) -{ - unsigned long seg = 0; - size_t len = 0; - - while (seg < nr_segs) { - seg++; - if (len + iov->iov_len >= to) { - iov->iov_len = to - len; - break; - } - len += iov->iov_len; - iov++; - } - return seg; -} -EXPORT_SYMBOL(iov_shorten); - static ssize_t do_iter_readv_writev(struct file *filp, struct iov_iter *iter, loff_t *ppos, int type, rwf_t flags) { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 11a48affa882..b13fc024d2ee 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2106,7 +2106,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, journal_end(th); goto out_inserted_sd; } - } else if (inode->i_sb->s_flags & MS_POSIXACL) { + } else if (inode->i_sb->s_flags & SB_POSIXACL) { reiserfs_warning(inode->i_sb, "jdm-13090", "ACLs aren't enabled in the fs, " "but vfs thinks they are!"); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 69ff280bdfe8..70057359fbaf 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1960,7 +1960,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, /* * Cancel flushing of old commits. Note that neither of these works * will be requeued because superblock is being shutdown and doesn't - * have MS_ACTIVE set. + * have SB_ACTIVE set. */ reiserfs_cancel_old_flush(sb); /* wait for all commits to finish */ @@ -4302,7 +4302,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, int flags) * Avoid queueing work when sb is being shut down. Transaction * will be flushed on journal shutdown. */ - if (sb->s_flags & MS_ACTIVE) + if (sb->s_flags & SB_ACTIVE) queue_delayed_work(REISERFS_SB(sb)->commit_wq, &journal->j_work, HZ / 10); } @@ -4393,7 +4393,7 @@ void reiserfs_abort_journal(struct super_block *sb, int errno) if (!journal->j_errno) journal->j_errno = errno; - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; set_bit(J_ABORTED, &journal->j_state); #ifdef CONFIG_REISERFS_CHECK diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index 64f49cafbc5b..7e288d97adcb 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -390,7 +390,7 @@ void __reiserfs_error(struct super_block *sb, const char *id, return; reiserfs_info(sb, "Remounting filesystem read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; reiserfs_abort_journal(sb, -EIO); } @@ -409,7 +409,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...) printk(KERN_CRIT "REISERFS abort (device %s): %s\n", sb->s_id, error_buf); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; reiserfs_abort_journal(sb, errno); } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 5464ec517702..020c9cacbb2f 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -121,7 +121,7 @@ void reiserfs_schedule_old_flush(struct super_block *s) * Avoid scheduling flush when sb is being shut down. It can race * with journal shutdown and free still queued delayed work. */ - if (sb_rdonly(s) || !(s->s_flags & MS_ACTIVE)) + if (sb_rdonly(s) || !(s->s_flags & SB_ACTIVE)) return; spin_lock(&sbi->old_work_lock); @@ -252,11 +252,11 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ - if (s->s_flags & MS_ACTIVE) { + if (s->s_flags & SB_ACTIVE) { ms_active_set = 0; } else { ms_active_set = 1; - s->s_flags |= MS_ACTIVE; + s->s_flags |= SB_ACTIVE; } /* Turn on quotas so that they are updated correctly */ for (i = 0; i < REISERFS_MAXQUOTAS; i++) { @@ -411,7 +411,7 @@ static int finish_unfinished(struct super_block *s) reiserfs_write_lock(s); if (ms_active_set) /* Restore the flag back */ - s->s_flags &= ~MS_ACTIVE; + s->s_flags &= ~SB_ACTIVE; #endif pathrelse(&path); if (done) @@ -1521,7 +1521,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) goto out_err_unlock; } - if (*mount_flags & MS_RDONLY) { + if (*mount_flags & SB_RDONLY) { reiserfs_write_unlock(s); reiserfs_xattr_init(s, *mount_flags); /* remount read-only */ @@ -1567,7 +1567,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); /* now it is safe to call journal_begin */ - s->s_flags &= ~MS_RDONLY; + s->s_flags &= ~SB_RDONLY; err = journal_begin(&th, s, 10); if (err) goto out_err_unlock; @@ -1575,7 +1575,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) /* Mount a partition which is read-only, read-write */ reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1); REISERFS_SB(s)->s_mount_state = sb_umount_state(rs); - s->s_flags &= ~MS_RDONLY; + s->s_flags &= ~SB_RDONLY; set_sb_umount_state(rs, REISERFS_ERROR_FS); if (!old_format_only(s)) set_sb_mnt_count(rs, sb_mnt_count(rs) + 1); @@ -1590,7 +1590,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) goto out_err_unlock; reiserfs_write_unlock(s); - if (!(*mount_flags & MS_RDONLY)) { + if (!(*mount_flags & SB_RDONLY)) { dquot_resume(s, -1); reiserfs_write_lock(s); finish_unfinished(s); @@ -2055,7 +2055,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) { SWARN(silent, s, "clm-7000", "Detected readonly device, marking FS readonly"); - s->s_flags |= MS_RDONLY; + s->s_flags |= SB_RDONLY; } args.objectid = REISERFS_ROOT_OBJECTID; args.dirid = REISERFS_ROOT_PARENT_OBJECTID; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 46492fb37a4c..5dbf5324bdda 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -959,7 +959,7 @@ int reiserfs_lookup_privroot(struct super_block *s) /* * We need to take a copy of the mount flags since things like - * MS_RDONLY don't get set until *after* we're called. + * SB_RDONLY don't get set until *after* we're called. * mount_flags != mount_options */ int reiserfs_xattr_init(struct super_block *s, int mount_flags) @@ -971,7 +971,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) if (err) goto error; - if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { + if (d_really_is_negative(privroot) && !(mount_flags & SB_RDONLY)) { inode_lock(d_inode(s->s_root)); err = create_privroot(REISERFS_SB(s)->priv_root); inode_unlock(d_inode(s->s_root)); @@ -999,11 +999,11 @@ error: clear_bit(REISERFS_POSIXACL, &REISERFS_SB(s)->s_mount_opt); } - /* The super_block MS_POSIXACL must mirror the (no)acl mount option. */ + /* The super_block SB_POSIXACL must mirror the (no)acl mount option. */ if (reiserfs_posixacl(s)) - s->s_flags |= MS_POSIXACL; + s->s_flags |= SB_POSIXACL; else - s->s_flags &= ~MS_POSIXACL; + s->s_flags &= ~SB_POSIXACL; return err; } diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 0186fe6d39f3..8f06fd1f3d69 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -451,7 +451,7 @@ static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int romfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } @@ -502,7 +502,7 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_maxbytes = 0xFFFFFFFF; sb->s_magic = ROMFS_MAGIC; - sb->s_flags |= MS_RDONLY | MS_NOATIME; + sb->s_flags |= SB_RDONLY | SB_NOATIME; sb->s_op = &romfs_super_ops; #ifdef CONFIG_ROMFS_ON_MTD diff --git a/fs/select.c b/fs/select.c index 063067e606ca..6de493bb42a4 100644 --- a/fs/select.c +++ b/fs/select.c @@ -292,8 +292,7 @@ static int poll_select_copy_remaining(struct timespec64 *end_time, void __user *p, int timeval, int ret) { - struct timespec64 rts64; - struct timespec rts; + struct timespec64 rts; struct timeval rtv; if (!p) @@ -306,23 +305,22 @@ static int poll_select_copy_remaining(struct timespec64 *end_time, if (!end_time->tv_sec && !end_time->tv_nsec) return ret; - ktime_get_ts64(&rts64); - rts64 = timespec64_sub(*end_time, rts64); - if (rts64.tv_sec < 0) - rts64.tv_sec = rts64.tv_nsec = 0; + ktime_get_ts64(&rts); + rts = timespec64_sub(*end_time, rts); + if (rts.tv_sec < 0) + rts.tv_sec = rts.tv_nsec = 0; - rts = timespec64_to_timespec(rts64); if (timeval) { if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec)) memset(&rtv, 0, sizeof(rtv)); - rtv.tv_sec = rts64.tv_sec; - rtv.tv_usec = rts64.tv_nsec / NSEC_PER_USEC; + rtv.tv_sec = rts.tv_sec; + rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC; if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; - } else if (!copy_to_user(p, &rts, sizeof(rts))) + } else if (!put_timespec64(&rts, p)) return ret; /* @@ -705,17 +703,15 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, const sigset_t __user *sigmask, size_t sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts; - struct timespec64 ts64, end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (get_timespec64(&ts, tsp)) return -EFAULT; - ts64 = timespec_to_timespec64(ts); to = &end_time; - if (poll_select_set_timeout(to, ts64.tv_sec, ts64.tv_nsec)) + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) return -EINVAL; } @@ -1052,12 +1048,11 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, size_t, sigsetsize) { sigset_t ksigmask, sigsaved; - struct timespec ts; - struct timespec64 end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; @@ -1103,10 +1098,10 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, #define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) static -int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, +int compat_poll_select_copy_remaining(struct timespec64 *end_time, void __user *p, int timeval, int ret) { - struct timespec ts; + struct timespec64 ts; if (!p) return ret; @@ -1118,8 +1113,8 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, if (!end_time->tv_sec && !end_time->tv_nsec) return ret; - ktime_get_ts(&ts); - ts = timespec_sub(*end_time, ts); + ktime_get_ts64(&ts); + ts = timespec64_sub(*end_time, ts); if (ts.tv_sec < 0) ts.tv_sec = ts.tv_nsec = 0; @@ -1132,12 +1127,7 @@ int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, if (!copy_to_user(p, &rtv, sizeof(rtv))) return ret; } else { - struct compat_timespec rts; - - rts.tv_sec = ts.tv_sec; - rts.tv_nsec = ts.tv_nsec; - - if (!copy_to_user(p, &rts, sizeof(rts))) + if (!compat_put_timespec64(&ts, p)) return ret; } /* @@ -1195,7 +1185,7 @@ int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, */ static int compat_core_sys_select(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct timespec *end_time) + struct timespec64 *end_time) { fd_set_bits fds; void *bits; @@ -1268,7 +1258,7 @@ COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, struct compat_timeval __user *, tvp) { - struct timespec end_time, *to = NULL; + struct timespec64 end_time, *to = NULL; struct compat_timeval tv; int ret; @@ -1312,14 +1302,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, compat_size_t sigsetsize) { - compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (compat_get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; @@ -1330,9 +1318,8 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp, if (sigmask) { if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + if (get_compat_sigset(&ksigmask, sigmask)) return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); @@ -1381,14 +1368,12 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, struct compat_timespec __user *, tsp, const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) { - compat_sigset_t ss32; sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; + struct timespec64 ts, end_time, *to = NULL; int ret; if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) + if (compat_get_timespec64(&ts, tsp)) return -EFAULT; to = &end_time; @@ -1399,9 +1384,8 @@ COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, if (sigmask) { if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + if (get_compat_sigset(&ksigmask, sigmask)) return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); diff --git a/fs/signalfd.c b/fs/signalfd.c index 1c667af86da5..5f1ff8756595 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -313,15 +313,13 @@ COMPAT_SYSCALL_DEFINE4(signalfd4, int, ufd, compat_size_t, sigsetsize, int, flags) { - compat_sigset_t ss32; sigset_t tmp; sigset_t __user *ksigmask; if (sigsetsize != sizeof(compat_sigset_t)) return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + if (get_compat_sigset(&tmp, sigmask)) return -EFAULT; - sigset_from_compat(&tmp, &ss32); ksigmask = compat_alloc_user_space(sizeof(sigset_t)); if (copy_to_user(ksigmask, &tmp, sizeof(sigset_t))) return -EFAULT; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index cf01e15a7b16..8a73b97217c8 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -195,7 +195,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) (u64) le64_to_cpu(sblk->id_table_start)); sb->s_maxbytes = MAX_LFS_FILESIZE; - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; sb->s_op = &squashfs_super_ops; err = -ENOMEM; @@ -373,7 +373,7 @@ static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int squashfs_remount(struct super_block *sb, int *flags, char *data) { sync_filesystem(sb); - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } diff --git a/fs/statfs.c b/fs/statfs.c index c25dd9a26cc1..5b2a24f0f263 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -35,11 +35,11 @@ static int flags_by_mnt(int mnt_flags) static int flags_by_sb(int s_flags) { int flags = 0; - if (s_flags & MS_SYNCHRONOUS) + if (s_flags & SB_SYNCHRONOUS) flags |= ST_SYNCHRONOUS; - if (s_flags & MS_MANDLOCK) + if (s_flags & SB_MANDLOCK) flags |= ST_MANDLOCK; - if (s_flags & MS_RDONLY) + if (s_flags & SB_RDONLY) flags |= ST_RDONLY; return flags; } @@ -217,7 +217,7 @@ SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user return error; } -int vfs_ustat(dev_t dev, struct kstatfs *sbuf) +static int vfs_ustat(dev_t dev, struct kstatfs *sbuf) { struct super_block *s = user_get_super(dev); int err; diff --git a/fs/super.c b/fs/super.c index 994db21f59bf..d4e33e8f1e6f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -155,21 +155,19 @@ static void destroy_super_rcu(struct rcu_head *head) schedule_work(&s->destroy_work); } -/** - * destroy_super - frees a superblock - * @s: superblock to free - * - * Frees a superblock. - */ -static void destroy_super(struct super_block *s) +/* Free a superblock that has never been seen by anyone */ +static void destroy_unused_super(struct super_block *s) { + if (!s) + return; + up_write(&s->s_umount); list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); security_sb_free(s); - WARN_ON(!list_empty(&s->s_mounts)); put_user_ns(s->s_user_ns); kfree(s->s_subtype); - call_rcu(&s->rcu, destroy_super_rcu); + /* no delays needed */ + destroy_super_work(&s->destroy_work); } /** @@ -257,7 +255,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, return s; fail: - destroy_super(s); + destroy_unused_super(s); return NULL; } @@ -266,11 +264,17 @@ fail: /* * Drop a superblock's refcount. The caller must hold sb_lock. */ -static void __put_super(struct super_block *sb) +static void __put_super(struct super_block *s) { - if (!--sb->s_count) { - list_del_init(&sb->s_list); - destroy_super(sb); + if (!--s->s_count) { + list_del_init(&s->s_list); + WARN_ON(s->s_dentry_lru.node); + WARN_ON(s->s_inode_lru.node); + WARN_ON(!list_empty(&s->s_mounts)); + security_sb_free(s); + put_user_ns(s->s_user_ns); + kfree(s->s_subtype); + call_rcu(&s->rcu, destroy_super_rcu); } } @@ -485,19 +489,12 @@ retry: continue; if (user_ns != old->s_user_ns) { spin_unlock(&sb_lock); - if (s) { - up_write(&s->s_umount); - destroy_super(s); - } + destroy_unused_super(s); return ERR_PTR(-EBUSY); } if (!grab_super(old)) goto retry; - if (s) { - up_write(&s->s_umount); - destroy_super(s); - s = NULL; - } + destroy_unused_super(s); return old; } } @@ -512,8 +509,7 @@ retry: err = set(s, data); if (err) { spin_unlock(&sb_lock); - up_write(&s->s_umount); - destroy_super(s); + destroy_unused_super(s); return ERR_PTR(err); } s->s_type = type; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 20b8f82e115b..fb49510c5dcf 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -30,7 +30,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, void *ns; bool new_sb; - if (!(flags & MS_KERNMOUNT)) { + if (!(flags & SB_KERNMOUNT)) { if (!kobj_ns_current_may_mount(KOBJ_NS_TYPE_NET)) return ERR_PTR(-EPERM); } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 3c47b7d5d4cf..bec9f79adb25 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -63,7 +63,7 @@ static int sysv_remount(struct super_block *sb, int *flags, char *data) sync_filesystem(sb); if (sbi->s_forced_ro) - *flags |= MS_RDONLY; + *flags |= SB_RDONLY; return 0; } diff --git a/fs/sysv/super.c b/fs/sysv/super.c index 0d56e486b392..89765ddfb738 100644 --- a/fs/sysv/super.c +++ b/fs/sysv/super.c @@ -333,7 +333,7 @@ static int complete_read_super(struct super_block *sb, int silent, int size) /* set up enough so that it can read an inode */ sb->s_op = &sysv_sops; if (sbi->s_forced_ro) - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; if (sbi->s_truncate) sb->s_d_op = &sysv_dentry_operations; root_inode = sysv_iget(sb, SYSV_ROOT_INO); diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index a02aa59d1e24..dfe85069586e 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1406,7 +1406,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, if (flags & S_MTIME) inode->i_mtime = *time; - if (!(inode->i_sb->s_flags & MS_LAZYTIME)) + if (!(inode->i_sb->s_flags & SB_LAZYTIME)) iflags |= I_DIRTY_SYNC; release = ui->dirty; diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 3be28900bf37..fe77e9625e84 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -84,7 +84,7 @@ void ubifs_ro_mode(struct ubifs_info *c, int err) if (!c->ro_error) { c->ro_error = 1; c->no_chk_data_crc = 0; - c->vfs_sb->s_flags |= MS_RDONLY; + c->vfs_sb->s_flags |= SB_RDONLY; ubifs_warn(c, "switched to read-only mode, error %d", err); dump_stack(); } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 7503e7cdf870..0beb285b143d 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -968,7 +968,7 @@ static int parse_standard_option(const char *option) pr_notice("UBIFS: parse %s\n", option); if (!strcmp(option, "sync")) - return MS_SYNCHRONOUS; + return SB_SYNCHRONOUS; return 0; } @@ -1160,8 +1160,8 @@ static int mount_ubifs(struct ubifs_info *c) size_t sz; c->ro_mount = !!sb_rdonly(c->vfs_sb); - /* Suppress error messages while probing if MS_SILENT is set */ - c->probing = !!(c->vfs_sb->s_flags & MS_SILENT); + /* Suppress error messages while probing if SB_SILENT is set */ + c->probing = !!(c->vfs_sb->s_flags & SB_SILENT); err = init_constants_early(c); if (err) @@ -1852,7 +1852,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) return err; } - if (c->ro_mount && !(*flags & MS_RDONLY)) { + if (c->ro_mount && !(*flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/W due to prior errors"); return -EROFS; @@ -1864,7 +1864,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data) err = ubifs_remount_rw(c); if (err) return err; - } else if (!c->ro_mount && (*flags & MS_RDONLY)) { + } else if (!c->ro_mount && (*flags & SB_RDONLY)) { if (c->ro_error) { ubifs_msg(c, "cannot re-mount R/O due to prior errors"); return -EROFS; @@ -2117,7 +2117,7 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, */ ubi = open_ubi(name, UBI_READONLY); if (IS_ERR(ubi)) { - if (!(flags & MS_SILENT)) + if (!(flags & SB_SILENT)) pr_err("UBIFS error (pid: %d): cannot open \"%s\", error %d", current->pid, name, (int)PTR_ERR(ubi)); return ERR_CAST(ubi); @@ -2143,18 +2143,18 @@ static struct dentry *ubifs_mount(struct file_system_type *fs_type, int flags, kfree(c); /* A new mount point for already mounted UBIFS */ dbg_gen("this ubi volume is already mounted"); - if (!!(flags & MS_RDONLY) != c1->ro_mount) { + if (!!(flags & SB_RDONLY) != c1->ro_mount) { err = -EBUSY; goto out_deact; } } else { - err = ubifs_fill_super(sb, data, flags & MS_SILENT ? 1 : 0); + err = ubifs_fill_super(sb, data, flags & SB_SILENT ? 1 : 0); if (err) goto out_deact; /* We do not support atime */ - sb->s_flags |= MS_ACTIVE; + sb->s_flags |= SB_ACTIVE; #ifndef CONFIG_UBIFS_ATIME_SUPPORT - sb->s_flags |= MS_NOATIME; + sb->s_flags |= SB_NOATIME; #else ubifs_msg(c, "full atime support is enabled."); #endif diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 63c7468147eb..5ee7af879cc4 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1201,7 +1201,7 @@ struct ubifs_debug_info; * @need_recovery: %1 if the file-system needs recovery * @replaying: %1 during journal replay * @mounting: %1 while mounting - * @probing: %1 while attempting to mount if MS_SILENT mount flag is set + * @probing: %1 while attempting to mount if SB_SILENT mount flag is set * @remounting_rw: %1 while re-mounting from R/O mode to R/W mode * @replay_list: temporary list used during journal replay * @replay_buds: list of buds to replay @@ -1850,7 +1850,7 @@ __printf(2, 3) void ubifs_warn(const struct ubifs_info *c, const char *fmt, ...); /* * A conditional variant of 'ubifs_err()' which doesn't output anything - * if probing (ie. MS_SILENT set). + * if probing (ie. SB_SILENT set). */ #define ubifs_errc(c, fmt, ...) \ do { \ diff --git a/fs/udf/super.c b/fs/udf/super.c index f80e0a0f24d3..f73239a9a97d 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -650,7 +650,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) sync_filesystem(sb); if (lvidiu) { int write_rev = le16_to_cpu(lvidiu->minUDFWriteRev); - if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & MS_RDONLY)) + if (write_rev > UDF_MAX_WRITE_VERSION && !(*flags & SB_RDONLY)) return -EACCES; } @@ -673,10 +673,10 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options) sbi->s_dmode = uopt.dmode; write_unlock(&sbi->s_cred_lock); - if ((bool)(*flags & MS_RDONLY) == sb_rdonly(sb)) + if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) goto out_unlock; - if (*flags & MS_RDONLY) + if (*flags & SB_RDONLY) udf_close_lvid(sb); else udf_open_lvid(sb); diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index b5cd79065ef9..e727ee07dbe4 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -115,7 +115,7 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); @@ -205,7 +205,7 @@ do_more: ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); if (overflow) { @@ -567,7 +567,7 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); @@ -688,7 +688,7 @@ cg_found: succed: ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 916b4a428933..e1ef0f0a1353 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -112,7 +112,7 @@ void ufs_free_inode (struct inode * inode) ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); @@ -146,14 +146,14 @@ static void ufs2_init_inodes_chunk(struct super_block *sb, set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bh); brelse(bh); } fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb); ubh_mark_buffer_dirty(UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); UFSD("EXIT\n"); @@ -284,7 +284,7 @@ cg_found: } ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) ubh_sync_block(UCPI_UBH(ucpi)); ufs_mark_sb_dirty(sb); @@ -330,7 +330,7 @@ cg_found: ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec); mark_buffer_dirty(bh); unlock_buffer(bh); - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) sync_dirty_buffer(bh); brelse(bh); } diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 6440003f8ddc..4d497e9c6883 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -282,7 +282,7 @@ void ufs_error (struct super_block * sb, const char * function, usb1->fs_clean = UFS_FSBAD; ubh_mark_buffer_dirty(USPI_UBH(uspi)); ufs_mark_sb_dirty(sb); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } va_start(args, fmt); vaf.fmt = fmt; @@ -320,7 +320,7 @@ void ufs_panic (struct super_block * sb, const char * function, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; pr_crit("panic (device %s): %s: %pV\n", sb->s_id, function, &vaf); va_end(args); @@ -905,7 +905,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=old is supported read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } break; @@ -921,7 +921,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=nextstep is supported read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } break; @@ -937,7 +937,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=nextstep-cd is supported read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } break; @@ -953,7 +953,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=openstep is supported read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } break; @@ -968,7 +968,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) if (!sb_rdonly(sb)) { if (!silent) pr_info("ufstype=hp is supported read-only\n"); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } break; default: @@ -1125,21 +1125,21 @@ magic_found: break; case UFS_FSACTIVE: pr_err("%s(): fs is active\n", __func__); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; break; case UFS_FSBAD: pr_err("%s(): fs is bad\n", __func__); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; break; default: pr_err("%s(): can't grok fs_clean 0x%x\n", __func__, usb1->fs_clean); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; break; } } else { pr_err("%s(): fs needs fsck\n", __func__); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } /* @@ -1328,7 +1328,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) return -EINVAL; } - if ((bool)(*mount_flags & MS_RDONLY) == sb_rdonly(sb)) { + if ((bool)(*mount_flags & SB_RDONLY) == sb_rdonly(sb)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; @@ -1337,7 +1337,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) /* * fs was mouted as rw, remounting ro */ - if (*mount_flags & MS_RDONLY) { + if (*mount_flags & SB_RDONLY) { ufs_put_super_internal(sb); usb1->fs_time = cpu_to_fs32(sb, get_seconds()); if ((flags & UFS_ST_MASK) == UFS_ST_SUN @@ -1346,7 +1346,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufs_set_fs_state(sb, usb1, usb3, UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time)); ubh_mark_buffer_dirty (USPI_UBH(uspi)); - sb->s_flags |= MS_RDONLY; + sb->s_flags |= SB_RDONLY; } else { /* * fs was mounted as ro, remounting rw @@ -1370,7 +1370,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) mutex_unlock(&UFS_SB(sb)->s_lock); return -EPERM; } - sb->s_flags &= ~MS_RDONLY; + sb->s_flags &= ~SB_RDONLY; #endif } UFS_SB(sb)->s_mount_opt = new_mount_opt; diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c index 343a94246f5b..89bf16b4d937 100644 --- a/fs/xfs/libxfs/xfs_iext_tree.c +++ b/fs/xfs/libxfs/xfs_iext_tree.c @@ -302,7 +302,7 @@ xfs_iext_rec_cmp( xfs_fileoff_t offset) { uint64_t rec_offset = rec->lo & XFS_IEXT_STARTOFF_MASK; - u32 rec_len = rec->hi & XFS_IEXT_LENGTH_MASK; + uint32_t rec_len = rec->hi & XFS_IEXT_LENGTH_MASK; if (rec_offset > offset) return 1; @@ -850,9 +850,9 @@ static void xfs_iext_free_last_leaf( struct xfs_ifork *ifp) { - ifp->if_u1.if_root = NULL; ifp->if_height--; kmem_free(ifp->if_u1.if_root); + ifp->if_u1.if_root = NULL; } void diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 1c90ec41e9df..c79a1616b79d 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -42,11 +42,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); -static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev) -{ - return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); -} - /* * Copy inode type and data and attr format specific information from the * on-disk inode to the in-core inode and fork structures. For fifos, devices, @@ -792,7 +787,8 @@ xfs_iflush_fork( case XFS_DINODE_FMT_DEV: if (iip->ili_fields & XFS_ILOG_DEV) { ASSERT(whichfork == XFS_DATA_FORK); - xfs_dinode_put_rdev(dip, sysv_encode_dev(VFS_I(ip)->i_rdev)); + xfs_dinode_put_rdev(dip, + linux_to_xfs_dev_t(VFS_I(ip)->i_rdev)); } break; diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index 996f035ee205..349d9f8edb89 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -274,7 +274,7 @@ struct xfs_inode_log_format { uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ - u8 __pad[16]; /* unused */ + uint8_t __pad[16]; /* unused */ } ilf_u; int64_t ilf_blkno; /* blkno of inode buffer */ int32_t ilf_len; /* len of inode buffer */ @@ -295,7 +295,7 @@ struct xfs_inode_log_format_32 { uint64_t ilf_ino; /* inode number */ union { uint32_t ilfu_rdev; /* rdev value for dev inode*/ - u8 __pad[16]; /* unused */ + uint8_t __pad[16]; /* unused */ } ilf_u; int64_t ilf_blkno; /* blkno of inode buffer */ int32_t ilf_len; /* len of inode buffer */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 18146873a8b3..8601275cc5e6 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -44,6 +44,7 @@ #include <linux/falloc.h> #include <linux/pagevec.h> #include <linux/backing-dev.h> +#include <linux/mman.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -1045,7 +1046,11 @@ __xfs_filemap_fault( xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); if (IS_DAX(inode)) { - ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); + pfn_t pfn; + + ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops); + if (ret & VM_FAULT_NEEDDSYNC) + ret = dax_finish_sync_fault(vmf, pe_size, pfn); } else { if (write_fault) ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); @@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite( } /* - * pfn_mkwrite was originally inteneded to ensure we capture time stamp - * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED - * to ensure we serialise the fault barrier in place. + * pfn_mkwrite was originally intended to ensure we capture time stamp updates + * on write faults. In reality, it needs to serialise against truncate and + * prepare memory for writing so handle is as standard write fault. */ static int xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { - struct inode *inode = file_inode(vmf->vma->vm_file); - struct xfs_inode *ip = XFS_I(inode); - int ret = VM_FAULT_NOPAGE; - loff_t size; - - trace_xfs_filemap_pfn_mkwrite(ip); - - sb_start_pagefault(inode->i_sb); - file_update_time(vmf->vma->vm_file); - - /* check if the faulting page hasn't raced with truncate */ - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (vmf->pgoff >= size) - ret = VM_FAULT_SIGBUS; - else if (IS_DAX(inode)) - ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); - sb_end_pagefault(inode->i_sb); - return ret; - + return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true); } static const struct vm_operations_struct xfs_file_vm_ops = { @@ -1136,6 +1120,13 @@ xfs_file_mmap( struct file *filp, struct vm_area_struct *vma) { + /* + * We don't support synchronous mappings for non-DAX files. At least + * until someone comes with a sensible use case. + */ + if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC)) + return -EOPNOTSUPP; + file_accessed(filp); vma->vm_ops = &xfs_file_vm_ops; if (IS_DAX(file_inode(filp))) @@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = { .compat_ioctl = xfs_file_compat_ioctl, #endif .mmap = xfs_file_mmap, + .mmap_supported_flags = MAP_SYNC, .open = xfs_file_open, .release = xfs_file_release, .fsync = xfs_file_fsync, diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index d8226f7a5dde..61d1cb7dc10d 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2357,6 +2357,7 @@ retry: */ if (ip->i_ino != inum + i) { xfs_iunlock(ip, XFS_ILOCK_EXCL); + rcu_read_unlock(); continue; } } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 18077e2189a9..33eb4fb2e3fd 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -34,6 +34,7 @@ #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_space.h" +#include "xfs_inode_item.h" #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_icache.h" @@ -1089,6 +1090,10 @@ xfs_file_iomap_begin( trace_xfs_iomap_found(ip, offset, length, 0, &imap); } + if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields + & ~XFS_ILOG_TIMESTAMP)) + iomap->flags |= IOMAP_F_DIRTY; + xfs_bmbt_to_iomap(ip, iomap, &imap); if (shared) diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 6282bfc1afa9..99562ec0de56 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -204,6 +204,16 @@ static inline kgid_t xfs_gid_to_kgid(uint32_t gid) return make_kgid(&init_user_ns, gid); } +static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev) +{ + return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev)); +} + +static inline xfs_dev_t linux_to_xfs_dev_t(dev_t dev) +{ + return sysv_encode_dev(dev); +} + /* * Various platform dependent calls that don't fit anywhere else */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 38d4227895ae..a503af96d780 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -781,17 +781,17 @@ xfs_log_mount_finish( * something to an unlinked inode, the irele won't cause * premature truncation and freeing of the inode, which results * in log recovery failure. We have to evict the unreferenced - * lru inodes after clearing MS_ACTIVE because we don't + * lru inodes after clearing SB_ACTIVE because we don't * otherwise clean up the lru if there's a subsequent failure in * xfs_mountfs, which leads to us leaking the inodes if nothing * else (e.g. quotacheck) references the inodes before the * mount failure occurs. */ - mp->m_super->s_flags |= MS_ACTIVE; + mp->m_super->s_flags |= SB_ACTIVE; error = xlog_recover_finish(mp->m_log); if (!error) xfs_log_work_queue(mp); - mp->m_super->s_flags &= ~MS_ACTIVE; + mp->m_super->s_flags &= ~SB_ACTIVE; evict_inodes(mp->m_super); /* diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f663022353c0..5122d3021117 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -212,9 +212,9 @@ xfs_parseargs( */ if (sb_rdonly(sb)) mp->m_flags |= XFS_MOUNT_RDONLY; - if (sb->s_flags & MS_DIRSYNC) + if (sb->s_flags & SB_DIRSYNC) mp->m_flags |= XFS_MOUNT_DIRSYNC; - if (sb->s_flags & MS_SYNCHRONOUS) + if (sb->s_flags & SB_SYNCHRONOUS) mp->m_flags |= XFS_MOUNT_WSYNC; /* @@ -1312,7 +1312,7 @@ xfs_fs_remount( } /* ro -> rw */ - if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & MS_RDONLY)) { + if ((mp->m_flags & XFS_MOUNT_RDONLY) && !(*flags & SB_RDONLY)) { if (mp->m_flags & XFS_MOUNT_NORECOVERY) { xfs_warn(mp, "ro->rw transition prohibited on norecovery mount"); @@ -1368,7 +1368,7 @@ xfs_fs_remount( } /* rw -> ro */ - if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) { + if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) { /* Free the per-AG metadata reservation pool. */ error = xfs_fs_unreserve_ag_blocks(mp); if (error) { diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h index 5f2f32408011..fcc5dfc70aa0 100644 --- a/fs/xfs/xfs_super.h +++ b/fs/xfs/xfs_super.h @@ -30,7 +30,7 @@ extern void xfs_qm_exit(void); #ifdef CONFIG_XFS_POSIX_ACL # define XFS_ACL_STRING "ACLs, " -# define set_posix_acl_flag(sb) ((sb)->s_flags |= MS_POSIXACL) +# define set_posix_acl_flag(sb) ((sb)->s_flags |= SB_POSIXACL) #else # define XFS_ACL_STRING # define set_posix_acl_flag(sb) do { } while (0) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 515ba042d75c..d718a10c2271 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -654,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); -DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite); - TRACE_EVENT(xfs_filemap_fault, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, bool write_fault), |