From ce23e640133484eebc20ca7b7668388213e11327 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 11 Apr 2016 00:48:00 -0400 Subject: ->getxattr(): pass dentry and inode as separate arguments Signed-off-by: Al Viro --- Documentation/filesystems/porting | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'Documentation') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index f1b87d8aa2da..57bb3754a027 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -525,3 +525,9 @@ in your dentry operations instead. set_delayed_call() where it used to set *cookie. ->put_link() is gone - just give the destructor to set_delayed_call() in ->get_link(). +-- +[mandatory] + ->getxattr() and xattr_handler.get() get dentry and inode passed separately. + dentry might be yet to be attached to inode, so do _not_ use its ->d_inode + in the instances. Rationale: !@#!@# security_d_instantiate() needs to be + called before we attach dentry to inode. -- cgit v1.2.3 From 84e710da2a1dfacfc87f604869a4d22df91ce6cd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Apr 2016 00:58:55 -0400 Subject: parallel lookups machinery, part 2 We'll need to verify that there's neither a hashed nor in-lookup dentry with desired parent/name before adding to in-lookup set. One possible solution would be to hold the parent's ->d_lock through both checks, but while the in-lookup set is relatively small at any time, dcache is not. And holding the parent's ->d_lock through something like __d_lookup_rcu() would suck too badly. So we leave the parent's ->d_lock alone, which means that we watch out for the following scenario: * we verify that there's no hashed match * existing in-lookup match gets hashed by another process * we verify that there's no in-lookup matches and decide that everything's fine. Solution: per-directory kinda-sorta seqlock, bumped around the times we hash something that used to be in-lookup or move (and hash) something in place of in-lookup. Then the above would turn into * read the counter * do dcache lookup * if no matches found, check for in-lookup matches * if there had been none of those either, check if the counter has changed; repeat if it has. The "kinda-sorta" part is due to the fact that we don't have much spare space in inode. There is a spare word (shared with i_bdev/i_cdev/i_pipe), so the counter part is not a problem, but spinlock is a different story. We could use the parent's ->d_lock, and it would be less painful in terms of contention, for __d_add() it would be rather inconvenient to grab; we could do that (using lock_parent()), but... Fortunately, we can get serialization on the counter itself, and it might be a good idea in general; we can use cmpxchg() in a loop to get from even to odd and smp_store_release() from odd to even. This commit adds the counter and updating logics; the readers will be added in the next commit. Signed-off-by: Al Viro --- Documentation/filesystems/porting | 8 ++++++++ fs/dcache.c | 34 ++++++++++++++++++++++++++++++++-- fs/inode.c | 1 + include/linux/fs.h | 1 + mm/shmem.c | 3 ++- 5 files changed, 44 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 57bb3754a027..8810e2367fe6 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -531,3 +531,11 @@ in your dentry operations instead. dentry might be yet to be attached to inode, so do _not_ use its ->d_inode in the instances. Rationale: !@#!@# security_d_instantiate() needs to be called before we attach dentry to inode. +-- +[mandatory] + symlinks are no longer the only inodes that do *not* have i_bdev/i_cdev/ + i_pipe/i_link union zeroed out at inode eviction. As the result, you can't + assume that non-NULL value in ->i_nlink at ->destroy_inode() implies that + it's a symlink. Checking ->i_mode is really needed now. In-tree we had + to fix shmem_destroy_callback() that used to take that kind of shortcut; + watch out, since that shortcut is no longer valid. diff --git a/fs/dcache.c b/fs/dcache.c index 0f1d93866e69..10988f7e5a23 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2364,6 +2364,22 @@ void d_rehash(struct dentry * entry) } EXPORT_SYMBOL(d_rehash); +static inline unsigned start_dir_add(struct inode *dir) +{ + + for (;;) { + unsigned n = dir->i_dir_seq; + if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) + return n; + cpu_relax(); + } +} + +static inline void end_dir_add(struct inode *dir, unsigned n) +{ + smp_store_release(&dir->i_dir_seq, n + 2); +} + void __d_lookup_done(struct dentry *dentry) { dentry->d_flags &= ~DCACHE_PAR_LOOKUP; @@ -2375,9 +2391,14 @@ EXPORT_SYMBOL(__d_lookup_done); static inline void __d_add(struct dentry *dentry, struct inode *inode) { + struct inode *dir = NULL; + unsigned n; spin_lock(&dentry->d_lock); - if (unlikely(d_in_lookup(dentry))) + if (unlikely(d_in_lookup(dentry))) { + dir = dentry->d_parent->d_inode; + n = start_dir_add(dir); __d_lookup_done(dentry); + } if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); @@ -2387,6 +2408,8 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode) __fsnotify_d_instantiate(dentry); } _d_rehash(dentry); + if (dir) + end_dir_add(dir, n); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); @@ -2616,6 +2639,8 @@ static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target) static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { + struct inode *dir = NULL; + unsigned n; if (!dentry->d_inode) printk(KERN_WARNING "VFS: moving negative dcache entry\n"); @@ -2623,8 +2648,11 @@ static void __d_move(struct dentry *dentry, struct dentry *target, BUG_ON(d_ancestor(target, dentry)); dentry_lock_for_move(dentry, target); - if (unlikely(d_in_lookup(target))) + if (unlikely(d_in_lookup(target))) { + dir = target->d_parent->d_inode; + n = start_dir_add(dir); __d_lookup_done(target); + } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); @@ -2674,6 +2702,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target, write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); + if (dir) + end_dir_add(dir, n); dentry_unlock_for_move(dentry, target); } diff --git a/fs/inode.c b/fs/inode.c index 4202aac99464..4b884f73214e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -151,6 +151,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_bdev = NULL; inode->i_cdev = NULL; inode->i_link = NULL; + inode->i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index 6d0fa9174a24..00cecc5a2f75 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -684,6 +684,7 @@ struct inode { struct block_device *i_bdev; struct cdev *i_cdev; char *i_link; + unsigned i_dir_seq; }; __u32 i_generation; diff --git a/mm/shmem.c b/mm/shmem.c index 4640699b209b..e684a9140228 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3123,7 +3123,8 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); - kfree(inode->i_link); + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } -- cgit v1.2.3 From 9902af79c01a8e39bb99b922fa3eef6d4ea23d69 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 15 Apr 2016 15:08:36 -0400 Subject: parallel lookups: actual switch to rwsem ta-da! The main issue is the lack of down_write_killable(), so the places like readdir.c switched to plain inode_lock(); once killable variants of rwsem primitives appear, that'll be dealt with. lockdep side also might need more work Signed-off-by: Al Viro --- Documentation/filesystems/porting | 18 ++++++++++++++++++ fs/btrfs/ioctl.c | 18 +++++++++++------- fs/configfs/inode.c | 2 +- fs/dcache.c | 9 +++++---- fs/gfs2/ops_fstype.c | 2 +- fs/inode.c | 12 ++++++------ fs/namei.c | 4 ++-- fs/ocfs2/inode.c | 2 +- fs/overlayfs/readdir.c | 4 +++- fs/readdir.c | 7 ++++--- include/linux/fs.h | 27 +++++++++++++++++++++------ 11 files changed, 73 insertions(+), 32 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 8810e2367fe6..1567a53857bd 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -539,3 +539,21 @@ in your dentry operations instead. it's a symlink. Checking ->i_mode is really needed now. In-tree we had to fix shmem_destroy_callback() that used to take that kind of shortcut; watch out, since that shortcut is no longer valid. +-- +[mandatory] + ->i_mutex is replaced with ->i_rwsem now. inode_lock() et.al. work as + they used to - they just take it exclusive. However, ->lookup() may be + called with parent locked shared. Its instances must not + * use d_instantiate) and d_rehash() separately - use d_add() or + d_splice_alias() instead. + * use d_rehash() alone - call d_add(new_dentry, NULL) instead. + * in the unlikely case when (read-only) access to filesystem + data structures needs exclusion for some reason, arrange it + yourself. None of the in-tree filesystems needed that. + * rely on ->d_parent and ->d_name not changing after dentry has + been fed to d_add() or d_splice_alias(). Again, none of the + in-tree instances relied upon that. + We are guaranteed that lookups of the same name in the same directory + will not happen in parallel ("same" in the sense of your ->d_compare()). + Lookups on different names in the same directory can and do happen in + parallel now. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a23806ae418..0b8ba717175b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -837,9 +837,11 @@ static noinline int btrfs_mksubvol(struct path *parent, struct dentry *dentry; int error; - error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - if (error == -EINTR) - return error; + inode_lock_nested(dir, I_MUTEX_PARENT); + // XXX: should've been + // mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + // if (error == -EINTR) + // return error; dentry = lookup_one_len(name, parent->dentry, namelen); error = PTR_ERR(dentry); @@ -2366,9 +2368,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; - err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); - if (err == -EINTR) - goto out_drop_write; + inode_lock_nested(dir, I_MUTEX_PARENT); + // XXX: should've been + // err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT); + // if (err == -EINTR) + // goto out_drop_write; dentry = lookup_one_len(vol_args->name, parent, namelen); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -2558,7 +2562,7 @@ out_dput: dput(dentry); out_unlock_dir: inode_unlock(dir); -out_drop_write: +//out_drop_write: mnt_drop_write_file(file); out: kfree(vol_args); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 03d124ae27d7..0387968e6f47 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -156,7 +156,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, if (depth > 0) { if (depth <= ARRAY_SIZE(default_group_class)) { - lockdep_set_class(&inode->i_mutex, + lockdep_set_class(&inode->i_rwsem, &default_group_class[depth - 1]); } else { /* diff --git a/fs/dcache.c b/fs/dcache.c index 59fcffcbf096..e49ba7d1b957 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2932,7 +2932,8 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) static int __d_unalias(struct inode *inode, struct dentry *dentry, struct dentry *alias) { - struct mutex *m1 = NULL, *m2 = NULL; + struct mutex *m1 = NULL; + struct rw_semaphore *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ @@ -2943,15 +2944,15 @@ static int __d_unalias(struct inode *inode, if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; - if (!inode_trylock(alias->d_parent->d_inode)) + if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; - m2 = &alias->d_parent->d_inode->i_mutex; + m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: if (m2) - mutex_unlock(m2); + up_read(m2); if (m1) mutex_unlock(m1); return ret; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index c09c63dcd7a2..45463600fb81 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -824,7 +824,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) * i_mutex on quota files is special. Since this inode is hidden system * file, we are safe to define locking ourselves. */ - lockdep_set_class(&sdp->sd_quota_inode->i_mutex, + lockdep_set_class(&sdp->sd_quota_inode->i_rwsem, &gfs2_quota_imutex_key); error = gfs2_rindex_update(sdp); diff --git a/fs/inode.c b/fs/inode.c index 4b884f73214e..4ccbc21b30ce 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -166,8 +166,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode) spin_lock_init(&inode->i_lock); lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key); - mutex_init(&inode->i_mutex); - lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key); + init_rwsem(&inode->i_rwsem); + lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key); atomic_set(&inode->i_dio_count, 0); @@ -925,13 +925,13 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode) struct file_system_type *type = inode->i_sb->s_type; /* Set new key only if filesystem hasn't already changed it */ - if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) { + if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) { /* * ensure nobody is actually holding i_mutex */ - mutex_destroy(&inode->i_mutex); - mutex_init(&inode->i_mutex); - lockdep_set_class(&inode->i_mutex, + // mutex_destroy(&inode->i_mutex); + init_rwsem(&inode->i_rwsem); + lockdep_set_class(&inode->i_rwsem, &type->i_mutex_dir_key); } } diff --git a/fs/namei.c b/fs/namei.c index 7babb5e5f276..8249852b5fc6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1607,7 +1607,7 @@ static struct dentry *lookup_slow(const struct qstr *name, struct inode *inode = dir->d_inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); - inode_lock(inode); + inode_lock_shared(inode); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) goto out; @@ -1638,7 +1638,7 @@ again: } } out: - inode_unlock(inode); + inode_unlock_shared(inode); return dentry; } diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 12f4a9e9800f..0748777f2e2a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -262,7 +262,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) inode->i_ino = args->fi_ino; OCFS2_I(inode)->ip_blkno = args->fi_blkno; if (args->fi_sysfile_type != 0) - lockdep_set_class(&inode->i_mutex, + lockdep_set_class(&inode->i_rwsem, &ocfs2_sysfile_lock_key[args->fi_sysfile_type]); if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE || args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE || diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 6ec1e43a9a54..da186ee4f846 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -218,7 +218,9 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); old_cred = override_creds(override_cred); - err = mutex_lock_killable(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); + err = 0; + // XXX: err = mutex_lock_killable(&dir->d_inode->i_mutex); if (!err) { while (rdd->first_maybe_whiteout) { p = rdd->first_maybe_whiteout; diff --git a/fs/readdir.c b/fs/readdir.c index e69ef3b79787..bf583e848a1a 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -32,9 +32,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx) if (res) goto out; - res = mutex_lock_killable(&inode->i_mutex); - if (res) - goto out; + inode_lock(inode); + // res = mutex_lock_killable(&inode->i_mutex); + // if (res) + // goto out; res = -ENOENT; if (!IS_DEADDIR(inode)) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 00cecc5a2f75..3018f31f7aa0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -647,7 +647,7 @@ struct inode { /* Misc */ unsigned long i_state; - struct mutex i_mutex; + struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; @@ -734,27 +734,42 @@ enum inode_i_mutex_lock_class static inline void inode_lock(struct inode *inode) { - mutex_lock(&inode->i_mutex); + down_write(&inode->i_rwsem); } static inline void inode_unlock(struct inode *inode) { - mutex_unlock(&inode->i_mutex); + up_write(&inode->i_rwsem); +} + +static inline void inode_lock_shared(struct inode *inode) +{ + down_read(&inode->i_rwsem); +} + +static inline void inode_unlock_shared(struct inode *inode) +{ + up_read(&inode->i_rwsem); } static inline int inode_trylock(struct inode *inode) { - return mutex_trylock(&inode->i_mutex); + return down_write_trylock(&inode->i_rwsem); +} + +static inline int inode_trylock_shared(struct inode *inode) +{ + return down_read_trylock(&inode->i_rwsem); } static inline int inode_is_locked(struct inode *inode) { - return mutex_is_locked(&inode->i_mutex); + return rwsem_is_locked(&inode->i_rwsem); } static inline void inode_lock_nested(struct inode *inode, unsigned subclass) { - mutex_lock_nested(&inode->i_mutex, subclass); + down_write_nested(&inode->i_rwsem, subclass); } void lock_two_nondirectories(struct inode *, struct inode*); -- cgit v1.2.3 From 6192269444ebfbfb42e23c7a6a93c76ffe4b5e51 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 20 Apr 2016 23:08:32 -0400 Subject: introduce a parallel variant of ->iterate() New method: ->iterate_shared(). Same arguments as in ->iterate(), called with the directory locked only shared. Once all filesystems switch, the old one will be gone. Signed-off-by: Al Viro --- Documentation/filesystems/porting | 18 ++++++++++++++++++ fs/coda/dir.c | 18 ++++++++++++------ fs/exportfs/expfs.c | 2 +- fs/readdir.c | 20 ++++++++++++++++---- include/linux/fs.h | 1 + 5 files changed, 48 insertions(+), 11 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 1567a53857bd..12c57abdaac9 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -557,3 +557,21 @@ in your dentry operations instead. will not happen in parallel ("same" in the sense of your ->d_compare()). Lookups on different names in the same directory can and do happen in parallel now. +-- +[recommended] + ->iterate_shared() is added; it's a parallel variant of ->iterate(). + Exclusion on struct file level is still provided (as well as that + between it and lseek on the same struct file), but if your directory + has been opened several times, you can get these called in parallel. + Exclusion between that method and all directory-modifying ones is + still provided, of course. + + Often enough ->iterate() can serve as ->iterate_shared() without any + changes - it is a read-only operation, after all. If you have any + per-inode or per-dentry in-core data structures modified by ->iterate(), + you might need something to serialize the access to them. If you + do dcache pre-seeding, you'll need to switch to d_alloc_parallel() for + that; look for in-tree examples. + + Old method is only used if the new one is absent; eventually it will + be removed. Switch while you still can; the old one won't stay. diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 42e731b8c80a..6fb8672c0892 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -424,16 +424,22 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (host_file->f_op->iterate) { + if (host_file->f_op->iterate || host_file->f_op->iterate_shared) { struct inode *host_inode = file_inode(host_file); - - inode_lock(host_inode); ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { - ret = host_file->f_op->iterate(host_file, ctx); - file_accessed(host_file); + if (host_file->f_op->iterate_shared) { + inode_lock_shared(host_inode); + ret = host_file->f_op->iterate_shared(host_file, ctx); + file_accessed(host_file); + inode_unlock_shared(host_inode); + } else { + inode_lock(host_inode); + ret = host_file->f_op->iterate(host_file, ctx); + file_accessed(host_file); + inode_unlock(host_inode); + } } - inode_unlock(host_inode); return ret; } /* Venus: we must read Venus dirents from a file */ diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 402c5caab5ca..207ba8d627ca 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -312,7 +312,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child) goto out; error = -EINVAL; - if (!file->f_op->iterate) + if (!file->f_op->iterate && !file->f_op->iterate_shared) goto out_close; buffer.sequence = 0; diff --git a/fs/readdir.c b/fs/readdir.c index d7308b8f6cf7..a86c6c04b9bc 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -24,15 +24,21 @@ int iterate_dir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); + bool shared = false; int res = -ENOTDIR; - if (!file->f_op->iterate) + if (file->f_op->iterate_shared) + shared = true; + else if (!file->f_op->iterate) goto out; res = security_file_permission(file, MAY_READ); if (res) goto out; - inode_lock(inode); + if (shared) + inode_lock_shared(inode); + else + inode_lock(inode); // res = mutex_lock_killable(&inode->i_mutex); // if (res) // goto out; @@ -40,12 +46,18 @@ int iterate_dir(struct file *file, struct dir_context *ctx) res = -ENOENT; if (!IS_DEADDIR(inode)) { ctx->pos = file->f_pos; - res = file->f_op->iterate(file, ctx); + if (shared) + res = file->f_op->iterate_shared(file, ctx); + else + res = file->f_op->iterate(file, ctx); file->f_pos = ctx->pos; fsnotify_access(file); file_accessed(file); } - inode_unlock(inode); + if (shared) + inode_unlock_shared(inode); + else + inode_unlock(inode); out: return res; } diff --git a/include/linux/fs.h b/include/linux/fs.h index 3018f31f7aa0..3dc0258a2b64 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1674,6 +1674,7 @@ struct file_operations { ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); + int (*iterate_shared) (struct file *, struct dir_context *); unsigned int (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); -- cgit v1.2.3 From 9cf843e3f47c41440367062e92ab32e59ecb6a87 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 28 Apr 2016 19:35:16 -0400 Subject: lookup_open(): lock the parent shared unless O_CREAT is given Signed-off-by: Al Viro --- Documentation/filesystems/porting | 3 +++ fs/namei.c | 12 +++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index 12c57abdaac9..46f3bb7a02f5 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -575,3 +575,6 @@ in your dentry operations instead. Old method is only used if the new one is absent; eventually it will be removed. Switch while you still can; the old one won't stay. +-- +[mandatory] + ->atomic_open() calls without O_CREAT may happen in parallel. diff --git a/fs/namei.c b/fs/namei.c index b84e6b2e19e9..01069ddda3e3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3084,7 +3084,7 @@ static int do_last(struct nameidata *nd, } retry_lookup: - if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { + if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) { error = mnt_want_write(nd->path.mnt); if (!error) got_write = true; @@ -3094,9 +3094,15 @@ retry_lookup: * dropping this one anyway. */ } - inode_lock(dir->d_inode); + if (open_flag & O_CREAT) + inode_lock(dir->d_inode); + else + inode_lock_shared(dir->d_inode); error = lookup_open(nd, &path, file, op, got_write, opened); - inode_unlock(dir->d_inode); + if (open_flag & O_CREAT) + inode_unlock(dir->d_inode); + else + inode_unlock_shared(dir->d_inode); if (error <= 0) { if (error) -- cgit v1.2.3