From ce23e640133484eebc20ca7b7668388213e11327 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 11 Apr 2016 00:48:00 -0400
Subject: ->getxattr(): pass dentry and inode as separate arguments

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'Documentation')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index f1b87d8aa2da..57bb3754a027 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -525,3 +525,9 @@ in your dentry operations instead.
 	set_delayed_call() where it used to set *cookie.
 	->put_link() is gone - just give the destructor to set_delayed_call()
 	in ->get_link().
+--
+[mandatory]
+	->getxattr() and xattr_handler.get() get dentry and inode passed separately.
+	dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
+	in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
+	called before we attach dentry to inode.
-- 
cgit v1.2.3


From 84e710da2a1dfacfc87f604869a4d22df91ce6cd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Apr 2016 00:58:55 -0400
Subject: parallel lookups machinery, part 2

We'll need to verify that there's neither a hashed nor in-lookup
dentry with desired parent/name before adding to in-lookup set.

One possible solution would be to hold the parent's ->d_lock through
both checks, but while the in-lookup set is relatively small at any
time, dcache is not.  And holding the parent's ->d_lock through
something like __d_lookup_rcu() would suck too badly.

So we leave the parent's ->d_lock alone, which means that we watch
out for the following scenario:
	* we verify that there's no hashed match
	* existing in-lookup match gets hashed by another process
	* we verify that there's no in-lookup matches and decide
that everything's fine.

Solution: per-directory kinda-sorta seqlock, bumped around the times
we hash something that used to be in-lookup or move (and hash)
something in place of in-lookup.  Then the above would turn into
	* read the counter
	* do dcache lookup
	* if no matches found, check for in-lookup matches
	* if there had been none of those either, check if the
counter has changed; repeat if it has.

The "kinda-sorta" part is due to the fact that we don't have much spare
space in inode.  There is a spare word (shared with i_bdev/i_cdev/i_pipe),
so the counter part is not a problem, but spinlock is a different story.

We could use the parent's ->d_lock, and it would be less painful in
terms of contention, for __d_add() it would be rather inconvenient to
grab; we could do that (using lock_parent()), but...

Fortunately, we can get serialization on the counter itself, and it
might be a good idea in general; we can use cmpxchg() in a loop to
get from even to odd and smp_store_release() from odd to even.

This commit adds the counter and updating logics; the readers will be
added in the next commit.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting |  8 ++++++++
 fs/dcache.c                       | 34 ++++++++++++++++++++++++++++++++--
 fs/inode.c                        |  1 +
 include/linux/fs.h                |  1 +
 mm/shmem.c                        |  3 ++-
 5 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 57bb3754a027..8810e2367fe6 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -531,3 +531,11 @@ in your dentry operations instead.
 	dentry might be yet to be attached to inode, so do _not_ use its ->d_inode
 	in the instances.  Rationale: !@#!@# security_d_instantiate() needs to be
 	called before we attach dentry to inode.
+--
+[mandatory]
+	symlinks are no longer the only inodes that do *not* have i_bdev/i_cdev/
+	i_pipe/i_link union zeroed out at inode eviction.  As the result, you can't
+	assume that non-NULL value in ->i_nlink at ->destroy_inode() implies that
+	it's a symlink.  Checking ->i_mode is really needed now.  In-tree we had
+	to fix shmem_destroy_callback() that used to take that kind of shortcut;
+	watch out, since that shortcut is no longer valid.
diff --git a/fs/dcache.c b/fs/dcache.c
index 0f1d93866e69..10988f7e5a23 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2364,6 +2364,22 @@ void d_rehash(struct dentry * entry)
 }
 EXPORT_SYMBOL(d_rehash);
 
+static inline unsigned start_dir_add(struct inode *dir)
+{
+
+	for (;;) {
+		unsigned n = dir->i_dir_seq;
+		if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+			return n;
+		cpu_relax();
+	}
+}
+
+static inline void end_dir_add(struct inode *dir, unsigned n)
+{
+	smp_store_release(&dir->i_dir_seq, n + 2);
+}
+
 void __d_lookup_done(struct dentry *dentry)
 {
 	dentry->d_flags &= ~DCACHE_PAR_LOOKUP;
@@ -2375,9 +2391,14 @@ EXPORT_SYMBOL(__d_lookup_done);
 
 static inline void __d_add(struct dentry *dentry, struct inode *inode)
 {
+	struct inode *dir = NULL;
+	unsigned n;
 	spin_lock(&dentry->d_lock);
-	if (unlikely(d_in_lookup(dentry)))
+	if (unlikely(d_in_lookup(dentry))) {
+		dir = dentry->d_parent->d_inode;
+		n = start_dir_add(dir);
 		__d_lookup_done(dentry);
+	}
 	if (inode) {
 		unsigned add_flags = d_flags_for_inode(inode);
 		hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
@@ -2387,6 +2408,8 @@ static inline void __d_add(struct dentry *dentry, struct inode *inode)
 		__fsnotify_d_instantiate(dentry);
 	}
 	_d_rehash(dentry);
+	if (dir)
+		end_dir_add(dir, n);
 	spin_unlock(&dentry->d_lock);
 	if (inode)
 		spin_unlock(&inode->i_lock);
@@ -2616,6 +2639,8 @@ static void dentry_unlock_for_move(struct dentry *dentry, struct dentry *target)
 static void __d_move(struct dentry *dentry, struct dentry *target,
 		     bool exchange)
 {
+	struct inode *dir = NULL;
+	unsigned n;
 	if (!dentry->d_inode)
 		printk(KERN_WARNING "VFS: moving negative dcache entry\n");
 
@@ -2623,8 +2648,11 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	BUG_ON(d_ancestor(target, dentry));
 
 	dentry_lock_for_move(dentry, target);
-	if (unlikely(d_in_lookup(target)))
+	if (unlikely(d_in_lookup(target))) {
+		dir = target->d_parent->d_inode;
+		n = start_dir_add(dir);
 		__d_lookup_done(target);
+	}
 
 	write_seqcount_begin(&dentry->d_seq);
 	write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED);
@@ -2674,6 +2702,8 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
 	write_seqcount_end(&target->d_seq);
 	write_seqcount_end(&dentry->d_seq);
 
+	if (dir)
+		end_dir_add(dir, n);
 	dentry_unlock_for_move(dentry, target);
 }
 
diff --git a/fs/inode.c b/fs/inode.c
index 4202aac99464..4b884f73214e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -151,6 +151,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_bdev = NULL;
 	inode->i_cdev = NULL;
 	inode->i_link = NULL;
+	inode->i_dir_seq = 0;
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6d0fa9174a24..00cecc5a2f75 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -684,6 +684,7 @@ struct inode {
 		struct block_device	*i_bdev;
 		struct cdev		*i_cdev;
 		char			*i_link;
+		unsigned		i_dir_seq;
 	};
 
 	__u32			i_generation;
diff --git a/mm/shmem.c b/mm/shmem.c
index 4640699b209b..e684a9140228 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3123,7 +3123,8 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 static void shmem_destroy_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kfree(inode->i_link);
+	if (S_ISLNK(inode->i_mode))
+		kfree(inode->i_link);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
-- 
cgit v1.2.3


From 9902af79c01a8e39bb99b922fa3eef6d4ea23d69 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 15 Apr 2016 15:08:36 -0400
Subject: parallel lookups: actual switch to rwsem

ta-da!

The main issue is the lack of down_write_killable(), so the places
like readdir.c switched to plain inode_lock(); once killable
variants of rwsem primitives appear, that'll be dealt with.

lockdep side also might need more work

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting | 18 ++++++++++++++++++
 fs/btrfs/ioctl.c                  | 18 +++++++++++-------
 fs/configfs/inode.c               |  2 +-
 fs/dcache.c                       |  9 +++++----
 fs/gfs2/ops_fstype.c              |  2 +-
 fs/inode.c                        | 12 ++++++------
 fs/namei.c                        |  4 ++--
 fs/ocfs2/inode.c                  |  2 +-
 fs/overlayfs/readdir.c            |  4 +++-
 fs/readdir.c                      |  7 ++++---
 include/linux/fs.h                | 27 +++++++++++++++++++++------
 11 files changed, 73 insertions(+), 32 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 8810e2367fe6..1567a53857bd 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -539,3 +539,21 @@ in your dentry operations instead.
 	it's a symlink.  Checking ->i_mode is really needed now.  In-tree we had
 	to fix shmem_destroy_callback() that used to take that kind of shortcut;
 	watch out, since that shortcut is no longer valid.
+--
+[mandatory]
+	->i_mutex is replaced with ->i_rwsem now.  inode_lock() et.al. work as
+	they used to - they just take it exclusive.  However, ->lookup() may be
+	called with parent locked shared.  Its instances must not
+		* use d_instantiate) and d_rehash() separately - use d_add() or
+		  d_splice_alias() instead.
+		* use d_rehash() alone - call d_add(new_dentry, NULL) instead.
+		* in the unlikely case when (read-only) access to filesystem
+		  data structures needs exclusion for some reason, arrange it
+		  yourself.  None of the in-tree filesystems needed that.
+		* rely on ->d_parent and ->d_name not changing after dentry has
+		  been fed to d_add() or d_splice_alias().  Again, none of the
+		  in-tree instances relied upon that.
+	We are guaranteed that lookups of the same name in the same directory
+	will not happen in parallel ("same" in the sense of your ->d_compare()).
+	Lookups on different names in the same directory can and do happen in
+	parallel now.
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5a23806ae418..0b8ba717175b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -837,9 +837,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
 	struct dentry *dentry;
 	int error;
 
-	error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
-	if (error == -EINTR)
-		return error;
+	inode_lock_nested(dir, I_MUTEX_PARENT);
+	// XXX: should've been
+	// mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	// if (error == -EINTR)
+	//	return error;
 
 	dentry = lookup_one_len(name, parent->dentry, namelen);
 	error = PTR_ERR(dentry);
@@ -2366,9 +2368,11 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		goto out;
 
 
-	err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
-	if (err == -EINTR)
-		goto out_drop_write;
+	inode_lock_nested(dir, I_MUTEX_PARENT);
+	// XXX: should've been
+	// err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	// if (err == -EINTR)
+	//	goto out_drop_write;
 	dentry = lookup_one_len(vol_args->name, parent, namelen);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -2558,7 +2562,7 @@ out_dput:
 	dput(dentry);
 out_unlock_dir:
 	inode_unlock(dir);
-out_drop_write:
+//out_drop_write:
 	mnt_drop_write_file(file);
 out:
 	kfree(vol_args);
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 03d124ae27d7..0387968e6f47 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -156,7 +156,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd,
 
 	if (depth > 0) {
 		if (depth <= ARRAY_SIZE(default_group_class)) {
-			lockdep_set_class(&inode->i_mutex,
+			lockdep_set_class(&inode->i_rwsem,
 					  &default_group_class[depth - 1]);
 		} else {
 			/*
diff --git a/fs/dcache.c b/fs/dcache.c
index 59fcffcbf096..e49ba7d1b957 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2932,7 +2932,8 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
 static int __d_unalias(struct inode *inode,
 		struct dentry *dentry, struct dentry *alias)
 {
-	struct mutex *m1 = NULL, *m2 = NULL;
+	struct mutex *m1 = NULL;
+	struct rw_semaphore *m2 = NULL;
 	int ret = -ESTALE;
 
 	/* If alias and dentry share a parent, then no extra locks required */
@@ -2943,15 +2944,15 @@ static int __d_unalias(struct inode *inode,
 	if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
 		goto out_err;
 	m1 = &dentry->d_sb->s_vfs_rename_mutex;
-	if (!inode_trylock(alias->d_parent->d_inode))
+	if (!inode_trylock_shared(alias->d_parent->d_inode))
 		goto out_err;
-	m2 = &alias->d_parent->d_inode->i_mutex;
+	m2 = &alias->d_parent->d_inode->i_rwsem;
 out_unalias:
 	__d_move(alias, dentry, false);
 	ret = 0;
 out_err:
 	if (m2)
-		mutex_unlock(m2);
+		up_read(m2);
 	if (m1)
 		mutex_unlock(m1);
 	return ret;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c09c63dcd7a2..45463600fb81 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -824,7 +824,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 	 * i_mutex on quota files is special. Since this inode is hidden system
 	 * file, we are safe to define locking ourselves.
 	 */
-	lockdep_set_class(&sdp->sd_quota_inode->i_mutex,
+	lockdep_set_class(&sdp->sd_quota_inode->i_rwsem,
 			  &gfs2_quota_imutex_key);
 
 	error = gfs2_rindex_update(sdp);
diff --git a/fs/inode.c b/fs/inode.c
index 4b884f73214e..4ccbc21b30ce 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -166,8 +166,8 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	spin_lock_init(&inode->i_lock);
 	lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
 
-	mutex_init(&inode->i_mutex);
-	lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+	init_rwsem(&inode->i_rwsem);
+	lockdep_set_class(&inode->i_rwsem, &sb->s_type->i_mutex_key);
 
 	atomic_set(&inode->i_dio_count, 0);
 
@@ -925,13 +925,13 @@ void lockdep_annotate_inode_mutex_key(struct inode *inode)
 		struct file_system_type *type = inode->i_sb->s_type;
 
 		/* Set new key only if filesystem hasn't already changed it */
-		if (lockdep_match_class(&inode->i_mutex, &type->i_mutex_key)) {
+		if (lockdep_match_class(&inode->i_rwsem, &type->i_mutex_key)) {
 			/*
 			 * ensure nobody is actually holding i_mutex
 			 */
-			mutex_destroy(&inode->i_mutex);
-			mutex_init(&inode->i_mutex);
-			lockdep_set_class(&inode->i_mutex,
+			// mutex_destroy(&inode->i_mutex);
+			init_rwsem(&inode->i_rwsem);
+			lockdep_set_class(&inode->i_rwsem,
 					  &type->i_mutex_dir_key);
 		}
 	}
diff --git a/fs/namei.c b/fs/namei.c
index 7babb5e5f276..8249852b5fc6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1607,7 +1607,7 @@ static struct dentry *lookup_slow(const struct qstr *name,
 	struct inode *inode = dir->d_inode;
 	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
 
-	inode_lock(inode);
+	inode_lock_shared(inode);
 	/* Don't go there if it's already dead */
 	if (unlikely(IS_DEADDIR(inode)))
 		goto out;
@@ -1638,7 +1638,7 @@ again:
 		}
 	}
 out:
-	inode_unlock(inode);
+	inode_unlock_shared(inode);
 	return dentry;
 }
 
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 12f4a9e9800f..0748777f2e2a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -262,7 +262,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 	inode->i_ino = args->fi_ino;
 	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
 	if (args->fi_sysfile_type != 0)
-		lockdep_set_class(&inode->i_mutex,
+		lockdep_set_class(&inode->i_rwsem,
 			&ocfs2_sysfile_lock_key[args->fi_sysfile_type]);
 	if (args->fi_sysfile_type == USER_QUOTA_SYSTEM_INODE ||
 	    args->fi_sysfile_type == GROUP_QUOTA_SYSTEM_INODE ||
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 6ec1e43a9a54..da186ee4f846 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -218,7 +218,9 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
 	cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE);
 	old_cred = override_creds(override_cred);
 
-	err = mutex_lock_killable(&dir->d_inode->i_mutex);
+	inode_lock(dir->d_inode);
+	err = 0;
+	// XXX: err = mutex_lock_killable(&dir->d_inode->i_mutex);
 	if (!err) {
 		while (rdd->first_maybe_whiteout) {
 			p = rdd->first_maybe_whiteout;
diff --git a/fs/readdir.c b/fs/readdir.c
index e69ef3b79787..bf583e848a1a 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -32,9 +32,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
 	if (res)
 		goto out;
 
-	res = mutex_lock_killable(&inode->i_mutex);
-	if (res)
-		goto out;
+	inode_lock(inode);
+	// res = mutex_lock_killable(&inode->i_mutex);
+	// if (res)
+	//	goto out;
 
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 00cecc5a2f75..3018f31f7aa0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -647,7 +647,7 @@ struct inode {
 
 	/* Misc */
 	unsigned long		i_state;
-	struct mutex		i_mutex;
+	struct rw_semaphore	i_rwsem;
 
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
 	unsigned long		dirtied_time_when;
@@ -734,27 +734,42 @@ enum inode_i_mutex_lock_class
 
 static inline void inode_lock(struct inode *inode)
 {
-	mutex_lock(&inode->i_mutex);
+	down_write(&inode->i_rwsem);
 }
 
 static inline void inode_unlock(struct inode *inode)
 {
-	mutex_unlock(&inode->i_mutex);
+	up_write(&inode->i_rwsem);
+}
+
+static inline void inode_lock_shared(struct inode *inode)
+{
+	down_read(&inode->i_rwsem);
+}
+
+static inline void inode_unlock_shared(struct inode *inode)
+{
+	up_read(&inode->i_rwsem);
 }
 
 static inline int inode_trylock(struct inode *inode)
 {
-	return mutex_trylock(&inode->i_mutex);
+	return down_write_trylock(&inode->i_rwsem);
+}
+
+static inline int inode_trylock_shared(struct inode *inode)
+{
+	return down_read_trylock(&inode->i_rwsem);
 }
 
 static inline int inode_is_locked(struct inode *inode)
 {
-	return mutex_is_locked(&inode->i_mutex);
+	return rwsem_is_locked(&inode->i_rwsem);
 }
 
 static inline void inode_lock_nested(struct inode *inode, unsigned subclass)
 {
-	mutex_lock_nested(&inode->i_mutex, subclass);
+	down_write_nested(&inode->i_rwsem, subclass);
 }
 
 void lock_two_nondirectories(struct inode *, struct inode*);
-- 
cgit v1.2.3


From 6192269444ebfbfb42e23c7a6a93c76ffe4b5e51 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 20 Apr 2016 23:08:32 -0400
Subject: introduce a parallel variant of ->iterate()

New method: ->iterate_shared().  Same arguments as in ->iterate(),
called with the directory locked only shared.  Once all filesystems
switch, the old one will be gone.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting | 18 ++++++++++++++++++
 fs/coda/dir.c                     | 18 ++++++++++++------
 fs/exportfs/expfs.c               |  2 +-
 fs/readdir.c                      | 20 ++++++++++++++++----
 include/linux/fs.h                |  1 +
 5 files changed, 48 insertions(+), 11 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 1567a53857bd..12c57abdaac9 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -557,3 +557,21 @@ in your dentry operations instead.
 	will not happen in parallel ("same" in the sense of your ->d_compare()).
 	Lookups on different names in the same directory can and do happen in
 	parallel now.
+--
+[recommended]
+	->iterate_shared() is added; it's a parallel variant of ->iterate().
+	Exclusion on struct file level is still provided (as well as that
+	between it and lseek on the same struct file), but if your directory
+	has been opened several times, you can get these called in parallel.
+	Exclusion between that method and all directory-modifying ones is
+	still provided, of course.
+
+	Often enough ->iterate() can serve as ->iterate_shared() without any
+	changes - it is a read-only operation, after all.  If you have any
+	per-inode or per-dentry in-core data structures modified by ->iterate(),
+	you might need something to serialize the access to them.  If you
+	do dcache pre-seeding, you'll need to switch to d_alloc_parallel() for
+	that; look for in-tree examples.
+
+	Old method is only used if the new one is absent; eventually it will
+	be removed.  Switch while you still can; the old one won't stay.
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 42e731b8c80a..6fb8672c0892 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -424,16 +424,22 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (host_file->f_op->iterate) {
+	if (host_file->f_op->iterate || host_file->f_op->iterate_shared) {
 		struct inode *host_inode = file_inode(host_file);
-
-		inode_lock(host_inode);
 		ret = -ENOENT;
 		if (!IS_DEADDIR(host_inode)) {
-			ret = host_file->f_op->iterate(host_file, ctx);
-			file_accessed(host_file);
+			if (host_file->f_op->iterate_shared) {
+				inode_lock_shared(host_inode);
+				ret = host_file->f_op->iterate_shared(host_file, ctx);
+				file_accessed(host_file);
+				inode_unlock_shared(host_inode);
+			} else {
+				inode_lock(host_inode);
+				ret = host_file->f_op->iterate(host_file, ctx);
+				file_accessed(host_file);
+				inode_unlock(host_inode);
+			}
 		}
-		inode_unlock(host_inode);
 		return ret;
 	}
 	/* Venus: we must read Venus dirents from a file */
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 402c5caab5ca..207ba8d627ca 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -312,7 +312,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child)
 		goto out;
 
 	error = -EINVAL;
-	if (!file->f_op->iterate)
+	if (!file->f_op->iterate && !file->f_op->iterate_shared)
 		goto out_close;
 
 	buffer.sequence = 0;
diff --git a/fs/readdir.c b/fs/readdir.c
index d7308b8f6cf7..a86c6c04b9bc 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -24,15 +24,21 @@
 int iterate_dir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
+	bool shared = false;
 	int res = -ENOTDIR;
-	if (!file->f_op->iterate)
+	if (file->f_op->iterate_shared)
+		shared = true;
+	else if (!file->f_op->iterate)
 		goto out;
 
 	res = security_file_permission(file, MAY_READ);
 	if (res)
 		goto out;
 
-	inode_lock(inode);
+	if (shared)
+		inode_lock_shared(inode);
+	else
+		inode_lock(inode);
 	// res = mutex_lock_killable(&inode->i_mutex);
 	// if (res)
 	//	goto out;
@@ -40,12 +46,18 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
 	res = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
 		ctx->pos = file->f_pos;
-		res = file->f_op->iterate(file, ctx);
+		if (shared)
+			res = file->f_op->iterate_shared(file, ctx);
+		else
+			res = file->f_op->iterate(file, ctx);
 		file->f_pos = ctx->pos;
 		fsnotify_access(file);
 		file_accessed(file);
 	}
-	inode_unlock(inode);
+	if (shared)
+		inode_unlock_shared(inode);
+	else
+		inode_unlock(inode);
 out:
 	return res;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3018f31f7aa0..3dc0258a2b64 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1674,6 +1674,7 @@ struct file_operations {
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
+	int (*iterate_shared) (struct file *, struct dir_context *);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
 	long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
-- 
cgit v1.2.3


From 9cf843e3f47c41440367062e92ab32e59ecb6a87 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 28 Apr 2016 19:35:16 -0400
Subject: lookup_open(): lock the parent shared unless O_CREAT is given

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting |  3 +++
 fs/namei.c                        | 12 +++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

(limited to 'Documentation')

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 12c57abdaac9..46f3bb7a02f5 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -575,3 +575,6 @@ in your dentry operations instead.
 
 	Old method is only used if the new one is absent; eventually it will
 	be removed.  Switch while you still can; the old one won't stay.
+--
+[mandatory]
+	->atomic_open() calls without O_CREAT may happen in parallel.
diff --git a/fs/namei.c b/fs/namei.c
index b84e6b2e19e9..01069ddda3e3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3084,7 +3084,7 @@ static int do_last(struct nameidata *nd,
 	}
 
 retry_lookup:
-	if (op->open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
+	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
 		error = mnt_want_write(nd->path.mnt);
 		if (!error)
 			got_write = true;
@@ -3094,9 +3094,15 @@ retry_lookup:
 		 * dropping this one anyway.
 		 */
 	}
-	inode_lock(dir->d_inode);
+	if (open_flag & O_CREAT)
+		inode_lock(dir->d_inode);
+	else
+		inode_lock_shared(dir->d_inode);
 	error = lookup_open(nd, &path, file, op, got_write, opened);
-	inode_unlock(dir->d_inode);
+	if (open_flag & O_CREAT)
+		inode_unlock(dir->d_inode);
+	else
+		inode_unlock_shared(dir->d_inode);
 
 	if (error <= 0) {
 		if (error)
-- 
cgit v1.2.3