From e8e1ba96b207deba1339b09983f8b29f92cb1497 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Fri, 4 Feb 2011 20:45:58 -0800
Subject: ceph: queue cap_snaps once per realm

We were forming a dirty list, and then queueing cap_snaps for each realm
_and_ its children, regardless of whether the children were already in the
dirty list.  This meant we did it twice for some realms.  Which in turn
meant we corrupted mdsc->snap_flush_list when the cap_snap was re-added to
the list it was already on, and could trigger an infinite loop.

We were also using recursion to do reach all the children, a no-no when
stack is limited.

Instead, (re)queue any children on the dirty list, avoiding processing
anything twice and avoiding any recursion.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 39c243acd062..f40b9139e437 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -584,10 +584,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
 	if (lastinode)
 		iput(lastinode);
 
-	dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
-	list_for_each_entry(child, &realm->children, child_item)
-		queue_realm_cap_snaps(child);
+	list_for_each_entry(child, &realm->children, child_item) {
+		dout("queue_realm_cap_snaps %p %llx queue child %p %llx\n",
+		     realm, realm->ino, child, child->ino);
+		list_del_init(&child->dirty_item);
+		list_add(&child->dirty_item, &realm->dirty_item);
+	}
 
+	list_del_init(&realm->dirty_item);
 	dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
 }
 
@@ -683,7 +687,9 @@ more:
 	 * queue cap snaps _after_ we've built the new snap contexts,
 	 * so that i_head_snapc can be set appropriately.
 	 */
-	list_for_each_entry(realm, &dirty_realms, dirty_item) {
+	while (!list_empty(&dirty_realms)) {
+		realm = list_first_entry(&dirty_realms, struct ceph_snap_realm,
+					 dirty_item);
 		queue_realm_cap_snaps(realm);
 	}
 
-- 
cgit v1.2.3


From 91435650c233b93e0da389db74f4b2c11c5ad2d4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Feb 2011 13:10:41 -0500
Subject: Btrfs: put ENOSPC debugging under a mount option

ENOSPC in btrfs is getting to the point where the extra debugging isn't
required.  I've put it under mount -o enospc_debug just in case someone
is having difficult problems.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       | 1 +
 fs/btrfs/extent-tree.c | 2 +-
 fs/btrfs/super.c       | 7 ++++++-
 3 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 72195378bef9..6297701bc19c 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1254,6 +1254,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_SPACE_CACHE		(1 << 12)
 #define BTRFS_MOUNT_CLEAR_CACHE		(1 << 13)
 #define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a7aaa10c5302..d375fc04a065 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5377,7 +5377,7 @@ again:
 			       num_bytes, data, 1);
 		goto again;
 	}
-	if (ret == -ENOSPC) {
+	if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
 		struct btrfs_space_info *sinfo;
 
 		sinfo = __find_space_info(root->fs_info, data);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0209b5fc772c..db0a827252bd 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -155,7 +155,8 @@ enum {
 	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
 	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
+	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+	Opt_enospc_debug, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -184,6 +185,7 @@ static match_table_t tokens = {
 	{Opt_space_cache, "space_cache"},
 	{Opt_clear_cache, "clear_cache"},
 	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+	{Opt_enospc_debug, "enospc_debug"},
 	{Opt_err, NULL},
 };
 
@@ -358,6 +360,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_user_subvol_rm_allowed:
 			btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
 			break;
+		case Opt_enospc_debug:
+			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
-- 
cgit v1.2.3


From c87f08ca44e83b2c8d28f63f9c33f3a270a04bbe Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 16 Feb 2011 13:57:04 -0500
Subject: Btrfs: allow balance to explicitly allocate chunks as it relocates

Btrfs device shrinking and balancing ends up reallocating all the blocks
in order to allow COW to move them to new destinations.  It is somewhat
awkward in terms of ENOSPC because most of the enospc code is built
around the idea that some operation on a reference counted tree triggers
allocations in the non-reference counted trees.

This commit changes the balancing code to deal with enospc by trying to
allocate a new chunk.  If that allocation succeeds, we go ahead and
retry whatever failed due to enospc.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  2 ++
 fs/btrfs/extent-tree.c |  7 +++++++
 fs/btrfs/relocation.c  | 13 ++++++++++++-
 3 files changed, 21 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 6297701bc19c..28188a786da0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2219,6 +2219,8 @@ int btrfs_error_unpin_extent_range(struct btrfs_root *root,
 				   u64 start, u64 end);
 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
 			       u64 num_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 type);
 
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d375fc04a065..100e409e9053 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -8066,6 +8066,13 @@ out:
 	return ret;
 }
 
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root, u64 type)
+{
+	u64 alloc_flags = get_alloc_profile(root, type);
+	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+}
+
 /*
  * helper to account the unused space of all the readonly block group in the
  * list. takes mirrors into account.
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0825e4ed9447..31ade5802ae8 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3654,6 +3654,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	u32 item_size;
 	int ret;
 	int err = 0;
+	int progress = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -3666,9 +3667,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	}
 
 	while (1) {
+		progress++;
 		trans = btrfs_start_transaction(rc->extent_root, 0);
 		BUG_ON(IS_ERR(trans));
-
+restart:
 		if (update_backref_cache(trans, &rc->backref_cache)) {
 			btrfs_end_transaction(trans, rc->extent_root);
 			continue;
@@ -3781,6 +3783,15 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 			}
 		}
 	}
+	if (trans && progress && err == -ENOSPC) {
+		ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+					      rc->block_group->flags);
+		if (ret == 0) {
+			err = 0;
+			progress = 0;
+			goto restart;
+		}
+	}
 
 	btrfs_release_path(rc->extent_root, path);
 	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
-- 
cgit v1.2.3


From b4dc2b8c694ead005b828f5fb7fa1134db5b6275 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 16 Feb 2011 06:06:34 +0000
Subject: Btrfs: Fix BTRFS_IOC_SUBVOL_SETFLAGS ioctl

- Check user-specified flags correctly
- Check the inode owership
- Search root item in root tree but not fs tree

Reported-by: Dan Rosenberg <drosenberg@vsecurity.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index be2d4f6aaa5e..5fdb2abc4fa7 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1071,12 +1071,15 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 	if (copy_from_user(&flags, arg, sizeof(flags)))
 		return -EFAULT;
 
-	if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
+	if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
 		return -EINVAL;
 
 	if (flags & ~BTRFS_SUBVOL_RDONLY)
 		return -EOPNOTSUPP;
 
+	if (!is_owner_or_cap(inode))
+		return -EACCES;
+
 	down_write(&root->fs_info->subvol_sem);
 
 	/* nothing to do */
@@ -1097,7 +1100,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
 		goto out_reset;
 	}
 
-	ret = btrfs_update_root(trans, root,
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
 				&root->root_key, &root->root_item);
 
 	btrfs_commit_transaction(trans, root);
-- 
cgit v1.2.3


From ca9b688c1c9a21635cfc8af8b68565b154185196 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 16 Feb 2011 06:06:41 +0000
Subject: Btrfs: Avoid accessing unmapped kernel address

When decompressing a chunk of data, we'll copy the data out to
a working buffer if the data is stored in more than one page,
otherwise we'll use the mapped page directly to avoid memory
copy.

In the latter case, we'll end up accessing the kernel address
after we've unmapped the page in a corner case.

Reported-by: Juan Francisco Cantero Hurtado <iam@juanfra.info>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/lzo.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index cc9b450399df..a178f5ebea78 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -280,6 +280,7 @@ static int lzo_decompress_biovec(struct list_head *ws,
 	unsigned long tot_out;
 	unsigned long tot_len;
 	char *buf;
+	bool may_late_unmap, need_unmap;
 
 	data_in = kmap(pages_in[0]);
 	tot_len = read_compress_length(data_in);
@@ -300,11 +301,13 @@ static int lzo_decompress_biovec(struct list_head *ws,
 
 		tot_in += in_len;
 		working_bytes = in_len;
+		may_late_unmap = need_unmap = false;
 
 		/* fast path: avoid using the working buffer */
 		if (in_page_bytes_left >= in_len) {
 			buf = data_in + in_offset;
 			bytes = in_len;
+			may_late_unmap = true;
 			goto cont;
 		}
 
@@ -329,14 +332,17 @@ cont:
 				if (working_bytes == 0 && tot_in >= tot_len)
 					break;
 
-				kunmap(pages_in[page_in_index]);
-				page_in_index++;
-				if (page_in_index >= total_pages_in) {
+				if (page_in_index + 1 >= total_pages_in) {
 					ret = -1;
-					data_in = NULL;
 					goto done;
 				}
-				data_in = kmap(pages_in[page_in_index]);
+
+				if (may_late_unmap)
+					need_unmap = true;
+				else
+					kunmap(pages_in[page_in_index]);
+
+				data_in = kmap(pages_in[++page_in_index]);
 
 				in_page_bytes_left = PAGE_CACHE_SIZE;
 				in_offset = 0;
@@ -346,6 +352,8 @@ cont:
 		out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
 		ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
 					    &out_len);
+		if (need_unmap)
+			kunmap(pages_in[page_in_index - 1]);
 		if (ret != LZO_E_OK) {
 			printk(KERN_WARNING "btrfs decompress failed\n");
 			ret = -1;
@@ -363,8 +371,7 @@ cont:
 			break;
 	}
 done:
-	if (data_in)
-		kunmap(pages_in[page_in_index]);
+	kunmap(pages_in[page_in_index]);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 9b3517e9136824346227b7b04f8f7ea1f3a726cc Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 15 Feb 2011 18:14:25 +0000
Subject: Btrfs: make btrfs_rm_device() fail gracefully

If shrinking done as part of the online device removal fails add that
device back to the allocation list and increment the rw_devices counter.
This fixes two bugs:

1) we could have a perfectly good device out of alloc list for no good
reason;

2) in the btrfs consisting of two devices, failure in btrfs_rm_device()
could lead to a situation where it was impossible to remove any of the
devices because of the "unable to remove the only writeable device"
error.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dadaaa8005c8..f31c33119bb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1337,11 +1337,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 	ret = btrfs_shrink_device(device, 0);
 	if (ret)
-		goto error_brelse;
+		goto error_undo;
 
 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
 	if (ret)
-		goto error_brelse;
+		goto error_undo;
 
 	device->in_fs_metadata = 0;
 
@@ -1415,6 +1415,13 @@ out:
 	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
 	return ret;
+error_undo:
+	if (device->writeable) {
+		list_add(&device->dev_alloc_list,
+			 &root->fs_info->fs_devices->alloc_list);
+		root->fs_info->fs_devices->rw_devices++;
+	}
+	goto error_brelse;
 }
 
 /*
-- 
cgit v1.2.3


From fb01aa85b8b29c1a4e1f4a28ea54175de6bf7559 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Tue, 15 Feb 2011 18:12:57 +0000
Subject: Btrfs: set FMODE_EXCL in btrfs_device->mode

This fixes a bug introduced in d4d77629, where the device added online
(and therefore initialized via btrfs_init_new_device()) would be left
with the positive bdev->bd_holders after unmount.  Since d4d77629 we no
longer OR FMODE_EXCL explicitly on blkdev_put(), set it in
btrfs_device->mode.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f31c33119bb6..94334d952280 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1639,7 +1639,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
-	device->mode = 0;
+	device->mode = FMODE_EXCL;
 	set_blocksize(device->bdev, 4096);
 
 	if (seeding_dev) {
-- 
cgit v1.2.3


From 9616125611ee47693186533d76e403856a36b3c8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 16 Feb 2011 09:34:16 -0500
Subject: cifs: fix handling of scopeid in cifs_convert_address
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The code finds, the '%' sign in an ipv6 address and copies that to a
buffer allocated on the stack. It then ignores that buffer, and passes
'pct' to simple_strtoul(), which doesn't work right because we're
comparing 'endp' against a completely different string.

Fix it by passing the correct pointer. While we're at it, this is a
good candidate for conversion to strict_strtoul as well.

Cc: stable@kernel.org
Cc: David Howells <dhowells@redhat.com>
Reported-by: BjÃ¶rn JACKE <bj@sernet.de>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/netmisc.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 8d9189f64477..79f641eeda30 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -170,7 +170,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 {
 	int rc, alen, slen;
 	const char *pct;
-	char *endp, scope_id[13];
+	char scope_id[13];
 	struct sockaddr_in *s4 = (struct sockaddr_in *) dst;
 	struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst;
 
@@ -197,9 +197,9 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
 		memcpy(scope_id, pct + 1, slen);
 		scope_id[slen] = '\0';
 
-		s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0);
-		if (endp != scope_id + slen)
-			return 0;
+		rc = strict_strtoul(scope_id, 0,
+					(unsigned long *)&s6->sin6_scope_id);
+		rc = (rc == 0) ? 1 : 0;
 	}
 
 	return rc;
-- 
cgit v1.2.3


From 8787c7a3e0e3f1aa21856d6b6cd6880cc93497e9 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 17 Feb 2011 18:51:24 -0600
Subject: eCryptfs: Revert "dont call lookup_one_len to avoid NULL nameidata"

This reverts commit 21edad32205e97dc7ccb81a85234c77e760364c8 and commit
93c3fe40c279f002906ad14584c30671097d4394, which fixed a regression by
the former.

Al Viro pointed out bypassed dcache lookups in
ecryptfs_new_lower_dentry(), misuse of vfs_path_lookup() in
ecryptfs_lookup_one_lower() and a dislike of passing nameidata to the
lower filesystem.

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 106 ++++++----------------------------------------------
 1 file changed, 12 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index bd33f87a1907..fc44823fea3a 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -348,75 +348,6 @@ out:
 	return rc;
 }
 
-/**
- * ecryptfs_new_lower_dentry
- * @name: The name of the new dentry.
- * @lower_dir_dentry: Parent directory of the new dentry.
- * @nd: nameidata from last lookup.
- *
- * Create a new dentry or get it from lower parent dir.
- */
-static struct dentry *
-ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
-			  struct nameidata *nd)
-{
-	struct dentry *new_dentry;
-	struct dentry *tmp;
-	struct inode *lower_dir_inode;
-
-	lower_dir_inode = lower_dir_dentry->d_inode;
-
-	tmp = d_alloc(lower_dir_dentry, name);
-	if (!tmp)
-		return ERR_PTR(-ENOMEM);
-
-	mutex_lock(&lower_dir_inode->i_mutex);
-	new_dentry = lower_dir_inode->i_op->lookup(lower_dir_inode, tmp, nd);
-	mutex_unlock(&lower_dir_inode->i_mutex);
-
-	if (!new_dentry)
-		new_dentry = tmp;
-	else
-		dput(tmp);
-
-	return new_dentry;
-}
-
-
-/**
- * ecryptfs_lookup_one_lower
- * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
- * @lower_dir_dentry: lower parent directory
- * @name: lower file name
- *
- * Get the lower dentry from vfs. If lower dentry does not exist yet,
- * create it.
- */
-static struct dentry *
-ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
-			  struct dentry *lower_dir_dentry, struct qstr *name)
-{
-	struct nameidata nd;
-	struct vfsmount *lower_mnt;
-	int err;
-
-	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
-				    ecryptfs_dentry->d_parent));
-	err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
-	mntput(lower_mnt);
-
-	if (!err) {
-		/* we dont need the mount */
-		mntput(nd.path.mnt);
-		return nd.path.dentry;
-	}
-	if (err != -ENOENT)
-		return ERR_PTR(err);
-
-	/* create a new lower dentry */
-	return ecryptfs_new_lower_dentry(name, lower_dir_dentry, &nd);
-}
-
 /**
  * ecryptfs_lookup
  * @ecryptfs_dir_inode: The eCryptfs directory inode
@@ -434,7 +365,6 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	size_t encrypted_and_encoded_name_size;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
 	struct dentry *lower_dir_dentry, *lower_dentry;
-	struct qstr lower_name;
 	int rc = 0;
 
 	if ((ecryptfs_dentry->d_name.len == 1
@@ -444,20 +374,14 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		goto out_d_drop;
 	}
 	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-	lower_name.name = ecryptfs_dentry->d_name.name;
-	lower_name.len = ecryptfs_dentry->d_name.len;
-	lower_name.hash = ecryptfs_dentry->d_name.hash;
-	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
-		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-				lower_dir_dentry->d_inode, &lower_name);
-		if (rc < 0)
-			goto out_d_drop;
-	}
-	lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-						 lower_dir_dentry, &lower_name);
+	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
+	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
+				      lower_dir_dentry,
+				      ecryptfs_dentry->d_name.len);
+	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
-		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
 				"[%d] on lower_dentry = [%s]\n", __func__, rc,
 				encrypted_and_encoded_name);
 		goto out_d_drop;
@@ -479,20 +403,14 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		       "filename; rc = [%d]\n", __func__, rc);
 		goto out_d_drop;
 	}
-	lower_name.name = encrypted_and_encoded_name;
-	lower_name.len = encrypted_and_encoded_name_size;
-	lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
-	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
-		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
-				lower_dir_dentry->d_inode, &lower_name);
-		if (rc < 0)
-			goto out_d_drop;
-	}
-	lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-						 lower_dir_dentry, &lower_name);
+	mutex_lock(&lower_dir_dentry->d_inode->i_mutex);
+	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
+				      lower_dir_dentry,
+				      encrypted_and_encoded_name_size);
+	mutex_unlock(&lower_dir_dentry->d_inode->i_mutex);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
-		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
+		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
 				"[%d] on lower_dentry = [%s]\n", __func__, rc,
 				encrypted_and_encoded_name);
 		goto out_d_drop;
-- 
cgit v1.2.3


From 97d79b403ef03f729883246208ef5d8a2ebc4d68 Mon Sep 17 00:00:00 2001
From: Yehuda Sadeh <yehuda@hq.newdream.net>
Date: Tue, 18 Jan 2011 13:37:28 -0800
Subject: ceph: keep reference to parent inode on ceph_dentry

When creating a new dentry we now hold a reference to the parent
inode in the ceph_dentry.  This is required due to the new RCU
changes from 949854d0, which set dentry->d_parent to NULL in d_kill before
calling the ->release() callback.  If/when that behavior is changed, we can
revert this hack.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 5 ++++-
 fs/ceph/super.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 562f9884a4d9..6bfaa6a4ec47 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -60,6 +60,7 @@ int ceph_init_dentry(struct dentry *dentry)
 	}
 	di->dentry = dentry;
 	di->lease_session = NULL;
+	di->parent_inode = igrab(dentry->d_parent->d_inode);
 	dentry->d_fsdata = di;
 	dentry->d_time = jiffies;
 	ceph_dentry_lru_add(dentry);
@@ -1025,7 +1026,7 @@ static void ceph_dentry_release(struct dentry *dentry)
 	u64 snapid = CEPH_NOSNAP;
 
 	if (!IS_ROOT(dentry)) {
-		parent_inode = dentry->d_parent->d_inode;
+		parent_inode = di->parent_inode;
 		if (parent_inode)
 			snapid = ceph_snap(parent_inode);
 	}
@@ -1050,6 +1051,8 @@ static void ceph_dentry_release(struct dentry *dentry)
 		kmem_cache_free(ceph_dentry_cachep, di);
 		dentry->d_fsdata = NULL;
 	}
+	if (parent_inode)
+		iput(parent_inode);
 }
 
 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 6e0826695112..c01aa646b407 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -207,6 +207,7 @@ struct ceph_dentry_info {
 	struct dentry *dentry;
 	u64 time;
 	u64 offset;
+	struct inode *parent_inode;
 };
 
 struct ceph_inode_xattrs_info {
-- 
cgit v1.2.3


From 705773a6656bba66f2a80a44ddaacf9620df8a59 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 3 Feb 2011 14:16:19 +0100
Subject: ocfs2: Fix estimate of necessary credits for mkdir

In the rare case that INLINE_DATA, INDEX_DIR, QUOTA, XATTR features are
disabled and both the allocation of the directory inode and the allocation
of the first directory block need to relink allocation group, there need
not be enough credits reserved in a transaction. Fix the estimate.

CC: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
---
 fs/ocfs2/journal.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 43e56b97f9c0..6180da1e37e6 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -405,9 +405,9 @@ static inline int ocfs2_remove_extent_credits(struct super_block *sb)
 	       ocfs2_quota_trans_credits(sb);
 }
 
-/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
- * bitmap block for the new bit) dx_root update for free list */
-#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2 + 1)
+/* data block for new dir/symlink, allocation of directory block, dx_root
+ * update for free list */
+#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + OCFS2_SUBALLOC_ALLOC + 1)
 
 static inline int ocfs2_add_dir_index_credits(struct super_block *sb)
 {
-- 
cgit v1.2.3


From acf3bb007e5636ef4c17505affb0974175108553 Mon Sep 17 00:00:00 2001
From: Tristan Ye <tristan.ye@oracle.com>
Date: Fri, 21 Jan 2011 18:20:18 +0800
Subject: Ocfs2/refcounttree: Fix a bug for refcounttree to writeback clusters
 in a right number.

Current refcounttree codes actually didn't writeback the new pages out in
write-back mode, due to a bug of always passing a ZERO number of clusters
to 'ocfs2_cow_sync_writeback', the patch tries to pass a proper one in.

Signed-off-by: Tristan Ye <tristan.ye@oracle.com>
Cc: stable@kernel.org
Signed-off-by: Joel Becker <jlbec@evilplan.org>
---
 fs/ocfs2/refcounttree.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index b5f9160e93e9..19ebc5aad391 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -3228,7 +3228,7 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 					u32 num_clusters, unsigned int e_flags)
 {
 	int ret, delete, index, credits =  0;
-	u32 new_bit, new_len;
+	u32 new_bit, new_len, orig_num_clusters;
 	unsigned int set_len;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	handle_t *handle;
@@ -3261,6 +3261,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 		goto out;
 	}
 
+	orig_num_clusters = num_clusters;
+
 	while (num_clusters) {
 		ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
 					     p_cluster, num_clusters,
@@ -3348,7 +3350,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 	 * in write-back mode.
 	 */
 	if (context->get_clusters == ocfs2_di_get_clusters) {
-		ret = ocfs2_cow_sync_writeback(sb, context, cpos, num_clusters);
+		ret = ocfs2_cow_sync_writeback(sb, context, cpos,
+					       orig_num_clusters);
 		if (ret)
 			mlog_errno(ret);
 	}
-- 
cgit v1.2.3


From 52c303c56c3638944b5f733e3961dc58eb8c7270 Mon Sep 17 00:00:00 2001
From: Mark Fasheh <mfasheh@suse.com>
Date: Mon, 31 Jan 2011 11:31:04 -0800
Subject: ocfs2: Check heartbeat mode for kernel stacks only

Commit 2c442719e90a44a6982c033d69df4aae4b167cfa added some checks for proper
heartbeat mode when the o2cb stack is running.  Unfortunately, it didn't
take into account that a userpsace stack could be running. Fix this by only
doing the check if o2cb is in use. This patch allows userspace stacks to
mount the fs again.

Cc: stable@kernel.org
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
---
 fs/ocfs2/super.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 38f986d2447e..36c423fb0635 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1316,7 +1316,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 			       struct mount_options *mopt,
 			       int is_remount)
 {
-	int status;
+	int status, user_stack = 0;
 	char *p;
 	u32 tmp;
 
@@ -1459,6 +1459,15 @@ static int ocfs2_parse_options(struct super_block *sb,
 			memcpy(mopt->cluster_stack, args[0].from,
 			       OCFS2_STACK_LABEL_LEN);
 			mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
+			/*
+			 * Open code the memcmp here as we don't have
+			 * an osb to pass to
+			 * ocfs2_userspace_stack().
+			 */
+			if (memcmp(mopt->cluster_stack,
+				   OCFS2_CLASSIC_CLUSTER_STACK,
+				   OCFS2_STACK_LABEL_LEN))
+				user_stack = 1;
 			break;
 		case Opt_inode64:
 			mopt->mount_opt |= OCFS2_MOUNT_INODE64;
@@ -1514,13 +1523,16 @@ static int ocfs2_parse_options(struct super_block *sb,
 		}
 	}
 
-	/* Ensure only one heartbeat mode */
-	tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
-				 OCFS2_MOUNT_HB_NONE);
-	if (hweight32(tmp) != 1) {
-		mlog(ML_ERROR, "Invalid heartbeat mount options\n");
-		status = 0;
-		goto bail;
+	if (user_stack == 0) {
+		/* Ensure only one heartbeat mode */
+		tmp = mopt->mount_opt & (OCFS2_MOUNT_HB_LOCAL |
+					 OCFS2_MOUNT_HB_GLOBAL |
+					 OCFS2_MOUNT_HB_NONE);
+		if (hweight32(tmp) != 1) {
+			mlog(ML_ERROR, "Invalid heartbeat mount options\n");
+			status = 0;
+			goto bail;
+		}
 	}
 
 	status = 1;
-- 
cgit v1.2.3


From 70b8902199003b098fde86d1db02e7465115a02c Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Thu, 17 Feb 2011 17:35:20 -0600
Subject: eCryptfs: Handle NULL nameidata pointers

Allow for NULL nameidata pointers in eCryptfs create, lookup, and
d_revalidate functions.

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/dentry.c          | 22 +++++++++++++---------
 fs/ecryptfs/ecryptfs_kernel.h |  3 +--
 fs/ecryptfs/inode.c           | 30 +++++++++++++++---------------
 3 files changed, 29 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c
index 6fc4f319b550..534c1d46e69e 100644
--- a/fs/ecryptfs/dentry.c
+++ b/fs/ecryptfs/dentry.c
@@ -46,24 +46,28 @@ static int ecryptfs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct dentry *lower_dentry;
 	struct vfsmount *lower_mnt;
-	struct dentry *dentry_save;
-	struct vfsmount *vfsmount_save;
+	struct dentry *dentry_save = NULL;
+	struct vfsmount *vfsmount_save = NULL;
 	int rc = 1;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
 	lower_mnt = ecryptfs_dentry_to_lower_mnt(dentry);
 	if (!lower_dentry->d_op || !lower_dentry->d_op->d_revalidate)
 		goto out;
-	dentry_save = nd->path.dentry;
-	vfsmount_save = nd->path.mnt;
-	nd->path.dentry = lower_dentry;
-	nd->path.mnt = lower_mnt;
+	if (nd) {
+		dentry_save = nd->path.dentry;
+		vfsmount_save = nd->path.mnt;
+		nd->path.dentry = lower_dentry;
+		nd->path.mnt = lower_mnt;
+	}
 	rc = lower_dentry->d_op->d_revalidate(lower_dentry, nd);
-	nd->path.dentry = dentry_save;
-	nd->path.mnt = vfsmount_save;
+	if (nd) {
+		nd->path.dentry = dentry_save;
+		nd->path.mnt = vfsmount_save;
+	}
 	if (dentry->d_inode) {
 		struct inode *lower_inode =
 			ecryptfs_inode_to_lower(dentry->d_inode);
diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
index dbc84ed96336..e00753496e3e 100644
--- a/fs/ecryptfs/ecryptfs_kernel.h
+++ b/fs/ecryptfs/ecryptfs_kernel.h
@@ -632,8 +632,7 @@ int ecryptfs_interpose(struct dentry *hidden_dentry,
 		       u32 flags);
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 					struct dentry *lower_dentry,
-					struct inode *ecryptfs_dir_inode,
-					struct nameidata *ecryptfs_nd);
+					struct inode *ecryptfs_dir_inode);
 int ecryptfs_decode_and_decrypt_filename(char **decrypted_name,
 					 size_t *decrypted_name_size,
 					 struct dentry *ecryptfs_dentry,
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index fc44823fea3a..eb0d267ee715 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -74,16 +74,20 @@ ecryptfs_create_underlying_file(struct inode *lower_dir_inode,
 	unsigned int flags_save;
 	int rc;
 
-	dentry_save = nd->path.dentry;
-	vfsmount_save = nd->path.mnt;
-	flags_save = nd->flags;
-	nd->path.dentry = lower_dentry;
-	nd->path.mnt = lower_mnt;
-	nd->flags &= ~LOOKUP_OPEN;
+	if (nd) {
+		dentry_save = nd->path.dentry;
+		vfsmount_save = nd->path.mnt;
+		flags_save = nd->flags;
+		nd->path.dentry = lower_dentry;
+		nd->path.mnt = lower_mnt;
+		nd->flags &= ~LOOKUP_OPEN;
+	}
 	rc = vfs_create(lower_dir_inode, lower_dentry, mode, nd);
-	nd->path.dentry = dentry_save;
-	nd->path.mnt = vfsmount_save;
-	nd->flags = flags_save;
+	if (nd) {
+		nd->path.dentry = dentry_save;
+		nd->path.mnt = vfsmount_save;
+		nd->flags = flags_save;
+	}
 	return rc;
 }
 
@@ -241,8 +245,7 @@ out:
  */
 int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 					struct dentry *lower_dentry,
-					struct inode *ecryptfs_dir_inode,
-					struct nameidata *ecryptfs_nd)
+					struct inode *ecryptfs_dir_inode)
 {
 	struct dentry *lower_dir_dentry;
 	struct vfsmount *lower_mnt;
@@ -290,8 +293,6 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry,
 		goto out;
 	if (special_file(lower_inode->i_mode))
 		goto out;
-	if (!ecryptfs_nd)
-		goto out;
 	/* Released in this function */
 	page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER);
 	if (!page_virt) {
@@ -417,8 +418,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	}
 lookup_and_interpose:
 	rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry,
-						 ecryptfs_dir_inode,
-						 ecryptfs_nd);
+						 ecryptfs_dir_inode);
 	goto out;
 out_d_drop:
 	d_drop(ecryptfs_dentry);
-- 
cgit v1.2.3


From 323ef68faf1bbd9b1e66aea268fd09d358d7e8ab Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Wed, 16 Feb 2011 04:49:59 +0000
Subject: ecryptfs: read on a directory should return EISDIR if not supported

read() calls against a file descriptor connected to a directory are
incorrectly returning EINVAL rather than EISDIR:

  [EISDIR]
    [XSI] [Option Start] The fildes argument refers to a directory and the
    implementation does not allow the directory to be read using read()
    or pread(). The readdir() function should be used instead. [Option End]

This occurs because we do not have a .read operation defined for
ecryptfs directories.  Connect this up to generic_read_dir().

BugLink: http://bugs.launchpad.net/bugs/719691
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/file.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 81e10e6a9443..7d1050e254f9 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -317,6 +317,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 const struct file_operations ecryptfs_dir_fops = {
 	.readdir = ecryptfs_readdir,
+	.read = generic_read_dir,
 	.unlocked_ioctl = ecryptfs_unlocked_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl = ecryptfs_compat_ioctl,
-- 
cgit v1.2.3


From 55f9cf6bbaa682958a7dd2755f883b768270c3ce Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Tue, 11 Jan 2011 12:43:42 -0600
Subject: eCryptfs: Copy up lower inode attrs in getattr

The lower filesystem may do some type of inode revalidation during a
getattr call. eCryptfs should take advantage of that by copying the
lower inode attributes to the eCryptfs inode after a call to
vfs_getattr() on the lower inode.

I originally wrote this fix while working on eCryptfs on nfsv3 support,
but discovered it also fixed an eCryptfs on ext4 nanosecond timestamp
bug that was reported.

https://bugs.launchpad.net/bugs/613873

Cc: <stable@kernel.org>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index eb0d267ee715..b592938a84bc 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -1010,6 +1010,8 @@ int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry),
 			 ecryptfs_dentry_to_lower(dentry), &lower_stat);
 	if (!rc) {
+		fsstack_copy_attr_all(dentry->d_inode,
+				      ecryptfs_inode_to_lower(dentry->d_inode));
 		generic_fillattr(dentry->d_inode, stat);
 		stat->blocks = lower_stat.blocks;
 	}
-- 
cgit v1.2.3


From 5e640927a597a7c3e72b61e8bce74c22e906de65 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Thu, 17 Feb 2011 14:38:31 -0600
Subject: cifs: Fix regression in LANMAN (LM) auth code

LANMAN response length was changed to 16 bytes instead of 24 bytes.
Revert it back to 24 bytes.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
CC: stable@kernel.org
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/sess.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 1adc9625a344..16765703131b 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -656,13 +656,13 @@ ssetup_ntlmssp_authenticate:
 
 	if (type == LANMAN) {
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
-		char lnm_session_key[CIFS_SESS_KEY_SIZE];
+		char lnm_session_key[CIFS_AUTH_RESP_SIZE];
 
 		pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE;
 
 		/* no capabilities flags in old lanman negotiation */
 
-		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE);
+		pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_AUTH_RESP_SIZE);
 
 		/* Calculate hash with password and copy into bcc_ptr.
 		 * Encryption Key (stored as in cryptkey) gets used if the
@@ -675,8 +675,8 @@ ssetup_ntlmssp_authenticate:
 					true : false, lnm_session_key);
 
 		ses->flags |= CIFS_SES_LANMAN;
-		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE);
-		bcc_ptr += CIFS_SESS_KEY_SIZE;
+		memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_AUTH_RESP_SIZE);
+		bcc_ptr += CIFS_AUTH_RESP_SIZE;
 
 		/* can not sign if LANMAN negotiated so no need
 		to calculate signing key? but what if server
-- 
cgit v1.2.3


From eed9e8307e01d6d8d6170afcb2f00e1a471b87d4 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Mon, 21 Feb 2011 22:31:47 +0000
Subject: [CIFS] update cifs version

Update version to 1.71 so we can more easily spot modules with the last two fixes

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 4a3330235d55..a9371b6578c0 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -127,5 +127,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.70"
+#define CIFS_VERSION   "1.71"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.3


From 361821854b71fc3a53c9e17701538247bddbd4ba Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 20 Feb 2011 20:08:35 -0800
Subject: Docbook: add fs/eventfd.c and fix typos in it

Add fs/eventfd.c to filesystems docbook.
Make typo corrections in fs/eventfd.c.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/DocBook/filesystems.tmpl |  5 +++++
 fs/eventfd.c                           | 12 ++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/Documentation/DocBook/filesystems.tmpl b/Documentation/DocBook/filesystems.tmpl
index 5e87ad58c0b5..f51f28531b8d 100644
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -82,6 +82,11 @@
      </sect1>
   </chapter>
 
+  <chapter id="fs_events">
+     <title>Events based on file descriptors</title>
+!Efs/eventfd.c
+  </chapter>
+
   <chapter id="sysfs">
      <title>The Filesystem for Exporting Kernel Objects</title>
 !Efs/sysfs/file.c
diff --git a/fs/eventfd.c b/fs/eventfd.c
index e0194b3e14d6..d9a591773919 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -99,7 +99,7 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_get);
  * @ctx: [in] Pointer to eventfd context.
  *
  * The eventfd context reference must have been previously acquired either
- * with eventfd_ctx_get() or eventfd_ctx_fdget()).
+ * with eventfd_ctx_get() or eventfd_ctx_fdget().
  */
 void eventfd_ctx_put(struct eventfd_ctx *ctx)
 {
@@ -146,9 +146,9 @@ static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
  * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
  * @ctx: [in] Pointer to eventfd context.
  * @wait: [in] Wait queue to be removed.
- * @cnt: [out] Pointer to the 64bit conter value.
+ * @cnt: [out] Pointer to the 64-bit counter value.
  *
- * Returns zero if successful, or the following error codes:
+ * Returns %0 if successful, or the following error codes:
  *
  * -EAGAIN      : The operation would have blocked.
  *
@@ -175,11 +175,11 @@ EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
  * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero.
  * @ctx: [in] Pointer to eventfd context.
  * @no_wait: [in] Different from zero if the operation should not block.
- * @cnt: [out] Pointer to the 64bit conter value.
+ * @cnt: [out] Pointer to the 64-bit counter value.
  *
- * Returns zero if successful, or the following error codes:
+ * Returns %0 if successful, or the following error codes:
  *
- * -EAGAIN      : The operation would have blocked but @no_wait was nonzero.
+ * -EAGAIN      : The operation would have blocked but @no_wait was non-zero.
  * -ERESTARTSYS : A signal interrupted the wait operation.
  *
  * If @no_wait is zero, the function might sleep until the eventfd internal
-- 
cgit v1.2.3


From 3a3675b7f23f83ca8c67c9c2b6edf707fd28d1ba Mon Sep 17 00:00:00 2001
From: Dan Rosenberg <drosenberg@vsecurity.com>
Date: Mon, 14 Feb 2011 13:45:28 +0000
Subject: xfs: prevent leaking uninitialized stack memory in FSGEOMETRY_V1

The FSGEOMETRY_V1 ioctl (and its compat equivalent) calls out to
xfs_fs_geometry() with a version number of 3.  This code path does not
fill in the logsunit member of the passed xfs_fsop_geom_t, leading to
the leaking of four bytes of uninitialized stack data to potentially
unprivileged callers.

v2 switches to memset() to avoid future issues if structure members
change, on suggestion of Dave Chinner.

Signed-off-by: Dan Rosenberg <drosenberg@vsecurity.com>
Reviewed-by: Eugene Teo <eugeneteo@kernel.org>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_fsops.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cec89dd5d7d2..85668efb3e3e 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -53,6 +53,9 @@ xfs_fs_geometry(
 	xfs_fsop_geom_t		*geo,
 	int			new_version)
 {
+
+	memset(geo, 0, sizeof(*geo));
+
 	geo->blocksize = mp->m_sb.sb_blocksize;
 	geo->rtextsize = mp->m_sb.sb_rextsize;
 	geo->agblocks = mp->m_sb.sb_agblocks;
-- 
cgit v1.2.3


From be715140b5c3baf8ab6708060cfab80bef279d18 Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Tue, 15 Feb 2011 17:07:36 +0000
Subject: xfs: check if device support discard in xfs_ioc_trim()

Right now we, are relying on the fact that when we attempt to
actually do the discard, blkdev_issue_discar() returns -EOPNOTSUPP
and the user is informed that the device does not support discard.

However, in the case where the we do not hit any suitable free
extent to trim in FITRIM code, it will finish without any error.
This is very confusing, because it seems that FITRIM was successful
even though the device does not actually supports discard.

Solution: Check for the discard support before attempt to search for
free extents.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_discard.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_discard.c b/fs/xfs/linux-2.6/xfs_discard.c
index 05201ae719e5..d61611c88012 100644
--- a/fs/xfs/linux-2.6/xfs_discard.c
+++ b/fs/xfs/linux-2.6/xfs_discard.c
@@ -152,6 +152,8 @@ xfs_ioc_trim(
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -XFS_ERROR(EPERM);
+	if (!blk_queue_discard(q))
+		return -XFS_ERROR(EOPNOTSUPP);
 	if (copy_from_user(&range, urange, sizeof(range)))
 		return -XFS_ERROR(EFAULT);
 
-- 
cgit v1.2.3


From 2c9c8f36c34e1defcaa7e4c298651998b47f5282 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Tue, 22 Feb 2011 14:43:22 -0800
Subject: NFSD: fix decode_cb_sequence4resok

Fix bug introduced in patch
85a56480 NFSD: Update XDR decoders in NFSv4 callback client

Although decode_cb_sequence4resok ignores highest slotid and target highest slotid
it must account for their space in their xdr stream when calling xdr_inline_decode

Cc: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index cde36cb0f348..02eb4edf0ece 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -432,7 +432,7 @@ static int decode_cb_sequence4resok(struct xdr_stream *xdr,
 	 * If the server returns different values for sessionID, slotID or
 	 * sequence number, the server is looney tunes.
 	 */
-	p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4);
+	p = xdr_inline_decode(xdr, NFS4_MAX_SESSIONID_LEN + 4 + 4 + 4 + 4);
 	if (unlikely(p == NULL))
 		goto out_overflow;
 	memcpy(id.data, p, NFS4_MAX_SESSIONID_LEN);
-- 
cgit v1.2.3


From ec29ed5b407d618a8128f5942aade9e1758aa14b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 23 Feb 2011 16:23:20 -0500
Subject: Btrfs: fix fiemap bugs with delalloc

The Btrfs fiemap code wasn't properly returning delalloc extents,
so applications that trust fiemap to decide if there are holes in the
file see holes instead of delalloc.

This reworks the btrfs fiemap code, adding a get_extent helper that
searches for delalloc ranges and also adding a helper for extent_fiemap
that skips past holes in the file.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 138 ++++++++++++++++++++++++++++++++++++---------------
 fs/btrfs/extent_io.h |   2 +-
 fs/btrfs/inode.c     | 126 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 224 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e7aeba242701..ff45b80d90f0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1433,12 +1433,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
  */
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end, u64 max_bytes,
-		     unsigned long bits)
+		     unsigned long bits, int contig)
 {
 	struct rb_node *node;
 	struct extent_state *state;
 	u64 cur_start = *start;
 	u64 total_bytes = 0;
+	u64 last = 0;
 	int found = 0;
 
 	if (search_end <= cur_start) {
@@ -1463,7 +1464,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
 		state = rb_entry(node, struct extent_state, rb_node);
 		if (state->start > search_end)
 			break;
-		if (state->end >= cur_start && (state->state & bits)) {
+		if (contig && found && state->start > last + 1)
+			break;
+		if (state->end >= cur_start && (state->state & bits) == bits) {
 			total_bytes += min(search_end, state->end) + 1 -
 				       max(cur_start, state->start);
 			if (total_bytes >= max_bytes)
@@ -1472,6 +1475,9 @@ u64 count_range_bits(struct extent_io_tree *tree,
 				*start = state->start;
 				found = 1;
 			}
+			last = state->end;
+		} else if (contig && found) {
+			break;
 		}
 		node = rb_next(node);
 		if (!node)
@@ -2912,6 +2918,46 @@ out:
 	return sector;
 }
 
+/*
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+						u64 offset,
+						u64 last,
+						get_extent_t *get_extent)
+{
+	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
+	struct extent_map *em;
+	u64 len;
+
+	if (offset >= last)
+		return NULL;
+
+	while(1) {
+		len = last - offset;
+		if (len == 0)
+			break;
+		len = (len + sectorsize - 1) & ~(sectorsize - 1);
+		em = get_extent(inode, NULL, 0, offset, len, 0);
+		if (!em || IS_ERR(em))
+			return em;
+
+		/* if this isn't a hole return it */
+		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
+		    em->block_start != EXTENT_MAP_HOLE) {
+			return em;
+		}
+
+		/* this is a hole, advance to the next extent */
+		offset = extent_map_end(em);
+		free_extent_map(em);
+		if (offset >= last)
+			break;
+	}
+	return NULL;
+}
+
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent)
 {
@@ -2921,16 +2967,19 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	u32 flags = 0;
 	u32 found_type;
 	u64 last;
+	u64 last_for_get_extent = 0;
 	u64 disko = 0;
+	u64 isize = i_size_read(inode);
 	struct btrfs_key found_key;
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_path *path;
 	struct btrfs_file_extent_item *item;
 	int end = 0;
-	u64 em_start = 0, em_len = 0;
+	u64 em_start = 0;
+	u64 em_len = 0;
+	u64 em_end = 0;
 	unsigned long emflags;
-	int hole = 0;
 
 	if (len == 0)
 		return -EINVAL;
@@ -2940,6 +2989,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		return -ENOMEM;
 	path->leave_spinning = 1;
 
+	/*
+	 * lookup the last file extent.  We're not using i_size here
+	 * because there might be preallocation past i_size
+	 */
 	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
 				       path, inode->i_ino, -1, 0);
 	if (ret < 0) {
@@ -2953,18 +3006,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
 	found_type = btrfs_key_type(&found_key);
 
-	/* No extents, just return */
+	/* No extents, but there might be delalloc bits */
 	if (found_key.objectid != inode->i_ino ||
 	    found_type != BTRFS_EXTENT_DATA_KEY) {
-		btrfs_free_path(path);
-		return 0;
+		/* have to trust i_size as the end */
+		last = (u64)-1;
+		last_for_get_extent = isize;
+	} else {
+		/*
+		 * remember the start of the last extent.  There are a
+		 * bunch of different factors that go into the length of the
+		 * extent, so its much less complex to remember where it started
+		 */
+		last = found_key.offset;
+		last_for_get_extent = last + 1;
 	}
-	last = found_key.offset;
 	btrfs_free_path(path);
 
+	/*
+	 * we might have some extents allocated but more delalloc past those
+	 * extents.  so, we trust isize unless the start of the last extent is
+	 * beyond isize
+	 */
+	if (last < isize) {
+		last = (u64)-1;
+		last_for_get_extent = isize;
+	}
+
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
 			 &cached_state, GFP_NOFS);
-	em = get_extent(inode, NULL, 0, off, max - off, 0);
+
+	em = get_extent_skip_holes(inode, off, last_for_get_extent,
+				   get_extent);
 	if (!em)
 		goto out;
 	if (IS_ERR(em)) {
@@ -2973,19 +3046,14 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	}
 
 	while (!end) {
-		hole = 0;
-		off = em->start + em->len;
+		off = extent_map_end(em);
 		if (off >= max)
 			end = 1;
 
-		if (em->block_start == EXTENT_MAP_HOLE) {
-			hole = 1;
-			goto next;
-		}
-
 		em_start = em->start;
 		em_len = em->len;
-
+		em_end = extent_map_end(em);
+		emflags = em->flags;
 		disko = 0;
 		flags = 0;
 
@@ -3004,37 +3072,29 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
 
-next:
-		emflags = em->flags;
 		free_extent_map(em);
 		em = NULL;
-		if (!end) {
-			em = get_extent(inode, NULL, 0, off, max - off, 0);
-			if (!em)
-				goto out;
-			if (IS_ERR(em)) {
-				ret = PTR_ERR(em);
-				goto out;
-			}
-			emflags = em->flags;
-		}
-
-		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
+		if ((em_start >= last) || em_len == (u64)-1 ||
+		   (last == (u64)-1 && isize <= em_end)) {
 			flags |= FIEMAP_EXTENT_LAST;
 			end = 1;
 		}
 
-		if (em_start == last) {
+		/* now scan forward to see if this is really the last extent. */
+		em = get_extent_skip_holes(inode, off, last_for_get_extent,
+					   get_extent);
+		if (IS_ERR(em)) {
+			ret = PTR_ERR(em);
+			goto out;
+		}
+		if (!em) {
 			flags |= FIEMAP_EXTENT_LAST;
 			end = 1;
 		}
-
-		if (!hole) {
-			ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-						em_len, flags);
-			if (ret)
-				goto out_free;
-		}
+		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
+					      em_len, flags);
+		if (ret)
+			goto out_free;
 	}
 out_free:
 	free_extent_map(em);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7083cfafd061..9318dfefd59c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -191,7 +191,7 @@ void extent_io_exit(void);
 
 u64 count_range_bits(struct extent_io_tree *tree,
 		     u64 *start, u64 search_end,
-		     u64 max_bytes, unsigned long bits);
+		     u64 max_bytes, unsigned long bits, int contig);
 
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8d392ed73d57..44b926646e33 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1913,7 +1913,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
 
 	private = 0;
 	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-			     (u64)-1, 1, EXTENT_DIRTY)) {
+			     (u64)-1, 1, EXTENT_DIRTY, 0)) {
 		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
 					start, &private_failure);
 		if (ret == 0) {
@@ -5282,6 +5282,128 @@ out:
 	return em;
 }
 
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+					   size_t pg_offset, u64 start, u64 len,
+					   int create)
+{
+	struct extent_map *em;
+	struct extent_map *hole_em = NULL;
+	u64 range_start = start;
+	u64 end;
+	u64 found;
+	u64 found_end;
+	int err = 0;
+
+	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+	if (IS_ERR(em))
+		return em;
+	if (em) {
+		/*
+		 * if our em maps to a hole, there might
+		 * actually be delalloc bytes behind it
+		 */
+		if (em->block_start != EXTENT_MAP_HOLE)
+			return em;
+		else
+			hole_em = em;
+	}
+
+	/* check to see if we've wrapped (len == -1 or similar) */
+	end = start + len;
+	if (end < start)
+		end = (u64)-1;
+	else
+		end -= 1;
+
+	em = NULL;
+
+	/* ok, we didn't find anything, lets look for delalloc */
+	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+				 end, len, EXTENT_DELALLOC, 1);
+	found_end = range_start + found;
+	if (found_end < range_start)
+		found_end = (u64)-1;
+
+	/*
+	 * we didn't find anything useful, return
+	 * the original results from get_extent()
+	 */
+	if (range_start > end || found_end <= start) {
+		em = hole_em;
+		hole_em = NULL;
+		goto out;
+	}
+
+	/* adjust the range_start to make sure it doesn't
+	 * go backwards from the start they passed in
+	 */
+	range_start = max(start,range_start);
+	found = found_end - range_start;
+
+	if (found > 0) {
+		u64 hole_start = start;
+		u64 hole_len = len;
+
+		em = alloc_extent_map(GFP_NOFS);
+		if (!em) {
+			err = -ENOMEM;
+			goto out;
+		}
+		/*
+		 * when btrfs_get_extent can't find anything it
+		 * returns one huge hole
+		 *
+		 * make sure what it found really fits our range, and
+		 * adjust to make sure it is based on the start from
+		 * the caller
+		 */
+		if (hole_em) {
+			u64 calc_end = extent_map_end(hole_em);
+
+			if (calc_end <= start || (hole_em->start > end)) {
+				free_extent_map(hole_em);
+				hole_em = NULL;
+			} else {
+				hole_start = max(hole_em->start, start);
+				hole_len = calc_end - hole_start;
+			}
+		}
+		em->bdev = NULL;
+		if (hole_em && range_start > hole_start) {
+			/* our hole starts before our delalloc, so we
+			 * have to return just the parts of the hole
+			 * that go until  the delalloc starts
+			 */
+			em->len = min(hole_len,
+				      range_start - hole_start);
+			em->start = hole_start;
+			em->orig_start = hole_start;
+			/*
+			 * don't adjust block start at all,
+			 * it is fixed at EXTENT_MAP_HOLE
+			 */
+			em->block_start = hole_em->block_start;
+			em->block_len = hole_len;
+		} else {
+			em->start = range_start;
+			em->len = found;
+			em->orig_start = range_start;
+			em->block_start = EXTENT_MAP_DELALLOC;
+			em->block_len = found;
+		}
+	} else if (hole_em) {
+		return hole_em;
+	}
+out:
+
+	free_extent_map(hole_em);
+	if (err) {
+		free_extent_map(em);
+		return ERR_PTR(err);
+	}
+	return em;
+}
+
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 						  u64 start, u64 len)
 {
@@ -6104,7 +6226,7 @@ out:
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
-	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
 
 int btrfs_readpage(struct file *file, struct page *page)
-- 
cgit v1.2.3


From 2aa15890f3c191326678f1bd68af61ec6b8753ec Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Wed, 23 Feb 2011 13:49:47 +0100
Subject: mm: prevent concurrent unmap_mapping_range() on the same inode

Michael Leun reported that running parallel opens on a fuse filesystem
can trigger a "kernel BUG at mm/truncate.c:475"

Gurudas Pai reported the same bug on NFS.

The reason is, unmap_mapping_range() is not prepared for more than
one concurrent invocation per inode.  For example:

  thread1: going through a big range, stops in the middle of a vma and
     stores the restart address in vm_truncate_count.

  thread2: comes in with a small (e.g. single page) unmap request on
     the same vma, somewhere before restart_address, finds that the
     vma was already unmapped up to the restart address and happily
     returns without doing anything.

Another scenario would be two big unmap requests, both having to
restart the unmapping and each one setting vm_truncate_count to its
own value.  This could go on forever without any of them being able to
finish.

Truncate and hole punching already serialize with i_mutex.  Other
callers of unmap_mapping_range() do not, and it's difficult to get
i_mutex protection for all callers.  In particular ->d_revalidate(),
which calls invalidate_inode_pages2_range() in fuse, may be called
with or without i_mutex.

This patch adds a new mutex to 'struct address_space' to prevent
running multiple concurrent unmap_mapping_range() on the same mapping.

[ We'll hopefully get rid of all this with the upcoming mm
  preemptibility series by Peter Zijlstra, the "mm: Remove i_mmap_mutex
  lockbreak" patch in particular.  But that is for 2.6.39 ]

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Reported-by: Michael Leun <lkml20101129@newton.leun.net>
Reported-by: Gurudas Pai <gurudas.pai@oracle.com>
Tested-by: Gurudas Pai <gurudas.pai@oracle.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/gfs2/main.c     |  9 +--------
 fs/inode.c         | 22 +++++++++++++++-------
 fs/nilfs2/btnode.c |  5 -----
 fs/nilfs2/btnode.h |  1 -
 fs/nilfs2/mdt.c    |  4 ++--
 fs/nilfs2/page.c   | 13 -------------
 fs/nilfs2/page.h   |  1 -
 fs/nilfs2/super.c  |  2 +-
 include/linux/fs.h |  2 ++
 mm/memory.c        |  2 ++
 10 files changed, 23 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 85ba027d1c4d..72c31a315d96 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -59,14 +59,7 @@ static void gfs2_init_gl_aspace_once(void *foo)
 	struct address_space *mapping = (struct address_space *)(gl + 1);
 
 	gfs2_init_glock_once(gl);
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	address_space_init_once(mapping);
 }
 
 /**
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f3..9c2b795ccc93 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -295,6 +295,20 @@ static void destroy_inode(struct inode *inode)
 		call_rcu(&inode->i_rcu, i_callback);
 }
 
+void address_space_init_once(struct address_space *mapping)
+{
+	memset(mapping, 0, sizeof(*mapping));
+	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
+	spin_lock_init(&mapping->tree_lock);
+	spin_lock_init(&mapping->i_mmap_lock);
+	INIT_LIST_HEAD(&mapping->private_list);
+	spin_lock_init(&mapping->private_lock);
+	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
+	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	mutex_init(&mapping->unmap_mutex);
+}
+EXPORT_SYMBOL(address_space_init_once);
+
 /*
  * These are initializations that only need to be done
  * once, because the fields are idempotent across use
@@ -308,13 +322,7 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->i_devices);
 	INIT_LIST_HEAD(&inode->i_wb_list);
 	INIT_LIST_HEAD(&inode->i_lru);
-	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
-	spin_lock_init(&inode->i_data.tree_lock);
-	spin_lock_init(&inode->i_data.i_mmap_lock);
-	INIT_LIST_HEAD(&inode->i_data.private_list);
-	spin_lock_init(&inode->i_data.private_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&inode->i_data.i_mmap);
-	INIT_LIST_HEAD(&inode->i_data.i_mmap_nonlinear);
+	address_space_init_once(&inode->i_data);
 	i_size_ordered_init(inode);
 #ifdef CONFIG_FSNOTIFY
 	INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 388e9e8f5286..85f7baa15f5d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -35,11 +35,6 @@
 #include "btnode.h"
 
 
-void nilfs_btnode_cache_init_once(struct address_space *btnc)
-{
-	nilfs_mapping_init_once(btnc);
-}
-
 static const struct address_space_operations def_btnode_aops = {
 	.sync_page		= block_sync_page,
 };
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 79037494f1e0..1b8ebd888c28 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -37,7 +37,6 @@ struct nilfs_btnode_chkey_ctxt {
 	struct buffer_head *newbh;
 };
 
-void nilfs_btnode_cache_init_once(struct address_space *);
 void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *);
 void nilfs_btnode_cache_clear(struct address_space *);
 struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 6a0e2a189f60..a0babd2bff6a 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -454,9 +454,9 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
 	struct backing_dev_info *bdi = inode->i_sb->s_bdi;
 
 	INIT_LIST_HEAD(&shadow->frozen_buffers);
-	nilfs_mapping_init_once(&shadow->frozen_data);
+	address_space_init_once(&shadow->frozen_data);
 	nilfs_mapping_init(&shadow->frozen_data, bdi, &shadow_map_aops);
-	nilfs_mapping_init_once(&shadow->frozen_btnodes);
+	address_space_init_once(&shadow->frozen_btnodes);
 	nilfs_mapping_init(&shadow->frozen_btnodes, bdi, &shadow_map_aops);
 	mi->mi_shadow = shadow;
 	return 0;
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 0c432416cfef..a585b35fd6bc 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -492,19 +492,6 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
 	return nc;
 }
 
-void nilfs_mapping_init_once(struct address_space *mapping)
-{
-	memset(mapping, 0, sizeof(*mapping));
-	INIT_RADIX_TREE(&mapping->page_tree, GFP_ATOMIC);
-	spin_lock_init(&mapping->tree_lock);
-	INIT_LIST_HEAD(&mapping->private_list);
-	spin_lock_init(&mapping->private_lock);
-
-	spin_lock_init(&mapping->i_mmap_lock);
-	INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-}
-
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops)
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index 622df27cd891..2a00953ebd5f 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -61,7 +61,6 @@ void nilfs_free_private_page(struct page *);
 int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_pages(struct address_space *);
-void nilfs_mapping_init_once(struct address_space *mapping);
 void nilfs_mapping_init(struct address_space *mapping,
 			struct backing_dev_info *bdi,
 			const struct address_space_operations *aops);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 58fd707174e1..1673b3d99842 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1279,7 +1279,7 @@ static void nilfs_inode_init_once(void *obj)
 #ifdef CONFIG_NILFS_XATTR
 	init_rwsem(&ii->xattr_sem);
 #endif
-	nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
+	address_space_init_once(&ii->i_btnode_cache);
 	ii->i_bmap = &ii->i_bmap_data;
 	inode_init_once(&ii->vfs_inode);
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bd3215940c37..97d08d8a7de8 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -649,6 +649,7 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	struct mutex		unmap_mutex;    /* to protect unmapping */
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -2225,6 +2226,7 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin);
 
 extern int inode_init_always(struct super_block *, struct inode *);
 extern void inode_init_once(struct inode *);
+extern void address_space_init_once(struct address_space *mapping);
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 extern struct inode * igrab(struct inode *);
diff --git a/mm/memory.c b/mm/memory.c
index 8e8c18324863..5823698c2b71 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2648,6 +2648,7 @@ void unmap_mapping_range(struct address_space *mapping,
 		details.last_index = ULONG_MAX;
 	details.i_mmap_lock = &mapping->i_mmap_lock;
 
+	mutex_lock(&mapping->unmap_mutex);
 	spin_lock(&mapping->i_mmap_lock);
 
 	/* Protect against endless unmapping loops */
@@ -2664,6 +2665,7 @@ void unmap_mapping_range(struct address_space *mapping,
 	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
 		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	spin_unlock(&mapping->i_mmap_lock);
+	mutex_unlock(&mapping->unmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
-- 
cgit v1.2.3


From 93b270f76e7ef3b81001576860c2701931cdc78b Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 24 Feb 2011 17:25:47 +1100
Subject: Fix over-zealous flush_disk when changing device size.

There are two cases when we call flush_disk.
In one, the device has disappeared (check_disk_change) so any
data will hold becomes irrelevant.
In the oter, the device has changed size (check_disk_size_change)
so data we hold may be irrelevant.

In both cases it makes sense to discard any 'clean' buffers,
so they will be read back from the device if needed.

In the former case it makes sense to discard 'dirty' buffers
as there will never be anywhere safe to write the data.  In the
second case it *does*not* make sense to discard dirty buffers
as that will lead to file system corruption when you simply enlarge
the containing devices.

flush_disk calls __invalidate_devices.
__invalidate_device calls both invalidate_inodes and invalidate_bdev.

invalidate_inodes *does* discard I_DIRTY inodes and this does lead
to fs corruption.

invalidate_bev *does*not* discard dirty pages, but I don't really care
about that at present.

So this patch adds a flag to __invalidate_device (calling it
__invalidate_device2) to indicate whether dirty buffers should be
killed, and this is passed to invalidate_inodes which can choose to
skip dirty inodes.

flusk_disk then passes true from check_disk_change and false from
check_disk_size_change.

dm avoids tripping over this problem by calling i_size_write directly
rathher than using check_disk_size_change.

md does use check_disk_size_change and so is affected.

This regression was introduced by commit 608aeef17a which causes
check_disk_size_change to call flush_disk, so it is suitable for any
kernel since 2.6.27.

Cc: stable@kernel.org
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Andrew Patterson <andrew.patterson@hp.com>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 block/genhd.c          |  2 +-
 drivers/block/floppy.c |  2 +-
 fs/block_dev.c         | 12 ++++++------
 fs/inode.c             |  9 ++++++++-
 fs/internal.h          |  2 +-
 include/linux/fs.h     |  2 +-
 6 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/block/genhd.c b/block/genhd.c
index 6a5b772aa201..cbf1112a885c 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1355,7 +1355,7 @@ int invalidate_partition(struct gendisk *disk, int partno)
 	struct block_device *bdev = bdget_disk(disk, partno);
 	if (bdev) {
 		fsync_bdev(bdev);
-		res = __invalidate_device(bdev);
+		res = __invalidate_device(bdev, true);
 		bdput(bdev);
 	}
 	return res;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index b9ba04fc2b34..77fc76f8aea9 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3281,7 +3281,7 @@ static int set_geometry(unsigned int cmd, struct floppy_struct *g,
 			struct block_device *bdev = opened_bdev[cnt];
 			if (!bdev || ITYPE(drive_state[cnt].fd_device) != type)
 				continue;
-			__invalidate_device(bdev);
+			__invalidate_device(bdev, true);
 		}
 		mutex_unlock(&open_lock);
 	} else {
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 333a7bb4cb9c..5e23152d04ad 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -927,9 +927,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
  * when a disk has been changed -- either by a media change or online
  * resize.
  */
-static void flush_disk(struct block_device *bdev)
+static void flush_disk(struct block_device *bdev, bool kill_dirty)
 {
-	if (__invalidate_device(bdev)) {
+	if (__invalidate_device(bdev, kill_dirty)) {
 		char name[BDEVNAME_SIZE] = "";
 
 		if (bdev->bd_disk)
@@ -966,7 +966,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
 		       "%s: detected capacity change from %lld to %lld\n",
 		       name, bdev_size, disk_size);
 		i_size_write(bdev->bd_inode, disk_size);
-		flush_disk(bdev);
+		flush_disk(bdev, false);
 	}
 }
 EXPORT_SYMBOL(check_disk_size_change);
@@ -1019,7 +1019,7 @@ int check_disk_change(struct block_device *bdev)
 	if (!(events & DISK_EVENT_MEDIA_CHANGE))
 		return 0;
 
-	flush_disk(bdev);
+	flush_disk(bdev, true);
 	if (bdops->revalidate_disk)
 		bdops->revalidate_disk(bdev->bd_disk);
 	return 1;
@@ -1601,7 +1601,7 @@ fail:
 }
 EXPORT_SYMBOL(lookup_bdev);
 
-int __invalidate_device(struct block_device *bdev)
+int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 {
 	struct super_block *sb = get_super(bdev);
 	int res = 0;
@@ -1614,7 +1614,7 @@ int __invalidate_device(struct block_device *bdev)
 		 * hold).
 		 */
 		shrink_dcache_sb(sb);
-		res = invalidate_inodes(sb);
+		res = invalidate_inodes(sb, kill_dirty);
 		drop_super(sb);
 	}
 	invalidate_bdev(bdev);
diff --git a/fs/inode.c b/fs/inode.c
index da85e56378f3..c50d7feb87b1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -540,11 +540,14 @@ void evict_inodes(struct super_block *sb)
 /**
  * invalidate_inodes	- attempt to free all inodes on a superblock
  * @sb:		superblock to operate on
+ * @kill_dirty: flag to guide handling of dirty inodes
  *
  * Attempts to free all inodes for a given superblock.  If there were any
  * busy inodes return a non-zero value, else zero.
+ * If @kill_dirty is set, discard dirty inodes too, otherwise treat
+ * them as busy.
  */
-int invalidate_inodes(struct super_block *sb)
+int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 {
 	int busy = 0;
 	struct inode *inode, *next;
@@ -556,6 +559,10 @@ int invalidate_inodes(struct super_block *sb)
 	list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE))
 			continue;
+		if (inode->i_state & I_DIRTY && !kill_dirty) {
+			busy = 1;
+			continue;
+		}
 		if (atomic_read(&inode->i_count)) {
 			busy = 1;
 			continue;
diff --git a/fs/internal.h b/fs/internal.h
index 0663568b1247..9b976b57d7fe 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -112,4 +112,4 @@ extern void release_open_intent(struct nameidata *);
  */
 extern int get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
-extern int invalidate_inodes(struct super_block *);
+extern int invalidate_inodes(struct super_block *, bool);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 32b38cd829d3..683f4c566c82 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2139,7 +2139,7 @@ extern void check_disk_size_change(struct gendisk *disk,
 				   struct block_device *bdev);
 extern int revalidate_disk(struct gendisk *);
 extern int check_disk_change(struct block_device *);
-extern int __invalidate_device(struct block_device *);
+extern int __invalidate_device(struct block_device *, bool);
 extern int invalidate_partition(struct gendisk *, int);
 #endif
 unsigned long invalidate_mapping_pages(struct address_space *mapping,
-- 
cgit v1.2.3


From bf9faa2aa30e2ebf30287536712ed2717bb47002 Mon Sep 17 00:00:00 2001
From: "J. R. Okajima" <hooanon05@yahoo.co.jp>
Date: Wed, 23 Feb 2011 16:59:49 +0900
Subject: Unlock vfsmount_lock in do_umount

By the commit
	b3e19d9 2011-01-07 fs: scale mntget/mntput
vfsmount_lock was introduced around testing mnt_count.
Fix the mis-typed 'unlock'

Signed-off-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 7b0b95371696..d1edf26025dc 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1244,7 +1244,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 		 */
 		br_write_lock(vfsmount_lock);
 		if (mnt_get_count(mnt) != 2) {
-			br_write_lock(vfsmount_lock);
+			br_write_unlock(vfsmount_lock);
 			return -EBUSY;
 		}
 		br_write_unlock(vfsmount_lock);
-- 
cgit v1.2.3


From e7407d1619713f4b1fdff3a485e1bd8e77bd480d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Feb 2011 09:56:32 +0100
Subject: block: bd_link_disk_holder() should hold on to holder_dir

The new implementation of bd_link_disk_holder() added by 49731baa41d
(block: restore multiple bd_link_disk_holder() support) didn't get an
extra reference for the holder_dir kobject of the slave bdev; however,
bdev kills holder_dir on removal, not release, so if the slave bdev is
removed while there are holder links, the holder_dir will be destroyed
while there still are holder links, which leads to oops later when
bd_unlink_disk_order() tries to remove those links.

Make bd_link_disk_holder() grab an extra reference for the slave's
holder_dir and put it in bd_unlink_disk_holder().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: "Hawrylewicz Czarnowski, Przemyslaw" <przemyslaw.hawrylewicz.czarnowski@intel.com>
Tested-by: "Hawrylewicz Czarnowski, Przemyslaw" <przemyslaw.hawrylewicz.czarnowski@intel.com>
Cc: Neil Brown <neilb@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4fb8a3431531..94d41db62004 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -873,6 +873,11 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
 	ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj);
 	if (ret)
 		goto out_del;
+	/*
+	 * bdev could be deleted beneath us which would implicitly destroy
+	 * the holder directory.  Hold on to it.
+	 */
+	kobject_get(bdev->bd_part->holder_dir);
 
 	list_add(&holder->list, &bdev->bd_holder_disks);
 	goto out_unlock;
@@ -909,6 +914,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
 		del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj);
 		del_symlink(bdev->bd_part->holder_dir,
 			    &disk_to_dev(disk)->kobj);
+		kobject_put(bdev->bd_part->holder_dir);
 		list_del_init(&holder->list);
 		kfree(holder);
 	}
-- 
cgit v1.2.3


From 5a18ec176c934ca1bc9dc61580a5e0e90a9b5733 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Feb 2011 14:44:58 +0100
Subject: fuse: fix hang of single threaded fuseblk filesystem

Single threaded NTFS-3G could get stuck if a delayed RELEASE reply
triggered a DESTROY request via path_put().

Fix this by

 a) making RELEASE requests synchronous, whenever possible, on fuseblk
 filesystems

 b) if not possible (triggered by an asynchronous read/write) then do
 the path_put() in a separate thread with schedule_work().

Reported-by: Oliver Neukum <oneukum@suse.de>
Cc: stable@kernel.org
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/file.c   | 52 +++++++++++++++++++++++++++++++++++++++++++++-------
 fs/fuse/fuse_i.h |  6 +++++-
 2 files changed, 50 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 95da1bc1c826..9e0832dbb1e3 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -86,18 +86,52 @@ struct fuse_file *fuse_file_get(struct fuse_file *ff)
 	return ff;
 }
 
+static void fuse_release_async(struct work_struct *work)
+{
+	struct fuse_req *req;
+	struct fuse_conn *fc;
+	struct path path;
+
+	req = container_of(work, struct fuse_req, misc.release.work);
+	path = req->misc.release.path;
+	fc = get_fuse_conn(path.dentry->d_inode);
+
+	fuse_put_request(fc, req);
+	path_put(&path);
+}
+
 static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
 {
-	path_put(&req->misc.release.path);
+	if (fc->destroy_req) {
+		/*
+		 * If this is a fuseblk mount, then it's possible that
+		 * releasing the path will result in releasing the
+		 * super block and sending the DESTROY request.  If
+		 * the server is single threaded, this would hang.
+		 * For this reason do the path_put() in a separate
+		 * thread.
+		 */
+		atomic_inc(&req->count);
+		INIT_WORK(&req->misc.release.work, fuse_release_async);
+		schedule_work(&req->misc.release.work);
+	} else {
+		path_put(&req->misc.release.path);
+	}
 }
 
-static void fuse_file_put(struct fuse_file *ff)
+static void fuse_file_put(struct fuse_file *ff, bool sync)
 {
 	if (atomic_dec_and_test(&ff->count)) {
 		struct fuse_req *req = ff->reserved_req;
 
-		req->end = fuse_release_end;
-		fuse_request_send_background(ff->fc, req);
+		if (sync) {
+			fuse_request_send(ff->fc, req);
+			path_put(&req->misc.release.path);
+			fuse_put_request(ff->fc, req);
+		} else {
+			req->end = fuse_release_end;
+			fuse_request_send_background(ff->fc, req);
+		}
 		kfree(ff);
 	}
 }
@@ -219,8 +253,12 @@ void fuse_release_common(struct file *file, int opcode)
 	 * Normally this will send the RELEASE request, however if
 	 * some asynchronous READ or WRITE requests are outstanding,
 	 * the sending will be delayed.
+	 *
+	 * Make the release synchronous if this is a fuseblk mount,
+	 * synchronous RELEASE is allowed (and desirable) in this case
+	 * because the server can be trusted not to screw up.
 	 */
-	fuse_file_put(ff);
+	fuse_file_put(ff, ff->fc->destroy_req != NULL);
 }
 
 static int fuse_open(struct inode *inode, struct file *file)
@@ -558,7 +596,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 		page_cache_release(page);
 	}
 	if (req->ff)
-		fuse_file_put(req->ff);
+		fuse_file_put(req->ff, false);
 }
 
 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -1137,7 +1175,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
 	__free_page(req->pages[0]);
-	fuse_file_put(req->ff);
+	fuse_file_put(req->ff, false);
 }
 
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index ae5744a2f9e9..d4286947bc2c 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -21,6 +21,7 @@
 #include <linux/rwsem.h>
 #include <linux/rbtree.h>
 #include <linux/poll.h>
+#include <linux/workqueue.h>
 
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
@@ -262,7 +263,10 @@ struct fuse_req {
 	/** Data for asynchronous requests */
 	union {
 		struct {
-			struct fuse_release_in in;
+			union {
+				struct fuse_release_in in;
+				struct work_struct work;
+			};
 			struct path path;
 		} release;
 		struct fuse_init_in init_in;
-- 
cgit v1.2.3


From 8d56addd70c7c0626502569e22cc8fce49ae39f5 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Fri, 25 Feb 2011 14:44:58 +0100
Subject: fuse: fix truncate after open

Commit e1181ee6 "vfs: pass struct file to do_truncate on O_TRUNC
opens" broke the behavior of open(O_TRUNC|O_RDONLY) in fuse.  Fuse
assumed that when called from open, a truncate() will be done, not an
ftruncate().

Fix by restoring the old behavior, based on the ATTR_OPEN flag.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
---
 fs/fuse/dir.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index bfed8447ed80..83543b5ff941 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1283,8 +1283,11 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
 	if (err)
 		return err;
 
-	if ((attr->ia_valid & ATTR_OPEN) && fc->atomic_o_trunc)
-		return 0;
+	if (attr->ia_valid & ATTR_OPEN) {
+		if (fc->atomic_o_trunc)
+			return 0;
+		file = NULL;
+	}
 
 	if (attr->ia_valid & ATTR_SIZE)
 		is_truncate = true;
-- 
cgit v1.2.3


From f129ccc9231c95513a1227ca9da876beeb03e577 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@au1.ibm.com>
Date: Fri, 25 Feb 2011 15:33:02 +0000
Subject: afs: Fix oops in afs_unlink_writeback

I'm seeing the following oops when testing afs:

  Unable to handle kernel paging request for data at address 0x00000008
  ...
  NIP [c0000000003393b0] .afs_unlink_writeback+0x38/0xc0
  LR [c00000000033987c] .afs_put_writeback+0x98/0xec
  Call Trace:
  [c00000000345f600] [c00000000033987c] .afs_put_writeback+0x98/0xec
  [c00000000345f690] [c00000000033ae80] .afs_write_begin+0x6a4/0x75c
  [c00000000345f790] [c00000000012b77c] .generic_file_buffered_write+0x148/0x320
  [c00000000345f8d0] [c00000000012e1b8] .__generic_file_aio_write+0x37c/0x3e4
  [c00000000345f9d0] [c00000000012e2a8] .generic_file_aio_write+0x88/0xfc
  [c00000000345fa90] [c0000000003390a8] .afs_file_write+0x10c/0x178
  [c00000000345fb40] [c000000000188788] .do_sync_write+0xc4/0x128
  [c00000000345fcc0] [c000000000189658] .vfs_write+0xe8/0x1d8
  [c00000000345fd70] [c000000000189884] .SyS_write+0x68/0xb0
  [c00000000345fe30] [c000000000008564] syscall_exit+0x0/0x40

afs_write_begin hits an error and calls afs_unlink_writeback. In there
we do list_del_init on an uninitialised list.

The patch below initialises ->link when creating the afs_writeback struct.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/afs/write.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/afs/write.c b/fs/afs/write.c
index 15690bb1d3b5..789b3afb3423 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -140,6 +140,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
 	candidate->first = candidate->last = index;
 	candidate->offset_first = from;
 	candidate->to_last = to;
+	INIT_LIST_HEAD(&candidate->link);
 	candidate->usage = 1;
 	candidate->state = AFS_WBACK_PENDING;
 	init_waitqueue_head(&candidate->waitq);
-- 
cgit v1.2.3


From 22bacca48a1755f79b7e0f192ddb9fbb7fc6e64e Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Fri, 25 Feb 2011 14:44:12 -0800
Subject: epoll: prevent creating circular epoll structures

In several places, an epoll fd can call another file's ->f_op->poll()
method with ep->mtx held.  This is in general unsafe, because that other
file could itself be an epoll fd that contains the original epoll fd.

The code defends against this possibility in its own ->poll() method using
ep_call_nested, but there are several other unsafe calls to ->poll
elsewhere that can be made to deadlock.  For example, the following simple
program causes the call in ep_insert recursively call the original fd's
->poll, leading to deadlock:

 #include <unistd.h>
 #include <sys/epoll.h>

 int main(void) {
     int e1, e2, p[2];
     struct epoll_event evt = {
         .events = EPOLLIN
     };

     e1 = epoll_create(1);
     e2 = epoll_create(2);
     pipe(p);

     epoll_ctl(e2, EPOLL_CTL_ADD, e1, &evt);
     epoll_ctl(e1, EPOLL_CTL_ADD, p[0], &evt);
     write(p[1], p, sizeof p);
     epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt);

     return 0;
 }

On insertion, check whether the inserted file is itself a struct epoll,
and if so, do a recursive walk to detect whether inserting this file would
create a loop of epoll structures, which could lead to deadlock.

[nelhage@ksplice.com: Use epmutex to serialize concurrent inserts]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Nelson Elhage <nelhage@ksplice.com>
Reported-by: Nelson Elhage <nelhage@ksplice.com>
Tested-by: Nelson Elhage <nelhage@ksplice.com>
Cc: <stable@kernel.org>		[2.6.34+, possibly earlier]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 267d0ada4541..4a09af9e9a63 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -63,6 +63,13 @@
  * cleanup path and it is also acquired by eventpoll_release_file()
  * if a file has been pushed inside an epoll set and it is then
  * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL).
+ * It is also acquired when inserting an epoll fd onto another epoll
+ * fd. We do this so that we walk the epoll tree and ensure that this
+ * insertion does not create a cycle of epoll file descriptors, which
+ * could lead to deadlock. We need a global mutex to prevent two
+ * simultaneous inserts (A into B and B into A) from racing and
+ * constructing a cycle without either insert observing that it is
+ * going to.
  * It is possible to drop the "ep->mtx" and to use the global
  * mutex "epmutex" (together with "ep->lock") to have it working,
  * but having "ep->mtx" will make the interface more scalable.
@@ -224,6 +231,9 @@ static long max_user_watches __read_mostly;
  */
 static DEFINE_MUTEX(epmutex);
 
+/* Used to check for epoll file descriptor inclusion loops */
+static struct nested_calls poll_loop_ncalls;
+
 /* Used for safe wake up implementation */
 static struct nested_calls poll_safewake_ncalls;
 
@@ -1198,6 +1208,62 @@ retry:
 	return res;
 }
 
+/**
+ * ep_loop_check_proc - Callback function to be passed to the @ep_call_nested()
+ *                      API, to verify that adding an epoll file inside another
+ *                      epoll structure, does not violate the constraints, in
+ *                      terms of closed loops, or too deep chains (which can
+ *                      result in excessive stack usage).
+ *
+ * @priv: Pointer to the epoll file to be currently checked.
+ * @cookie: Original cookie for this call. This is the top-of-the-chain epoll
+ *          data structure pointer.
+ * @call_nests: Current dept of the @ep_call_nested() call stack.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
+{
+	int error = 0;
+	struct file *file = priv;
+	struct eventpoll *ep = file->private_data;
+	struct rb_node *rbp;
+	struct epitem *epi;
+
+	mutex_lock(&ep->mtx);
+	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
+		epi = rb_entry(rbp, struct epitem, rbn);
+		if (unlikely(is_file_epoll(epi->ffd.file))) {
+			error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+					       ep_loop_check_proc, epi->ffd.file,
+					       epi->ffd.file->private_data, current);
+			if (error != 0)
+				break;
+		}
+	}
+	mutex_unlock(&ep->mtx);
+
+	return error;
+}
+
+/**
+ * ep_loop_check - Performs a check to verify that adding an epoll file (@file)
+ *                 another epoll file (represented by @ep) does not create
+ *                 closed loops or too deep chains.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @file: Pointer to the epoll file to be checked.
+ *
+ * Returns: Returns zero if adding the epoll @file inside current epoll
+ *          structure @ep does not violate the constraints, or -1 otherwise.
+ */
+static int ep_loop_check(struct eventpoll *ep, struct file *file)
+{
+	return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
+			      ep_loop_check_proc, file, ep, current);
+}
+
 /*
  * Open an eventpoll file descriptor.
  */
@@ -1246,6 +1312,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		struct epoll_event __user *, event)
 {
 	int error;
+	int did_lock_epmutex = 0;
 	struct file *file, *tfile;
 	struct eventpoll *ep;
 	struct epitem *epi;
@@ -1287,6 +1354,25 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	 */
 	ep = file->private_data;
 
+	/*
+	 * When we insert an epoll file descriptor, inside another epoll file
+	 * descriptor, there is the change of creating closed loops, which are
+	 * better be handled here, than in more critical paths.
+	 *
+	 * We hold epmutex across the loop check and the insert in this case, in
+	 * order to prevent two separate inserts from racing and each doing the
+	 * insert "at the same time" such that ep_loop_check passes on both
+	 * before either one does the insert, thereby creating a cycle.
+	 */
+	if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
+		mutex_lock(&epmutex);
+		did_lock_epmutex = 1;
+		error = -ELOOP;
+		if (ep_loop_check(ep, tfile) != 0)
+			goto error_tgt_fput;
+	}
+
+
 	mutex_lock(&ep->mtx);
 
 	/*
@@ -1322,6 +1408,9 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 	mutex_unlock(&ep->mtx);
 
 error_tgt_fput:
+	if (unlikely(did_lock_epmutex))
+		mutex_unlock(&epmutex);
+
 	fput(tfile);
 error_fput:
 	fput(file);
@@ -1441,6 +1530,12 @@ static int __init eventpoll_init(void)
 		EP_ITEM_COST;
 	BUG_ON(max_user_watches < 0);
 
+	/*
+	 * Initialize the structure used to perform epoll file descriptor
+	 * inclusion loops checks.
+	 */
+	ep_nested_calls_init(&poll_loop_ncalls);
+
 	/* Initialize the structure used to perform safe poll wait head wake ups */
 	ep_nested_calls_init(&poll_safewake_ncalls);
 
-- 
cgit v1.2.3


From 294f6cf48666825d23c9372ef37631232746e40d Mon Sep 17 00:00:00 2001
From: Timo Warns <Warns@pre-sense.de>
Date: Fri, 25 Feb 2011 14:44:21 -0800
Subject: ldm: corrupted partition table can cause kernel oops

The kernel automatically evaluates partition tables of storage devices.
The code for evaluating LDM partitions (in fs/partitions/ldm.c) contains
a bug that causes a kernel oops on certain corrupted LDM partitions.  A
kernel subsystem seems to crash, because, after the oops, the kernel no
longer recognizes newly connected storage devices.

The patch changes ldm_parse_vmdb() to Validate the value of vblk_size.

Signed-off-by: Timo Warns <warns@pre-sense.de>
Cc: Eugene Teo <eugeneteo@kernel.sg>
Acked-by: Richard Russon <ldm@flatcap.org>
Cc: Harvey Harrison <harvey.harrison@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/ldm.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'fs')

diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 789c625c7aa5..b10e3540d5b7 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -251,6 +251,11 @@ static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm)
 	}
 
 	vm->vblk_size     = get_unaligned_be32(data + 0x08);
+	if (vm->vblk_size == 0) {
+		ldm_error ("Illegal VBLK size");
+		return false;
+	}
+
 	vm->vblk_offset   = get_unaligned_be32(data + 0x0C);
 	vm->last_vblk_seq = get_unaligned_be32(data + 0x04);
 
-- 
cgit v1.2.3


From 3bd9a5d734c7cc7533b27abf451416c7f50095a7 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@gmail.com>
Date: Fri, 25 Feb 2011 14:44:26 -0800
Subject: aio: fix rcu ioctx lookup

aio-dio-invalidate-failure GPFs in aio_put_req from io_submit.

lookup_ioctx doesn't implement the rcu lookup pattern properly.
rcu_read_lock does not prevent refcount going to zero, so we might take
a refcount on a zero count ioctx.

Fix the bug by atomically testing for zero refcount before incrementing.

[jack@suse.cz: added comment into the code]
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index fc557a3be0a9..b4dd668fbccc 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -239,15 +239,23 @@ static void __put_ioctx(struct kioctx *ctx)
 	call_rcu(&ctx->rcu_head, ctx_rcu_free);
 }
 
-#define get_ioctx(kioctx) do {						\
-	BUG_ON(atomic_read(&(kioctx)->users) <= 0);			\
-	atomic_inc(&(kioctx)->users);					\
-} while (0)
-#define put_ioctx(kioctx) do {						\
-	BUG_ON(atomic_read(&(kioctx)->users) <= 0);			\
-	if (unlikely(atomic_dec_and_test(&(kioctx)->users))) 		\
-		__put_ioctx(kioctx);					\
-} while (0)
+static inline void get_ioctx(struct kioctx *kioctx)
+{
+	BUG_ON(atomic_read(&kioctx->users) <= 0);
+	atomic_inc(&kioctx->users);
+}
+
+static inline int try_get_ioctx(struct kioctx *kioctx)
+{
+	return atomic_inc_not_zero(&kioctx->users);
+}
+
+static inline void put_ioctx(struct kioctx *kioctx)
+{
+	BUG_ON(atomic_read(&kioctx->users) <= 0);
+	if (unlikely(atomic_dec_and_test(&kioctx->users)))
+		__put_ioctx(kioctx);
+}
 
 /* ioctx_alloc
  *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
@@ -601,8 +609,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	rcu_read_lock();
 
 	hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
-		if (ctx->user_id == ctx_id && !ctx->dead) {
-			get_ioctx(ctx);
+		/*
+		 * RCU protects us against accessing freed memory but
+		 * we have to be careful not to get a reference when the
+		 * reference count already dropped to 0 (ctx->dead test
+		 * is unreliable because of races).
+		 */
+		if (ctx->user_id == ctx_id && !ctx->dead && try_get_ioctx(ctx)){
 			ret = ctx;
 			break;
 		}
-- 
cgit v1.2.3


From 7137c6bd455234bcb7560fd829e6ee49cae5fed6 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 25 Feb 2011 14:44:27 -0800
Subject: aio: fix race between io_destroy() and io_submit()

A race can occur when io_submit() races with io_destroy():

 CPU1						CPU2
io_submit()
  do_io_submit()
    ...
    ctx = lookup_ioctx(ctx_id);
						io_destroy()
    Now do_io_submit() holds the last reference to ctx.
    ...
    queue new AIO
    put_ioctx(ctx) - frees ctx with active AIOs

We solve this issue by checking whether ctx is being destroyed in AIO
submission path after adding new AIO to ctx.  Then we are guaranteed that
either io_destroy() waits for new AIO or we see that ctx is being
destroyed and bail out.

Cc: Nick Piggin <npiggin@kernel.dk>
Reviewed-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/aio.c b/fs/aio.c
index b4dd668fbccc..26869cde3953 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1642,6 +1642,23 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		goto out_put_req;
 
 	spin_lock_irq(&ctx->ctx_lock);
+	/*
+	 * We could have raced with io_destroy() and are currently holding a
+	 * reference to ctx which should be destroyed. We cannot submit IO
+	 * since ctx gets freed as soon as io_submit() puts its reference.  The
+	 * check here is reliable: io_destroy() sets ctx->dead before waiting
+	 * for outstanding IO and the barrier between these two is realized by
+	 * unlock of mm->ioctx_lock and lock of ctx->ctx_lock.  Analogously we
+	 * increment ctx->reqs_active before checking for ctx->dead and the
+	 * barrier is realized by unlock and lock of ctx->ctx_lock. Thus if we
+	 * don't see ctx->dead set here, io_destroy() waits for our IO to
+	 * finish.
+	 */
+	if (ctx->dead) {
+		spin_unlock_irq(&ctx->ctx_lock);
+		ret = -EINVAL;
+		goto out_put_req;
+	}
 	aio_run_iocb(req);
 	if (!list_empty(&ctx->run_list)) {
 		/* drain the run list */
-- 
cgit v1.2.3


From e6eb5ce1b202ac9cdcfda5be559c9b9d8ec7542c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 26 Feb 2011 10:54:00 -0800
Subject: fs/block_dev.c: fix new kernel-doc warning

Fix new kernel-doc warning in fs/block_dev.c:

Warning(fs/block_dev.c:937): No description found for parameter 'kill_dirty'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/block_dev.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f05bf16cd979..889287019599 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -928,6 +928,7 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
  * flush_disk - invalidates all buffer-cache entries on a disk
  *
  * @bdev:      struct block device to be flushed
+ * @kill_dirty: flag to guide handling of dirty inodes
  *
  * Invalidates all buffer-cache entries on a disk. It should be called
  * when a disk has been changed -- either by a media change or online
-- 
cgit v1.2.3


From 72746ac643928f6c3113b5aa783d8ea1b13949d2 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 28 Feb 2011 13:41:11 +0900
Subject: nilfs2: fix regression that i-flag is not set on changeless
 checkpoints

According to the report from Jiro SEKIBA titled "regression in
2.6.37?"  (Message-Id: <8739n8vs1f.wl%jir@sekiba.com>), on 2.6.37 and
later kernels, lscp command no longer displays "i" flag on checkpoints
that snapshot operations or garbage collection created.

This is a regression of nilfs2 checkpointing function, and it's
critical since it broke behavior of a part of nilfs2 applications.
For instance, snapshot manager of TimeBrowse gets to create
meaningless snapshots continuously; snapshot creation triggers another
checkpoint, but applications cannot distinguish whether the new
checkpoint contains meaningful changes or not without the i-flag.

This patch fixes the regression and brings that application behavior
back to normal.

Reported-by: Jiro SEKIBA <jir@unicus.jp>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Jiro SEKIBA <jir@unicus.jp>
Cc: stable <stable@kernel.org>  [2.6.37]
---
 fs/nilfs2/segment.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55ebae5c7f39..2de9f636792a 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -430,7 +430,8 @@ static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
 	nilfs_segctor_map_segsum_entry(
 		sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
 
-	if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
+	if (NILFS_I(inode)->i_root &&
+	    !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
 		set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
 	/* skip finfo */
 }
-- 
cgit v1.2.3


From af24ee9ea8d532e16883251a6684dfa1be8eec29 Mon Sep 17 00:00:00 2001
From: Alex Elder <aelder@sgi.com>
Date: Tue, 1 Mar 2011 17:50:00 +0000
Subject: xfs: zero proper structure size for geometry calls

Commit 493f3358cb289ccf716c5a14fa5bb52ab75943e5 added this call to
xfs_fs_geometry() in order to avoid passing kernel stack data back
to user space:

+       memset(geo, 0, sizeof(*geo));

Unfortunately, one of the callers of that function passes the
address of a smaller data type, cast to fit the type that
xfs_fs_geometry() requires.  As a result, this can happen:

Kernel panic - not syncing: stack-protector: Kernel stack is corrupted
in: f87aca93

Pid: 262, comm: xfs_fsr Not tainted 2.6.38-rc6-493f3358cb2+ #1
Call Trace:

[<c12991ac>] ? panic+0x50/0x150
[<c102ed71>] ? __stack_chk_fail+0x10/0x18
[<f87aca93>] ? xfs_ioc_fsgeometry_v1+0x56/0x5d [xfs]

Fix this by fixing that one caller to pass the right type and then
copy out the subset it is interested in.

Note: This patch is an alternative to one originally proposed by
Eric Sandeen.

Reported-by: Jeffrey Hundstad <jeffrey.hundstad@mnsu.edu>
Signed-off-by: Alex Elder <aelder@sgi.com>
Reviewed-by: Eric Sandeen <sandeen@redhat.com>
Tested-by: Jeffrey Hundstad <jeffrey.hundstad@mnsu.edu>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f5e2a19e0f8e..0ca0e3c024d7 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -695,14 +695,19 @@ xfs_ioc_fsgeometry_v1(
 	xfs_mount_t		*mp,
 	void			__user *arg)
 {
-	xfs_fsop_geom_v1_t	fsgeo;
+	xfs_fsop_geom_t         fsgeo;
 	int			error;
 
-	error = xfs_fs_geometry(mp, (xfs_fsop_geom_t *)&fsgeo, 3);
+	error = xfs_fs_geometry(mp, &fsgeo, 3);
 	if (error)
 		return -error;
 
-	if (copy_to_user(arg, &fsgeo, sizeof(fsgeo)))
+	/*
+	 * Caller should have passed an argument of type
+	 * xfs_fsop_geom_v1_t.  This is a proper subset of the
+	 * xfs_fsop_geom_t that xfs_fs_geometry() fills in.
+	 */
+	if (copy_to_user(arg, &fsgeo, sizeof(xfs_fsop_geom_v1_t)))
 		return -XFS_ERROR(EFAULT);
 	return 0;
 }
-- 
cgit v1.2.3


From e8a80c6f769dd4622d8b211b398452158ee60c0b Mon Sep 17 00:00:00 2001
From: Josh Hunt <johunt@akamai.com>
Date: Thu, 24 Feb 2011 11:48:22 +0100
Subject: ext2: Fix link count corruption under heavy link+rename load

vfs_rename_other() does not lock renamed inode with i_mutex. Thus changing
i_nlink in a non-atomic manner (which happens in ext2_rename()) can corrupt
it as reported and analyzed by Josh.

In fact, there is no good reason to mess with i_nlink of the moved file.
We did it presumably to simulate linking into the new directory and unlinking
from an old one. But the practical effect of this is disputable because fsck
can possibly treat file as being properly linked into both directories without
writing any error which is confusing. So we just stop increment-decrement
games with i_nlink which also fixes the corruption.

CC: stable@kernel.org
CC: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Josh Hunt <johunt@akamai.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/ext2/namei.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2e1d8341d827..adb91855ccd0 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -344,7 +344,6 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 		new_de = ext2_find_entry (new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		ext2_set_link(new_dir, new_de, new_page, old_inode, 1);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
@@ -356,12 +355,9 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 			if (new_dir->i_nlink >= EXT2_LINK_MAX)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = ext2_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
@@ -369,12 +365,11 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
 	/*
 	 * Like most other Unix systems, set the ctime for inodes on a
  	 * rename.
-	 * inode_dec_link_count() will mark the inode dirty.
 	 */
 	old_inode->i_ctime = CURRENT_TIME_SEC;
+	mark_inode_dirty(old_inode);
 
 	ext2_delete_entry (old_de, old_page);
-	inode_dec_link_count(old_inode);
 
 	if (dir_de) {
 		if (old_dir != new_dir)
-- 
cgit v1.2.3


From 8aaccf7fa2a2f148db1edbe7b09e3119c3f910cf Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Mon, 14 Feb 2011 22:34:22 +0100
Subject: of/flattree: Drop an uninteresting message to pr_debug level

This message looks like an error (which it isn't) when booting with a
flattened device tree.  Remove the message from normal kernel builds.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 fs/proc/proc_devtree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index d9396a4fc7ff..927cbd115e53 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -233,7 +233,7 @@ void __init proc_device_tree_init(void)
 		return;
 	root = of_find_node_by_path("/");
 	if (root == NULL) {
-		printk(KERN_ERR "/proc/device-tree: can't find root\n");
+		pr_debug("/proc/device-tree: can't find root\n");
 		return;
 	}
 	proc_device_tree_add_node(root, proc_device_tree);
-- 
cgit v1.2.3


From 4787d45fa76b97e224a8299086bb5fb496275796 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 09:38:45 -0500
Subject: sysv: i_nlink races in rename()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysv/namei.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c
index b427b1208c26..e474fbcf8bde 100644
--- a/fs/sysv/namei.c
+++ b/fs/sysv/namei.c
@@ -245,7 +245,6 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
 		new_de = sysv_find_entry(new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		sysv_set_link(new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
@@ -257,18 +256,15 @@ static int sysv_rename(struct inode * old_dir, struct dentry * old_dentry,
 			if (new_dir->i_nlink >= SYSV_SB(new_dir->i_sb)->s_link_max)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = sysv_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
 
 	sysv_delete_entry(old_de, old_page);
-	inode_dec_link_count(old_inode);
+	mark_inode_dirty(old_inode);
 
 	if (dir_de) {
 		sysv_set_link(dir_de, dir_page, new_dir);
-- 
cgit v1.2.3


From 37750cdda36721fa7fa816f5f58258d2c013b248 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 09:40:21 -0500
Subject: ufs: i_nlink races in rename()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ufs/namei.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 12f39b9e4437..d6f681535eb8 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -306,7 +306,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		ufs_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
@@ -318,12 +317,9 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			if (new_dir->i_nlink >= UFS_LINK_MAX)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = ufs_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
@@ -331,12 +327,11 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	/*
 	 * Like most other Unix systems, set the ctime for inodes on a
  	 * rename.
-	 * inode_dec_link_count() will mark the inode dirty.
 	 */
 	old_inode->i_ctime = CURRENT_TIME_SEC;
 
 	ufs_delete_entry(old_dir, old_de, old_page);
-	inode_dec_link_count(old_inode);
+	mark_inode_dirty(old_inode);
 
 	if (dir_de) {
 		ufs_set_link(old_inode, dir_de, dir_page, new_dir);
-- 
cgit v1.2.3


From 6f88049caf56022c773272e03ddfa8cf79867059 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 09:41:38 -0500
Subject: minix: i_nlink races in rename()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/minix/namei.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index ce7337ddfdbf..6e6777f1b4b2 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -213,7 +213,6 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
 		new_de = minix_find_entry(new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		minix_set_link(new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		if (dir_de)
@@ -225,18 +224,15 @@ static int minix_rename(struct inode * old_dir, struct dentry *old_dentry,
 			if (new_dir->i_nlink >= info->s_link_max)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = minix_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
 
 	minix_delete_entry(old_de, old_page);
-	inode_dec_link_count(old_inode);
+	mark_inode_dirty(old_inode);
 
 	if (dir_de) {
 		minix_set_link(dir_de, dir_page, new_dir);
-- 
cgit v1.2.3


From 30eb43d31478f0fca28423623f3ec6af13f845fa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 12:01:13 -0500
Subject: nilfs2: i_nlink races in rename()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/namei.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 98034271cd02..161791d26458 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -397,7 +397,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inc_nlink(old_inode);
 		nilfs_set_link(new_dir, new_de, new_page, old_inode);
 		nilfs_mark_inode_dirty(new_dir);
 		new_inode->i_ctime = CURRENT_TIME;
@@ -411,13 +410,9 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			if (new_dir->i_nlink >= NILFS_LINK_MAX)
 				goto out_dir;
 		}
-		inc_nlink(old_inode);
 		err = nilfs_add_link(new_dentry, old_inode);
-		if (err) {
-			drop_nlink(old_inode);
-			nilfs_mark_inode_dirty(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de) {
 			inc_nlink(new_dir);
 			nilfs_mark_inode_dirty(new_dir);
@@ -431,7 +426,6 @@ static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	old_inode->i_ctime = CURRENT_TIME;
 
 	nilfs_delete_entry(old_de, old_page);
-	drop_nlink(old_inode);
 
 	if (dir_de) {
 		nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
-- 
cgit v1.2.3


From babfe56046885749b6a90a3c4409219a1f16cf48 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 16:42:38 -0500
Subject: exofs: i_nlink races in rename()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exofs/namei.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 264e95d02830..4d70db110cfc 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -272,7 +272,6 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
 		if (!new_de)
 			goto out_dir;
-		inode_inc_link_count(old_inode);
 		err = exofs_set_link(new_dir, new_de, new_page, old_inode);
 		new_inode->i_ctime = CURRENT_TIME;
 		if (dir_de)
@@ -286,12 +285,9 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			if (new_dir->i_nlink >= EXOFS_LINK_MAX)
 				goto out_dir;
 		}
-		inode_inc_link_count(old_inode);
 		err = exofs_add_link(new_dentry, old_inode);
-		if (err) {
-			inode_dec_link_count(old_inode);
+		if (err)
 			goto out_dir;
-		}
 		if (dir_de)
 			inode_inc_link_count(new_dir);
 	}
@@ -299,7 +295,7 @@ static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	old_inode->i_ctime = CURRENT_TIME;
 
 	exofs_delete_entry(old_de, old_page);
-	inode_dec_link_count(old_inode);
+	mark_inode_dirty(old_inode);
 
 	if (dir_de) {
 		err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
-- 
cgit v1.2.3


From 99890a3be1ee67346300f1e0a873006588760f2a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 09:35:13 -0500
Subject: fix reiserfs mkdir() breakage

if directory has so many subdirectories that its link count is set
to 1 (i.e. "can't tell accurately") and reiserfs_new_inode() fails,
we shouldn't decrement the parent's link count in cleanup path;
that's what DEC_DIR_INODE_NLINK() is for.  As it is, we end up
with parent suddenly getting zero i_nlink, with very unpleasant
effects.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index ba5f51ec3458..68fdf45cc6c9 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -771,7 +771,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 					EMPTY_DIR_SIZE_V1 : EMPTY_DIR_SIZE,
 					dentry, inode, &security);
 	if (retval) {
-		dir->i_nlink--;
+		DEC_DIR_INODE_NLINK(dir)
 		goto out_failed;
 	}
 
-- 
cgit v1.2.3


From 810c1b2e48d32a8605928c3609262d94853c3a76 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 10:15:26 -0500
Subject: udf: fix i_nlink limit

(256 << sizeof(x)) - 1 is not the maximal possible value of x...
In reality, the maximal allowed value for UDF FileLinkCount is
65535.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/udf/namei.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 2be0f9eb86d2..b7c338d5e9df 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -32,6 +32,8 @@
 #include <linux/crc-itu-t.h>
 #include <linux/exportfs.h>
 
+enum { UDF_MAX_LINKS = 0xffff };
+
 static inline int udf_match(int len1, const unsigned char *name1, int len2,
 			    const unsigned char *name2)
 {
@@ -650,7 +652,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	struct udf_inode_info *iinfo;
 
 	err = -EMLINK;
-	if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
+	if (dir->i_nlink >= UDF_MAX_LINKS)
 		goto out;
 
 	err = -EIO;
@@ -1034,9 +1036,8 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
 	struct fileIdentDesc cfi, *fi;
 	int err;
 
-	if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
+	if (inode->i_nlink >= UDF_MAX_LINKS)
 		return -EMLINK;
-	}
 
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
 	if (!fi) {
@@ -1131,9 +1132,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto end_rename;
 
 		retval = -EMLINK;
-		if (!new_inode &&
-			new_dir->i_nlink >=
-				(256 << sizeof(new_dir->i_nlink)) - 1)
+		if (!new_inode && new_dir->i_nlink >= UDF_MAX_LINKS)
 			goto end_rename;
 	}
 	if (!nfi) {
-- 
cgit v1.2.3


From 69102e9b4b61f56a26717659ec2e572a6b18458d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 2 Mar 2011 23:46:51 -0500
Subject: hfs: fix rename() over non-empty directory

merge hfs_unlink() and hfs_rmdir(), while we are at it.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfs/dir.c | 50 +++++++++++++-------------------------------------
 1 file changed, 13 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index afa66aaa2237..b4d70b13be92 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -238,46 +238,22 @@ static int hfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 }
 
 /*
- * hfs_unlink()
+ * hfs_remove()
  *
- * This is the unlink() entry in the inode_operations structure for
- * regular HFS directories.  The purpose is to delete an existing
- * file, given the inode for the parent directory and the name
- * (and its length) of the existing file.
- */
-static int hfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode;
-	int res;
-
-	inode = dentry->d_inode;
-	res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
-	if (res)
-		return res;
-
-	drop_nlink(inode);
-	hfs_delete_inode(inode);
-	inode->i_ctime = CURRENT_TIME_SEC;
-	mark_inode_dirty(inode);
-
-	return res;
-}
-
-/*
- * hfs_rmdir()
+ * This serves as both unlink() and rmdir() in the inode_operations
+ * structure for regular HFS directories.  The purpose is to delete
+ * an existing child, given the inode for the parent directory and
+ * the name (and its length) of the existing directory.
  *
- * This is the rmdir() entry in the inode_operations structure for
- * regular HFS directories.  The purpose is to delete an existing
- * directory, given the inode for the parent directory and the name
- * (and its length) of the existing directory.
+ * HFS does not have hardlinks, so both rmdir and unlink set the
+ * link count to 0.  The only difference is the emptiness check.
  */
-static int hfs_rmdir(struct inode *dir, struct dentry *dentry)
+static int hfs_remove(struct inode *dir, struct dentry *dentry)
 {
-	struct inode *inode;
+	struct inode *inode = dentry->d_inode;
 	int res;
 
-	inode = dentry->d_inode;
-	if (inode->i_size != 2)
+	if (S_ISDIR(inode->i_mode) && inode->i_size != 2)
 		return -ENOTEMPTY;
 	res = hfs_cat_delete(inode->i_ino, dir, &dentry->d_name);
 	if (res)
@@ -307,7 +283,7 @@ static int hfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	/* Unlink destination if it already exists */
 	if (new_dentry->d_inode) {
-		res = hfs_unlink(new_dir, new_dentry);
+		res = hfs_remove(new_dir, new_dentry);
 		if (res)
 			return res;
 	}
@@ -332,9 +308,9 @@ const struct file_operations hfs_dir_operations = {
 const struct inode_operations hfs_dir_inode_operations = {
 	.create		= hfs_create,
 	.lookup		= hfs_lookup,
-	.unlink		= hfs_unlink,
+	.unlink		= hfs_remove,
 	.mkdir		= hfs_mkdir,
-	.rmdir		= hfs_rmdir,
+	.rmdir		= hfs_remove,
 	.rename		= hfs_rename,
 	.setattr	= hfs_inode_setattr,
 };
-- 
cgit v1.2.3


From 9bde178d052418af0b8e0f12932cf02ab4764c9d Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 28 Feb 2011 09:47:37 -0800
Subject: Revert "ceph: keep reference to parent inode on ceph_dentry"

This reverts commit 97d79b403ef03f729883246208ef5d8a2ebc4d68.

This fails to account for d_parent changes due to rename or disconnected
dentries due to submounts or NFS reexports.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 5 +----
 fs/ceph/super.h | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f0aef787a102..0bc68de8edd7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -60,7 +60,6 @@ int ceph_init_dentry(struct dentry *dentry)
 	}
 	di->dentry = dentry;
 	di->lease_session = NULL;
-	di->parent_inode = igrab(dentry->d_parent->d_inode);
 	dentry->d_fsdata = di;
 	dentry->d_time = jiffies;
 	ceph_dentry_lru_add(dentry);
@@ -1034,7 +1033,7 @@ static void ceph_dentry_release(struct dentry *dentry)
 	u64 snapid = CEPH_NOSNAP;
 
 	if (!IS_ROOT(dentry)) {
-		parent_inode = di->parent_inode;
+		parent_inode = dentry->d_parent->d_inode;
 		if (parent_inode)
 			snapid = ceph_snap(parent_inode);
 	}
@@ -1059,8 +1058,6 @@ static void ceph_dentry_release(struct dentry *dentry)
 		kmem_cache_free(ceph_dentry_cachep, di);
 		dentry->d_fsdata = NULL;
 	}
-	if (parent_inode)
-		iput(parent_inode);
 }
 
 static int ceph_snapdir_d_revalidate(struct dentry *dentry,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 88fcaa21b801..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -207,7 +207,6 @@ struct ceph_dentry_info {
 	struct dentry *dentry;
 	u64 time;
 	u64 offset;
-	struct inode *parent_inode;
 };
 
 struct ceph_inode_xattrs_info {
-- 
cgit v1.2.3


From b545cc1505eb49247071ce9f4092665de788ca00 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 28 Feb 2011 12:46:46 -0800
Subject: ceph: do not set I_COMPLETE

Do not set the I_COMPLETE flag on directories until we resolve races with
dcache pruning.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c   | 2 +-
 fs/ceph/inode.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 0bc68de8edd7..9b4f9d9947b3 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -409,7 +409,7 @@ more:
 	spin_lock(&inode->i_lock);
 	if (ci->i_release_count == fi->dir_release_count) {
 		dout(" marking %p complete\n", inode);
-		ci->i_ceph_flags |= CEPH_I_COMPLETE;
+		/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
 		ci->i_max_offset = filp->f_pos;
 	}
 	spin_unlock(&inode->i_lock);
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5625463aa479..193bfa5e9cbd 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -707,7 +707,7 @@ static int fill_inode(struct inode *inode,
 		    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
 			dout(" marking %p complete (empty)\n", inode);
-			ci->i_ceph_flags |= CEPH_I_COMPLETE;
+			/* ci->i_ceph_flags |= CEPH_I_COMPLETE; */
 			ci->i_max_offset = 2;
 		}
 		break;
-- 
cgit v1.2.3


From 16a8b70a5a757db513f036bbcc73309f6c507d81 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 28 Feb 2011 12:49:15 -0800
Subject: ceph: do not clear I_COMPLETE from d_release

First, this was racy anyway: d_release isn't called until well after the
dentry is unhashed.  Second, this runs afoul of the recent dcache change
that clears d_parent prior to calling d_release (949854d0), causing a NULL
pointer dereference.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 22 +---------------------
 1 file changed, 1 insertion(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 9b4f9d9947b3..196fd4c62db7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1029,28 +1029,8 @@ out_touch:
 static void ceph_dentry_release(struct dentry *dentry)
 {
 	struct ceph_dentry_info *di = ceph_dentry(dentry);
-	struct inode *parent_inode = NULL;
-	u64 snapid = CEPH_NOSNAP;
 
-	if (!IS_ROOT(dentry)) {
-		parent_inode = dentry->d_parent->d_inode;
-		if (parent_inode)
-			snapid = ceph_snap(parent_inode);
-	}
-	dout("dentry_release %p parent %p\n", dentry, parent_inode);
-	if (parent_inode && snapid != CEPH_SNAPDIR) {
-		struct ceph_inode_info *ci = ceph_inode(parent_inode);
-
-		spin_lock(&parent_inode->i_lock);
-		if (ci->i_shared_gen == di->lease_shared_gen ||
-		    snapid <= CEPH_MAXSNAP) {
-			dout(" clearing %p complete (d_release)\n",
-			     parent_inode);
-			ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-			ci->i_release_count++;
-		}
-		spin_unlock(&parent_inode->i_lock);
-	}
+	dout("dentry_release %p\n", dentry);
 	if (di) {
 		ceph_dentry_lru_del(dentry);
 		if (di->lease_session)
-- 
cgit v1.2.3


From 1858efd471624ecb37e6b5462cab8076f47d1cee Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 13:14:21 -0500
Subject: minimal fix for do_filp_open() race

failure exits on the no-O_CREAT side of do_filp_open() merge with
those of O_CREAT one; unfortunately, if do_path_lookup() returns
-ESTALE, we'll get out_filp:, notice that we are about to return
-ESTALE without having trying to create the sucker with LOOKUP_REVAL
and jump right into the O_CREAT side of code.  And proceed to try
and create a file.  Usually that'll fail with -ESTALE again, but
we can race and get that attempt of pathname resolution to succeed.

open() without O_CREAT really shouldn't end up creating files, races
or not.  The real fix is to rearchitect the whole do_filp_open(),
but for now splitting the failure exits will do.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 0087cf9c2c6b..a5e844fe4b28 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2455,22 +2455,29 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	/* !O_CREAT, simple open */
 	error = do_path_lookup(dfd, pathname, flags, &nd);
 	if (unlikely(error))
-		goto out_filp;
+		goto out_filp2;
 	error = -ELOOP;
 	if (!(nd.flags & LOOKUP_FOLLOW)) {
 		if (nd.inode->i_op->follow_link)
-			goto out_path;
+			goto out_path2;
 	}
 	error = -ENOTDIR;
 	if (nd.flags & LOOKUP_DIRECTORY) {
 		if (!nd.inode->i_op->lookup)
-			goto out_path;
+			goto out_path2;
 	}
 	audit_inode(pathname, nd.path.dentry);
 	filp = finish_open(&nd, open_flag, acc_mode);
+out2:
 	release_open_intent(&nd);
 	return filp;
 
+out_path2:
+	path_put(&nd.path);
+out_filp2:
+	filp = ERR_PTR(error);
+	goto out2;
+
 creat:
 	/* OK, have to create the file. Find the parent. */
 	error = path_init_rcu(dfd, pathname,
-- 
cgit v1.2.3


From 455cec0abff563574cca432ced49f734117ca113 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 3 Mar 2011 13:44:35 -0800
Subject: ceph: no .snap inside of snapped namespace

Otherwise you can do things like

# mkdir .snap/foo
# cd .snap/foo/.snap
# ls
<badness>

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 196fd4c62db7..099a58615b90 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -496,6 +496,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 
 	/* .snap dir? */
 	if (err == -ENOENT &&
+	    ceph_snap(parent) == CEPH_NOSNAP &&
 	    strcmp(dentry->d_name.name,
 		   fsc->mount_options->snapdir_name) == 0) {
 		struct inode *inode = ceph_get_snapdir(parent);
-- 
cgit v1.2.3


From e9e3d724e2145f5039b423c290ce2b2c3d8f94bc Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Fri, 4 Mar 2011 19:26:03 -0500
Subject: nfs4: Ensure that ACL pages sent over NFS were not allocated from the
 slab (v3)

The "bad_page()" page allocator sanity check was reported recently (call
chain as follows):

  bad_page+0x69/0x91
  free_hot_cold_page+0x81/0x144
  skb_release_data+0x5f/0x98
  __kfree_skb+0x11/0x1a
  tcp_ack+0x6a3/0x1868
  tcp_rcv_established+0x7a6/0x8b9
  tcp_v4_do_rcv+0x2a/0x2fa
  tcp_v4_rcv+0x9a2/0x9f6
  do_timer+0x2df/0x52c
  ip_local_deliver+0x19d/0x263
  ip_rcv+0x539/0x57c
  netif_receive_skb+0x470/0x49f
  :virtio_net:virtnet_poll+0x46b/0x5c5
  net_rx_action+0xac/0x1b3
  __do_softirq+0x89/0x133
  call_softirq+0x1c/0x28
  do_softirq+0x2c/0x7d
  do_IRQ+0xec/0xf5
  default_idle+0x0/0x50
  ret_from_intr+0x0/0xa
  default_idle+0x29/0x50
  cpu_idle+0x95/0xb8
  start_kernel+0x220/0x225
  _sinittext+0x22f/0x236

It occurs because an skb with a fraglist was freed from the tcp
retransmit queue when it was acked, but a page on that fraglist had
PG_Slab set (indicating it was allocated from the Slab allocator (which
means the free path above can't safely free it via put_page.

We tracked this back to an nfsv4 setacl operation, in which the nfs code
attempted to fill convert the passed in buffer to an array of pages in
__nfs4_proc_set_acl, which gets used by the skb->frags list in
xs_sendpages.  __nfs4_proc_set_acl just converts each page in the buffer
to a page struct via virt_to_page, but the vfs allocates the buffer via
kmalloc, meaning the PG_slab bit is set.  We can't create a buffer with
kmalloc and free it later in the tcp ack path with put_page, so we need
to either:

1) ensure that when we create the list of pages, no page struct has
   PG_Slab set

 or

2) not use a page list to send this data

Given that these buffers can be multiple pages and arbitrarily sized, I
think (1) is the right way to go.  I've written the below patch to
allocate a page from the buddy allocator directly and copy the data over
to it.  This ensures that we have a put_page free-able page for every
entry that winds up on an skb frag list, so it can be safely freed when
the frame is acked.  We do a put page on each entry after the
rpc_call_sync call so as to drop our own reference count to the page,
leaving only the ref count taken by tcp_sendpages.  This way the data
will be properly freed when the ack comes in

Successfully tested by myself to solve the above oops.

Note, as this is the result of a setacl operation that exceeded a page
of data, I think this amounts to a local DOS triggerable by an
uprivlidged user, so I'm CCing security on this as well.

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
CC: Trond Myklebust <Trond.Myklebust@netapp.com>
CC: security@kernel.org
CC: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/nfs/nfs4proc.c | 44 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 78936a8f40ab..1ff76acc7e98 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -51,6 +51,7 @@
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
+#include <linux/mm.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -3252,6 +3253,35 @@ static void buf_to_pages(const void *buf, size_t buflen,
 	}
 }
 
+static int buf_to_pages_noslab(const void *buf, size_t buflen,
+		struct page **pages, unsigned int *pgbase)
+{
+	struct page *newpage, **spages;
+	int rc = 0;
+	size_t len;
+	spages = pages;
+
+	do {
+		len = min(PAGE_CACHE_SIZE, buflen);
+		newpage = alloc_page(GFP_KERNEL);
+
+		if (newpage == NULL)
+			goto unwind;
+		memcpy(page_address(newpage), buf, len);
+                buf += len;
+                buflen -= len;
+		*pages++ = newpage;
+		rc++;
+	} while (buflen != 0);
+
+	return rc;
+
+unwind:
+	for(; rc > 0; rc--)
+		__free_page(spages[rc-1]);
+	return -ENOMEM;
+}
+
 struct nfs4_cached_acl {
 	int cached;
 	size_t len;
@@ -3420,13 +3450,23 @@ static int __nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t bufl
 		.rpc_argp	= &arg,
 		.rpc_resp	= &res,
 	};
-	int ret;
+	int ret, i;
 
 	if (!nfs4_server_supports_acls(server))
 		return -EOPNOTSUPP;
+	i = buf_to_pages_noslab(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
+	if (i < 0)
+		return i;
 	nfs_inode_return_delegation(inode);
-	buf_to_pages(buf, buflen, arg.acl_pages, &arg.acl_pgbase);
 	ret = nfs4_call_sync(server, &msg, &arg, &res, 1);
+
+	/*
+	 * Free each page after tx, so the only ref left is
+	 * held by the network stack
+	 */
+	for (; i > 0; i--)
+		put_page(pages[i-1]);
+
 	/*
 	 * Acl update can result in inode attribute update.
 	 * so mark the attribute cache invalid.
-- 
cgit v1.2.3


From b1bf862e9dad431175a1174379476299dbfdc017 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 28 Feb 2011 09:52:08 -0500
Subject: Btrfs: fix regressions in copy_from_user handling

Commit 914ee295af418e936ec20a08c1663eaabe4cd07a fixed deadlocks in
btrfs_file_write where we would catch page faults on pages we had
locked.

But, there were a few problems:

1) The x86-32 iov_iter_copy_from_user_atomic code always fails to copy
data when the amount to copy is more than 4K and the offset to start
copying from is not page aligned.  The result was btrfs_file_write
looping forever retrying the iov_iter_copy_from_user_atomic

We deal with this by changing btrfs_file_write to drop down to single
page copies when iov_iter_copy_from_user_atomic starts returning failure.

2) The btrfs_file_write code was leaking delalloc reservations when
iov_iter_copy_from_user_atomic returned zero.  The looping above would
result in the entire filesystem running out of delalloc reservations and
constantly trying to flush things to disk.

3) btrfs_file_write will lock down page cache pages, make sure
any writeback is finished, do the copy_from_user and then release them.
Before the loop runs we check the first and last pages in the write to
see if they are only being partially modified.  If the start or end of
the write isn't aligned, we make sure the corresponding pages are
up to date so that we don't introduce garbage into the file.

With the copy_from_user changes, we're allowing the VM to reclaim the
pages after a partial update from copy_from_user, but we're not
making sure the page cache page is up to date when we loop around to
resume the write.

We deal with this by pushing the up to date checks down into the page
prep code.  This fits better with how the rest of file_write works.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Reported-by: Mitch Harder <mitch.harder@sabayonlinux.org>
cc: stable@kernel.org
---
 fs/btrfs/file.c | 101 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 59 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 65338a1d14ad..13664b315fe2 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -761,6 +761,27 @@ out:
 	return 0;
 }
 
+/*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+	int ret = 0;
+
+	if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+		ret = btrfs_readpage(NULL, page);
+		if (ret)
+			return ret;
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			return -EIO;
+		}
+	}
+	return 0;
+}
+
 /*
  * this gets pages into the page cache and locks them down, it also properly
  * waits for data=ordered extents to finish before allowing the pages to be
@@ -776,6 +797,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = fdentry(file)->d_inode;
 	int err = 0;
+	int faili = 0;
 	u64 start_pos;
 	u64 last_pos;
 
@@ -793,15 +815,24 @@ again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
 		if (!pages[i]) {
-			int c;
-			for (c = i - 1; c >= 0; c--) {
-				unlock_page(pages[c]);
-				page_cache_release(pages[c]);
-			}
-			return -ENOMEM;
+			faili = i - 1;
+			err = -ENOMEM;
+			goto fail;
+		}
+
+		if (i == 0)
+			err = prepare_uptodate_page(pages[i], pos);
+		if (i == num_pages - 1)
+			err = prepare_uptodate_page(pages[i],
+						    pos + write_bytes);
+		if (err) {
+			page_cache_release(pages[i]);
+			faili = i - 1;
+			goto fail;
 		}
 		wait_on_page_writeback(pages[i]);
 	}
+	err = 0;
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -841,6 +872,14 @@ again:
 		WARN_ON(!PageLocked(pages[i]));
 	}
 	return 0;
+fail:
+	while (faili >= 0) {
+		unlock_page(pages[faili]);
+		page_cache_release(pages[faili]);
+		faili--;
+	}
+	return err;
+
 }
 
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
@@ -850,7 +889,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct page *pinned[2];
 	struct page **pages = NULL;
 	struct iov_iter i;
 	loff_t *ppos = &iocb->ki_pos;
@@ -871,9 +909,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
 		      (file->f_flags & O_DIRECT));
 
-	pinned[0] = NULL;
-	pinned[1] = NULL;
-
 	start_pos = pos;
 
 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
@@ -961,32 +996,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
 
-	/*
-	 * there are lots of better ways to do this, but this code
-	 * makes sure the first and last page in the file range are
-	 * up to date and ready for cow
-	 */
-	if ((pos & (PAGE_CACHE_SIZE - 1))) {
-		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
-		if (!PageUptodate(pinned[0])) {
-			ret = btrfs_readpage(NULL, pinned[0]);
-			BUG_ON(ret);
-			wait_on_page_locked(pinned[0]);
-		} else {
-			unlock_page(pinned[0]);
-		}
-	}
-	if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
-		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
-		if (!PageUptodate(pinned[1])) {
-			ret = btrfs_readpage(NULL, pinned[1]);
-			BUG_ON(ret);
-			wait_on_page_locked(pinned[1]);
-		} else {
-			unlock_page(pinned[1]);
-		}
-	}
-
 	while (iov_iter_count(&i) > 0) {
 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
 		size_t write_bytes = min(iov_iter_count(&i),
@@ -1023,8 +1032,20 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 
 		copied = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, &i);
-		dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >>
-				PAGE_CACHE_SHIFT;
+
+		/*
+		 * if we have trouble faulting in the pages, fall
+		 * back to one page at a time
+		 */
+		if (copied < write_bytes)
+			nrptrs = 1;
+
+		if (copied == 0)
+			dirty_pages = 0;
+		else
+			dirty_pages = (copied + offset +
+				       PAGE_CACHE_SIZE - 1) >>
+				       PAGE_CACHE_SHIFT;
 
 		if (num_pages > dirty_pages) {
 			if (copied > 0)
@@ -1068,10 +1089,6 @@ out:
 		err = ret;
 
 	kfree(pages);
-	if (pinned[0])
-		page_cache_release(pinned[0]);
-	if (pinned[1])
-		page_cache_release(pinned[1]);
 	*ppos = pos;
 
 	/*
-- 
cgit v1.2.3


From 31339acd07b4ba687906702085127895a56eb920 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 7 Mar 2011 11:10:24 -0500
Subject: Btrfs: deal with short returns from copy_from_user

When copy_from_user is only able to copy some of the bytes we requested,
we may end up creating a partially up to date page.  To avoid garbage in
the page, we need to treat a partial copy as a zero length copy.

This makes the rest of the file_write code drop the page and
retry the whole copy instead of marking the partially up to
date page as dirty.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
cc: stable@kernel.org
---
 fs/btrfs/file.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 13664b315fe2..ab22ca4f237f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -69,6 +69,19 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
 
 		/* Flush processor's dcache for this page */
 		flush_dcache_page(page);
+
+		/*
+		 * if we get a partial write, we can end up with
+		 * partially up to date pages.  These add
+		 * a lot of complexity, so make sure they don't
+		 * happen by forcing this copy to be retried.
+		 *
+		 * The rest of the btrfs_file_write code will fall
+		 * back to page at a time copies after we return 0.
+		 */
+		if (!PageUptodate(page) && copied < count)
+			copied = 0;
+
 		iov_iter_advance(i, copied);
 		write_bytes -= copied;
 		total_copied += copied;
-- 
cgit v1.2.3


From 32b007b4e19b50ff4d27ea8b69cd6d744cfec86b Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Sun, 6 Mar 2011 19:11:03 -0500
Subject: nfsd4: fix bad pointer on failure to find delegation

In case of a nonempty list, the return on error here is obviously bogus;
it ends up being a pointer to the list head instead of to any valid
delegation on the list.

In particular, if nfsd4_delegreturn() hits this case, and you're quite unlucky,
then renew_client may oops, and it may take an embarassingly long time to
figure out why.  Facepalm.

BUG: unable to handle kernel NULL pointer dereference at 0000000000000090
IP: [<ffffffff81292965>] nfsd4_delegreturn+0x125/0x200
...

Cc: stable@kernel.org
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 54b60bfceb8d..7b566ec14e18 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2445,15 +2445,16 @@ nfs4_check_delegmode(struct nfs4_delegation *dp, int flags)
 static struct nfs4_delegation *
 find_delegation_file(struct nfs4_file *fp, stateid_t *stid)
 {
-	struct nfs4_delegation *dp = NULL;
+	struct nfs4_delegation *dp;
 
 	spin_lock(&recall_lock);
-	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) {
-		if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid)
-			break;
-	}
+	list_for_each_entry(dp, &fp->fi_delegations, dl_perfile)
+		if (dp->dl_stateid.si_stateownerid == stid->si_stateownerid) {
+			spin_unlock(&recall_lock);
+			return dp;
+		}
 	spin_unlock(&recall_lock);
-	return dp;
+	return NULL;
 }
 
 int share_access_to_flags(u32 share_access)
-- 
cgit v1.2.3


From dfef6dcd35cb4a251f6322ca9b2c06f0bb1aa1f4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Mar 2011 01:25:28 -0500
Subject: unfuck proc_sysctl ->d_compare()

a) struct inode is not going to be freed under ->d_compare();
however, the thing PROC_I(inode)->sysctl points to just might.
Fortunately, it's enough to make freeing that sucker delayed,
provided that we don't step on its ->unregistering, clear
the pointer to it in PROC_I(inode) before dropping the reference
and check if it's NULL in ->d_compare().

b) I'm not sure that we *can* walk into NULL inode here (we recheck
dentry->seq between verifying that it's still hashed / fetching
dentry->d_inode and passing it to ->d_compare() and there's no
negative hashed dentries in /proc/sys/*), but if we can walk into
that, we really should not have ->d_compare() return 0 on it!
Said that, I really suspect that this check can be simply killed.
Nick?

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/inode.c        |  8 ++++++--
 fs/proc/proc_sysctl.c  |  7 +++++--
 include/linux/sysctl.h | 14 ++++++++++----
 kernel/sysctl.c        | 15 ++++++++++-----
 4 files changed, 31 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 176ce4cda68a..d6a7ca1fdac5 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -27,6 +27,7 @@
 static void proc_evict_inode(struct inode *inode)
 {
 	struct proc_dir_entry *de;
+	struct ctl_table_header *head;
 
 	truncate_inode_pages(&inode->i_data, 0);
 	end_writeback(inode);
@@ -38,8 +39,11 @@ static void proc_evict_inode(struct inode *inode)
 	de = PROC_I(inode)->pde;
 	if (de)
 		pde_put(de);
-	if (PROC_I(inode)->sysctl)
-		sysctl_head_put(PROC_I(inode)->sysctl);
+	head = PROC_I(inode)->sysctl;
+	if (head) {
+		rcu_assign_pointer(PROC_I(inode)->sysctl, NULL);
+		sysctl_head_put(head);
+	}
 }
 
 struct vfsmount *proc_mnt;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 09a1f92a34ef..8eb2522111c5 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -408,15 +408,18 @@ static int proc_sys_compare(const struct dentry *parent,
 		const struct dentry *dentry, const struct inode *inode,
 		unsigned int len, const char *str, const struct qstr *name)
 {
+	struct ctl_table_header *head;
 	/* Although proc doesn't have negative dentries, rcu-walk means
 	 * that inode here can be NULL */
+	/* AV: can it, indeed? */
 	if (!inode)
-		return 0;
+		return 1;
 	if (name->len != len)
 		return 1;
 	if (memcmp(name->name, str, len))
 		return 1;
-	return !sysctl_is_seen(PROC_I(inode)->sysctl);
+	head = rcu_dereference(PROC_I(inode)->sysctl);
+	return !head || !sysctl_is_seen(head);
 }
 
 static const struct dentry_operations proc_sys_dentry_operations = {
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 7bb5cb64f3b8..bb7c2b086fa4 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -25,6 +25,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
+#include <linux/rcupdate.h>
 
 struct completion;
 
@@ -1037,10 +1038,15 @@ struct ctl_table_root {
    struct ctl_table trees. */
 struct ctl_table_header
 {
-	struct ctl_table *ctl_table;
-	struct list_head ctl_entry;
-	int used;
-	int count;
+	union {
+		struct {
+			struct ctl_table *ctl_table;
+			struct list_head ctl_entry;
+			int used;
+			int count;
+		};
+		struct rcu_head rcu;
+	};
 	struct completion *unregistering;
 	struct ctl_table *ctl_table_arg;
 	struct ctl_table_root *root;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0f1bd83db985..4eed0af5d144 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -194,9 +194,9 @@ static int sysrq_sysctl_handler(ctl_table *table, int write,
 static struct ctl_table root_table[];
 static struct ctl_table_root sysctl_table_root;
 static struct ctl_table_header root_table_header = {
-	.count = 1,
+	{{.count = 1,
 	.ctl_table = root_table,
-	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
+	.ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}},
 	.root = &sysctl_table_root,
 	.set = &sysctl_table_root.default_set,
 };
@@ -1567,11 +1567,16 @@ void sysctl_head_get(struct ctl_table_header *head)
 	spin_unlock(&sysctl_lock);
 }
 
+static void free_head(struct rcu_head *rcu)
+{
+	kfree(container_of(rcu, struct ctl_table_header, rcu));
+}
+
 void sysctl_head_put(struct ctl_table_header *head)
 {
 	spin_lock(&sysctl_lock);
 	if (!--head->count)
-		kfree(head);
+		call_rcu(&head->rcu, free_head);
 	spin_unlock(&sysctl_lock);
 }
 
@@ -1948,10 +1953,10 @@ void unregister_sysctl_table(struct ctl_table_header * header)
 	start_unregistering(header);
 	if (!--header->parent->count) {
 		WARN_ON(1);
-		kfree(header->parent);
+		call_rcu(&header->parent->rcu, free_head);
 	}
 	if (!--header->count)
-		kfree(header);
+		call_rcu(&header->rcu, free_head);
 	spin_unlock(&sysctl_lock);
 }
 
-- 
cgit v1.2.3


From ea8efc74bd0402b4d5f663d007b4e25fa29ea778 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 8 Mar 2011 11:54:40 -0500
Subject: Btrfs: make sure not to return overlapping extents to fiemap

The btrfs fiemap code was incorrectly returning duplicate or overlapping
extents in some cases.  cp was blindly trusting this result and we would
end up with a destination file that was bigger than the original because
some bytes were copied twice.

The fix here adjusts our offsets to make sure we're always moving
forward in the fiemap results.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 33 +++++++++++++++++++++++++++------
 1 file changed, 27 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ff45b80d90f0..9fcb5ede6b72 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3046,17 +3046,38 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	}
 
 	while (!end) {
-		off = extent_map_end(em);
-		if (off >= max)
-			end = 1;
+		u64 offset_in_extent;
+
+		/* break if the extent we found is outside the range */
+		if (em->start >= max || extent_map_end(em) < off)
+			break;
+
+		/*
+		 * get_extent may return an extent that starts before our
+		 * requested range.  We have to make sure the ranges
+		 * we return to fiemap always move forward and don't
+		 * overlap, so adjust the offsets here
+		 */
+		em_start = max(em->start, off);
 
-		em_start = em->start;
-		em_len = em->len;
+		/*
+		 * record the offset from the start of the extent
+		 * for adjusting the disk offset below
+		 */
+		offset_in_extent = em_start - em->start;
 		em_end = extent_map_end(em);
+		em_len = em_end - em_start;
 		emflags = em->flags;
 		disko = 0;
 		flags = 0;
 
+		/*
+		 * bump off for our next call to get_extent
+		 */
+		off = extent_map_end(em);
+		if (off >= max)
+			end = 1;
+
 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
 			end = 1;
 			flags |= FIEMAP_EXTENT_LAST;
@@ -3067,7 +3088,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			flags |= (FIEMAP_EXTENT_DELALLOC |
 				  FIEMAP_EXTENT_UNKNOWN);
 		} else {
-			disko = em->block_start;
+			disko = em->block_start + offset_in_extent;
 		}
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
 			flags |= FIEMAP_EXTENT_ENCODED;
-- 
cgit v1.2.3


From 3ec07aa9522e3d5e9d5ede7bef946756e623a0a0 Mon Sep 17 00:00:00 2001
From: roel <roel.kluin@gmail.com>
Date: Tue, 8 Mar 2011 22:32:26 +0100
Subject: nfsd: wrong index used in inner loop

Index i was already used in the outer loop

Cc: stable@kernel.org
Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1275b8655070..615f0a9f0600 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1142,7 +1142,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 
 	u32 dummy;
 	char *machine_name;
-	int i;
+	int i, j;
 	int nr_secflavs;
 
 	READ_BUF(16);
@@ -1215,7 +1215,7 @@ nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
 			READ_BUF(4);
 			READ32(dummy);
 			READ_BUF(dummy * 4);
-			for (i = 0; i < dummy; ++i)
+			for (j = 0; j < dummy; ++j)
 				READ32(dummy);
 			break;
 		case RPC_AUTH_GSS:
-- 
cgit v1.2.3


From b306419ae08d9def53f2142a37cc0a58622307a8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Mar 2011 21:16:28 -0500
Subject: nd->inode is not set on the second attempt in path_walk()

We leave it at whatever it had been pointing to after the
first link_path_walk() had failed with -ESTALE.  Things
do not work well after that...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a5e844fe4b28..a4689eb2df28 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1546,6 +1546,7 @@ static int path_walk(const char *name, struct nameidata *nd)
 		/* nd->path had been dropped */
 		current->total_link_count = 0;
 		nd->path = save;
+		nd->inode = save.dentry->d_inode;
 		path_get(&nd->path);
 		nd->flags |= LOOKUP_REVAL;
 		result = link_path_walk(name, nd);
-- 
cgit v1.2.3


From ae50adcb0ac4cde67a7aec8ae67249d1b2be2948 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Feb 2011 10:04:50 -0500
Subject: /proc/self is never going to be invalidated...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c | 30 ------------------------------
 1 file changed, 30 deletions(-)

(limited to 'fs')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9d096e82b201..d49c4b5d2c3e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2620,35 +2620,6 @@ static const struct pid_entry proc_base_stuff[] = {
 		&proc_self_inode_operations, NULL, {}),
 };
 
-/*
- *	Exceptional case: normally we are not allowed to unhash a busy
- * directory. In this case, however, we can do it - no aliasing problems
- * due to the way we treat inodes.
- */
-static int proc_base_revalidate(struct dentry *dentry, struct nameidata *nd)
-{
-	struct inode *inode;
-	struct task_struct *task;
-
-	if (nd->flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	inode = dentry->d_inode;
-	task = get_proc_task(inode);
-	if (task) {
-		put_task_struct(task);
-		return 1;
-	}
-	d_drop(dentry);
-	return 0;
-}
-
-static const struct dentry_operations proc_base_dentry_operations =
-{
-	.d_revalidate	= proc_base_revalidate,
-	.d_delete	= pid_delete_dentry,
-};
-
 static struct dentry *proc_base_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2685,7 +2656,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
 	if (p->fop)
 		inode->i_fop = p->fop;
 	ei->op = p->op;
-	d_set_d_op(dentry, &proc_base_dentry_operations);
 	d_add(dentry, inode);
 	error = NULL;
 out:
-- 
cgit v1.2.3


From c78f4cc5e7d642c7009089817c12d8984e7ba872 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Feb 2011 10:14:56 -0500
Subject: reiserfs xattr ->d_revalidate() shouldn't care about RCU

... it returns an error unconditionally

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/xattr.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 3cfb2e933644..5c11ca82b782 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -978,8 +978,6 @@ int reiserfs_permission(struct inode *inode, int mask, unsigned int flags)
 
 static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	if (nd->flags & LOOKUP_RCU)
-		return -ECHILD;
 	return -EPERM;
 }
 
-- 
cgit v1.2.3


From 0eb980e31770cfeff6e27760b4692d595b8dbf28 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:44:05 -0500
Subject: ceph: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ceph/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 099a58615b90..ebafa65a29b6 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -993,7 +993,7 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *dir;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	dir = dentry->d_parent->d_inode;
-- 
cgit v1.2.3


From 529c5f958f9e60abaa7407986034b17d17536bf2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:44:31 -0500
Subject: fuse: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 83543b5ff941..8bd0ef9286c3 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -158,7 +158,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
 	struct inode *inode;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	inode = entry->d_inode;
-- 
cgit v1.2.3


From 53fe924161ff18d24c5c1c256549e9c1b9874827 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:44:48 -0500
Subject: gfs2: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/dentry.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 4a456338b873..0da8da2c991d 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -44,7 +44,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
 	int error;
 	int had_lock = 0;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	parent = dget_parent(dentry);
-- 
cgit v1.2.3


From 4714e63731a8a641b5e0ed5e2e2191c13bf2d71a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:45:07 -0500
Subject: ocfs2: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/dcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 6d80ecc7834f..7eb90403fc8a 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -56,7 +56,7 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 	int ret = 0;    /* if all else fails, just return false */
 	struct ocfs2_super *osb;
 
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	inode = dentry->d_inode;
-- 
cgit v1.2.3


From 8ce84eeb5b40da21f20174dd25891a8409534237 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:45:28 -0500
Subject: jfs: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jfs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 81ead850ddb6..5a2b269428a6 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1600,7 +1600,7 @@ out:
 
 static int jfs_ci_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 	/*
 	 * This is not negative dentry. Always valid.
-- 
cgit v1.2.3


From 9177ada99d5e69fe91950b3ef5c23f2bcd109987 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 10 Mar 2011 03:45:49 -0500
Subject: fat: fix d_revalidate oopsen on NFS exports

can't blindly check nd->flags in ->d_revalidate()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fat/namei_vfat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index f88f752babd9..adae3fb7451a 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -43,7 +43,7 @@ static int vfat_revalidate_shortname(struct dentry *dentry)
 
 static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	/* This is not negative dentry. Always valid. */
@@ -54,7 +54,7 @@ static int vfat_revalidate(struct dentry *dentry, struct nameidata *nd)
 
 static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
 {
-	if (nd->flags & LOOKUP_RCU)
+	if (nd && nd->flags & LOOKUP_RCU)
 		return -ECHILD;
 
 	/*
-- 
cgit v1.2.3


From 1ca551c6caae7b52178555cdedea6ca26444be46 Mon Sep 17 00:00:00 2001
From: Marco Stornelli <marco.stornelli@gmail.com>
Date: Sat, 5 Mar 2011 11:10:19 +0100
Subject: Check for immutable/append flag in fallocate path

In the fallocate path the kernel doesn't check for the immutable/append
flag. It's possible to have a race condition in this scenario: an
application open a file in read/write and it does something, meanwhile
root set the immutable flag on the file, the application at that point
can call fallocate with success. In addition, we don't allow to do any
unreserve operation on an append only file but only the reserve one.

Signed-off-by: Marco Stornelli <marco.stornelli@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 5a2c6ebc22b5..b47aab39c057 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -233,6 +233,14 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
+
+	/* It's not possible punch hole on append only file */
+	if (mode & FALLOC_FL_PUNCH_HOLE && IS_APPEND(inode))
+		return -EPERM;
+
+	if (IS_IMMUTABLE(inode))
+		return -EPERM;
+
 	/*
 	 * Revalidate the write permissions, in case security policy has
 	 * changed since the files were opened.
-- 
cgit v1.2.3


From d891eedbc3b1b0fade8a9ce60cc0eba1cccb59e5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 18 Jan 2011 15:45:09 -0500
Subject: fs/dcache: allow d_obtain_alias() to return unhashed dentries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Without this patch, inodes are not promptly freed on last close of an
unlinked file by an nfs client:

	client$ mount -tnfs4 server:/export/ /mnt/
	client$ tail -f /mnt/FOO
	...
	server$ df -i /export
	server$ rm /export/FOO
	(^C the tail -f)
	server$ df -i /export
	server$ echo 2 >/proc/sys/vm/drop_caches
	server$ df -i /export

the df's will show that the inode is not freed on the filesystem until
the last step, when it could have been freed after killing the client's
tail -f. On-disk data won't be deallocated either, leading to possible
spurious ENOSPC.

This occurs because when the client does the close, it arrives in a
compound with a putfh and a close, processed like:

	- putfh: look up the filehandle.  The only alias found for the
	  inode will be DCACHE_UNHASHED alias referenced by the filp
	  this, so it creates a new DCACHE_DISCONECTED dentry and
	  returns that instead.
	- close: closes the existing filp, which is destroyed
	  immediately by dput() since it's DCACHE_UNHASHED.
	- end of the compound: release the reference
	  to the current filehandle, and dput() the new
	  DCACHE_DISCONECTED dentry, which gets put on the
	  unused list instead of being destroyed immediately.

Nick Piggin suggested fixing this by allowing d_obtain_alias to return
the unhashed dentry that is referenced by the filp, instead of making it
create a new dentry.

Leave __d_find_alias() alone to avoid changing behavior of other
callers.

Also nfsd doesn't need all the checks of __d_find_alias(); any dentry,
hashed or unhashed, disconnected or not, should work.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 2a6bd9a4ae97..611ffe928c03 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1523,6 +1523,28 @@ struct dentry * d_alloc_root(struct inode * root_inode)
 }
 EXPORT_SYMBOL(d_alloc_root);
 
+static struct dentry * __d_find_any_alias(struct inode *inode)
+{
+	struct dentry *alias;
+
+	if (list_empty(&inode->i_dentry))
+		return NULL;
+	alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias);
+	__dget(alias);
+	return alias;
+}
+
+static struct dentry * d_find_any_alias(struct inode *inode)
+{
+	struct dentry *de;
+
+	spin_lock(&inode->i_lock);
+	de = __d_find_any_alias(inode);
+	spin_unlock(&inode->i_lock);
+	return de;
+}
+
+
 /**
  * d_obtain_alias - find or allocate a dentry for a given inode
  * @inode: inode to allocate the dentry for
@@ -1552,7 +1574,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
-	res = d_find_alias(inode);
+	res = d_find_any_alias(inode);
 	if (res)
 		goto out_iput;
 
@@ -1565,7 +1587,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 
 
 	spin_lock(&inode->i_lock);
-	res = __d_find_alias(inode, 0);
+	res = __d_find_any_alias(inode);
 	if (res) {
 		spin_unlock(&inode->i_lock);
 		dput(tmp);
-- 
cgit v1.2.3


From b4966b7770349deb05e3dd2bd2c65d2d044abbbb Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Wed, 9 Mar 2011 16:46:42 +0000
Subject: btrfs: fix dip leak

The btrfs DIO code leaks dip structs when dip->csums allocation
fails; bio->bi_end_io isn't set at the point where the free_ordered
branch is consequently taken, thus bio_endio doesn't call the function
which would free it in the normal case. Fix.

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
Acked-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 44b926646e33..e7a8303328b2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6058,6 +6058,7 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
 	if (!skip_sum) {
 		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
 		if (!dip->csums) {
+			kfree(dip);
 			ret = -ENOMEM;
 			goto free_ordered;
 		}
-- 
cgit v1.2.3


From 7e6b6465e6efbca3985258996be9c189da96c8bf Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 18 Feb 2011 09:21:17 +0000
Subject: btrfs: fix not enough reserved space

btrfs_link() will insert 3 items(inode ref, dir name item and dir index item)
into the b+ tree and update 2 items(its inode, and parent's inode) in the b+
tree. So we should reserve space for these 5 items, not 3 items.

Reported-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e7a8303328b2..db67821ccac2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4823,10 +4823,11 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		goto fail;
 
 	/*
-	 * 1 item for inode ref
+	 * 2 items for inode and inode ref
 	 * 2 items for dir items
+	 * 1 item for parent inode
 	 */
-	trans = btrfs_start_transaction(root, 3);
+	trans = btrfs_start_transaction(root, 5);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto fail;
-- 
cgit v1.2.3


From bf294b41cefcb22fc3139e0f42c5b3f06728bd5e Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 21 Feb 2011 11:05:41 -0800
Subject: SUNRPC: Close a race in __rpc_wait_for_completion_task()

Although they run as rpciod background tasks, under normal operation
(i.e. no SIGKILL), functions like nfs_sillyrename(), nfs4_proc_unlck()
and nfs4_do_close() want to be fully synchronous. This means that when we
exit, we want all references to the rpc_task to be gone, and we want
any dentry references etc. held by that task to be released.

For this reason these functions call __rpc_wait_for_completion_task(),
followed by rpc_put_task() in the expectation that the latter will be
releasing the last reference to the rpc_task, and thus ensuring that the
callback_ops->rpc_release() has been called synchronously.

This patch fixes a race which exists due to the fact that
rpciod calls rpc_complete_task() (in order to wake up the callers of
__rpc_wait_for_completion_task()) and then subsequently calls
rpc_put_task() without ensuring that these two steps are done atomically.

In order to avoid adding new spin locks, the patch uses the existing
waitqueue spin lock to order the rpc_task reference count releases between
the waiting process and rpciod.
The common case where nobody is waiting for completion is optimised for by
checking if the RPC_TASK_ASYNC flag is cleared and/or if the rpc_task
reference count is 1: in those cases we drop trying to grab the spin lock,
and immediately free up the rpc_task.

Those few processes that need to put the rpc_task from inside an
asynchronous context and that do not care about ordering are given a new
helper: rpc_put_task_async().

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c            |  4 +--
 fs/nfs/unlink.c              |  2 +-
 include/linux/sunrpc/sched.h |  1 +
 kernel/sched.c               |  1 +
 net/sunrpc/sched.c           | 75 +++++++++++++++++++++++++++++++++++---------
 5 files changed, 66 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 1ff76acc7e98..d1ed67145cf3 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4150,7 +4150,7 @@ static void nfs4_lock_release(void *calldata)
 		task = nfs4_do_unlck(&data->fl, data->ctx, data->lsp,
 				data->arg.lock_seqid);
 		if (!IS_ERR(task))
-			rpc_put_task(task);
+			rpc_put_task_async(task);
 		dprintk("%s: cancelling lock!\n", __func__);
 	} else
 		nfs_free_seqid(data->arg.lock_seqid);
@@ -5227,7 +5227,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr
 	if (IS_ERR(task))
 		ret = PTR_ERR(task);
 	else
-		rpc_put_task(task);
+		rpc_put_task_async(task);
 	dprintk("<-- %s status=%d\n", __func__, ret);
 	return ret;
 }
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index e313a51acdd1..6481d537d69d 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -180,7 +180,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n
 	task_setup_data.rpc_client = NFS_CLIENT(dir);
 	task = rpc_run_task(&task_setup_data);
 	if (!IS_ERR(task))
-		rpc_put_task(task);
+		rpc_put_task_async(task);
 	return 1;
 }
 
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 88513fd8e208..d81db8012c63 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -212,6 +212,7 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *);
 struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req,
 				const struct rpc_call_ops *ops);
 void		rpc_put_task(struct rpc_task *);
+void		rpc_put_task_async(struct rpc_task *);
 void		rpc_exit_task(struct rpc_task *);
 void		rpc_exit(struct rpc_task *, int);
 void		rpc_release_calldata(const struct rpc_call_ops *, void *);
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..42eab5a8437d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4213,6 +4213,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
 	__wake_up_common(q, mode, 1, 0, key);
 }
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 
 /**
  * __wake_up_sync_key - wake up threads blocked on a waitqueue.
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 243fc09b164e..59e599498e37 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -252,23 +252,37 @@ static void rpc_set_active(struct rpc_task *task)
 
 /*
  * Mark an RPC call as having completed by clearing the 'active' bit
+ * and then waking up all tasks that were sleeping.
  */
-static void rpc_mark_complete_task(struct rpc_task *task)
+static int rpc_complete_task(struct rpc_task *task)
 {
-	smp_mb__before_clear_bit();
+	void *m = &task->tk_runstate;
+	wait_queue_head_t *wq = bit_waitqueue(m, RPC_TASK_ACTIVE);
+	struct wait_bit_key k = __WAIT_BIT_KEY_INITIALIZER(m, RPC_TASK_ACTIVE);
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&wq->lock, flags);
 	clear_bit(RPC_TASK_ACTIVE, &task->tk_runstate);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&task->tk_runstate, RPC_TASK_ACTIVE);
+	ret = atomic_dec_and_test(&task->tk_count);
+	if (waitqueue_active(wq))
+		__wake_up_locked_key(wq, TASK_NORMAL, &k);
+	spin_unlock_irqrestore(&wq->lock, flags);
+	return ret;
 }
 
 /*
  * Allow callers to wait for completion of an RPC call
+ *
+ * Note the use of out_of_line_wait_on_bit() rather than wait_on_bit()
+ * to enforce taking of the wq->lock and hence avoid races with
+ * rpc_complete_task().
  */
 int __rpc_wait_for_completion_task(struct rpc_task *task, int (*action)(void *))
 {
 	if (action == NULL)
 		action = rpc_wait_bit_killable;
-	return wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
+	return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE,
 			action, TASK_KILLABLE);
 }
 EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task);
@@ -857,34 +871,67 @@ static void rpc_async_release(struct work_struct *work)
 	rpc_free_task(container_of(work, struct rpc_task, u.tk_work));
 }
 
-void rpc_put_task(struct rpc_task *task)
+static void rpc_release_resources_task(struct rpc_task *task)
 {
-	if (!atomic_dec_and_test(&task->tk_count))
-		return;
-	/* Release resources */
 	if (task->tk_rqstp)
 		xprt_release(task);
 	if (task->tk_msg.rpc_cred)
 		put_rpccred(task->tk_msg.rpc_cred);
 	rpc_task_release_client(task);
-	if (task->tk_workqueue != NULL) {
+}
+
+static void rpc_final_put_task(struct rpc_task *task,
+		struct workqueue_struct *q)
+{
+	if (q != NULL) {
 		INIT_WORK(&task->u.tk_work, rpc_async_release);
-		queue_work(task->tk_workqueue, &task->u.tk_work);
+		queue_work(q, &task->u.tk_work);
 	} else
 		rpc_free_task(task);
 }
+
+static void rpc_do_put_task(struct rpc_task *task, struct workqueue_struct *q)
+{
+	if (atomic_dec_and_test(&task->tk_count)) {
+		rpc_release_resources_task(task);
+		rpc_final_put_task(task, q);
+	}
+}
+
+void rpc_put_task(struct rpc_task *task)
+{
+	rpc_do_put_task(task, NULL);
+}
 EXPORT_SYMBOL_GPL(rpc_put_task);
 
+void rpc_put_task_async(struct rpc_task *task)
+{
+	rpc_do_put_task(task, task->tk_workqueue);
+}
+EXPORT_SYMBOL_GPL(rpc_put_task_async);
+
 static void rpc_release_task(struct rpc_task *task)
 {
 	dprintk("RPC: %5u release task\n", task->tk_pid);
 
 	BUG_ON (RPC_IS_QUEUED(task));
 
-	/* Wake up anyone who is waiting for task completion */
-	rpc_mark_complete_task(task);
+	rpc_release_resources_task(task);
 
-	rpc_put_task(task);
+	/*
+	 * Note: at this point we have been removed from rpc_clnt->cl_tasks,
+	 * so it should be safe to use task->tk_count as a test for whether
+	 * or not any other processes still hold references to our rpc_task.
+	 */
+	if (atomic_read(&task->tk_count) != 1 + !RPC_IS_ASYNC(task)) {
+		/* Wake up anyone who may be waiting for task completion */
+		if (!rpc_complete_task(task))
+			return;
+	} else {
+		if (!atomic_dec_and_test(&task->tk_count))
+			return;
+	}
+	rpc_final_put_task(task, task->tk_workqueue);
 }
 
 int rpciod_up(void)
-- 
cgit v1.2.3


From d2224e7afbf2a6556f4f8f25bc0e96d99ec4d2bd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sun, 6 Mar 2011 17:14:13 +0000
Subject: nfs: close NFSv4 COMMIT vs. CLOSE race

I've been adding in more artificial delays in the NFSv4 commit and close
codepaths to uncover races. The kernel I'm testing has the patch to
close the race in __rpc_wait_for_completion_task that's in Trond's
cthon2011 branch. The reproducer I've been using does this in a loop:

	mkdir("DIR");
	fd = open("DIR/FILE", O_WRONLY|O_CREAT|O_EXCL, 0644);
	write(fd, "abcdefg", 7);
	close(fd);
	unlink("DIR/FILE");
	rmdir("DIR");

The above reproducer shouldn't result in any silly-renaming. However,
when I add a "msleep(100)" just after the nfs_commit_clear_lock call in
nfs_commit_release, I can almost always force one to occur. If I can
force it to occur with that, then it can happen without that delay
given the right timing.

nfs_commit_inode waits for the NFS_INO_COMMIT bit to clear when called
with FLUSH_SYNC set. nfs_commit_rpcsetup on the other hand does not wait
for the task to complete before putting its reference to it, so the last
reference get put in rpc_release task and gets queued to a workqueue.

In this situation, the last open context reference may be put by the
COMMIT release instead of the close() syscall. The close() syscall
returns too quickly and the unlink runs while the d_count is still
high since the COMMIT release hasn't put its dentry reference yet.

Fix this by having rpc_commit_rpcsetup wait for the RPC call to complete
before putting the task reference when FLUSH_SYNC is set. With this, the
last reference is put by the process that's initiating the FLUSH_SYNC
commit and the race is closed.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/write.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c8278f4046cb..42b92d7a9cc4 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1292,6 +1292,8 @@ static int nfs_commit_rpcsetup(struct list_head *head,
 	task = rpc_run_task(&task_setup_data);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
+	if (how & FLUSH_SYNC)
+		rpc_wait_for_completion_task(task);
 	rpc_put_task(task);
 	return 0;
 }
-- 
cgit v1.2.3


From b9f810570d9cc13177128e11a74e22d37aa68a1a Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <kernel@fomichev.me>
Date: Sat, 5 Feb 2011 23:13:01 +0000
Subject: nfs: add kmalloc return value check in decode_and_add_ds

add kmalloc return value check in decode_and_add_ds

Signed-off-by: Stanislav Fomichev <kernel@fomichev.me>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayoutdev.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index f5c9b125e8cc..b73c34375f60 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -219,6 +219,10 @@ decode_and_add_ds(__be32 **pp, struct inode *inode)
 		goto out_err;
 	}
 	buf = kmalloc(rlen + 1, GFP_KERNEL);
+	if (!buf) {
+		dprintk("%s: Not enough memory\n", __func__);
+		goto out_err;
+	}
 	buf[rlen] = '\0';
 	memcpy(buf, r_addr, rlen);
 
-- 
cgit v1.2.3


From 43b7c3f051dea504afccc39bcb56d8e26c2e0b77 Mon Sep 17 00:00:00 2001
From: Jovi Zhang <bookjovi@gmail.com>
Date: Wed, 2 Mar 2011 23:19:37 +0000
Subject: nfs: fix compilation warning

this commit fix compilation warning as following:
linux-2.6/fs/nfs/nfs4proc.c:3265: warning: comparison of distinct pointer types lacks a cast

Signed-off-by: Jovi Zhang <bookjovi@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d1ed67145cf3..b07d4e23b876 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3262,7 +3262,7 @@ static int buf_to_pages_noslab(const void *buf, size_t buflen,
 	spages = pages;
 
 	do {
-		len = min(PAGE_CACHE_SIZE, buflen);
+		len = min_t(size_t, PAGE_CACHE_SIZE, buflen);
 		newpage = alloc_page(GFP_KERNEL);
 
 		if (newpage == NULL)
-- 
cgit v1.2.3


From 3fa0b4e201d254b52a251fa348bd53e53000cff6 Mon Sep 17 00:00:00 2001
From: Frank Filz <ffilzlnx@us.ibm.com>
Date: Thu, 2 Dec 2010 19:31:23 +0000
Subject: (try3-resend) Fix nfs_compat_user_ino64 so it doesn't cause problems
 if bit 31 or 63 are set in fileid

The problem was use of an int32, which when converted to a uint64
is sign extended resulting in a fileid that doesn't fit in 32 bits
even though the intent of the function is to fit the fileid into
32 bits.

Signed-off-by: Frank Filz <ffilzlnx@us.ibm.com>
Reviewed-by: Jeff Layton <jlayton@redhat.com>
[Trond: Added an include for compat.h]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 1cc600e77bb4..2f8e61816d75 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/inet.h>
 #include <linux/nfs_xdr.h>
 #include <linux/slab.h>
+#include <linux/compat.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -89,7 +90,11 @@ int nfs_wait_bit_killable(void *word)
  */
 u64 nfs_compat_user_ino64(u64 fileid)
 {
-	int ino;
+#ifdef CONFIG_COMPAT
+	compat_ulong_t ino;
+#else	
+	unsigned long ino;
+#endif
 
 	if (enable_ino64)
 		return fileid;
-- 
cgit v1.2.3


From 7d6d63d6427090cbb1d282364b65b12634ca59bd Mon Sep 17 00:00:00 2001
From: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Date: Wed, 9 Mar 2011 13:13:44 -0500
Subject: NFSv4.1: Retry CREATE_SESSION on NFS4ERR_DELAY

Fix bug where we currently retry the EXCHANGEID call again, eventhough
we already have a valid clientid.  Instead, delay and retry the CREATE_SESSION
call.

Signed-off-by: Ricardo Labiaga <Ricardo.Labiaga@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b07d4e23b876..d3c705aa71f6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5028,10 +5028,20 @@ int nfs4_proc_create_session(struct nfs_client *clp)
 	int status;
 	unsigned *ptr;
 	struct nfs4_session *session = clp->cl_session;
+	long timeout = 0;
+	int err;
 
 	dprintk("--> %s clp=%p session=%p\n", __func__, clp, session);
 
-	status = _nfs4_proc_create_session(clp);
+	do {
+		status = _nfs4_proc_create_session(clp);
+		if (status == -NFS4ERR_DELAY) {
+			err = nfs4_delay(clp->cl_rpcclient, &timeout);
+			if (err)
+				status = err;
+		}
+	} while (status == -NFS4ERR_DELAY);
+
 	if (status)
 		goto out;
 
-- 
cgit v1.2.3


From 114f64b5f24abac33a42f4f1856eb3a9766d497e Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Mar 2011 13:13:45 -0500
Subject: NFSv4: remove duplicate clientid in struct nfs_client

Signed-off-by: Andy Adamson <andros@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4xdr.c          | 4 ++--
 include/linux/nfs_fs_sb.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 4e2c168b6ee9..94d50e86a124 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1660,7 +1660,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 
 	p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
 	*p++ = cpu_to_be32(OP_CREATE_SESSION);
-	p = xdr_encode_hyper(p, clp->cl_ex_clid);
+	p = xdr_encode_hyper(p, clp->cl_clientid);
 	*p++ = cpu_to_be32(clp->cl_seqid);			/*Sequence id */
 	*p++ = cpu_to_be32(args->flags);			/*flags */
 
@@ -4694,7 +4694,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	xdr_decode_hyper(p, &clp->cl_ex_clid);
+	xdr_decode_hyper(p, &clp->cl_clientid);
 	p = xdr_inline_decode(xdr, 12);
 	if (unlikely(!p))
 		goto out_overflow;
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index b197563913bf..6beb33e834ba 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -71,8 +71,6 @@ struct nfs_client {
 #endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_V4_1
-	/* clientid returned from EXCHANGE_ID, used by session operations */
-	u64			cl_ex_clid;
 	/* The sequence id to use for the next CREATE_SESSION */
 	u32			cl_seqid;
 	/* The flags used for obtaining the clientid during EXCHANGE_ID */
-- 
cgit v1.2.3


From c34c32ea97718bb24fc06158733580003ba89211 Mon Sep 17 00:00:00 2001
From: Andy Adamson <andros@netapp.com>
Date: Wed, 9 Mar 2011 13:13:46 -0500
Subject: NFSv4.1 reclaim complete must wait for completion

Signed-off-by: Andy Adamson <andros@netapp.com>
[Trond: fix whitespace errors]
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d3c705aa71f6..0a13ae720dd0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5359,6 +5359,9 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp)
 		status = PTR_ERR(task);
 		goto out;
 	}
+	status = nfs4_wait_for_completion_rpc_task(task);
+	if (status == 0)
+		status = task->tk_status;
 	rpc_put_task(task);
 	return 0;
 out:
-- 
cgit v1.2.3


From 0400a6b0cb756f976bae32ae8db47bfa9853897c Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 Mar 2011 16:00:53 -0500
Subject: NFSv4/4.1: Fix nfs4_schedule_state_recovery abuses

nfs4_schedule_state_recovery() should only be used when we need to force
the state manager to check the lease. If we just want to start the
state manager in order to handle a state recovery situation, we should be
using nfs4_schedule_state_manager().

This patch fixes the abuses of nfs4_schedule_state_recovery() by replacing
its use with a set of helper functions that do the right thing.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h          |  8 +++++++-
 fs/nfs/nfs4proc.c         | 45 +++++++++++++++++++++++----------------------
 fs/nfs/nfs4state.c        | 25 +++++++++++++++++++------
 include/linux/nfs_fs_sb.h |  8 +++-----
 4 files changed, 52 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 7a7474073148..54ff900cb8f0 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -298,6 +298,11 @@ struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
 #if defined(CONFIG_NFS_V4_1)
 struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp);
 struct rpc_cred *nfs4_get_exchange_id_cred(struct nfs_client *clp);
+extern void nfs4_schedule_session_recovery(struct nfs4_session *);
+#else
+static inline void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
@@ -307,8 +312,9 @@ extern void nfs4_put_open_state(struct nfs4_state *);
 extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
-extern void nfs4_schedule_state_recovery(struct nfs_client *);
+extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
 extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0a13ae720dd0..411dc80d0656 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -257,12 +257,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 		case -NFS4ERR_OPENMODE:
 			if (state == NULL)
 				break;
-			nfs4_state_mark_reclaim_nograce(clp, state);
-			goto do_state_recovery;
+			nfs4_schedule_stateid_recovery(server, state);
+			goto wait_on_recovery;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_EXPIRED:
-			goto do_state_recovery;
+			nfs4_schedule_lease_recovery(clp);
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -273,7 +274,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR: %d Reset session\n", __func__,
 				errorcode);
-			nfs4_schedule_state_recovery(clp);
+			nfs4_schedule_session_recovery(clp->cl_session);
 			exception->retry = 1;
 			break;
 #endif /* defined(CONFIG_NFS_V4_1) */
@@ -296,8 +297,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode,
 	}
 	/* We failed to handle the error */
 	return nfs4_map_errors(ret);
-do_state_recovery:
-	nfs4_schedule_state_recovery(clp);
+wait_on_recovery:
 	ret = nfs4_wait_clnt_recover(clp);
 	if (ret == 0)
 		exception->retry = 1;
@@ -1256,14 +1256,13 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_state_recovery(
-					server->nfs_client);
+				nfs4_schedule_session_recovery(server->nfs_client->cl_session);
 				goto out;
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_EXPIRED:
 				/* Don't recall a delegation if it was lost */
-				nfs4_schedule_state_recovery(server->nfs_client);
+				nfs4_schedule_lease_recovery(server->nfs_client);
 				goto out;
 			case -ERESTARTSYS:
 				/*
@@ -1272,7 +1271,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
 				 */
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_BAD_STATEID:
-				nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+				nfs4_schedule_stateid_recovery(server, state);
 			case -EKEYEXPIRED:
 				/*
 				 * User RPCSEC_GSS context has expired.
@@ -1588,7 +1587,7 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
 		if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
 		    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
 			break;
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_state_manager(clp);
 		ret = -EIO;
 	}
 	return ret;
@@ -3179,7 +3178,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
 	if (task->tk_status < 0) {
 		/* Unless we're shutting down, schedule state recovery! */
 		if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
-			nfs4_schedule_state_recovery(clp);
+			nfs4_schedule_lease_recovery(clp);
 		return;
 	}
 	do_renew_lease(clp, timestamp);
@@ -3504,12 +3503,13 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_OPENMODE:
 			if (state == NULL)
 				break;
-			nfs4_state_mark_reclaim_nograce(clp, state);
-			goto do_state_recovery;
+			nfs4_schedule_stateid_recovery(server, state);
+			goto wait_on_recovery;
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_EXPIRED:
-			goto do_state_recovery;
+			nfs4_schedule_lease_recovery(clp);
+			goto wait_on_recovery;
 #if defined(CONFIG_NFS_V4_1)
 		case -NFS4ERR_BADSESSION:
 		case -NFS4ERR_BADSLOT:
@@ -3520,7 +3520,7 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 		case -NFS4ERR_SEQ_MISORDERED:
 			dprintk("%s ERROR %d, Reset session\n", __func__,
 				task->tk_status);
-			nfs4_schedule_state_recovery(clp);
+			nfs4_schedule_session_recovery(clp->cl_session);
 			task->tk_status = 0;
 			return -EAGAIN;
 #endif /* CONFIG_NFS_V4_1 */
@@ -3537,9 +3537,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server,
 	}
 	task->tk_status = nfs4_map_errors(task->tk_status);
 	return 0;
-do_state_recovery:
+wait_on_recovery:
 	rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
-	nfs4_schedule_state_recovery(clp);
 	if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
 		rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
 	task->tk_status = 0;
@@ -4406,12 +4405,14 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 			case -NFS4ERR_EXPIRED:
 			case -NFS4ERR_STALE_CLIENTID:
 			case -NFS4ERR_STALE_STATEID:
+				nfs4_schedule_lease_recovery(server->nfs_client);
+				goto out;
 			case -NFS4ERR_BADSESSION:
 			case -NFS4ERR_BADSLOT:
 			case -NFS4ERR_BAD_HIGH_SLOT:
 			case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 			case -NFS4ERR_DEADSESSION:
-				nfs4_schedule_state_recovery(server->nfs_client);
+				nfs4_schedule_session_recovery(server->nfs_client->cl_session);
 				goto out;
 			case -ERESTARTSYS:
 				/*
@@ -4421,7 +4422,7 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
 			case -NFS4ERR_ADMIN_REVOKED:
 			case -NFS4ERR_BAD_STATEID:
 			case -NFS4ERR_OPENMODE:
-				nfs4_state_mark_reclaim_nograce(server->nfs_client, state);
+				nfs4_schedule_stateid_recovery(server, state);
 				err = 0;
 				goto out;
 			case -EKEYEXPIRED:
@@ -5150,7 +5151,7 @@ static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client
 		rpc_delay(task, NFS4_POLL_RETRY_MAX);
 		return -EAGAIN;
 	default:
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_lease_recovery(clp);
 	}
 	return 0;
 }
@@ -5291,7 +5292,7 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
 		rpc_delay(task, NFS4_POLL_RETRY_MAX);
 		return -EAGAIN;
 	default:
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_lease_recovery(clp);
 	}
 	return 0;
 }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index e6742b57a04c..47c8dcdada8a 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1007,9 +1007,9 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
 }
 
 /*
- * Schedule a state recovery attempt
+ * Schedule a lease recovery attempt
  */
-void nfs4_schedule_state_recovery(struct nfs_client *clp)
+void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 {
 	if (!clp)
 		return;
@@ -1041,6 +1041,14 @@ int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *s
 	return 1;
 }
 
+void nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_state *state)
+{
+	struct nfs_client *clp = server->nfs_client;
+
+	nfs4_state_mark_reclaim_nograce(clp, state);
+	nfs4_schedule_state_manager(clp);
+}
+
 static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
 {
 	struct inode *inode = state->inode;
@@ -1436,10 +1444,15 @@ static int nfs4_reclaim_lease(struct nfs_client *clp)
 }
 
 #ifdef CONFIG_NFS_V4_1
+void nfs4_schedule_session_recovery(struct nfs4_session *session)
+{
+	nfs4_schedule_lease_recovery(session->clp);
+}
+
 void nfs41_handle_recall_slot(struct nfs_client *clp)
 {
 	set_bit(NFS4CLNT_RECALL_SLOT, &clp->cl_state);
-	nfs4_schedule_state_recovery(clp);
+	nfs4_schedule_state_manager(clp);
 }
 
 static void nfs4_reset_all_state(struct nfs_client *clp)
@@ -1447,7 +1460,7 @@ static void nfs4_reset_all_state(struct nfs_client *clp)
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
 		clp->cl_boot_time = CURRENT_TIME;
 		nfs4_state_start_reclaim_nograce(clp);
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_state_manager(clp);
 	}
 }
 
@@ -1455,7 +1468,7 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp)
 {
 	if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
 		nfs4_state_start_reclaim_reboot(clp);
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_state_manager(clp);
 	}
 }
 
@@ -1475,7 +1488,7 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
 {
 	nfs_expire_all_delegations(clp);
 	if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0)
-		nfs4_schedule_state_recovery(clp);
+		nfs4_schedule_state_manager(clp);
 }
 
 void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 6beb33e834ba..3e112de12d8d 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -68,9 +68,7 @@ struct nfs_client {
 	unsigned char		cl_id_uniquifier;
 	u32			cl_cb_ident;	/* v4.0 callback identifier */
 	const struct nfs4_minor_version_ops *cl_mvops;
-#endif /* CONFIG_NFS_V4 */
 
-#ifdef CONFIG_NFS_V4_1
 	/* The sequence id to use for the next CREATE_SESSION */
 	u32			cl_seqid;
 	/* The flags used for obtaining the clientid during EXCHANGE_ID */
@@ -78,7 +76,7 @@ struct nfs_client {
 	struct nfs4_session	*cl_session; 	/* sharred session */
 	struct list_head	cl_layouts;
 	struct pnfs_deviceid_cache *cl_devid_cache; /* pNFS deviceid cache */
-#endif /* CONFIG_NFS_V4_1 */
+#endif /* CONFIG_NFS_V4 */
 
 #ifdef CONFIG_NFS_FSCACHE
 	struct fscache_cookie	*fscache;	/* client index cache cookie */
@@ -183,7 +181,7 @@ struct nfs_server {
 /* maximum number of slots to use */
 #define NFS4_MAX_SLOT_TABLE RPC_MAX_SLOT_TABLE
 
-#if defined(CONFIG_NFS_V4_1)
+#if defined(CONFIG_NFS_V4)
 
 /* Sessions */
 #define SLOT_TABLE_SZ (NFS4_MAX_SLOT_TABLE/(8*sizeof(long)))
@@ -223,5 +221,5 @@ struct nfs4_session {
 	struct nfs_client		*clp;
 };
 
-#endif /* CONFIG_NFS_V4_1 */
+#endif /* CONFIG_NFS_V4 */
 #endif
-- 
cgit v1.2.3


From b4410c2f7f775b03da31566c05bb8d2383c7dc27 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 Mar 2011 16:00:55 -0500
Subject: NFSv4.1: Fix the handling of the SEQUENCE status bits

We want SEQUENCE status bits to be handled by the state manager in order
to avoid threading issues.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 411dc80d0656..b0b1a5568529 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -436,8 +436,8 @@ static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *
 		clp = res->sr_session->clp;
 		do_renew_lease(clp, timestamp);
 		/* Check sequence flags */
-		if (atomic_read(&clp->cl_count) > 1)
-			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+		if (res->sr_status_flags != 0)
+			nfs4_schedule_lease_recovery(clp);
 		break;
 	case -NFS4ERR_DELAY:
 		/* The server detected a resend of the RPC call and
@@ -5254,8 +5254,13 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred)
 		goto out;
 	}
 	ret = rpc_wait_for_completion_task(task);
-	if (!ret)
+	if (!ret) {
+		struct nfs4_sequence_res *res = task->tk_msg.rpc_resp;
+
+		if (task->tk_status == 0)
+			nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
 		ret = task->tk_status;
+	}
 	rpc_put_task(task);
 out:
 	dprintk("<-- %s status=%d\n", __func__, ret);
-- 
cgit v1.2.3


From ecac799a5ecc364006f0db6f2db15e77ed4d63e2 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 Mar 2011 16:00:56 -0500
Subject: NFSv4: Fix the setlk error handler

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index b0b1a5568529..bf4e6d4b5bf5 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -4173,23 +4173,18 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = {
 
 static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error)
 {
-	struct nfs_client *clp = server->nfs_client;
-	struct nfs4_state *state = lsp->ls_state;
-
 	switch (error) {
 	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_BAD_STATEID:
-	case -NFS4ERR_EXPIRED:
+		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
 		if (new_lock_owner != 0 ||
 		   (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-			nfs4_state_mark_reclaim_nograce(clp, state);
-		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+			nfs4_schedule_stateid_recovery(server, lsp->ls_state);
 		break;
 	case -NFS4ERR_STALE_STATEID:
-		if (new_lock_owner != 0 ||
-		    (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0)
-			nfs4_state_mark_reclaim_reboot(clp, state);
 		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
+	case -NFS4ERR_EXPIRED:
+		nfs4_schedule_lease_recovery(server->nfs_client);
 	};
 }
 
-- 
cgit v1.2.3


From f9feab1e180d1392f2f59d692826c6da2e57adf4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 9 Mar 2011 16:12:46 -0500
Subject: NFSv4: nfs4_state_mark_reclaim_nograce() should be static

There are no more external users of nfs4_state_mark_reclaim_nograce() or
nfs4_state_mark_reclaim_reboot(), so mark them as static.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h   | 2 --
 fs/nfs/nfs4state.c | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 54ff900cb8f0..1be36cf65bfc 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -315,8 +315,6 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
 extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
-extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
-extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 47c8dcdada8a..0592288f9f06 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1018,7 +1018,7 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
 	nfs4_schedule_state_manager(clp);
 }
 
-int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
 
 	set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
@@ -1032,7 +1032,7 @@ int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *st
 	return 1;
 }
 
-int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
 {
 	set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
 	clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
-- 
cgit v1.2.3


From 57df216bd8c8813a79a6a618e3d2ec937d532b86 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Tue, 8 Mar 2011 23:11:30 +0000
Subject: nfs4: remove duplicated #include

Remove duplicated #include('s) in
  fs/nfs/nfs4proc.c

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index bf4e6d4b5bf5..0a07e353a961 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -51,7 +51,6 @@
 #include <linux/sunrpc/bc_xprt.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
-#include <linux/mm.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
-- 
cgit v1.2.3


From 53d4737580535e073963b91ce87d4216e434fab5 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Fri, 11 Mar 2011 15:31:06 -0500
Subject: NFS: NFSROOT should default to "proto=udp"

There have been a number of recent reports that NFSROOT is no longer
working with default mount options, but fails only with certain NICs.

Brian Downing <bdowning@lavos.net> bisected to commit 56463e50 "NFS:
Use super.c for NFSROOT mount option parsing".  Among other things,
this commit changes the default mount options for NFSROOT to use TCP
instead of UDP as the underlying transport.

TCP seems less able to deal with NICs that are slow to initialize.
The system logs that have accompanied reports of problems all show
that NFSROOT attempts to establish a TCP connection before the NIC is
fully initialized, and thus the TCP connection attempt fails.

When a TCP connection attempt fails during a mount operation, the
NFS stack needs to fail the operation.  Usually user space knows how
and when to retry it.  The network layer does not report a distinct
error code for this particular failure mode.  Thus, there isn't a
clean way for the RPC client to see that it needs to retry in this
case, but not in others.

Because NFSROOT is used in some environments where it is not possible
to update the kernel command line to specify "udp", the proper thing
to do is change NFSROOT to use UDP by default, as it did before commit
56463e50.

To make it easier to see how to change default mount options for
NFSROOT and to distinguish default settings from mandatory settings,
I've adjusted a couple of areas to document the specifics.

root_nfs_cat() is also modified to deal with commas properly when
concatenating strings containing mount option lists.  This keeps
root_nfs_cat() call sites simpler, now that we may be concatenating
multiple mount option strings.

Tested-by: Brian Downing <bdowning@lavos.net>
Tested-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Cc: <stable@kernel.org> # 2.6.37
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfsroot.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index 903908a20023..c541093a5bf2 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,11 +86,14 @@
 /* Default path we try to mount. "%s" gets replaced by our IP address */
 #define NFS_ROOT		"/tftpboot/%s"
 
+/* Default NFSROOT mount options. */
+#define NFS_DEF_OPTIONS		"udp"
+
 /* Parameters passed from the kernel command line */
 static char nfs_root_parms[256] __initdata = "";
 
 /* Text-based mount options passed to super.c */
-static char nfs_root_options[256] __initdata = "";
+static char nfs_root_options[256] __initdata = NFS_DEF_OPTIONS;
 
 /* Address of NFS server */
 static __be32 servaddr __initdata = htonl(INADDR_NONE);
@@ -160,8 +163,14 @@ static int __init root_nfs_copy(char *dest, const char *src,
 }
 
 static int __init root_nfs_cat(char *dest, const char *src,
-				  const size_t destlen)
+			       const size_t destlen)
 {
+	size_t len = strlen(dest);
+
+	if (len && dest[len - 1] != ',')
+		if (strlcat(dest, ",", destlen) > destlen)
+			return -1;
+
 	if (strlcat(dest, src, destlen) > destlen)
 		return -1;
 	return 0;
@@ -194,16 +203,6 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
 		if (root_nfs_cat(nfs_root_options, incoming,
 						sizeof(nfs_root_options)))
 			return -1;
-
-	/*
-	 * Possibly prepare for more options to be appended
-	 */
-	if (nfs_root_options[0] != '\0' &&
-	    nfs_root_options[strlen(nfs_root_options)] != ',')
-		if (root_nfs_cat(nfs_root_options, ",",
-						sizeof(nfs_root_options)))
-			return -1;
-
 	return 0;
 }
 
@@ -217,7 +216,7 @@ static int __init root_nfs_parse_options(char *incoming, char *exppath,
  */
 static int __init root_nfs_data(char *cmdline)
 {
-	char addr_option[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
+	char mand_options[sizeof("nolock,addr=") + INET_ADDRSTRLEN + 1];
 	int len, retval = -1;
 	char *tmp = NULL;
 	const size_t tmplen = sizeof(nfs_export_path);
@@ -244,9 +243,9 @@ static int __init root_nfs_data(char *cmdline)
 	 * Append mandatory options for nfsroot so they override
 	 * what has come before
 	 */
-	snprintf(addr_option, sizeof(addr_option), "nolock,addr=%pI4",
+	snprintf(mand_options, sizeof(mand_options), "nolock,addr=%pI4",
 			&servaddr);
-	if (root_nfs_cat(nfs_root_options, addr_option,
+	if (root_nfs_cat(nfs_root_options, mand_options,
 						sizeof(nfs_root_options)))
 		goto out_optionstoolong;
 
-- 
cgit v1.2.3


From 36e39c40b3facc9b489a13f1d301fc53ff6960a3 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sat, 12 Mar 2011 07:08:42 -0500
Subject: Btrfs: break out of shrink_delalloc earlier

Josef had changed shrink_delalloc to exit after three shrink
attempts, which wasn't quite enough because new writers could
race in and steal free space.

But it also fixed deadlocks and stalls as we tried to recover
delalloc reservations.  The code was tweaked to loop 1024
times, and would reset the counter any time a small amount
of progress was made.  This was too drastic, and with a
lot of writers we can end up stuck in shrink_delalloc forever.

The shrink_delalloc loop is fairly complex because the caller is looping
too, and the caller will go ahead and force a transaction commit to make
sure we reclaim space.

This reworks things to exit shrink_delalloc when we've forced some
writeback and the delalloc reservations have gone down.  This means
the writeback has not just started but has also finished at
least some of the metadata changes required to reclaim delalloc
space.

If we've got this wrong, we're returning ENOSPC too early, which
is a big improvement over the current behavior of hanging the machine.

Test 224 in xfstests hammers on this nicely, and with 1000 writers
trying to fill a 1GB drive we get our first ENOSPC at 93% full.  The
other writers are able to continue until we get 100%.

This is a worst case test for btrfs because the 1000 writers are doing
small IO, and the small FS size means we don't have a lot of room
for metadata chunks.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h       |  9 +++++++++
 fs/btrfs/extent-tree.c | 35 +++++++++++++++++++++++------------
 2 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 28188a786da0..8b4b9d158a0a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -729,6 +729,15 @@ struct btrfs_space_info {
 	u64 disk_total;		/* total bytes on disk, takes mirrors into
 				   account */
 
+	/*
+	 * we bump reservation progress every time we decrement
+	 * bytes_reserved.  This way people waiting for reservations
+	 * know something good has happened and they can check
+	 * for progress.  The number here isn't to be trusted, it
+	 * just shows reclaim activity
+	 */
+	unsigned long reservation_progress;
+
 	int full;		/* indicates that we cannot allocate any more
 				   chunks for this space */
 	int force_alloc;	/* set if we need to force a chunk alloc for
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 100e409e9053..f1db57d4a016 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3343,15 +3343,16 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	u64 max_reclaim;
 	u64 reclaimed = 0;
 	long time_left;
-	int pause = 1;
 	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
 	int loops = 0;
+	unsigned long progress;
 
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
 
 	smp_mb();
 	reserved = space_info->bytes_reserved;
+	progress = space_info->reservation_progress;
 
 	if (reserved == 0)
 		return 0;
@@ -3366,31 +3367,36 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 
 		spin_lock(&space_info->lock);
-		if (reserved > space_info->bytes_reserved) {
-			loops = 0;
+		if (reserved > space_info->bytes_reserved)
 			reclaimed += reserved - space_info->bytes_reserved;
-		} else {
-			loops++;
-		}
 		reserved = space_info->bytes_reserved;
 		spin_unlock(&space_info->lock);
 
+		loops++;
+
 		if (reserved == 0 || reclaimed >= max_reclaim)
 			break;
 
 		if (trans && trans->transaction->blocked)
 			return -EAGAIN;
 
-		__set_current_state(TASK_INTERRUPTIBLE);
-		time_left = schedule_timeout(pause);
+		time_left = schedule_timeout_interruptible(1);
 
 		/* We were interrupted, exit */
 		if (time_left)
 			break;
 
-		pause <<= 1;
-		if (pause > HZ / 10)
-			pause = HZ / 10;
+		/* we've kicked the IO a few times, if anything has been freed,
+		 * exit.  There is no sense in looping here for a long time
+		 * when we really need to commit the transaction, or there are
+		 * just too many writers without enough free space
+		 */
+
+		if (loops > 3) {
+			smp_mb();
+			if (progress != space_info->reservation_progress)
+				break;
+		}
 
 	}
 	return reclaimed >= to_reclaim;
@@ -3613,6 +3619,7 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
 			space_info->bytes_reserved -= num_bytes;
+			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
 	}
@@ -3845,6 +3852,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
 		sinfo->bytes_reserved -= num_bytes;
+		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
 	}
@@ -4006,7 +4014,6 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		to_reserve = 0;
 	}
 	spin_unlock(&BTRFS_I(inode)->accounting_lock);
-
 	to_reserve += calc_csum_metadata_size(inode, num_bytes);
 	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
 	if (ret)
@@ -4134,6 +4141,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			btrfs_set_block_group_used(&cache->item, old_val);
 			cache->reserved -= num_bytes;
 			cache->space_info->bytes_reserved -= num_bytes;
+			cache->space_info->reservation_progress++;
 			cache->space_info->bytes_used += num_bytes;
 			cache->space_info->disk_used += num_bytes * factor;
 			spin_unlock(&cache->lock);
@@ -4185,6 +4193,7 @@ static int pin_down_extent(struct btrfs_root *root,
 	if (reserved) {
 		cache->reserved -= num_bytes;
 		cache->space_info->bytes_reserved -= num_bytes;
+		cache->space_info->reservation_progress++;
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
@@ -4235,6 +4244,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
 				space_info->bytes_readonly += num_bytes;
 			cache->reserved -= num_bytes;
 			space_info->bytes_reserved -= num_bytes;
+			space_info->reservation_progress++;
 		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&space_info->lock);
@@ -4713,6 +4723,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		if (ret) {
 			spin_lock(&cache->space_info->lock);
 			cache->space_info->bytes_reserved -= buf->len;
+			cache->space_info->reservation_progress++;
 			spin_unlock(&cache->space_info->lock);
 		}
 		goto out;
-- 
cgit v1.2.3


From 586ce098a23b6ab7383df853a84ae3d48dc889aa Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 01:50:58 -0500
Subject: compat breakage in preadv() and pwritev()

Fix for a dumb preadv()/pwritev() compat bug - unlike the native
variants, compat_... ones forget to check FMODE_P{READ,WRITE}, so e.g.
on pipe the native preadv() will fail with -ESPIPE and compat one will
act as readv() and succeed.  Not critical, but it's a clear bug with trivial
fix.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a00e6cc..691c3fd8ce1d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1228,7 +1228,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
 	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
-	ret = compat_readv(file, vec, vlen, &pos);
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PREAD)
+		ret = compat_readv(file, vec, vlen, &pos);
 	fput_light(file, fput_needed);
 	return ret;
 }
@@ -1285,7 +1287,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
 	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
-	ret = compat_writev(file, vec, vlen, &pos);
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PWRITE)
+		ret = compat_writev(file, vec, vlen, &pos);
 	fput_light(file, fput_needed);
 	return ret;
 }
-- 
cgit v1.2.3


From c44ed965be7a84afaa07543c04eb97a5dfe93422 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 13 Mar 2011 23:24:46 +0000
Subject: compat breakage in preadv() and pwritev()

Fix for a dumb preadv()/pwritev() compat bug - unlike the native
variants, the compat_...  ones forget to check FMODE_P{READ,WRITE}, so
e.g.  on pipe the native preadv() will fail with -ESPIPE and compat one
will act as readv() and succeed.

Not critical, but it's a clear bug with trivial fix, so IMO it's OK for
-final.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index f6fd0a00e6cc..691c3fd8ce1d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1228,7 +1228,9 @@ compat_sys_preadv(unsigned long fd, const struct compat_iovec __user *vec,
 	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
-	ret = compat_readv(file, vec, vlen, &pos);
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PREAD)
+		ret = compat_readv(file, vec, vlen, &pos);
 	fput_light(file, fput_needed);
 	return ret;
 }
@@ -1285,7 +1287,9 @@ compat_sys_pwritev(unsigned long fd, const struct compat_iovec __user *vec,
 	file = fget_light(fd, &fput_needed);
 	if (!file)
 		return -EBADF;
-	ret = compat_writev(file, vec, vlen, &pos);
+	ret = -ESPIPE;
+	if (file->f_mode & FMODE_PWRITE)
+		ret = compat_writev(file, vec, vlen, &pos);
 	fput_light(file, fput_needed);
 	return ret;
 }
-- 
cgit v1.2.3


From c9c6cac0c2bdbda42e7b804838648d0bc60ddb13 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 16 Feb 2011 15:15:47 -0500
Subject: kill path_lookup()

all remaining callers pass LOOKUP_PARENT to it, so
flags argument can die; renamed to kern_path_parent()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/platforms/cell/spufs/syscalls.c | 2 +-
 fs/namei.c                                   | 7 +++----
 fs/ocfs2/refcounttree.c                      | 2 +-
 include/linux/namei.h                        | 2 +-
 kernel/audit_watch.c                         | 2 +-
 net/unix/af_unix.c                           | 2 +-
 6 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/arch/powerpc/platforms/cell/spufs/syscalls.c b/arch/powerpc/platforms/cell/spufs/syscalls.c
index 187a7d32f86a..a3d2ce54ea2e 100644
--- a/arch/powerpc/platforms/cell/spufs/syscalls.c
+++ b/arch/powerpc/platforms/cell/spufs/syscalls.c
@@ -70,7 +70,7 @@ static long do_spu_create(const char __user *pathname, unsigned int flags,
 	if (!IS_ERR(tmp)) {
 		struct nameidata nd;
 
-		ret = path_lookup(tmp, LOOKUP_PARENT, &nd);
+		ret = kern_path_parent(tmp, &nd);
 		if (!ret) {
 			nd.flags |= LOOKUP_OPEN | LOOKUP_CREATE;
 			ret = spufs_create(&nd, flags, mode, neighbor);
diff --git a/fs/namei.c b/fs/namei.c
index a4689eb2df28..1d6bc8151553 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1747,10 +1747,9 @@ static int do_path_lookup(int dfd, const char *name,
 	return retval;
 }
 
-int path_lookup(const char *name, unsigned int flags,
-			struct nameidata *nd)
+int kern_path_parent(const char *name, struct nameidata *nd)
 {
-	return do_path_lookup(AT_FDCWD, name, flags, nd);
+	return do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, nd);
 }
 
 int kern_path(const char *name, unsigned int flags, struct path *path)
@@ -3586,7 +3585,7 @@ EXPORT_SYMBOL(page_readlink);
 EXPORT_SYMBOL(__page_symlink);
 EXPORT_SYMBOL(page_symlink);
 EXPORT_SYMBOL(page_symlink_inode_operations);
-EXPORT_SYMBOL(path_lookup);
+EXPORT_SYMBOL(kern_path_parent);
 EXPORT_SYMBOL(kern_path);
 EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19ebc5aad391..29623da133cc 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4379,7 +4379,7 @@ static int ocfs2_user_path_parent(const char __user *path,
 	if (IS_ERR(s))
 		return PTR_ERR(s);
 
-	error = path_lookup(s, LOOKUP_PARENT, nd);
+	error = kern_path_parent(s, nd);
 	if (error)
 		putname(s);
 	else
diff --git a/include/linux/namei.h b/include/linux/namei.h
index f276d4fa01fc..58ce3433d4ec 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -72,7 +72,7 @@ extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
 extern int kern_path(const char *, unsigned, struct path *);
 
-extern int path_lookup(const char *, unsigned, struct nameidata *);
+extern int kern_path_parent(const char *, struct nameidata *);
 extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 			   const char *, unsigned int, struct nameidata *);
 
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 20b9fe6907d0..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -359,7 +359,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent)
 	struct dentry *d;
 	int err;
 
-	err = path_lookup(watch->path, LOOKUP_PARENT, &nd);
+	err = kern_path_parent(watch->path, &nd);
 	if (err)
 		return err;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index dd419d286204..d8c04a602cf1 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -850,7 +850,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		 * Get the parent directory, calculate the hash for last
 		 * component.
 		 */
-		err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
+		err = kern_path_parent(sunaddr->sun_path, &nd);
 		if (err)
 			goto out_mknod_parent;
 
-- 
cgit v1.2.3


From 52094c8a0610cf57920ad4c6c57470ae2ccbbd25 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Feb 2011 21:34:47 -0500
Subject: take RCU-dependent stuff around exec_permission() into a new helper

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 1d6bc8151553..8c704465f6ce 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1322,6 +1322,18 @@ fail:
 	return PTR_ERR(dentry);
 }
 
+static inline int may_lookup(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU) {
+		int err = exec_permission(nd->inode, IPERM_FLAG_RCU);
+		if (err != -ECHILD)
+			return err;
+		if (nameidata_drop_rcu(nd))
+			return -ECHILD;
+	}
+	return exec_permission(nd->inode, 0);
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1352,17 +1364,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		unsigned int c;
 
 		nd->flags |= LOOKUP_CONTINUE;
-		if (nd->flags & LOOKUP_RCU) {
-			err = exec_permission(nd->inode, IPERM_FLAG_RCU);
-			if (err == -ECHILD) {
-				if (nameidata_drop_rcu(nd))
-					return -ECHILD;
-				goto exec_again;
-			}
-		} else {
-exec_again:
-			err = exec_permission(nd->inode, 0);
-		}
+
+		err = may_lookup(nd);
  		if (err)
 			break;
 
-- 
cgit v1.2.3


From ee0827cd6b42b0385dc1a116cd853ac1b739f711 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Feb 2011 23:38:09 -0500
Subject: sanitize path_walk() mess

New helper: path_lookupat().  Basically, what do_path_lookup() boils to
modulo -ECHILD/-ESTALE handler.  path_walk* family is gone; vfs_path_lookup()
is using link_path_walk() directly, do_path_lookup() and do_filp_open()
are using path_lookupat().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 148 +++++++++++++++++++++++--------------------------------------
 1 file changed, 56 insertions(+), 92 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 8c704465f6ce..f5de5bb1a61f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1520,59 +1520,6 @@ return_err:
 	return err;
 }
 
-static inline int path_walk_rcu(const char *name, struct nameidata *nd)
-{
-	current->total_link_count = 0;
-
-	return link_path_walk(name, nd);
-}
-
-static inline int path_walk_simple(const char *name, struct nameidata *nd)
-{
-	current->total_link_count = 0;
-
-	return link_path_walk(name, nd);
-}
-
-static int path_walk(const char *name, struct nameidata *nd)
-{
-	struct path save = nd->path;
-	int result;
-
-	current->total_link_count = 0;
-
-	/* make sure the stuff we saved doesn't go away */
-	path_get(&save);
-
-	result = link_path_walk(name, nd);
-	if (result == -ESTALE) {
-		/* nd->path had been dropped */
-		current->total_link_count = 0;
-		nd->path = save;
-		nd->inode = save.dentry->d_inode;
-		path_get(&nd->path);
-		nd->flags |= LOOKUP_REVAL;
-		result = link_path_walk(name, nd);
-	}
-
-	path_put(&save);
-
-	return result;
-}
-
-static void path_finish_rcu(struct nameidata *nd)
-{
-	if (nd->flags & LOOKUP_RCU) {
-		/* RCU dangling. Cancel it. */
-		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
-		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
-	}
-	if (nd->file)
-		fput(nd->file);
-}
-
 static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
@@ -1697,7 +1644,7 @@ out_fail:
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
+static int path_lookupat(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
 	int retval;
@@ -1716,29 +1663,45 @@ static int do_path_lookup(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	retval = path_init_rcu(dfd, name, flags, nd);
+	if (flags & LOOKUP_RCU)
+		retval = path_init_rcu(dfd, name, flags, nd);
+	else
+		retval = path_init(dfd, name, flags, nd);
+
 	if (unlikely(retval))
 		return retval;
-	retval = path_walk_rcu(name, nd);
-	path_finish_rcu(nd);
+
+	current->total_link_count = 0;
+	retval = link_path_walk(name, nd);
+
+	if (nd->flags & LOOKUP_RCU) {
+		/* RCU dangling. Cancel it. */
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+
+	if (nd->file) {
+		fput(nd->file);
+		nd->file = NULL;
+	}
+
 	if (nd->root.mnt) {
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
+	return retval;
+}
 
-	if (unlikely(retval == -ECHILD || retval == -ESTALE)) {
-		/* slower, locked walk */
-		if (retval == -ESTALE)
-			flags |= LOOKUP_REVAL;
-		retval = path_init(dfd, name, flags, nd);
-		if (unlikely(retval))
-			return retval;
-		retval = path_walk(name, nd);
-		if (nd->root.mnt) {
-			path_put(&nd->root);
-			nd->root.mnt = NULL;
-		}
-	}
+static int do_path_lookup(int dfd, const char *name,
+				unsigned int flags, struct nameidata *nd)
+{
+	int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd);
+	if (unlikely(retval == -ECHILD))
+		retval = path_lookupat(dfd, name, flags, nd);
+	if (unlikely(retval == -ESTALE))
+		retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd);
 
 	if (likely(!retval)) {
 		if (unlikely(!audit_dummy_context())) {
@@ -1746,7 +1709,6 @@ static int do_path_lookup(int dfd, const char *name,
 				audit_inode(name, nd->path.dentry);
 		}
 	}
-
 	return retval;
 }
 
@@ -1776,7 +1738,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		    const char *name, unsigned int flags,
 		    struct nameidata *nd)
 {
-	int retval;
+	int result;
 
 	/* same as do_path_lookup */
 	nd->last_type = LAST_ROOT;
@@ -1790,15 +1752,27 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	path_get(&nd->root);
 	nd->inode = nd->path.dentry->d_inode;
 
-	retval = path_walk(name, nd);
-	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
+	current->total_link_count = 0;
+
+	result = link_path_walk(name, nd);
+	if (result == -ESTALE) {
+		/* nd->path had been dropped */
+		current->total_link_count = 0;
+		nd->path.dentry = dentry;
+		nd->path.mnt = mnt;
+		nd->inode = dentry->d_inode;
+		path_get(&nd->path);
+		nd->flags |= LOOKUP_REVAL;
+		result = link_path_walk(name, nd);
+	}
+	if (unlikely(!result && !audit_dummy_context() && nd->path.dentry &&
 				nd->inode))
 		audit_inode(name, nd->path.dentry);
 
 	path_put(&nd->root);
 	nd->root.mnt = NULL;
 
-	return retval;
+	return result;
 }
 
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -2483,24 +2457,14 @@ out_filp2:
 
 creat:
 	/* OK, have to create the file. Find the parent. */
-	error = path_init_rcu(dfd, pathname,
-			LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-	if (error)
-		goto out_filp;
-	error = path_walk_rcu(pathname, &nd);
-	path_finish_rcu(&nd);
-	if (unlikely(error == -ECHILD || error == -ESTALE)) {
-		/* slower, locked walk */
-		if (error == -ESTALE) {
+	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | LOOKUP_RCU, &nd);
+	if (unlikely(error == -ECHILD))
+		error = path_lookupat(dfd, pathname, LOOKUP_PARENT, &nd);
+	if (unlikely(error == -ESTALE)) {
 reval:
-			flags |= LOOKUP_REVAL;
-		}
-		error = path_init(dfd, pathname,
-				LOOKUP_PARENT | (flags & LOOKUP_REVAL), &nd);
-		if (error)
-			goto out_filp;
-
-		error = path_walk_simple(pathname, &nd);
+		flags |= LOOKUP_REVAL;
+		error = path_lookupat(dfd, pathname,
+				LOOKUP_PARENT | LOOKUP_REVAL, &nd);
 	}
 	if (unlikely(error))
 		goto out_filp;
-- 
cgit v1.2.3


From e41f7d4ee5bdb00da7d327a00b0ab9c4a2e9eaa3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 14:02:58 -0500
Subject: merge path_init and path_init_rcu

Actual dependency on whether we want RCU or not is in 3 small areas
(as it ought to be) and everything around those is the same in both
versions.  Since each function has only one caller and those callers
are on two sides of if (flags & LOOKUP_RCU), it's easier and cleaner
to merge them and pull the checks inside.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 118 ++++++++++++++++++-------------------------------------------
 1 file changed, 35 insertions(+), 83 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f5de5bb1a61f..b9e537980ef5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1520,45 +1520,44 @@ return_err:
 	return err;
 }
 
-static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
 	int fput_needed;
 	struct file *file;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
-	nd->flags = flags | LOOKUP_RCU;
+	nd->flags = flags;
 	nd->depth = 0;
 	nd->root.mnt = NULL;
 	nd->file = NULL;
 
 	if (*name=='/') {
-		struct fs_struct *fs = current->fs;
-		unsigned seq;
-
-		br_read_lock(vfsmount_lock);
-		rcu_read_lock();
-
-		do {
-			seq = read_seqcount_begin(&fs->seq);
-			nd->root = fs->root;
-			nd->path = nd->root;
-			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-		} while (read_seqcount_retry(&fs->seq, seq));
-
+		if (flags & LOOKUP_RCU) {
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+			set_root_rcu(nd);
+		} else {
+			set_root(nd);
+			path_get(&nd->root);
+		}
+		nd->path = nd->root;
 	} else if (dfd == AT_FDCWD) {
-		struct fs_struct *fs = current->fs;
-		unsigned seq;
-
-		br_read_lock(vfsmount_lock);
-		rcu_read_lock();
+		if (flags & LOOKUP_RCU) {
+			struct fs_struct *fs = current->fs;
+			unsigned seq;
 
-		do {
-			seq = read_seqcount_begin(&fs->seq);
-			nd->path = fs->pwd;
-			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-		} while (read_seqcount_retry(&fs->seq, seq));
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
 
+			do {
+				seq = read_seqcount_begin(&fs->seq);
+				nd->path = fs->pwd;
+				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			} while (read_seqcount_retry(&fs->seq, seq));
+		} else {
+			get_fs_pwd(current->fs, &nd->path);
+		}
 	} else {
 		struct dentry *dentry;
 
@@ -1578,62 +1577,18 @@ static int path_init_rcu(int dfd, const char *name, unsigned int flags, struct n
 			goto fput_fail;
 
 		nd->path = file->f_path;
-		if (fput_needed)
-			nd->file = file;
-
-		nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-		br_read_lock(vfsmount_lock);
-		rcu_read_lock();
+		if (flags & LOOKUP_RCU) {
+			if (fput_needed)
+				nd->file = file;
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+		} else {
+			path_get(&file->f_path);
+			fput_light(file, fput_needed);
+		}
 	}
-	nd->inode = nd->path.dentry->d_inode;
-	return 0;
 
-fput_fail:
-	fput_light(file, fput_needed);
-out_fail:
-	return retval;
-}
-
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
-{
-	int retval = 0;
-	int fput_needed;
-	struct file *file;
-
-	nd->last_type = LAST_ROOT; /* if there are only slashes... */
-	nd->flags = flags;
-	nd->depth = 0;
-	nd->root.mnt = NULL;
-
-	if (*name=='/') {
-		set_root(nd);
-		nd->path = nd->root;
-		path_get(&nd->root);
-	} else if (dfd == AT_FDCWD) {
-		get_fs_pwd(current->fs, &nd->path);
-	} else {
-		struct dentry *dentry;
-
-		file = fget_light(dfd, &fput_needed);
-		retval = -EBADF;
-		if (!file)
-			goto out_fail;
-
-		dentry = file->f_path.dentry;
-
-		retval = -ENOTDIR;
-		if (!S_ISDIR(dentry->d_inode->i_mode))
-			goto fput_fail;
-
-		retval = file_permission(file, MAY_EXEC);
-		if (retval)
-			goto fput_fail;
-
-		nd->path = file->f_path;
-		path_get(&file->f_path);
-
-		fput_light(file, fput_needed);
-	}
 	nd->inode = nd->path.dentry->d_inode;
 	return 0;
 
@@ -1663,10 +1618,7 @@ static int path_lookupat(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	if (flags & LOOKUP_RCU)
-		retval = path_init_rcu(dfd, name, flags, nd);
-	else
-		retval = path_init(dfd, name, flags, nd);
+	retval = path_init(dfd, name, flags, nd);
 
 	if (unlikely(retval))
 		return retval;
-- 
cgit v1.2.3


From fe479a580dc9c737c4eb49ff7fdb31d41d2c7003 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 15:10:03 -0500
Subject: merge component type recognition

no need to do it in three places...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 48 ++++++++++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index b9e537980ef5..4521b5ff7c93 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1362,6 +1362,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		unsigned long hash;
 		struct qstr this;
 		unsigned int c;
+		int type;
 
 		nd->flags |= LOOKUP_CONTINUE;
 
@@ -1381,6 +1382,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		this.len = name - (const char *) this.name;
 		this.hash = end_name_hash(hash);
 
+		type = LAST_NORM;
+		if (this.name[0] == '.') switch (this.len) {
+			case 2:
+				if (this.name[1] == '.')
+					type = LAST_DOTDOT;
+				break;
+			case 1:
+				type = LAST_DOT;
+		}
+
 		/* remove trailing slashes? */
 		if (!c)
 			goto last_component;
@@ -1393,21 +1404,17 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * to be able to know about the current root directory and
 		 * parent relationships.
 		 */
-		if (this.name[0] == '.') switch (this.len) {
-			default:
-				break;
-			case 2:
-				if (this.name[1] != '.')
-					break;
+		if (unlikely(type != LAST_NORM)) {
+			if (type == LAST_DOTDOT) {
 				if (nd->flags & LOOKUP_RCU) {
 					if (follow_dotdot_rcu(nd))
 						return -ECHILD;
 				} else
 					follow_dotdot(nd);
-				/* fallthrough */
-			case 1:
-				continue;
+			}
+			continue;
 		}
+
 		/* This does the actual lookups.. */
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1441,20 +1448,15 @@ last_component:
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
-		if (this.name[0] == '.') switch (this.len) {
-			default:
-				break;
-			case 2:
-				if (this.name[1] != '.')
-					break;
+		if (unlikely(type != LAST_NORM)) {
+			if (type == LAST_DOTDOT) {
 				if (nd->flags & LOOKUP_RCU) {
 					if (follow_dotdot_rcu(nd))
 						return -ECHILD;
 				} else
 					follow_dotdot(nd);
-				/* fallthrough */
-			case 1:
-				goto return_reval;
+			}
+			goto return_reval;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1480,14 +1482,8 @@ last_component:
 		goto return_base;
 lookup_parent:
 		nd->last = this;
-		nd->last_type = LAST_NORM;
-		if (this.name[0] != '.')
-			goto return_base;
-		if (this.len == 1)
-			nd->last_type = LAST_DOT;
-		else if (this.len == 2 && this.name[1] == '.')
-			nd->last_type = LAST_DOTDOT;
-		else
+		nd->last_type = type;
+		if (type == LAST_NORM)
 			goto return_base;
 return_reval:
 		/*
-- 
cgit v1.2.3


From 16c2cd7179881d5dd87779512ca5a0d657c64f62 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 15:50:10 -0500
Subject: untangle the "need_reval_dot" mess

instead of ad-hackery around need_reval_dot(), do the following:
set a flag (LOOKUP_JUMPED) in the beginning of path, on absolute
symlink traversal, on ".." and on procfs-style symlinks.  Clear on
normal components, leave unchanged on ".".  Non-nested callers of
link_path_walk() call handle_reval_path(), which checks that flag
is set and that fs does want the final revalidate thing, then does
->d_revalidate().  In link_path_walk() all the return_reval stuff
is gone.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 107 +++++++++++++++++++++-----------------------------
 include/linux/namei.h |   2 +
 2 files changed, 46 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 4521b5ff7c93..450b686e9682 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -613,19 +613,8 @@ do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
 	return dentry;
 }
 
-static inline int need_reval_dot(struct dentry *dentry)
-{
-	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
-		return 0;
-
-	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
-		return 0;
-
-	return 1;
-}
-
 /*
- * force_reval_path - force revalidation of a dentry
+ * handle_reval_path - force revalidation of a dentry
  *
  * In some situations the path walking code will trust dentries without
  * revalidating them. This causes problems for filesystems that depend on
@@ -639,27 +628,28 @@ static inline int need_reval_dot(struct dentry *dentry)
  * invalidate the dentry. It's up to the caller to handle putting references
  * to the path if necessary.
  */
-static int
-force_reval_path(struct path *path, struct nameidata *nd)
+static inline int handle_reval_path(struct nameidata *nd)
 {
+	struct dentry *dentry = nd->path.dentry;
 	int status;
-	struct dentry *dentry = path->dentry;
 
-	/*
-	 * only check on filesystems where it's possible for the dentry to
-	 * become stale.
-	 */
-	if (!need_reval_dot(dentry))
+	if (likely(!(nd->flags & LOOKUP_JUMPED)))
+		return 0;
+
+	if (likely(!(dentry->d_flags & DCACHE_OP_REVALIDATE)))
 		return 0;
 
+	if (likely(!(dentry->d_sb->s_type->fs_flags & FS_REVAL_DOT)))
+		return 0;
+
+	/* Note: we do not d_invalidate() */
 	status = d_revalidate(dentry, nd);
 	if (status > 0)
 		return 0;
 
-	if (!status) {
-		d_invalidate(dentry);
+	if (!status)
 		status = -ESTALE;
-	}
+
 	return status;
 }
 
@@ -728,6 +718,7 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		path_put(&nd->path);
 		nd->path = nd->root;
 		path_get(&nd->root);
+		nd->flags |= LOOKUP_JUMPED;
 	}
 	nd->inode = nd->path.dentry->d_inode;
 
@@ -779,11 +770,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 		error = 0;
 		if (s)
 			error = __vfs_follow_link(nd, s);
-		else if (nd->last_type == LAST_BIND) {
-			error = force_reval_path(&nd->path, nd);
-			if (error)
-				path_put(&nd->path);
-		}
+		else if (nd->last_type == LAST_BIND)
+			nd->flags |= LOOKUP_JUMPED;
 	}
 	return error;
 }
@@ -1351,7 +1339,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	while (*name=='/')
 		name++;
 	if (!*name)
-		goto return_reval;
+		goto return_base;
 
 	if (nd->depth)
 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
@@ -1385,12 +1373,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		type = LAST_NORM;
 		if (this.name[0] == '.') switch (this.len) {
 			case 2:
-				if (this.name[1] == '.')
+				if (this.name[1] == '.') {
 					type = LAST_DOTDOT;
+					nd->flags |= LOOKUP_JUMPED;
+				}
 				break;
 			case 1:
 				type = LAST_DOT;
 		}
+		if (likely(type == LAST_NORM))
+			nd->flags &= ~LOOKUP_JUMPED;
 
 		/* remove trailing slashes? */
 		if (!c)
@@ -1456,7 +1448,7 @@ last_component:
 				} else
 					follow_dotdot(nd);
 			}
-			goto return_reval;
+			goto return_base;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1483,24 +1475,6 @@ last_component:
 lookup_parent:
 		nd->last = this;
 		nd->last_type = type;
-		if (type == LAST_NORM)
-			goto return_base;
-return_reval:
-		/*
-		 * We bypassed the ordinary revalidation routines.
-		 * We may need to check the cached dentry for staleness.
-		 */
-		if (need_reval_dot(nd->path.dentry)) {
-			if (nameidata_drop_rcu_last_maybe(nd))
-				return -ECHILD;
-			/* Note: we do not d_invalidate() */
-			err = d_revalidate(nd->path.dentry, nd);
-			if (!err)
-				err = -ESTALE;
-			if (err < 0)
-				break;
-			return 0;
-		}
 return_base:
 		if (nameidata_drop_rcu_last_maybe(nd))
 			return -ECHILD;
@@ -1523,7 +1497,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 	struct file *file;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
-	nd->flags = flags;
+	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 	nd->root.mnt = NULL;
 	nd->file = NULL;
@@ -1630,6 +1604,9 @@ static int path_lookupat(int dfd, const char *name,
 		br_read_unlock(vfsmount_lock);
 	}
 
+	if (!retval)
+		retval = handle_reval_path(nd);
+
 	if (nd->file) {
 		fput(nd->file);
 		nd->file = NULL;
@@ -1690,7 +1667,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 
 	/* same as do_path_lookup */
 	nd->last_type = LAST_ROOT;
-	nd->flags = flags;
+	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 
 	nd->path.dentry = dentry;
@@ -1703,6 +1680,8 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	current->total_link_count = 0;
 
 	result = link_path_walk(name, nd);
+	if (!result)
+		result = handle_reval_path(nd);
 	if (result == -ESTALE) {
 		/* nd->path had been dropped */
 		current->total_link_count = 0;
@@ -1710,8 +1689,11 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		nd->path.mnt = mnt;
 		nd->inode = dentry->d_inode;
 		path_get(&nd->path);
-		nd->flags |= LOOKUP_REVAL;
+		nd->flags = flags | LOOKUP_JUMPED | LOOKUP_REVAL;
+
 		result = link_path_walk(name, nd);
+		if (!result)
+			result = handle_reval_path(nd);
 	}
 	if (unlikely(!result && !audit_dummy_context() && nd->path.dentry &&
 				nd->inode))
@@ -2198,30 +2180,29 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
-	int error = -EISDIR;
+	int error;
 
 	switch (nd->last_type) {
 	case LAST_DOTDOT:
 		follow_dotdot(nd);
 		dir = nd->path.dentry;
 	case LAST_DOT:
-		if (need_reval_dot(dir)) {
-			int status = d_revalidate(nd->path.dentry, nd);
-			if (!status)
-				status = -ESTALE;
-			if (status < 0) {
-				error = status;
-				goto exit;
-			}
-		}
 		/* fallthrough */
 	case LAST_ROOT:
+		error = handle_reval_path(nd);
+		if (error)
+			goto exit;
+		error = -EISDIR;
 		goto exit;
 	case LAST_BIND:
+		error = handle_reval_path(nd);
+		if (error)
+			goto exit;
 		audit_inode(pathname, dir);
 		goto ok;
 	}
 
+	error = -EISDIR;
 	/* trailing slashes? */
 	if (nd->last.name[nd->last.len])
 		goto exit;
@@ -2422,7 +2403,7 @@ reval:
 	/*
 	 * We have the parent and last component.
 	 */
-	nd.flags = flags;
+	nd.flags = (nd.flags & ~LOOKUP_PARENT) | flags;
 	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 58ce3433d4ec..265378a707bd 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -63,6 +63,8 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_EXCL		0x0400
 #define LOOKUP_RENAME_TARGET	0x0800
 
+#define LOOKUP_JUMPED		0x1000
+
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
 #define user_path(name, path) user_path_at(AT_FDCWD, name, LOOKUP_FOLLOW, path)
-- 
cgit v1.2.3


From 086e183a641109033420e0b26ddecb6f4abb4c89 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 20:56:27 -0500
Subject: pull dropping RCU on success of link_path_walk() into path_lookupat()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 450b686e9682..8f10a9ff9f6b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -539,14 +539,6 @@ err_unlock:
 	return -ECHILD;
 }
 
-/* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
-static inline int nameidata_drop_rcu_last_maybe(struct nameidata *nd)
-{
-	if (likely(nd->flags & LOOKUP_RCU))
-		return nameidata_drop_rcu_last(nd);
-	return 0;
-}
-
 /**
  * release_open_intent - free up open intent resources
  * @nd: pointer to nameidata
@@ -1339,7 +1331,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	while (*name=='/')
 		name++;
 	if (!*name)
-		goto return_base;
+		return 0;
 
 	if (nd->depth)
 		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
@@ -1448,7 +1440,7 @@ last_component:
 				} else
 					follow_dotdot(nd);
 			}
-			goto return_base;
+			return 0;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
@@ -1471,13 +1463,10 @@ last_component:
 			if (!nd->inode->i_op->lookup)
 				break;
 		}
-		goto return_base;
+		return 0;
 lookup_parent:
 		nd->last = this;
 		nd->last_type = type;
-return_base:
-		if (nameidata_drop_rcu_last_maybe(nd))
-			return -ECHILD;
 		return 0;
 out_dput:
 		if (!(nd->flags & LOOKUP_RCU))
@@ -1598,10 +1587,15 @@ static int path_lookupat(int dfd, const char *name,
 
 	if (nd->flags & LOOKUP_RCU) {
 		/* RCU dangling. Cancel it. */
-		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
-		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
+		if (!retval) {
+			if (nameidata_drop_rcu_last(nd))
+				retval = -ECHILD;
+		} else {
+			nd->flags &= ~LOOKUP_RCU;
+			nd->root.mnt = NULL;
+			rcu_read_unlock();
+			br_read_unlock(vfsmount_lock);
+		}
 	}
 
 	if (!retval)
-- 
cgit v1.2.3


From 36f3b4f69070fee7c647bab5dc4408990bb3606c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 21:24:38 -0500
Subject: pull security_inode_follow_link() into __do_follow_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 8f10a9ff9f6b..f956567270bb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -754,6 +754,13 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 	if (link->mnt == nd->path.mnt)
 		mntget(link->mnt);
 
+	error = security_inode_follow_link(link->dentry, nd);
+	if (error) {
+		*p = ERR_PTR(error); /* no ->put_link(), please */
+		path_put(&nd->path);
+		return error;
+	}
+
 	nd->last_type = LAST_BIND;
 	*p = dentry->d_inode->i_op->follow_link(dentry, nd);
 	error = PTR_ERR(*p);
@@ -791,9 +798,6 @@ static inline int do_follow_link(struct inode *inode, struct path *path, struct
 		goto loop;
 	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
 	cond_resched();
-	err = security_inode_follow_link(path->dentry, nd);
-	if (err)
-		goto loop;
 	current->link_count++;
 	current->total_link_count++;
 	nd->depth++;
@@ -2420,9 +2424,6 @@ reval:
 		 * just set LAST_BIND.
 		 */
 		nd.flags |= LOOKUP_PARENT;
-		error = security_inode_follow_link(link.dentry, &nd);
-		if (error)
-			goto exit_dput;
 		error = __do_follow_link(&link, &nd, &cookie);
 		if (unlikely(error)) {
 			if (!IS_ERR(cookie) && linki->i_op->put_link)
-- 
cgit v1.2.3


From f1afe9efc84476ca42fbb7301a441021063eead7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 22 Feb 2011 22:27:28 -0500
Subject: clean up the failure exits after __do_follow_link() in do_filp_open()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f956567270bb..e0f59031be87 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2426,15 +2426,12 @@ reval:
 		nd.flags |= LOOKUP_PARENT;
 		error = __do_follow_link(&link, &nd, &cookie);
 		if (unlikely(error)) {
-			if (!IS_ERR(cookie) && linki->i_op->put_link)
-				linki->i_op->put_link(link.dentry, &nd, cookie);
-			/* nd.path had been dropped */
-			nd.path = link;
-			goto out_path;
+			filp = ERR_PTR(error);
+		} else {
+			nd.flags &= ~LOOKUP_PARENT;
+			filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
 		}
-		nd.flags &= ~LOOKUP_PARENT;
-		filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		if (linki->i_op->put_link)
+		if (!IS_ERR(cookie) && linki->i_op->put_link)
 			linki->i_op->put_link(link.dentry, &nd, cookie);
 		path_put(&link);
 	}
-- 
cgit v1.2.3


From c3e380b0b3cfa613189fb91513efd88a65e1d9d8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 13:39:45 -0500
Subject: Collect "operation mode" arguments of do_last() into a structure

No point messing with passing shitloads of "operation mode" arguments
to do_open() one by one, especially since they are not going to change
during do_filp_open().  Collect them into a struct, fill it and pass
to do_last() by reference.

Make sure that lookup intent flags are correctly set and removed - we
want them for do_last(), but they make no sense for __do_follow_link().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 57 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 35 insertions(+), 22 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index e0f59031be87..5e4206f45371 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2169,17 +2169,26 @@ exit:
 	return ERR_PTR(error);
 }
 
+struct open_flags {
+	int open_flag;
+	int mode;
+	int acc_mode;
+	int intent;
+};
+
 /*
  * Handle O_CREAT case for do_filp_open
  */
 static struct file *do_last(struct nameidata *nd, struct path *path,
-			    int open_flag, int acc_mode,
-			    int mode, const char *pathname)
+			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
 	int error;
 
+	nd->flags &= ~LOOKUP_PARENT;
+	nd->flags |= op->intent;
+
 	switch (nd->last_type) {
 	case LAST_DOTDOT:
 		follow_dotdot(nd);
@@ -2233,7 +2242,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto exit_mutex_unlock;
-		error = __open_namei_create(nd, path, open_flag, mode);
+		error = __open_namei_create(nd, path, op->open_flag, op->mode);
 		if (error) {
 			mnt_drop_write(nd->path.mnt);
 			goto exit;
@@ -2242,7 +2251,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		mnt_drop_write(nd->path.mnt);
 		path_put(&nd->path);
 		if (!IS_ERR(filp)) {
-			error = ima_file_check(filp, acc_mode);
+			error = ima_file_check(filp, op->acc_mode);
 			if (error) {
 				fput(filp);
 				filp = ERR_PTR(error);
@@ -2258,7 +2267,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	audit_inode(pathname, path->dentry);
 
 	error = -EEXIST;
-	if (open_flag & O_EXCL)
+	if (op->open_flag & O_EXCL)
 		goto exit_dput;
 
 	error = follow_managed(path, nd->flags);
@@ -2278,7 +2287,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	filp = finish_open(nd, open_flag, acc_mode);
+	filp = finish_open(nd, op->open_flag, op->acc_mode);
 	return filp;
 
 exit_mutex_unlock:
@@ -2304,7 +2313,8 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	struct path path;
 	int count = 0;
 	int flag = open_to_namei_flags(open_flag);
-	int flags;
+	int flags = 0;
+	struct open_flags op;
 
 	if (!(open_flag & O_CREAT))
 		mode = 0;
@@ -2321,6 +2331,8 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (open_flag & __O_SYNC)
 		open_flag |= O_DSYNC;
 
+	op.open_flag = open_flag;
+
 	if (!acc_mode)
 		acc_mode = MAY_OPEN | ACC_MODE(open_flag);
 
@@ -2333,12 +2345,15 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (open_flag & O_APPEND)
 		acc_mode |= MAY_APPEND;
 
-	flags = LOOKUP_OPEN;
+	op.acc_mode = acc_mode;
+
+	op.intent = LOOKUP_OPEN;
 	if (open_flag & O_CREAT) {
-		flags |= LOOKUP_CREATE;
+		op.intent |= LOOKUP_CREATE;
 		if (open_flag & O_EXCL)
-			flags |= LOOKUP_EXCL;
+			op.intent |= LOOKUP_EXCL;
 	}
+
 	if (open_flag & O_DIRECTORY)
 		flags |= LOOKUP_DIRECTORY;
 	if (!(open_flag & O_NOFOLLOW))
@@ -2357,7 +2372,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 		goto creat;
 
 	/* !O_CREAT, simple open */
-	error = do_path_lookup(dfd, pathname, flags, &nd);
+	error = do_path_lookup(dfd, pathname, flags | op.intent, &nd);
 	if (unlikely(error))
 		goto out_filp2;
 	error = -ELOOP;
@@ -2384,14 +2399,14 @@ out_filp2:
 
 creat:
 	/* OK, have to create the file. Find the parent. */
-	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | LOOKUP_RCU, &nd);
+	error = path_lookupat(dfd, pathname,
+			LOOKUP_PARENT | LOOKUP_RCU | flags, &nd);
 	if (unlikely(error == -ECHILD))
-		error = path_lookupat(dfd, pathname, LOOKUP_PARENT, &nd);
+		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
 	if (unlikely(error == -ESTALE)) {
 reval:
 		flags |= LOOKUP_REVAL;
-		error = path_lookupat(dfd, pathname,
-				LOOKUP_PARENT | LOOKUP_REVAL, &nd);
+		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
 	}
 	if (unlikely(error))
 		goto out_filp;
@@ -2401,8 +2416,7 @@ reval:
 	/*
 	 * We have the parent and last component.
 	 */
-	nd.flags = (nd.flags & ~LOOKUP_PARENT) | flags;
-	filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
+	filp = do_last(&nd, &path, &op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
@@ -2424,13 +2438,12 @@ reval:
 		 * just set LAST_BIND.
 		 */
 		nd.flags |= LOOKUP_PARENT;
+		nd.flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
 		error = __do_follow_link(&link, &nd, &cookie);
-		if (unlikely(error)) {
+		if (unlikely(error))
 			filp = ERR_PTR(error);
-		} else {
-			nd.flags &= ~LOOKUP_PARENT;
-			filp = do_last(&nd, &path, open_flag, acc_mode, mode, pathname);
-		}
+		else
+			filp = do_last(&nd, &path, &op, pathname);
 		if (!IS_ERR(cookie) && linki->i_op->put_link)
 			linki->i_op->put_link(link.dentry, &nd, cookie);
 		path_put(&link);
-- 
cgit v1.2.3


From 47c805dc2d2dff686962f5f0baa6bac2d703ba19 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 17:44:09 -0500
Subject: switch do_filp_open() to struct open_flags

take calculation of open_flags by open(2) arguments into new helper
in fs/open.c, move filp_open() over there, have it and do_sys_open()
use that helper, switch exec.c callers of do_filp_open() to explicit
(and constant) struct open_flags.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exec.c          | 18 +++++++----
 fs/internal.h      |  8 +++++
 fs/namei.c         | 88 ++++++------------------------------------------------
 fs/open.c          | 73 +++++++++++++++++++++++++++++++++++++++++++-
 include/linux/fs.h |  2 --
 5 files changed, 101 insertions(+), 88 deletions(-)

(limited to 'fs')

diff --git a/fs/exec.c b/fs/exec.c
index 52a447d9b6ab..ba99e1abb1aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -115,13 +115,16 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 	struct file *file;
 	char *tmp = getname(library);
 	int error = PTR_ERR(tmp);
+	static const struct open_flags uselib_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN
+	};
 
 	if (IS_ERR(tmp))
 		goto out;
 
-	file = do_filp_open(AT_FDCWD, tmp,
-				O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
-				MAY_READ | MAY_EXEC | MAY_OPEN);
+	file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
 	putname(tmp);
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
@@ -721,10 +724,13 @@ struct file *open_exec(const char *name)
 {
 	struct file *file;
 	int err;
+	static const struct open_flags open_exec_flags = {
+		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+		.acc_mode = MAY_EXEC | MAY_OPEN,
+		.intent = LOOKUP_OPEN
+	};
 
-	file = do_filp_open(AT_FDCWD, name,
-				O_LARGEFILE | O_RDONLY | __FMODE_EXEC, 0,
-				MAY_EXEC | MAY_OPEN);
+	file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
 	if (IS_ERR(file))
 		goto out;
 
diff --git a/fs/internal.h b/fs/internal.h
index 9b976b57d7fe..6fdbdf2c6047 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -106,6 +106,14 @@ extern void put_super(struct super_block *sb);
 struct nameidata;
 extern struct file *nameidata_to_filp(struct nameidata *);
 extern void release_open_intent(struct nameidata *);
+struct open_flags {
+	int open_flag;
+	int mode;
+	int acc_mode;
+	int intent;
+};
+extern struct file *do_filp_open(int dfd, const char *pathname,
+		const struct open_flags *op, int lookup_flags);
 
 /*
  * inode.c
diff --git a/fs/namei.c b/fs/namei.c
index 5e4206f45371..9c7fa946abe1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2169,13 +2169,6 @@ exit:
 	return ERR_PTR(error);
 }
 
-struct open_flags {
-	int open_flag;
-	int mode;
-	int acc_mode;
-	int intent;
-};
-
 /*
  * Handle O_CREAT case for do_filp_open
  */
@@ -2305,74 +2298,28 @@ exit:
  * open_to_namei_flags() for more details.
  */
 struct file *do_filp_open(int dfd, const char *pathname,
-		int open_flag, int mode, int acc_mode)
+		const struct open_flags *op, int flags)
 {
 	struct file *filp;
 	struct nameidata nd;
 	int error;
 	struct path path;
 	int count = 0;
-	int flag = open_to_namei_flags(open_flag);
-	int flags = 0;
-	struct open_flags op;
-
-	if (!(open_flag & O_CREAT))
-		mode = 0;
-
-	/* Must never be set by userspace */
-	open_flag &= ~FMODE_NONOTIFY;
-
-	/*
-	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
-	 * check for O_DSYNC if the need any syncing at all we enforce it's
-	 * always set instead of having to deal with possibly weird behaviour
-	 * for malicious applications setting only __O_SYNC.
-	 */
-	if (open_flag & __O_SYNC)
-		open_flag |= O_DSYNC;
-
-	op.open_flag = open_flag;
-
-	if (!acc_mode)
-		acc_mode = MAY_OPEN | ACC_MODE(open_flag);
-
-	/* O_TRUNC implies we need access checks for write permissions */
-	if (open_flag & O_TRUNC)
-		acc_mode |= MAY_WRITE;
-
-	/* Allow the LSM permission hook to distinguish append 
-	   access from general write access. */
-	if (open_flag & O_APPEND)
-		acc_mode |= MAY_APPEND;
-
-	op.acc_mode = acc_mode;
-
-	op.intent = LOOKUP_OPEN;
-	if (open_flag & O_CREAT) {
-		op.intent |= LOOKUP_CREATE;
-		if (open_flag & O_EXCL)
-			op.intent |= LOOKUP_EXCL;
-	}
-
-	if (open_flag & O_DIRECTORY)
-		flags |= LOOKUP_DIRECTORY;
-	if (!(open_flag & O_NOFOLLOW))
-		flags |= LOOKUP_FOLLOW;
 
 	filp = get_empty_filp();
 	if (!filp)
 		return ERR_PTR(-ENFILE);
 
-	filp->f_flags = open_flag;
+	filp->f_flags = op->open_flag;
 	nd.intent.open.file = filp;
-	nd.intent.open.flags = flag;
-	nd.intent.open.create_mode = mode;
+	nd.intent.open.flags = open_to_namei_flags(op->open_flag);
+	nd.intent.open.create_mode = op->mode;
 
-	if (open_flag & O_CREAT)
+	if (op->open_flag & O_CREAT)
 		goto creat;
 
 	/* !O_CREAT, simple open */
-	error = do_path_lookup(dfd, pathname, flags | op.intent, &nd);
+	error = do_path_lookup(dfd, pathname, flags | op->intent, &nd);
 	if (unlikely(error))
 		goto out_filp2;
 	error = -ELOOP;
@@ -2386,7 +2333,7 @@ struct file *do_filp_open(int dfd, const char *pathname,
 			goto out_path2;
 	}
 	audit_inode(pathname, nd.path.dentry);
-	filp = finish_open(&nd, open_flag, acc_mode);
+	filp = finish_open(&nd, op->open_flag, op->acc_mode);
 out2:
 	release_open_intent(&nd);
 	return filp;
@@ -2416,7 +2363,7 @@ reval:
 	/*
 	 * We have the parent and last component.
 	 */
-	filp = do_last(&nd, &path, &op, pathname);
+	filp = do_last(&nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
@@ -2443,7 +2390,7 @@ reval:
 		if (unlikely(error))
 			filp = ERR_PTR(error);
 		else
-			filp = do_last(&nd, &path, &op, pathname);
+			filp = do_last(&nd, &path, op, pathname);
 		if (!IS_ERR(cookie) && linki->i_op->put_link)
 			linki->i_op->put_link(link.dentry, &nd, cookie);
 		path_put(&link);
@@ -2465,23 +2412,6 @@ out_filp:
 	goto out;
 }
 
-/**
- * filp_open - open file and return file pointer
- *
- * @filename:	path to open
- * @flags:	open flags as per the open(2) second argument
- * @mode:	mode for the new file if O_CREAT is set, else ignored
- *
- * This is the helper to open a file from kernelspace if you really
- * have to.  But in generally you should not do this, so please move
- * along, nothing to see here..
- */
-struct file *filp_open(const char *filename, int flags, int mode)
-{
-	return do_filp_open(AT_FDCWD, filename, flags, mode, 0);
-}
-EXPORT_SYMBOL(filp_open);
-
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
diff --git a/fs/open.c b/fs/open.c
index b47aab39c057..d05e18c60bae 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -890,15 +890,86 @@ void fd_install(unsigned int fd, struct file *file)
 
 EXPORT_SYMBOL(fd_install);
 
+static inline int build_open_flags(int flags, int mode, struct open_flags *op)
+{
+	int lookup_flags = 0;
+	int acc_mode;
+
+	if (!(flags & O_CREAT))
+		mode = 0;
+	op->mode = mode;
+
+	/* Must never be set by userspace */
+	flags &= ~FMODE_NONOTIFY;
+
+	/*
+	 * O_SYNC is implemented as __O_SYNC|O_DSYNC.  As many places only
+	 * check for O_DSYNC if the need any syncing at all we enforce it's
+	 * always set instead of having to deal with possibly weird behaviour
+	 * for malicious applications setting only __O_SYNC.
+	 */
+	if (flags & __O_SYNC)
+		flags |= O_DSYNC;
+
+	op->open_flag = flags;
+
+	acc_mode = MAY_OPEN | ACC_MODE(flags);
+
+	/* O_TRUNC implies we need access checks for write permissions */
+	if (flags & O_TRUNC)
+		acc_mode |= MAY_WRITE;
+
+	/* Allow the LSM permission hook to distinguish append
+	   access from general write access. */
+	if (flags & O_APPEND)
+		acc_mode |= MAY_APPEND;
+
+	op->acc_mode = acc_mode;
+
+	op->intent = LOOKUP_OPEN;
+	if (flags & O_CREAT) {
+		op->intent |= LOOKUP_CREATE;
+		if (flags & O_EXCL)
+			op->intent |= LOOKUP_EXCL;
+	}
+
+	if (flags & O_DIRECTORY)
+		lookup_flags |= LOOKUP_DIRECTORY;
+	if (!(flags & O_NOFOLLOW))
+		lookup_flags |= LOOKUP_FOLLOW;
+	return lookup_flags;
+}
+
+/**
+ * filp_open - open file and return file pointer
+ *
+ * @filename:	path to open
+ * @flags:	open flags as per the open(2) second argument
+ * @mode:	mode for the new file if O_CREAT is set, else ignored
+ *
+ * This is the helper to open a file from kernelspace if you really
+ * have to.  But in generally you should not do this, so please move
+ * along, nothing to see here..
+ */
+struct file *filp_open(const char *filename, int flags, int mode)
+{
+	struct open_flags op;
+	int lookup = build_open_flags(flags, mode, &op);
+	return do_filp_open(AT_FDCWD, filename, &op, lookup);
+}
+EXPORT_SYMBOL(filp_open);
+
 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
+	struct open_flags op;
+	int lookup = build_open_flags(flags, mode, &op);
 	char *tmp = getname(filename);
 	int fd = PTR_ERR(tmp);
 
 	if (!IS_ERR(tmp)) {
 		fd = get_unused_fd_flags(flags);
 		if (fd >= 0) {
-			struct file *f = do_filp_open(dfd, tmp, flags, mode, 0);
+			struct file *f = do_filp_open(dfd, tmp, &op, lookup);
 			if (IS_ERR(f)) {
 				put_unused_fd(fd);
 				fd = PTR_ERR(f);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e38b50a4b9d2..9c75714f92c1 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2205,8 +2205,6 @@ extern struct file *create_read_pipe(struct file *f, int flags);
 extern struct file *create_write_pipe(int flags);
 extern void free_write_pipe(struct file *);
 
-extern struct file *do_filp_open(int dfd, const char *pathname,
-		int open_flag, int mode, int acc_mode);
 extern int may_open(struct path *, int, int);
 
 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
-- 
cgit v1.2.3


From 13aab428a73d3200b9283b61b7fdf5713181ac66 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 17:54:08 -0500
Subject: separate -ESTALE/-ECHILD retries in do_filp_open() from real work

new helper: path_openat().  Does what do_filp_open() does, except
that it tries only the walk mode (RCU/normal/force revalidation)
it had been told to.

Both create and non-create branches are using path_lookupat() now.
Fixed the double audit_inode() in non-create branch.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 49 ++++++++++++++++++++-----------------------------
 1 file changed, 20 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9c7fa946abe1..01a17dd2f151 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2292,19 +2292,14 @@ exit:
 	return ERR_PTR(error);
 }
 
-/*
- * Note that the low bits of the passed in "open_flag"
- * are not the same as in the local variable "flag". See
- * open_to_namei_flags() for more details.
- */
-struct file *do_filp_open(int dfd, const char *pathname,
+static struct file *path_openat(int dfd, const char *pathname,
 		const struct open_flags *op, int flags)
 {
 	struct file *filp;
 	struct nameidata nd;
-	int error;
 	struct path path;
 	int count = 0;
+	int error;
 
 	filp = get_empty_filp();
 	if (!filp)
@@ -2319,42 +2314,27 @@ struct file *do_filp_open(int dfd, const char *pathname,
 		goto creat;
 
 	/* !O_CREAT, simple open */
-	error = do_path_lookup(dfd, pathname, flags | op->intent, &nd);
+	error = path_lookupat(dfd, pathname, flags | op->intent, &nd);
 	if (unlikely(error))
-		goto out_filp2;
+		goto out_filp;
 	error = -ELOOP;
 	if (!(nd.flags & LOOKUP_FOLLOW)) {
 		if (nd.inode->i_op->follow_link)
-			goto out_path2;
+			goto out_path;
 	}
 	error = -ENOTDIR;
 	if (nd.flags & LOOKUP_DIRECTORY) {
 		if (!nd.inode->i_op->lookup)
-			goto out_path2;
+			goto out_path;
 	}
 	audit_inode(pathname, nd.path.dentry);
 	filp = finish_open(&nd, op->open_flag, op->acc_mode);
-out2:
 	release_open_intent(&nd);
 	return filp;
 
-out_path2:
-	path_put(&nd.path);
-out_filp2:
-	filp = ERR_PTR(error);
-	goto out2;
-
 creat:
 	/* OK, have to create the file. Find the parent. */
-	error = path_lookupat(dfd, pathname,
-			LOOKUP_PARENT | LOOKUP_RCU | flags, &nd);
-	if (unlikely(error == -ECHILD))
-		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
-	if (unlikely(error == -ESTALE)) {
-reval:
-		flags |= LOOKUP_REVAL;
-		error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
-	}
+	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
 	if (unlikely(error))
 		goto out_filp;
 	if (unlikely(!audit_dummy_context()))
@@ -2398,8 +2378,6 @@ reval:
 out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
-	if (filp == ERR_PTR(-ESTALE) && !(flags & LOOKUP_REVAL))
-		goto reval;
 	release_open_intent(&nd);
 	return filp;
 
@@ -2412,6 +2390,19 @@ out_filp:
 	goto out;
 }
 
+struct file *do_filp_open(int dfd, const char *pathname,
+		const struct open_flags *op, int flags)
+{
+	struct file *filp;
+
+	filp = path_openat(dfd, pathname, op, flags | LOOKUP_RCU);
+	if (unlikely(filp == ERR_PTR(-ECHILD)))
+		filp = path_openat(dfd, pathname, op, flags);
+	if (unlikely(filp == ERR_PTR(-ESTALE)))
+		filp = path_openat(dfd, pathname, op, flags | LOOKUP_REVAL);
+	return filp;
+}
+
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
-- 
cgit v1.2.3


From 7bc055d1d524f209bf49d8b9cb220712dd7df4ed Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 23 Feb 2011 19:41:31 -0500
Subject: kill out_dput: in link_path_walk()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 01a17dd2f151..fea36369dc87 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1407,22 +1407,19 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-		err = -ENOENT;
-		if (!inode)
-			goto out_dput;
 
-		if (inode->i_op->follow_link) {
+		if (inode && inode->i_op->follow_link) {
 			err = do_follow_link(inode, &next, nd);
 			if (err)
 				goto return_err;
 			nd->inode = nd->path.dentry->d_inode;
-			err = -ENOENT;
-			if (!nd->inode)
-				break;
 		} else {
 			path_to_nameidata(&next, nd);
 			nd->inode = inode;
 		}
+		err = -ENOENT;
+		if (!nd->inode)
+			break;
 		err = -ENOTDIR; 
 		if (!nd->inode->i_op->lookup)
 			break;
@@ -1472,10 +1469,6 @@ lookup_parent:
 		nd->last = this;
 		nd->last_type = type;
 		return 0;
-out_dput:
-		if (!(nd->flags & LOOKUP_RCU))
-			path_put_conditional(&next, nd);
-		break;
 	}
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&nd->path);
-- 
cgit v1.2.3


From 9856fa1b281eccdc9f8d94d716e96818c675e78e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:22:06 -0500
Subject: pull handling of . and .. into inlined helper

getting LOOKUP_RCU checks out of link_path_walk()...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index fea36369dc87..d29f91e8ff3d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1318,6 +1318,18 @@ static inline int may_lookup(struct nameidata *nd)
 	return exec_permission(nd->inode, 0);
 }
 
+static inline int handle_dots(struct nameidata *nd, int type)
+{
+	if (type == LAST_DOTDOT) {
+		if (nd->flags & LOOKUP_RCU) {
+			if (follow_dotdot_rcu(nd))
+				return -ECHILD;
+		} else
+			follow_dotdot(nd);
+	}
+	return 0;
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1393,13 +1405,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * parent relationships.
 		 */
 		if (unlikely(type != LAST_NORM)) {
-			if (type == LAST_DOTDOT) {
-				if (nd->flags & LOOKUP_RCU) {
-					if (follow_dotdot_rcu(nd))
-						return -ECHILD;
-				} else
-					follow_dotdot(nd);
-			}
+			if (handle_dots(nd, type))
+				return -ECHILD;
 			continue;
 		}
 
@@ -1434,13 +1441,8 @@ last_component:
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
 		if (unlikely(type != LAST_NORM)) {
-			if (type == LAST_DOTDOT) {
-				if (nd->flags & LOOKUP_RCU) {
-					if (follow_dotdot_rcu(nd))
-						return -ECHILD;
-				} else
-					follow_dotdot(nd);
-			}
+			if (handle_dots(nd, type))
+				return -ECHILD;
 			return 0;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
-- 
cgit v1.2.3


From 4455ca6223cc59cbc0a75f4be8bce9e84cc0d6b8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:28:10 -0500
Subject: clear RCU on all failure exits from link_path_walk()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index d29f91e8ff3d..f09887a45831 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1405,8 +1405,9 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * parent relationships.
 		 */
 		if (unlikely(type != LAST_NORM)) {
-			if (handle_dots(nd, type))
-				return -ECHILD;
+			err = handle_dots(nd, type);
+			if (err)
+				goto return_err;
 			continue;
 		}
 
@@ -1441,8 +1442,9 @@ last_component:
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
 		if (unlikely(type != LAST_NORM)) {
-			if (handle_dots(nd, type))
-				return -ECHILD;
+			err = handle_dots(nd, type);
+			if (err)
+				goto return_err;
 			return 0;
 		}
 		err = do_lookup(nd, &this, &next, &inode);
@@ -1475,6 +1477,12 @@ lookup_parent:
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&nd->path);
 return_err:
+	if (nd->flags & LOOKUP_RCU) {
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
 	return err;
 }
 
@@ -1585,16 +1593,10 @@ static int path_lookupat(int dfd, const char *name,
 	retval = link_path_walk(name, nd);
 
 	if (nd->flags & LOOKUP_RCU) {
-		/* RCU dangling. Cancel it. */
-		if (!retval) {
-			if (nameidata_drop_rcu_last(nd))
-				retval = -ECHILD;
-		} else {
-			nd->flags &= ~LOOKUP_RCU;
-			nd->root.mnt = NULL;
-			rcu_read_unlock();
-			br_read_unlock(vfsmount_lock);
-		}
+		/* went all way through without dropping RCU */
+		BUG_ON(retval);
+		if (nameidata_drop_rcu_last(nd))
+			retval = -ECHILD;
 	}
 
 	if (!retval)
-- 
cgit v1.2.3


From ef7562d5283a91da3ba5c14de3221f47b7f08823 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:35:59 -0500
Subject: make handle_dots() leave RCU mode on error

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f09887a45831..ea14bfb04785 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1052,7 +1052,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 
 			seq = read_seqcount_begin(&parent->d_seq);
 			if (read_seqcount_retry(&old->d_seq, nd->seq))
-				return -ECHILD;
+				goto failed;
 			inode = parent->d_inode;
 			nd->path.dentry = parent;
 			nd->seq = seq;
@@ -1065,8 +1065,14 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 	}
 	__follow_mount_rcu(nd, &nd->path, &inode, true);
 	nd->inode = inode;
-
 	return 0;
+
+failed:
+	nd->flags &= ~LOOKUP_RCU;
+	nd->root.mnt = NULL;
+	rcu_read_unlock();
+	br_read_unlock(vfsmount_lock);
+	return -ECHILD;
 }
 
 /*
@@ -1405,9 +1411,8 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		 * parent relationships.
 		 */
 		if (unlikely(type != LAST_NORM)) {
-			err = handle_dots(nd, type);
-			if (err)
-				goto return_err;
+			if (handle_dots(nd, type))
+				return -ECHILD;
 			continue;
 		}
 
@@ -1441,12 +1446,8 @@ last_component:
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
 		if (lookup_flags & LOOKUP_PARENT)
 			goto lookup_parent;
-		if (unlikely(type != LAST_NORM)) {
-			err = handle_dots(nd, type);
-			if (err)
-				goto return_err;
-			return 0;
-		}
+		if (unlikely(type != LAST_NORM))
+			return handle_dots(nd, type);
 		err = do_lookup(nd, &this, &next, &inode);
 		if (err)
 			break;
-- 
cgit v1.2.3


From a7472baba22dd5d68580f528374f93421b33667e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:39:30 -0500
Subject: make nameidata_dentry_drop_rcu_maybe() always leave RCU mode

Now we have do_follow_link() guaranteed to leave without dangling RCU
and the next step will get LOOKUP_RCU logics completely out of
link_path_walk().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index ea14bfb04785..53bba7c1a520 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -498,8 +498,15 @@ err_root:
 /* Try to drop out of rcu-walk mode if we were in it, otherwise do nothing.  */
 static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct dentry *dentry)
 {
-	if (nd->flags & LOOKUP_RCU)
-		return nameidata_dentry_drop_rcu(nd, dentry);
+	if (nd->flags & LOOKUP_RCU) {
+		if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
+			nd->flags &= ~LOOKUP_RCU;
+			nd->root.mnt = NULL;
+			rcu_read_unlock();
+			br_read_unlock(vfsmount_lock);
+			return -ECHILD;
+		}
+	}
 	return 0;
 }
 
@@ -1424,7 +1431,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		if (inode && inode->i_op->follow_link) {
 			err = do_follow_link(inode, &next, nd);
 			if (err)
-				goto return_err;
+				return err;
 			nd->inode = nd->path.dentry->d_inode;
 		} else {
 			path_to_nameidata(&next, nd);
@@ -1455,7 +1462,7 @@ last_component:
 		    (lookup_flags & LOOKUP_FOLLOW)) {
 			err = do_follow_link(inode, &next, nd);
 			if (err)
-				goto return_err;
+				return err;
 			nd->inode = nd->path.dentry->d_inode;
 		} else {
 			path_to_nameidata(&next, nd);
@@ -1477,7 +1484,6 @@ lookup_parent:
 	}
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&nd->path);
-return_err:
 	if (nd->flags & LOOKUP_RCU) {
 		nd->flags &= ~LOOKUP_RCU;
 		nd->root.mnt = NULL;
-- 
cgit v1.2.3


From 951361f954596bd134d4270df834f47d151f98a6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 4 Mar 2011 14:44:37 -0500
Subject: get rid of the last LOOKUP_RCU dependencies in link_path_walk()

New helper: terminate_walk().  An error has happened during pathname
resolution and we either drop nd->path or terminate RCU, depending
the mode we had been in.  After that, nd is essentially empty.
Switch link_path_walk() to using that for cleanup.

Now the top-level logics in link_path_walk() is back to sanity.  RCU
dependencies are in the lower-level functions.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 53bba7c1a520..85f6e39b4034 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1343,6 +1343,18 @@ static inline int handle_dots(struct nameidata *nd, int type)
 	return 0;
 }
 
+static void terminate_walk(struct nameidata *nd)
+{
+	if (!(nd->flags & LOOKUP_RCU)) {
+		path_put(&nd->path);
+	} else {
+		nd->flags &= ~LOOKUP_RCU;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		br_read_unlock(vfsmount_lock);
+	}
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1482,14 +1494,7 @@ lookup_parent:
 		nd->last_type = type;
 		return 0;
 	}
-	if (!(nd->flags & LOOKUP_RCU))
-		path_put(&nd->path);
-	if (nd->flags & LOOKUP_RCU) {
-		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
-		rcu_read_unlock();
-		br_read_unlock(vfsmount_lock);
-	}
+	terminate_walk(nd);
 	return err;
 }
 
-- 
cgit v1.2.3


From 70e9b3571107b88674cd55ae4bed33f76261e7d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Mar 2011 21:12:22 -0500
Subject: get rid of nd->file

Don't stash the struct file * used as starting point of walk in nameidata;
pass file ** to path_init() instead.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 15 +++++++--------
 include/linux/namei.h |  1 -
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 85f6e39b4034..a260a306daf5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1498,7 +1498,8 @@ lookup_parent:
 	return err;
 }
 
-static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags,
+		     struct nameidata *nd, struct file **fp)
 {
 	int retval = 0;
 	int fput_needed;
@@ -1508,7 +1509,6 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
 	nd->root.mnt = NULL;
-	nd->file = NULL;
 
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
@@ -1557,7 +1557,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 		nd->path = file->f_path;
 		if (flags & LOOKUP_RCU) {
 			if (fput_needed)
-				nd->file = file;
+				*fp = file;
 			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 			br_read_lock(vfsmount_lock);
 			rcu_read_lock();
@@ -1580,6 +1580,7 @@ out_fail:
 static int path_lookupat(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
+	struct file *base = NULL;
 	int retval;
 
 	/*
@@ -1596,7 +1597,7 @@ static int path_lookupat(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	retval = path_init(dfd, name, flags, nd);
+	retval = path_init(dfd, name, flags, nd, &base);
 
 	if (unlikely(retval))
 		return retval;
@@ -1614,10 +1615,8 @@ static int path_lookupat(int dfd, const char *name,
 	if (!retval)
 		retval = handle_reval_path(nd);
 
-	if (nd->file) {
-		fput(nd->file);
-		nd->file = NULL;
-	}
+	if (base)
+		fput(base);
 
 	if (nd->root.mnt) {
 		path_put(&nd->root);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 265378a707bd..72ffd62ac736 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -19,7 +19,6 @@ struct nameidata {
 	struct path	path;
 	struct qstr	last;
 	struct path	root;
-	struct file	*file;
 	struct inode	*inode; /* path.dentry.d_inode */
 	unsigned int	flags;
 	unsigned	seq;
-- 
cgit v1.2.3


From fe2d35ff0d18a2c93993b0d7d46f846ff4331b72 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 5 Mar 2011 22:58:25 -0500
Subject: switch non-create side of open() to use of do_last()

Instead of path_lookupat() doing trailing symlink resolution,
use the same scheme as on the O_CREAT side.  Walk with
LOOKUP_PARENT, then (in do_last()) look the final component
up, then either open it or return error or, if it's a symlink,
give the symlink back to path_openat() to be resolved there.

The really messy complication here is RCU.  We don't want to drop
out of RCU mode before the final lookup, since we don't want to
bounce parent directory ->d_count without a good reason.

Result is _not_ pretty; later in the series we'll clean it up.
For now we are roughly back where we'd been before the revert
done by Nick's series - top-level logics of path_openat() is
cleaned up, do_last() does actual opening, symlink resolution is
done uniformly.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 100 +++++++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 67 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a260a306daf5..9595b4a55c39 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2178,13 +2178,14 @@ exit:
 }
 
 /*
- * Handle O_CREAT case for do_filp_open
+ * Handle the last step of open()
  */
 static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
 	struct file *filp;
+	struct inode *inode;
 	int error;
 
 	nd->flags &= ~LOOKUP_PARENT;
@@ -2192,17 +2193,27 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 	switch (nd->last_type) {
 	case LAST_DOTDOT:
-		follow_dotdot(nd);
-		dir = nd->path.dentry;
 	case LAST_DOT:
+		error = handle_dots(nd, nd->last_type);
+		if (error)
+			return ERR_PTR(error);
 		/* fallthrough */
 	case LAST_ROOT:
+		if (nd->flags & LOOKUP_RCU) {
+			if (nameidata_drop_rcu_last(nd))
+				return ERR_PTR(-ECHILD);
+		}
 		error = handle_reval_path(nd);
 		if (error)
 			goto exit;
-		error = -EISDIR;
-		goto exit;
+		audit_inode(pathname, nd->path.dentry);
+		if (op->open_flag & O_CREAT) {
+			error = -EISDIR;
+			goto exit;
+		}
+		goto ok;
 	case LAST_BIND:
+		/* can't be RCU mode here */
 		error = handle_reval_path(nd);
 		if (error)
 			goto exit;
@@ -2210,6 +2221,51 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		goto ok;
 	}
 
+	if (!(op->open_flag & O_CREAT)) {
+		if (nd->last.name[nd->last.len])
+			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+		/* we _can_ be in RCU mode here */
+		error = do_lookup(nd, &nd->last, path, &inode);
+		if (error) {
+			terminate_walk(nd);
+			return ERR_PTR(error);
+		}
+		if (!inode) {
+			path_to_nameidata(path, nd);
+			terminate_walk(nd);
+			return ERR_PTR(-ENOENT);
+		}
+		if (unlikely(inode->i_op->follow_link)) {
+			/* We drop rcu-walk here */
+			if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+				return ERR_PTR(-ECHILD);
+			return NULL;
+		}
+		path_to_nameidata(path, nd);
+		nd->inode = inode;
+		/* sayonara */
+		if (nd->flags & LOOKUP_RCU) {
+			if (nameidata_drop_rcu_last(nd))
+				return ERR_PTR(-ECHILD);
+		}
+
+		error = -ENOTDIR;
+		if (nd->flags & LOOKUP_DIRECTORY) {
+			if (!inode->i_op->lookup)
+				goto exit;
+		}
+		audit_inode(pathname, nd->path.dentry);
+		goto ok;
+	}
+
+	/* create side of things */
+
+	if (nd->flags & LOOKUP_RCU) {
+		if (nameidata_drop_rcu_last(nd))
+			return ERR_PTR(-ECHILD);
+	}
+
+	audit_inode(pathname, dir);
 	error = -EISDIR;
 	/* trailing slashes? */
 	if (nd->last.name[nd->last.len])
@@ -2303,6 +2359,7 @@ exit:
 static struct file *path_openat(int dfd, const char *pathname,
 		const struct open_flags *op, int flags)
 {
+	struct file *base = NULL;
 	struct file *filp;
 	struct nameidata nd;
 	struct path path;
@@ -2318,39 +2375,15 @@ static struct file *path_openat(int dfd, const char *pathname,
 	nd.intent.open.flags = open_to_namei_flags(op->open_flag);
 	nd.intent.open.create_mode = op->mode;
 
-	if (op->open_flag & O_CREAT)
-		goto creat;
-
-	/* !O_CREAT, simple open */
-	error = path_lookupat(dfd, pathname, flags | op->intent, &nd);
+	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, &nd, &base);
 	if (unlikely(error))
 		goto out_filp;
-	error = -ELOOP;
-	if (!(nd.flags & LOOKUP_FOLLOW)) {
-		if (nd.inode->i_op->follow_link)
-			goto out_path;
-	}
-	error = -ENOTDIR;
-	if (nd.flags & LOOKUP_DIRECTORY) {
-		if (!nd.inode->i_op->lookup)
-			goto out_path;
-	}
-	audit_inode(pathname, nd.path.dentry);
-	filp = finish_open(&nd, op->open_flag, op->acc_mode);
-	release_open_intent(&nd);
-	return filp;
 
-creat:
-	/* OK, have to create the file. Find the parent. */
-	error = path_lookupat(dfd, pathname, LOOKUP_PARENT | flags, &nd);
+	current->total_link_count = 0;
+	error = link_path_walk(pathname, &nd);
 	if (unlikely(error))
 		goto out_filp;
-	if (unlikely(!audit_dummy_context()))
-		audit_inode(pathname, nd.path.dentry);
 
-	/*
-	 * We have the parent and last component.
-	 */
 	filp = do_last(&nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
@@ -2386,12 +2419,13 @@ creat:
 out:
 	if (nd.root.mnt)
 		path_put(&nd.root);
+	if (base)
+		fput(base);
 	release_open_intent(&nd);
 	return filp;
 
 exit_dput:
 	path_put_conditional(&path, &nd);
-out_path:
 	path_put(&nd.path);
 out_filp:
 	filp = ERR_PTR(error);
-- 
cgit v1.2.3


From 6a96ba54418be740303765c0f52be028573cb99a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 7 Mar 2011 23:49:20 -0500
Subject: kill __lookup_one_len()

only one caller left

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 41 +++++++++++++++--------------------------
 1 file changed, 15 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9595b4a55c39..f6f3ef47bc74 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1759,28 +1759,6 @@ static struct dentry *lookup_hash(struct nameidata *nd)
 	return __lookup_hash(&nd->last, nd->path.dentry, nd);
 }
 
-static int __lookup_one_len(const char *name, struct qstr *this,
-		struct dentry *base, int len)
-{
-	unsigned long hash;
-	unsigned int c;
-
-	this->name = name;
-	this->len = len;
-	if (!len)
-		return -EACCES;
-
-	hash = init_name_hash();
-	while (len--) {
-		c = *(const unsigned char *)name++;
-		if (c == '/' || c == '\0')
-			return -EACCES;
-		hash = partial_name_hash(c, hash);
-	}
-	this->hash = end_name_hash(hash);
-	return 0;
-}
-
 /**
  * lookup_one_len - filesystem helper to lookup single pathname component
  * @name:	pathname component to lookup
@@ -1794,14 +1772,25 @@ static int __lookup_one_len(const char *name, struct qstr *this,
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
-	int err;
 	struct qstr this;
+	unsigned long hash;
+	unsigned int c;
 
 	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
 
-	err = __lookup_one_len(name, &this, base, len);
-	if (err)
-		return ERR_PTR(err);
+	this.name = name;
+	this.len = len;
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	hash = init_name_hash();
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+		hash = partial_name_hash(c, hash);
+	}
+	this.hash = end_name_hash(hash);
 
 	return __lookup_hash(&this, base, NULL);
 }
-- 
cgit v1.2.3


From 5a202bcd75bbd2397136397961babbd8463416af Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 8 Mar 2011 14:17:44 -0500
Subject: sanitize pathname component hash calculation

Lift it to lookup_one_len() and link_path_walk() resp. into the
same place where we calculated default hash function of the same
name.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 42 +++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index f6f3ef47bc74..d1a5dfeaf999 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1216,16 +1216,6 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	struct inode *dir;
 	int err;
 
-	/*
-	 * See if the low-level filesystem might want
-	 * to use its own hash..
-	 */
-	if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-		err = parent->d_op->d_hash(parent, nd->inode, name);
-		if (err < 0)
-			return err;
-	}
-
 	/*
 	 * Rename seqlock is not required here because in the off chance
 	 * of a false negative due to a concurrent rename, we're going to
@@ -1414,8 +1404,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			case 1:
 				type = LAST_DOT;
 		}
-		if (likely(type == LAST_NORM))
+		if (likely(type == LAST_NORM)) {
+			struct dentry *parent = nd->path.dentry;
 			nd->flags &= ~LOOKUP_JUMPED;
+			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+				err = parent->d_op->d_hash(parent, nd->inode,
+							   &this);
+				if (err < 0)
+					break;
+			}
+		}
 
 		/* remove trailing slashes? */
 		if (!c)
@@ -1722,17 +1720,6 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	if (err)
 		return ERR_PTR(err);
 
-	/*
-	 * See if the low-level filesystem might want
-	 * to use its own hash..
-	 */
-	if (base->d_flags & DCACHE_OP_HASH) {
-		err = base->d_op->d_hash(base, inode, name);
-		dentry = ERR_PTR(err);
-		if (err < 0)
-			goto out;
-	}
-
 	/*
 	 * Don't bother with __d_lookup: callers are for creat as
 	 * well as unlink, so a lot of the time it would cost
@@ -1745,7 +1732,7 @@ static struct dentry *__lookup_hash(struct qstr *name,
 
 	if (!dentry)
 		dentry = d_alloc_and_lookup(base, name, nd);
-out:
+
 	return dentry;
 }
 
@@ -1791,6 +1778,15 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 		hash = partial_name_hash(c, hash);
 	}
 	this.hash = end_name_hash(hash);
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, base->d_inode, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
 
 	return __lookup_hash(&this, base, NULL);
 }
-- 
cgit v1.2.3


From 0f9d1a10c341020617e5b1c7f9c16f6a070438ec Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:13:14 -0500
Subject: expand finish_open() in its only caller

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 90 ++++++++++++++++++++++++++------------------------------------
 1 file changed, 38 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index d1a5dfeaf999..1f561dc495a1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2111,57 +2111,6 @@ static int open_will_truncate(int flag, struct inode *inode)
 	return (flag & O_TRUNC);
 }
 
-static struct file *finish_open(struct nameidata *nd,
-				int open_flag, int acc_mode)
-{
-	struct file *filp;
-	int will_truncate;
-	int error;
-
-	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
-	if (will_truncate) {
-		error = mnt_want_write(nd->path.mnt);
-		if (error)
-			goto exit;
-	}
-	error = may_open(&nd->path, acc_mode, open_flag);
-	if (error) {
-		if (will_truncate)
-			mnt_drop_write(nd->path.mnt);
-		goto exit;
-	}
-	filp = nameidata_to_filp(nd);
-	if (!IS_ERR(filp)) {
-		error = ima_file_check(filp, acc_mode);
-		if (error) {
-			fput(filp);
-			filp = ERR_PTR(error);
-		}
-	}
-	if (!IS_ERR(filp)) {
-		if (will_truncate) {
-			error = handle_truncate(filp);
-			if (error) {
-				fput(filp);
-				filp = ERR_PTR(error);
-			}
-		}
-	}
-	/*
-	 * It is now safe to drop the mnt write
-	 * because the filp has had a write taken
-	 * on its behalf.
-	 */
-	if (will_truncate)
-		mnt_drop_write(nd->path.mnt);
-	path_put(&nd->path);
-	return filp;
-
-exit:
-	path_put(&nd->path);
-	return ERR_PTR(error);
-}
-
 /*
  * Handle the last step of open()
  */
@@ -2169,6 +2118,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
+	int will_truncate;
 	struct file *filp;
 	struct inode *inode;
 	int error;
@@ -2329,7 +2279,43 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	filp = finish_open(nd, op->open_flag, op->acc_mode);
+	will_truncate = open_will_truncate(op->open_flag, nd->path.dentry->d_inode);
+	if (will_truncate) {
+		error = mnt_want_write(nd->path.mnt);
+		if (error)
+			goto exit;
+	}
+	error = may_open(&nd->path, op->acc_mode, op->open_flag);
+	if (error) {
+		if (will_truncate)
+			mnt_drop_write(nd->path.mnt);
+		goto exit;
+	}
+	filp = nameidata_to_filp(nd);
+	if (!IS_ERR(filp)) {
+		error = ima_file_check(filp, op->acc_mode);
+		if (error) {
+			fput(filp);
+			filp = ERR_PTR(error);
+		}
+	}
+	if (!IS_ERR(filp)) {
+		if (will_truncate) {
+			error = handle_truncate(filp);
+			if (error) {
+				fput(filp);
+				filp = ERR_PTR(error);
+			}
+		}
+	}
+	/*
+	 * It is now safe to drop the mnt write
+	 * because the filp has had a write taken
+	 * on its behalf.
+	 */
+	if (will_truncate)
+		mnt_drop_write(nd->path.mnt);
+	path_put(&nd->path);
 	return filp;
 
 exit_mutex_unlock:
-- 
cgit v1.2.3


From 9b44f1b3928b6f41532c9a1dc9a6fc665989ad5b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:17:27 -0500
Subject: move may_open() from __open_name_create() to do_last()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 1f561dc495a1..def63e7c058d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2069,11 +2069,7 @@ out_unlock:
 	mutex_unlock(&dir->d_inode->i_mutex);
 	dput(nd->path.dentry);
 	nd->path.dentry = path->dentry;
-
-	if (error)
-		return error;
-	/* Don't check for write permission, don't truncate */
-	return may_open(&nd->path, 0, open_flag & ~O_TRUNC);
+	return error;
 }
 
 /*
@@ -2239,6 +2235,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			mnt_drop_write(nd->path.mnt);
 			goto exit;
 		}
+		/* Don't check for write permission, don't truncate */
+		error = may_open(&nd->path, 0, op->open_flag & ~O_TRUNC);
+		if (error) {
+			mnt_drop_write(nd->path.mnt);
+			goto exit;
+		}
 		filp = nameidata_to_filp(nd);
 		mnt_drop_write(nd->path.mnt);
 		path_put(&nd->path);
-- 
cgit v1.2.3


From ca344a894b41a133dab07dfbbdf652c053f6658c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:36:45 -0500
Subject: do_last: unify may_open() call and everyting after it

We have a bunch of diverging codepaths in do_last(); some of
them converge, but the case of having to create a new file
duplicates large part of common tail of the rest and exits
separately.  Massage them so that they could be merged.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 59 ++++++++++++++++++++++-------------------------------------
 1 file changed, 22 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index def63e7c058d..63844776484c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2114,7 +2114,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
+	int open_flag = op->open_flag;
 	int will_truncate;
+	int want_write = 0;
+	int skip_perm = 0;
 	struct file *filp;
 	struct inode *inode;
 	int error;
@@ -2138,7 +2141,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (error)
 			goto exit;
 		audit_inode(pathname, nd->path.dentry);
-		if (op->open_flag & O_CREAT) {
+		if (open_flag & O_CREAT) {
 			error = -EISDIR;
 			goto exit;
 		}
@@ -2152,7 +2155,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		goto ok;
 	}
 
-	if (!(op->open_flag & O_CREAT)) {
+	if (!(open_flag & O_CREAT)) {
 		if (nd->last.name[nd->last.len])
 			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 		/* we _can_ be in RCU mode here */
@@ -2230,28 +2233,15 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto exit_mutex_unlock;
-		error = __open_namei_create(nd, path, op->open_flag, op->mode);
-		if (error) {
-			mnt_drop_write(nd->path.mnt);
+		want_write = 1;
+		will_truncate = 0;
+		error = __open_namei_create(nd, path, open_flag, op->mode);
+		if (error)
 			goto exit;
-		}
 		/* Don't check for write permission, don't truncate */
-		error = may_open(&nd->path, 0, op->open_flag & ~O_TRUNC);
-		if (error) {
-			mnt_drop_write(nd->path.mnt);
-			goto exit;
-		}
-		filp = nameidata_to_filp(nd);
-		mnt_drop_write(nd->path.mnt);
-		path_put(&nd->path);
-		if (!IS_ERR(filp)) {
-			error = ima_file_check(filp, op->acc_mode);
-			if (error) {
-				fput(filp);
-				filp = ERR_PTR(error);
-			}
-		}
-		return filp;
+		open_flag &= ~O_TRUNC;
+		skip_perm = 1;
+		goto common;
 	}
 
 	/*
@@ -2261,7 +2251,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	audit_inode(pathname, path->dentry);
 
 	error = -EEXIST;
-	if (op->open_flag & O_EXCL)
+	if (open_flag & O_EXCL)
 		goto exit_dput;
 
 	error = follow_managed(path, nd->flags);
@@ -2281,18 +2271,17 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	will_truncate = open_will_truncate(op->open_flag, nd->path.dentry->d_inode);
+	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
 	if (will_truncate) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
 			goto exit;
+		want_write = 1;
 	}
-	error = may_open(&nd->path, op->acc_mode, op->open_flag);
-	if (error) {
-		if (will_truncate)
-			mnt_drop_write(nd->path.mnt);
+common:
+	error = may_open(&nd->path, skip_perm ? 0 : op->acc_mode, open_flag);
+	if (error)
 		goto exit;
-	}
 	filp = nameidata_to_filp(nd);
 	if (!IS_ERR(filp)) {
 		error = ima_file_check(filp, op->acc_mode);
@@ -2310,12 +2299,8 @@ ok:
 			}
 		}
 	}
-	/*
-	 * It is now safe to drop the mnt write
-	 * because the filp has had a write taken
-	 * on its behalf.
-	 */
-	if (will_truncate)
+out:
+	if (want_write)
 		mnt_drop_write(nd->path.mnt);
 	path_put(&nd->path);
 	return filp;
@@ -2325,8 +2310,8 @@ exit_mutex_unlock:
 exit_dput:
 	path_put_conditional(path, nd);
 exit:
-	path_put(&nd->path);
-	return ERR_PTR(error);
+	filp = ERR_PTR(error);
+	goto out;
 }
 
 static struct file *path_openat(int dfd, const char *pathname,
-- 
cgit v1.2.3


From 6c0d46c493217cf48999b3f8808910ae534aa085 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 00:59:59 -0500
Subject: fold __open_namei_create() and open_will_truncate() into do_last()

... and clean up a bit more

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 74 ++++++++++++++++++++++----------------------------------------
 1 file changed, 26 insertions(+), 48 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 63844776484c..441f1106de08 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2048,30 +2048,6 @@ static int handle_truncate(struct file *filp)
 	return error;
 }
 
-/*
- * Be careful about ever adding any more callers of this
- * function.  Its flags must be in the namei format, not
- * what get passed to sys_open().
- */
-static int __open_namei_create(struct nameidata *nd, struct path *path,
-				int open_flag, int mode)
-{
-	int error;
-	struct dentry *dir = nd->path.dentry;
-
-	if (!IS_POSIXACL(dir->d_inode))
-		mode &= ~current_umask();
-	error = security_path_mknod(&nd->path, path->dentry, mode, 0);
-	if (error)
-		goto out_unlock;
-	error = vfs_create(dir->d_inode, path->dentry, mode, nd);
-out_unlock:
-	mutex_unlock(&dir->d_inode->i_mutex);
-	dput(nd->path.dentry);
-	nd->path.dentry = path->dentry;
-	return error;
-}
-
 /*
  * Note that while the flag value (low two bits) for sys_open means:
  *	00 - read-only
@@ -2096,17 +2072,6 @@ static inline int open_to_namei_flags(int flag)
 	return flag;
 }
 
-static int open_will_truncate(int flag, struct inode *inode)
-{
-	/*
-	 * We'll never write to the fs underlying
-	 * a device file.
-	 */
-	if (special_file(inode->i_mode))
-		return 0;
-	return (flag & O_TRUNC);
-}
-
 /*
  * Handle the last step of open()
  */
@@ -2114,8 +2079,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			    const struct open_flags *op, const char *pathname)
 {
 	struct dentry *dir = nd->path.dentry;
+	struct dentry *dentry;
 	int open_flag = op->open_flag;
-	int will_truncate;
+	int will_truncate = open_flag & O_TRUNC;
 	int want_write = 0;
 	int skip_perm = 0;
 	struct file *filp;
@@ -2207,25 +2173,29 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 	mutex_lock(&dir->d_inode->i_mutex);
 
-	path->dentry = lookup_hash(nd);
-	path->mnt = nd->path.mnt;
-
-	error = PTR_ERR(path->dentry);
-	if (IS_ERR(path->dentry)) {
+	dentry = lookup_hash(nd);
+	error = PTR_ERR(dentry);
+	if (IS_ERR(dentry)) {
 		mutex_unlock(&dir->d_inode->i_mutex);
 		goto exit;
 	}
 
+	path->dentry = dentry;
+	path->mnt = nd->path.mnt;
+
 	if (IS_ERR(nd->intent.open.file)) {
 		error = PTR_ERR(nd->intent.open.file);
 		goto exit_mutex_unlock;
 	}
 
 	/* Negative dentry, just create the file */
-	if (!path->dentry->d_inode) {
+	if (!dentry->d_inode) {
+		int mode = op->mode;
+		if (!IS_POSIXACL(dir->d_inode))
+			mode &= ~current_umask();
 		/*
 		 * This write is needed to ensure that a
-		 * ro->rw transition does not occur between
+		 * rw->ro transition does not occur between
 		 * the time when the file is created and when
 		 * a permanent write count is taken through
 		 * the 'struct file' in nameidata_to_filp().
@@ -2234,13 +2204,19 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (error)
 			goto exit_mutex_unlock;
 		want_write = 1;
-		will_truncate = 0;
-		error = __open_namei_create(nd, path, open_flag, op->mode);
-		if (error)
-			goto exit;
 		/* Don't check for write permission, don't truncate */
 		open_flag &= ~O_TRUNC;
+		will_truncate = 0;
 		skip_perm = 1;
+		error = security_path_mknod(&nd->path, dentry, mode, 0);
+		if (error)
+			goto exit_mutex_unlock;
+		error = vfs_create(dir->d_inode, dentry, mode, nd);
+		if (error)
+			goto exit_mutex_unlock;
+		mutex_unlock(&dir->d_inode->i_mutex);
+		dput(nd->path.dentry);
+		nd->path.dentry = dentry;
 		goto common;
 	}
 
@@ -2271,7 +2247,9 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	if (S_ISDIR(nd->inode->i_mode))
 		goto exit;
 ok:
-	will_truncate = open_will_truncate(open_flag, nd->path.dentry->d_inode);
+	if (!S_ISREG(nd->inode->i_mode))
+		will_truncate = 0;
+
 	if (will_truncate) {
 		error = mnt_want_write(nd->path.mnt);
 		if (error)
-- 
cgit v1.2.3


From f374ed5fa8afed8590deaae5dc147422e0e1a6d9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 01:34:45 -0500
Subject: do_last: kill a rudiment of old ->d_revalidate() workaround

There used to be time when ->d_revalidate() couldn't return an error.
So intents code had lookup_instantiate_filp() stash ERR_PTR(error)
in nd->intent.open.filp and had it checked after lookup_hash(), to
catch the otherwise silent failures.  That had been introduced by
commit 4af4c52f34606bdaab6930a845550c6fb02078a4.  These days
->d_revalidate() can and does propagate errors back to callers
explicitly, so this check isn't needed anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 441f1106de08..6972e761286b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2183,11 +2183,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	path->dentry = dentry;
 	path->mnt = nd->path.mnt;
 
-	if (IS_ERR(nd->intent.open.file)) {
-		error = PTR_ERR(nd->intent.open.file);
-		goto exit_mutex_unlock;
-	}
-
 	/* Negative dentry, just create the file */
 	if (!dentry->d_inode) {
 		int mode = op->mode;
-- 
cgit v1.2.3


From 40b39136f07279fdc868a36cba050f4e84ce0ace Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 16:22:18 -0500
Subject: path_openat: clean ELOOP handling a bit

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 6972e761286b..ca9a06a65704 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2320,11 +2320,12 @@ static struct file *path_openat(int dfd, const char *pathname,
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
-		error = -ELOOP;
-		if (!(nd.flags & LOOKUP_FOLLOW))
-			goto exit_dput;
-		if (count++ == 32)
-			goto exit_dput;
+		if (!(nd.flags & LOOKUP_FOLLOW) || count++ == 32) {
+			path_put_conditional(&path, &nd);
+			path_put(&nd.path);
+			filp = ERR_PTR(-ELOOP);
+			break;
+		}
 		/*
 		 * This is subtle. Instead of calling do_follow_link() we do
 		 * the thing by hands. The reason is that this way we have zero
@@ -2355,9 +2356,6 @@ out:
 	release_open_intent(&nd);
 	return filp;
 
-exit_dput:
-	path_put_conditional(&path, &nd);
-	path_put(&nd.path);
 out_filp:
 	filp = ERR_PTR(error);
 	goto out;
-- 
cgit v1.2.3


From 5a18fff2090c3af830d699c8ccb230498a1e37e5 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 11 Mar 2011 04:44:53 -0500
Subject: untangle do_lookup()

That thing has devolved into rats nest of gotos; sane use of unlikely()
gets rid of that horror and gives much more readable structure:
	* make a fast attempt to find a dentry; false negatives are OK.
In RCU mode if everything went fine, we are done, otherwise just drop
out of RCU.  If we'd done (RCU) ->d_revalidate() and it had not refused
outright (i.e. didn't give us -ECHILD), remember its result.
	* now we are not in RCU mode and hopefully have a dentry.  If we
do not, lock parent, do full d_lookup() and if that has not found anything,
allocate and call ->lookup().  If we'd done that ->lookup(), remember that
dentry is good and we don't need to revalidate it.
	* now we have a dentry.  If it has ->d_revalidate() and we can't
skip it, call it.
	* hopefully dentry is good; if not, either fail (in case of error)
or try to invalidate it.  If d_invalidate() has succeeded, drop it and
retry everything as if original attempt had not found a dentry.
	* now we can finish it up - deal with mountpoint crossing and
automount.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 141 ++++++++++++++++++++++++-------------------------------------
 1 file changed, 56 insertions(+), 85 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index ca9a06a65704..0bebd13e5cb7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -589,29 +589,6 @@ do_revalidate(struct dentry *dentry, struct nameidata *nd)
 	return dentry;
 }
 
-static inline struct dentry *
-do_revalidate_rcu(struct dentry *dentry, struct nameidata *nd)
-{
-	int status = d_revalidate(dentry, nd);
-	if (likely(status > 0))
-		return dentry;
-	if (status == -ECHILD) {
-		if (nameidata_dentry_drop_rcu(nd, dentry))
-			return ERR_PTR(-ECHILD);
-		return do_revalidate(dentry, nd);
-	}
-	if (status < 0)
-		return ERR_PTR(status);
-	/* Don't d_invalidate in rcu-walk mode */
-	if (nameidata_dentry_drop_rcu(nd, dentry))
-		return ERR_PTR(-ECHILD);
-	if (!d_invalidate(dentry)) {
-		dput(dentry);
-		dentry = NULL;
-	}
-	return dentry;
-}
-
 /*
  * handle_reval_path - force revalidation of a dentry
  *
@@ -1213,7 +1190,8 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 {
 	struct vfsmount *mnt = nd->path.mnt;
 	struct dentry *dentry, *parent = nd->path.dentry;
-	struct inode *dir;
+	int need_reval = 1;
+	int status = 1;
 	int err;
 
 	/*
@@ -1223,48 +1201,74 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	 */
 	if (nd->flags & LOOKUP_RCU) {
 		unsigned seq;
-
 		*inode = nd->inode;
 		dentry = __d_lookup_rcu(parent, name, &seq, inode);
-		if (!dentry) {
-			if (nameidata_drop_rcu(nd))
-				return -ECHILD;
-			goto need_lookup;
-		}
+		if (!dentry)
+			goto unlazy;
+
 		/* Memory barrier in read_seqcount_begin of child is enough */
 		if (__read_seqcount_retry(&parent->d_seq, nd->seq))
 			return -ECHILD;
-
 		nd->seq = seq;
+
 		if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-			dentry = do_revalidate_rcu(dentry, nd);
-			if (!dentry)
-				goto need_lookup;
-			if (IS_ERR(dentry))
-				goto fail;
-			if (!(nd->flags & LOOKUP_RCU))
-				goto done;
+			status = d_revalidate(dentry, nd);
+			if (unlikely(status <= 0)) {
+				if (status != -ECHILD)
+					need_reval = 0;
+				goto unlazy;
+			}
 		}
 		path->mnt = mnt;
 		path->dentry = dentry;
 		if (likely(__follow_mount_rcu(nd, path, inode, false)))
 			return 0;
-		if (nameidata_drop_rcu(nd))
-			return -ECHILD;
-		/* fallthru */
+unlazy:
+		if (dentry) {
+			if (nameidata_dentry_drop_rcu(nd, dentry))
+				return -ECHILD;
+		} else {
+			if (nameidata_drop_rcu(nd))
+				return -ECHILD;
+		}
+	} else {
+		dentry = __d_lookup(parent, name);
 	}
-	dentry = __d_lookup(parent, name);
-	if (!dentry)
-		goto need_lookup;
-found:
-	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
-		dentry = do_revalidate(dentry, nd);
-		if (!dentry)
-			goto need_lookup;
-		if (IS_ERR(dentry))
-			goto fail;
+
+retry:
+	if (unlikely(!dentry)) {
+		struct inode *dir = parent->d_inode;
+		BUG_ON(nd->inode != dir);
+
+		mutex_lock(&dir->i_mutex);
+		dentry = d_lookup(parent, name);
+		if (likely(!dentry)) {
+			dentry = d_alloc_and_lookup(parent, name, nd);
+			if (IS_ERR(dentry)) {
+				mutex_unlock(&dir->i_mutex);
+				return PTR_ERR(dentry);
+			}
+			/* known good */
+			need_reval = 0;
+			status = 1;
+		}
+		mutex_unlock(&dir->i_mutex);
+	}
+	if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE) && need_reval)
+		status = d_revalidate(dentry, nd);
+	if (unlikely(status <= 0)) {
+		if (status < 0) {
+			dput(dentry);
+			return status;
+		}
+		if (!d_invalidate(dentry)) {
+			dput(dentry);
+			dentry = NULL;
+			need_reval = 1;
+			goto retry;
+		}
 	}
-done:
+
 	path->mnt = mnt;
 	path->dentry = dentry;
 	err = follow_managed(path, nd->flags);
@@ -1274,39 +1278,6 @@ done:
 	}
 	*inode = path->dentry->d_inode;
 	return 0;
-
-need_lookup:
-	dir = parent->d_inode;
-	BUG_ON(nd->inode != dir);
-
-	mutex_lock(&dir->i_mutex);
-	/*
-	 * First re-do the cached lookup just in case it was created
-	 * while we waited for the directory semaphore, or the first
-	 * lookup failed due to an unrelated rename.
-	 *
-	 * This could use version numbering or similar to avoid unnecessary
-	 * cache lookups, but then we'd have to do the first lookup in the
-	 * non-racy way. However in the common case here, everything should
-	 * be hot in cache, so would it be a big win?
-	 */
-	dentry = d_lookup(parent, name);
-	if (likely(!dentry)) {
-		dentry = d_alloc_and_lookup(parent, name, nd);
-		mutex_unlock(&dir->i_mutex);
-		if (IS_ERR(dentry))
-			goto fail;
-		goto done;
-	}
-	/*
-	 * Uhhuh! Nasty case: the cache was re-populated while
-	 * we waited on the semaphore. Need to revalidate.
-	 */
-	mutex_unlock(&dir->i_mutex);
-	goto found;
-
-fail:
-	return PTR_ERR(dentry);
 }
 
 static inline int may_lookup(struct nameidata *nd)
-- 
cgit v1.2.3


From 5b6ca027d85b7438c84b78a54ccdc2e53f2909cd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 9 Mar 2011 23:04:47 -0500
Subject: reduce vfs_path_lookup() to do_path_lookup()

New lookup flag: LOOKUP_ROOT.  nd->root is set (and held) by caller,
path_init() starts walking from that place and all pathname resolution
machinery never drops nd->root if that flag is set.  That turns
vfs_path_lookup() into a special case of do_path_lookup() *and*
gets us down to 3 callers of link_path_walk(), making it finally
feasible to rip the handling of trailing symlink out of link_path_walk().
That will not only simply the living hell out of it, but make life
much simpler for unionfs merge.  Trailing symlink handling will
become iterative, which is a good thing for stack footprint in
a lot of situations as well.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 95 +++++++++++++++++++++++----------------------------
 include/linux/namei.h |  1 +
 2 files changed, 44 insertions(+), 52 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 0bebd13e5cb7..8ee7785d5642 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -401,9 +401,11 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 {
 	struct fs_struct *fs = current->fs;
 	struct dentry *dentry = nd->path.dentry;
+	int want_root = 0;
 
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
-	if (nd->root.mnt) {
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		want_root = 1;
 		spin_lock(&fs->lock);
 		if (nd->root.mnt != fs->root.mnt ||
 				nd->root.dentry != fs->root.dentry)
@@ -414,7 +416,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 		goto err;
 	BUG_ON(nd->inode != dentry->d_inode);
 	spin_unlock(&dentry->d_lock);
-	if (nd->root.mnt) {
+	if (want_root) {
 		path_get(&nd->root);
 		spin_unlock(&fs->lock);
 	}
@@ -427,7 +429,7 @@ static int nameidata_drop_rcu(struct nameidata *nd)
 err:
 	spin_unlock(&dentry->d_lock);
 err_root:
-	if (nd->root.mnt)
+	if (want_root)
 		spin_unlock(&fs->lock);
 	return -ECHILD;
 }
@@ -454,9 +456,11 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 {
 	struct fs_struct *fs = current->fs;
 	struct dentry *parent = nd->path.dentry;
+	int want_root = 0;
 
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
-	if (nd->root.mnt) {
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
+		want_root = 1;
 		spin_lock(&fs->lock);
 		if (nd->root.mnt != fs->root.mnt ||
 				nd->root.dentry != fs->root.dentry)
@@ -476,7 +480,7 @@ static int nameidata_dentry_drop_rcu(struct nameidata *nd, struct dentry *dentry
 	parent->d_count++;
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
-	if (nd->root.mnt) {
+	if (want_root) {
 		path_get(&nd->root);
 		spin_unlock(&fs->lock);
 	}
@@ -490,7 +494,7 @@ err:
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
 err_root:
-	if (nd->root.mnt)
+	if (want_root)
 		spin_unlock(&fs->lock);
 	return -ECHILD;
 }
@@ -501,7 +505,8 @@ static inline int nameidata_dentry_drop_rcu_maybe(struct nameidata *nd, struct d
 	if (nd->flags & LOOKUP_RCU) {
 		if (unlikely(nameidata_dentry_drop_rcu(nd, dentry))) {
 			nd->flags &= ~LOOKUP_RCU;
-			nd->root.mnt = NULL;
+			if (!(nd->flags & LOOKUP_ROOT))
+				nd->root.mnt = NULL;
 			rcu_read_unlock();
 			br_read_unlock(vfsmount_lock);
 			return -ECHILD;
@@ -525,7 +530,8 @@ static int nameidata_drop_rcu_last(struct nameidata *nd)
 
 	BUG_ON(!(nd->flags & LOOKUP_RCU));
 	nd->flags &= ~LOOKUP_RCU;
-	nd->root.mnt = NULL;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
 	spin_lock(&dentry->d_lock);
 	if (!__d_rcu_to_refcount(dentry, nd->seq))
 		goto err_unlock;
@@ -1053,7 +1059,8 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 
 failed:
 	nd->flags &= ~LOOKUP_RCU;
-	nd->root.mnt = NULL;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
 	rcu_read_unlock();
 	br_read_unlock(vfsmount_lock);
 	return -ECHILD;
@@ -1310,7 +1317,8 @@ static void terminate_walk(struct nameidata *nd)
 		path_put(&nd->path);
 	} else {
 		nd->flags &= ~LOOKUP_RCU;
-		nd->root.mnt = NULL;
+		if (!(nd->flags & LOOKUP_ROOT))
+			nd->root.mnt = NULL;
 		rcu_read_unlock();
 		br_read_unlock(vfsmount_lock);
 	}
@@ -1477,6 +1485,25 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags | LOOKUP_JUMPED;
 	nd->depth = 0;
+	if (flags & LOOKUP_ROOT) {
+		struct inode *inode = nd->root.dentry->d_inode;
+		if (!inode->i_op->lookup)
+			return -ENOTDIR;
+		retval = inode_permission(inode, MAY_EXEC);
+		if (retval)
+			return retval;
+		nd->path = nd->root;
+		nd->inode = inode;
+		if (flags & LOOKUP_RCU) {
+			br_read_lock(vfsmount_lock);
+			rcu_read_lock();
+			nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
+		} else {
+			path_get(&nd->path);
+		}
+		return 0;
+	}
+
 	nd->root.mnt = NULL;
 
 	if (*name=='/') {
@@ -1587,7 +1614,7 @@ static int path_lookupat(int dfd, const char *name,
 	if (base)
 		fput(base);
 
-	if (nd->root.mnt) {
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) {
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
@@ -1638,46 +1665,10 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 		    const char *name, unsigned int flags,
 		    struct nameidata *nd)
 {
-	int result;
-
-	/* same as do_path_lookup */
-	nd->last_type = LAST_ROOT;
-	nd->flags = flags | LOOKUP_JUMPED;
-	nd->depth = 0;
-
-	nd->path.dentry = dentry;
-	nd->path.mnt = mnt;
-	path_get(&nd->path);
-	nd->root = nd->path;
-	path_get(&nd->root);
-	nd->inode = nd->path.dentry->d_inode;
-
-	current->total_link_count = 0;
-
-	result = link_path_walk(name, nd);
-	if (!result)
-		result = handle_reval_path(nd);
-	if (result == -ESTALE) {
-		/* nd->path had been dropped */
-		current->total_link_count = 0;
-		nd->path.dentry = dentry;
-		nd->path.mnt = mnt;
-		nd->inode = dentry->d_inode;
-		path_get(&nd->path);
-		nd->flags = flags | LOOKUP_JUMPED | LOOKUP_REVAL;
-
-		result = link_path_walk(name, nd);
-		if (!result)
-			result = handle_reval_path(nd);
-	}
-	if (unlikely(!result && !audit_dummy_context() && nd->path.dentry &&
-				nd->inode))
-		audit_inode(name, nd->path.dentry);
-
-	path_put(&nd->root);
-	nd->root.mnt = NULL;
-
-	return result;
+	nd->root.dentry = dentry;
+	nd->root.mnt = mnt;
+	/* the first argument of do_path_lookup() is ignored with LOOKUP_ROOT */
+	return do_path_lookup(AT_FDCWD, name, flags | LOOKUP_ROOT, nd);
 }
 
 static struct dentry *__lookup_hash(struct qstr *name,
@@ -2320,7 +2311,7 @@ static struct file *path_openat(int dfd, const char *pathname,
 		path_put(&link);
 	}
 out:
-	if (nd.root.mnt)
+	if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
 		path_put(&nd.root);
 	if (base)
 		fput(base);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 72ffd62ac736..83cd6e5cd7dc 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -63,6 +63,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 #define LOOKUP_RENAME_TARGET	0x0800
 
 #define LOOKUP_JUMPED		0x1000
+#define LOOKUP_ROOT		0x2000
 
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
-- 
cgit v1.2.3


From 73d049a40fc6269189c4e2ba6792cb5dd054883c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 11 Mar 2011 12:08:24 -0500
Subject: open-style analog of vfs_path_lookup()

new function: file_open_root(dentry, mnt, name, flags) opens the file
vfs_path_lookup would arrive to.

Note that name can be empty; in that case the usual requirement that
dentry should be a directory is lifted.

open-coded equivalents switched to it, may_open() got down exactly
one caller and became static.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/drivers/mconsole_kern.c | 21 ++---------
 fs/internal.h                   |  2 ++
 fs/namei.c                      | 80 ++++++++++++++++++++++++++---------------
 fs/nfsctl.c                     | 21 +++--------
 fs/open.c                       | 14 ++++++++
 include/linux/fs.h              |  4 +--
 kernel/sysctl_binary.c          | 19 +---------
 7 files changed, 77 insertions(+), 84 deletions(-)

(limited to 'fs')

diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c
index 975613b23dcf..c70e047eed72 100644
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -124,35 +124,18 @@ void mconsole_log(struct mc_request *req)
 #if 0
 void mconsole_proc(struct mc_request *req)
 {
-	struct nameidata nd;
 	struct vfsmount *mnt = current->nsproxy->pid_ns->proc_mnt;
 	struct file *file;
-	int n, err;
+	int n;
 	char *ptr = req->request.data, *buf;
 	mm_segment_t old_fs = get_fs();
 
 	ptr += strlen("proc");
 	ptr = skip_spaces(ptr);
 
-	err = vfs_path_lookup(mnt->mnt_root, mnt, ptr, LOOKUP_FOLLOW, &nd);
-	if (err) {
-		mconsole_reply(req, "Failed to look up file", 1, 0);
-		goto out;
-	}
-
-	err = may_open(&nd.path, MAY_READ, O_RDONLY);
-	if (result) {
-		mconsole_reply(req, "Failed to open file", 1, 0);
-		path_put(&nd.path);
-		goto out;
-	}
-
-	file = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
-			   current_cred());
-	err = PTR_ERR(file);
+	file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY);
 	if (IS_ERR(file)) {
 		mconsole_reply(req, "Failed to open file", 1, 0);
-		path_put(&nd.path);
 		goto out;
 	}
 
diff --git a/fs/internal.h b/fs/internal.h
index 6fdbdf2c6047..52abc5287f50 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -114,6 +114,8 @@ struct open_flags {
 };
 extern struct file *do_filp_open(int dfd, const char *pathname,
 		const struct open_flags *op, int lookup_flags);
+extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
+		const char *, const struct open_flags *, int lookup_flags);
 
 /*
  * inode.c
diff --git a/fs/namei.c b/fs/namei.c
index 8ee7785d5642..abc8d2df121c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1487,11 +1487,13 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	nd->depth = 0;
 	if (flags & LOOKUP_ROOT) {
 		struct inode *inode = nd->root.dentry->d_inode;
-		if (!inode->i_op->lookup)
-			return -ENOTDIR;
-		retval = inode_permission(inode, MAY_EXEC);
-		if (retval)
-			return retval;
+		if (*name) {
+			if (!inode->i_op->lookup)
+				return -ENOTDIR;
+			retval = inode_permission(inode, MAY_EXEC);
+			if (retval)
+				return retval;
+		}
 		nd->path = nd->root;
 		nd->inode = inode;
 		if (flags & LOOKUP_RCU) {
@@ -1937,7 +1939,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
 	return error;
 }
 
-int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(struct path *path, int acc_mode, int flag)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *inode = dentry->d_inode;
@@ -2250,11 +2252,10 @@ exit:
 }
 
 static struct file *path_openat(int dfd, const char *pathname,
-		const struct open_flags *op, int flags)
+		struct nameidata *nd, const struct open_flags *op, int flags)
 {
 	struct file *base = NULL;
 	struct file *filp;
-	struct nameidata nd;
 	struct path path;
 	int count = 0;
 	int error;
@@ -2264,27 +2265,27 @@ static struct file *path_openat(int dfd, const char *pathname,
 		return ERR_PTR(-ENFILE);
 
 	filp->f_flags = op->open_flag;
-	nd.intent.open.file = filp;
-	nd.intent.open.flags = open_to_namei_flags(op->open_flag);
-	nd.intent.open.create_mode = op->mode;
+	nd->intent.open.file = filp;
+	nd->intent.open.flags = open_to_namei_flags(op->open_flag);
+	nd->intent.open.create_mode = op->mode;
 
-	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, &nd, &base);
+	error = path_init(dfd, pathname, flags | LOOKUP_PARENT, nd, &base);
 	if (unlikely(error))
 		goto out_filp;
 
 	current->total_link_count = 0;
-	error = link_path_walk(pathname, &nd);
+	error = link_path_walk(pathname, nd);
 	if (unlikely(error))
 		goto out_filp;
 
-	filp = do_last(&nd, &path, op, pathname);
+	filp = do_last(nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
 		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
-		if (!(nd.flags & LOOKUP_FOLLOW) || count++ == 32) {
-			path_put_conditional(&path, &nd);
-			path_put(&nd.path);
+		if (!(nd->flags & LOOKUP_FOLLOW) || count++ == 32) {
+			path_put_conditional(&path, nd);
+			path_put(&nd->path);
 			filp = ERR_PTR(-ELOOP);
 			break;
 		}
@@ -2299,23 +2300,23 @@ static struct file *path_openat(int dfd, const char *pathname,
 		 * have to putname() it when we are done. Procfs-like symlinks
 		 * just set LAST_BIND.
 		 */
-		nd.flags |= LOOKUP_PARENT;
-		nd.flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-		error = __do_follow_link(&link, &nd, &cookie);
+		nd->flags |= LOOKUP_PARENT;
+		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
+		error = __do_follow_link(&link, nd, &cookie);
 		if (unlikely(error))
 			filp = ERR_PTR(error);
 		else
-			filp = do_last(&nd, &path, op, pathname);
+			filp = do_last(nd, &path, op, pathname);
 		if (!IS_ERR(cookie) && linki->i_op->put_link)
-			linki->i_op->put_link(link.dentry, &nd, cookie);
+			linki->i_op->put_link(link.dentry, nd, cookie);
 		path_put(&link);
 	}
 out:
-	if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
-		path_put(&nd.root);
+	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
+		path_put(&nd->root);
 	if (base)
 		fput(base);
-	release_open_intent(&nd);
+	release_open_intent(nd);
 	return filp;
 
 out_filp:
@@ -2326,16 +2327,39 @@ out_filp:
 struct file *do_filp_open(int dfd, const char *pathname,
 		const struct open_flags *op, int flags)
 {
+	struct nameidata nd;
 	struct file *filp;
 
-	filp = path_openat(dfd, pathname, op, flags | LOOKUP_RCU);
+	filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU);
 	if (unlikely(filp == ERR_PTR(-ECHILD)))
-		filp = path_openat(dfd, pathname, op, flags);
+		filp = path_openat(dfd, pathname, &nd, op, flags);
 	if (unlikely(filp == ERR_PTR(-ESTALE)))
-		filp = path_openat(dfd, pathname, op, flags | LOOKUP_REVAL);
+		filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL);
 	return filp;
 }
 
+struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+		const char *name, const struct open_flags *op, int flags)
+{
+	struct nameidata nd;
+	struct file *file;
+
+	nd.root.mnt = mnt;
+	nd.root.dentry = dentry;
+
+	flags |= LOOKUP_ROOT;
+
+	if (dentry->d_inode->i_op->follow_link)
+		return ERR_PTR(-ELOOP);
+
+	file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
+	if (unlikely(file == ERR_PTR(-ECHILD)))
+		file = path_openat(-1, name, &nd, op, flags);
+	if (unlikely(file == ERR_PTR(-ESTALE)))
+		file = path_openat(-1, name, &nd, op, flags | LOOKUP_REVAL);
+	return file;
+}
+
 /**
  * lookup_create - lookup a dentry, creating it if it doesn't exist
  * @nd: nameidata info
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index bf9cbd242ddd..124e8fcb0dd6 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -22,30 +22,17 @@
 
 static struct file *do_open(char *name, int flags)
 {
-	struct nameidata nd;
 	struct vfsmount *mnt;
-	int error;
+	struct file *file;
 
 	mnt = do_kern_mount("nfsd", 0, "nfsd", NULL);
 	if (IS_ERR(mnt))
 		return (struct file *)mnt;
 
-	error = vfs_path_lookup(mnt->mnt_root, mnt, name, 0, &nd);
-	mntput(mnt);	/* drop do_kern_mount reference */
-	if (error)
-		return ERR_PTR(error);
-
-	if (flags == O_RDWR)
-		error = may_open(&nd.path, MAY_READ|MAY_WRITE, flags);
-	else
-		error = may_open(&nd.path, MAY_WRITE, flags);
+	file = file_open_root(mnt->mnt_root, mnt, name, flags);
 
-	if (!error)
-		return dentry_open(nd.path.dentry, nd.path.mnt, flags,
-				   current_cred());
-
-	path_put(&nd.path);
-	return ERR_PTR(error);
+	mntput(mnt);	/* drop do_kern_mount reference */
+	return file;
 }
 
 static struct {
diff --git a/fs/open.c b/fs/open.c
index d05e18c60bae..48afc5c139d2 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -959,6 +959,20 @@ struct file *filp_open(const char *filename, int flags, int mode)
 }
 EXPORT_SYMBOL(filp_open);
 
+struct file *file_open_root(struct dentry *dentry, struct vfsmount *mnt,
+			    const char *filename, int flags)
+{
+	struct open_flags op;
+	int lookup = build_open_flags(flags, 0, &op);
+	if (flags & O_CREAT)
+		return ERR_PTR(-EINVAL);
+	if (!filename && (flags & O_DIRECTORY))
+		if (!dentry->d_inode->i_op->lookup)
+			return ERR_PTR(-ENOTDIR);
+	return do_file_open_root(dentry, mnt, filename, &op, lookup);
+}
+EXPORT_SYMBOL(file_open_root);
+
 long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 {
 	struct open_flags op;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9c75714f92c1..bf5c3c896072 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1990,6 +1990,8 @@ extern int do_fallocate(struct file *file, int mode, loff_t offset,
 extern long do_sys_open(int dfd, const char __user *filename, int flags,
 			int mode);
 extern struct file *filp_open(const char *, int, int);
+extern struct file *file_open_root(struct dentry *, struct vfsmount *,
+				   const char *, int);
 extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
 				 const struct cred *);
 extern int filp_close(struct file *, fl_owner_t id);
@@ -2205,8 +2207,6 @@ extern struct file *create_read_pipe(struct file *f, int flags);
 extern struct file *create_write_pipe(int flags);
 extern void free_write_pipe(struct file *);
 
-extern int may_open(struct path *, int, int);
-
 extern int kernel_read(struct file *, loff_t, char *, unsigned long);
 extern struct file * open_exec(const char *);
  
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bedf7c9a..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
 {
 	const struct bin_table *table = NULL;
-	struct nameidata nd;
 	struct vfsmount *mnt;
 	struct file *file;
 	ssize_t result;
 	char *pathname;
 	int flags;
-	int acc_mode;
 
 	pathname = sysctl_getname(name, nlen, &table);
 	result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
 	/* How should the sysctl be accessed? */
 	if (oldval && oldlen && newval && newlen) {
 		flags = O_RDWR;
-		acc_mode = MAY_READ | MAY_WRITE;
 	} else if (newval && newlen) {
 		flags = O_WRONLY;
-		acc_mode = MAY_WRITE;
 	} else if (oldval && oldlen) {
 		flags = O_RDONLY;
-		acc_mode = MAY_READ;
 	} else {
 		result = 0;
 		goto out_putname;
 	}
 
 	mnt = current->nsproxy->pid_ns->proc_mnt;
-	result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
-	if (result)
-		goto out_putname;
-
-	result = may_open(&nd.path, acc_mode, flags);
-	if (result)
-		goto out_putpath;
-
-	file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
+	file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
 	result = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
 	putname(pathname);
 out:
 	return result;
-
-out_putpath:
-	path_put(&nd.path);
-	goto out_putname;
 }
 
 
-- 
cgit v1.2.3


From c8b91accfa1059d5565443193d89572eca2f5dd6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 12 Mar 2011 10:41:39 -0500
Subject: clean statfs-like syscalls up

New helpers: user_statfs() and fd_statfs(), taking userland pathname and
descriptor resp. and filling struct kstatfs.  Syscalls of statfs family
(native, compat and foreign - osf and hpux on alpha and parisc resp.)
switched to those.  Removes some boilerplate code, simplifies cleanup
on errors...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/osf_sys.c |  36 ++-------
 arch/parisc/hpux/sys_hpux.c |  65 ++++++----------
 fs/compat.c                 |  48 +++---------
 fs/statfs.c                 | 176 ++++++++++++++++++++------------------------
 include/linux/fs.h          |   2 +
 5 files changed, 123 insertions(+), 204 deletions(-)

(limited to 'fs')

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index fe698b5045e9..376f22130791 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -230,44 +230,24 @@ linux_to_osf_statfs(struct kstatfs *linux_stat, struct osf_statfs __user *osf_st
 	return copy_to_user(osf_stat, &tmp_stat, bufsiz) ? -EFAULT : 0;
 }
 
-static int
-do_osf_statfs(struct path *path, struct osf_statfs __user *buffer,
-	      unsigned long bufsiz)
+SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname,
+		struct osf_statfs __user *, buffer, unsigned long, bufsiz)
 {
 	struct kstatfs linux_stat;
-	int error = vfs_statfs(path, &linux_stat);
+	int error = user_statfs(pathname, &linux_stat);
 	if (!error)
 		error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
 	return error;	
 }
 
-SYSCALL_DEFINE3(osf_statfs, const char __user *, pathname,
-		struct osf_statfs __user *, buffer, unsigned long, bufsiz)
-{
-	struct path path;
-	int retval;
-
-	retval = user_path(pathname, &path);
-	if (!retval) {
-		retval = do_osf_statfs(&path, buffer, bufsiz);
-		path_put(&path);
-	}
-	return retval;
-}
-
 SYSCALL_DEFINE3(osf_fstatfs, unsigned long, fd,
 		struct osf_statfs __user *, buffer, unsigned long, bufsiz)
 {
-	struct file *file;
-	int retval;
-
-	retval = -EBADF;
-	file = fget(fd);
-	if (file) {
-		retval = do_osf_statfs(&file->f_path, buffer, bufsiz);
-		fput(file);
-	}
-	return retval;
+	struct kstatfs linux_stat;
+	int error = fd_statfs(fd, &linux_stat);
+	if (!error)
+		error = linux_to_osf_statfs(&linux_stat, buffer, bufsiz);
+	return error;
 }
 
 /*
diff --git a/arch/parisc/hpux/sys_hpux.c b/arch/parisc/hpux/sys_hpux.c
index 30394081d9b6..6ab9580b0b00 100644
--- a/arch/parisc/hpux/sys_hpux.c
+++ b/arch/parisc/hpux/sys_hpux.c
@@ -185,26 +185,21 @@ struct hpux_statfs {
      int16_t f_pad;
 };
 
-static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf)
+static int do_statfs_hpux(struct kstatfs *st, struct hpux_statfs __user *p)
 {
-	struct kstatfs st;
-	int retval;
-	
-	retval = vfs_statfs(path, &st);
-	if (retval)
-		return retval;
-
-	memset(buf, 0, sizeof(*buf));
-	buf->f_type = st.f_type;
-	buf->f_bsize = st.f_bsize;
-	buf->f_blocks = st.f_blocks;
-	buf->f_bfree = st.f_bfree;
-	buf->f_bavail = st.f_bavail;
-	buf->f_files = st.f_files;
-	buf->f_ffree = st.f_ffree;
-	buf->f_fsid[0] = st.f_fsid.val[0];
-	buf->f_fsid[1] = st.f_fsid.val[1];
-
+	struct hpux_statfs buf;
+	memset(&buf, 0, sizeof(buf));
+	buf.f_type = st->f_type;
+	buf.f_bsize = st->f_bsize;
+	buf.f_blocks = st->f_blocks;
+	buf.f_bfree = st->f_bfree;
+	buf.f_bavail = st->f_bavail;
+	buf.f_files = st->f_files;
+	buf.f_ffree = st->f_ffree;
+	buf.f_fsid[0] = st->f_fsid.val[0];
+	buf.f_fsid[1] = st->f_fsid.val[1];
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
 	return 0;
 }
 
@@ -212,35 +207,19 @@ static int do_statfs_hpux(struct path *path, struct hpux_statfs *buf)
 asmlinkage long hpux_statfs(const char __user *pathname,
 						struct hpux_statfs __user *buf)
 {
-	struct path path;
-	int error;
-
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct hpux_statfs tmp;
-		error = do_statfs_hpux(&path, &tmp);
-		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-			error = -EFAULT;
-		path_put(&path);
-	}
+	struct kstatfs st;
+	int error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs_hpux(&st, buf);
 	return error;
 }
 
 asmlinkage long hpux_fstatfs(unsigned int fd, struct hpux_statfs __user * buf)
 {
-	struct file *file;
-	struct hpux_statfs tmp;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = do_statfs_hpux(&file->f_path, &tmp);
-	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-		error = -EFAULT;
-	fput(file);
- out:
+	struct kstatfs st;
+	int error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs_hpux(&st, buf);
 	return error;
 }
 
diff --git a/fs/compat.c b/fs/compat.c
index 691c3fd8ce1d..a071775f3bb3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -262,35 +262,19 @@ static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *
  */
 asmlinkage long compat_sys_statfs(const char __user *pathname, struct compat_statfs __user *buf)
 {
-	struct path path;
-	int error;
-
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct kstatfs tmp;
-		error = vfs_statfs(&path, &tmp);
-		if (!error)
-			error = put_compat_statfs(buf, &tmp);
-		path_put(&path);
-	}
+	struct kstatfs tmp;
+	int error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs(buf, &tmp);
 	return error;
 }
 
 asmlinkage long compat_sys_fstatfs(unsigned int fd, struct compat_statfs __user *buf)
 {
-	struct file * file;
 	struct kstatfs tmp;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = vfs_statfs(&file->f_path, &tmp);
+	int error = fd_statfs(fd, &tmp);
 	if (!error)
 		error = put_compat_statfs(buf, &tmp);
-	fput(file);
-out:
 	return error;
 }
 
@@ -329,41 +313,29 @@ static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstat
 
 asmlinkage long compat_sys_statfs64(const char __user *pathname, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-	struct path path;
+	struct kstatfs tmp;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct kstatfs tmp;
-		error = vfs_statfs(&path, &tmp);
-		if (!error)
-			error = put_compat_statfs64(buf, &tmp);
-		path_put(&path);
-	}
+	error = user_statfs(pathname, &tmp);
+	if (!error)
+		error = put_compat_statfs64(buf, &tmp);
 	return error;
 }
 
 asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct compat_statfs64 __user *buf)
 {
-	struct file * file;
 	struct kstatfs tmp;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = vfs_statfs(&file->f_path, &tmp);
+	error = fd_statfs(fd, &tmp);
 	if (!error)
 		error = put_compat_statfs64(buf, &tmp);
-	fput(file);
-out:
 	return error;
 }
 
diff --git a/fs/statfs.c b/fs/statfs.c
index 30ea8c8a996b..8244924dec55 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -73,149 +73,135 @@ int vfs_statfs(struct path *path, struct kstatfs *buf)
 }
 EXPORT_SYMBOL(vfs_statfs);
 
-static int do_statfs_native(struct path *path, struct statfs *buf)
+int user_statfs(const char __user *pathname, struct kstatfs *st)
 {
-	struct kstatfs st;
-	int retval;
+	struct path path;
+	int error = user_path(pathname, &path);
+	if (!error) {
+		error = vfs_statfs(&path, st);
+		path_put(&path);
+	}
+	return error;
+}
 
-	retval = vfs_statfs(path, &st);
-	if (retval)
-		return retval;
+int fd_statfs(int fd, struct kstatfs *st)
+{
+	struct file *file = fget(fd);
+	int error = -EBADF;
+	if (file) {
+		error = vfs_statfs(&file->f_path, st);
+		fput(file);
+	}
+	return error;
+}
 
-	if (sizeof(*buf) == sizeof(st))
-		memcpy(buf, &st, sizeof(st));
+static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
+{
+	struct statfs buf;
+
+	if (sizeof(buf) == sizeof(*st))
+		memcpy(&buf, st, sizeof(*st));
 	else {
-		if (sizeof buf->f_blocks == 4) {
-			if ((st.f_blocks | st.f_bfree | st.f_bavail |
-			     st.f_bsize | st.f_frsize) &
+		if (sizeof buf.f_blocks == 4) {
+			if ((st->f_blocks | st->f_bfree | st->f_bavail |
+			     st->f_bsize | st->f_frsize) &
 			    0xffffffff00000000ULL)
 				return -EOVERFLOW;
 			/*
 			 * f_files and f_ffree may be -1; it's okay to stuff
 			 * that into 32 bits
 			 */
-			if (st.f_files != -1 &&
-			    (st.f_files & 0xffffffff00000000ULL))
+			if (st->f_files != -1 &&
+			    (st->f_files & 0xffffffff00000000ULL))
 				return -EOVERFLOW;
-			if (st.f_ffree != -1 &&
-			    (st.f_ffree & 0xffffffff00000000ULL))
+			if (st->f_ffree != -1 &&
+			    (st->f_ffree & 0xffffffff00000000ULL))
 				return -EOVERFLOW;
 		}
 
-		buf->f_type = st.f_type;
-		buf->f_bsize = st.f_bsize;
-		buf->f_blocks = st.f_blocks;
-		buf->f_bfree = st.f_bfree;
-		buf->f_bavail = st.f_bavail;
-		buf->f_files = st.f_files;
-		buf->f_ffree = st.f_ffree;
-		buf->f_fsid = st.f_fsid;
-		buf->f_namelen = st.f_namelen;
-		buf->f_frsize = st.f_frsize;
-		buf->f_flags = st.f_flags;
-		memset(buf->f_spare, 0, sizeof(buf->f_spare));
+		buf.f_type = st->f_type;
+		buf.f_bsize = st->f_bsize;
+		buf.f_blocks = st->f_blocks;
+		buf.f_bfree = st->f_bfree;
+		buf.f_bavail = st->f_bavail;
+		buf.f_files = st->f_files;
+		buf.f_ffree = st->f_ffree;
+		buf.f_fsid = st->f_fsid;
+		buf.f_namelen = st->f_namelen;
+		buf.f_frsize = st->f_frsize;
+		buf.f_flags = st->f_flags;
+		memset(buf.f_spare, 0, sizeof(buf.f_spare));
 	}
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
 	return 0;
 }
 
-static int do_statfs64(struct path *path, struct statfs64 *buf)
+static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
 {
-	struct kstatfs st;
-	int retval;
-
-	retval = vfs_statfs(path, &st);
-	if (retval)
-		return retval;
-
-	if (sizeof(*buf) == sizeof(st))
-		memcpy(buf, &st, sizeof(st));
+	struct statfs64 buf;
+	if (sizeof(buf) == sizeof(*st))
+		memcpy(&buf, st, sizeof(*st));
 	else {
-		buf->f_type = st.f_type;
-		buf->f_bsize = st.f_bsize;
-		buf->f_blocks = st.f_blocks;
-		buf->f_bfree = st.f_bfree;
-		buf->f_bavail = st.f_bavail;
-		buf->f_files = st.f_files;
-		buf->f_ffree = st.f_ffree;
-		buf->f_fsid = st.f_fsid;
-		buf->f_namelen = st.f_namelen;
-		buf->f_frsize = st.f_frsize;
-		buf->f_flags = st.f_flags;
-		memset(buf->f_spare, 0, sizeof(buf->f_spare));
+		buf.f_type = st->f_type;
+		buf.f_bsize = st->f_bsize;
+		buf.f_blocks = st->f_blocks;
+		buf.f_bfree = st->f_bfree;
+		buf.f_bavail = st->f_bavail;
+		buf.f_files = st->f_files;
+		buf.f_ffree = st->f_ffree;
+		buf.f_fsid = st->f_fsid;
+		buf.f_namelen = st->f_namelen;
+		buf.f_frsize = st->f_frsize;
+		buf.f_flags = st->f_flags;
+		memset(buf.f_spare, 0, sizeof(buf.f_spare));
 	}
+	if (copy_to_user(p, &buf, sizeof(buf)))
+		return -EFAULT;
 	return 0;
 }
 
 SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct statfs __user *, buf)
 {
-	struct path path;
-	int error;
-
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct statfs tmp;
-		error = do_statfs_native(&path, &tmp);
-		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-			error = -EFAULT;
-		path_put(&path);
-	}
+	struct kstatfs st;
+	int error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs_native(&st, buf);
 	return error;
 }
 
 SYSCALL_DEFINE3(statfs64, const char __user *, pathname, size_t, sz, struct statfs64 __user *, buf)
 {
-	struct path path;
-	long error;
-
+	struct kstatfs st;
+	int error;
 	if (sz != sizeof(*buf))
 		return -EINVAL;
-	error = user_path(pathname, &path);
-	if (!error) {
-		struct statfs64 tmp;
-		error = do_statfs64(&path, &tmp);
-		if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-			error = -EFAULT;
-		path_put(&path);
-	}
+	error = user_statfs(pathname, &st);
+	if (!error)
+		error = do_statfs64(&st, buf);
 	return error;
 }
 
 SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct statfs __user *, buf)
 {
-	struct file *file;
-	struct statfs tmp;
-	int error;
-
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = do_statfs_native(&file->f_path, &tmp);
-	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-		error = -EFAULT;
-	fput(file);
-out:
+	struct kstatfs st;
+	int error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs_native(&st, buf);
 	return error;
 }
 
 SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, size_t, sz, struct statfs64 __user *, buf)
 {
-	struct file *file;
-	struct statfs64 tmp;
+	struct kstatfs st;
 	int error;
 
 	if (sz != sizeof(*buf))
 		return -EINVAL;
 
-	error = -EBADF;
-	file = fget(fd);
-	if (!file)
-		goto out;
-	error = do_statfs64(&file->f_path, &tmp);
-	if (!error && copy_to_user(buf, &tmp, sizeof(tmp)))
-		error = -EFAULT;
-	fput(file);
-out:
+	error = fd_statfs(fd, &st);
+	if (!error)
+		error = do_statfs64(&st, buf);
 	return error;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index bf5c3c896072..b7178b05cf3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1874,6 +1874,8 @@ extern void drop_collected_mounts(struct vfsmount *);
 extern int iterate_mounts(int (*)(struct vfsmount *, void *), void *,
 			  struct vfsmount *);
 extern int vfs_statfs(struct path *, struct kstatfs *);
+extern int user_statfs(const char __user *, struct kstatfs *);
+extern int fd_statfs(int, struct kstatfs *);
 extern int statfs_by_dentry(struct dentry *, struct kstatfs *);
 extern int freeze_super(struct super_block *super);
 extern int thaw_super(struct super_block *super);
-- 
cgit v1.2.3


From 5fe0c2378884e68beb532f5890cc0e3539ac747b Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:25 +0530
Subject: exportfs: Return the minimum required handle size

The exportfs encode handle function should return the minimum required
handle size. This helps user to find out the handle size by passing 0
handle size in the first step and then redoing to the call again with
the returned handle size value.

Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/export.c             | 8 ++++++--
 fs/exportfs/expfs.c           | 9 +++++++--
 fs/fat/inode.c                | 4 +++-
 fs/fuse/inode.c               | 4 +++-
 fs/gfs2/export.c              | 8 ++++++--
 fs/isofs/export.c             | 8 ++++++--
 fs/ocfs2/export.c             | 8 ++++++--
 fs/reiserfs/inode.c           | 7 ++++++-
 fs/udf/namei.c                | 7 ++++++-
 fs/xfs/linux-2.6/xfs_export.c | 4 +++-
 include/linux/exportfs.h      | 6 ++++--
 mm/shmem.c                    | 4 +++-
 12 files changed, 59 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ff27d7a477b2..b4ffad859adb 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,9 +21,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 	int len = *max_len;
 	int type;
 
-	if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
-	    (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+	if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+		*max_len = BTRFS_FID_SIZE_CONNECTABLE;
 		return 255;
+	} else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+		*max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+		return 255;
+	}
 
 	len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
 	type = FILEID_BTRFS_WITHOUT_PARENT;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 4b6825740dd5..cfe55731b6dc 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -320,9 +320,14 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
 	struct inode * inode = dentry->d_inode;
 	int len = *max_len;
 	int type = FILEID_INO32_GEN;
-	
-	if (len < 2 || (connectable && len < 4))
+
+	if (connectable && (len < 4)) {
+		*max_len = 4;
+		return 255;
+	} else if (len < 2) {
+		*max_len = 2;
 		return 255;
+	}
 
 	len = 2;
 	fid->i32.ino = inode->i_ino;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 86753fe10bd1..0e277ec4b612 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -757,8 +757,10 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
 	struct inode *inode =  de->d_inode;
 	u32 ipos_h, ipos_m, ipos_l;
 
-	if (len < 5)
+	if (len < 5) {
+		*lenp = 5;
 		return 255; /* no room */
+	}
 
 	ipos_h = MSDOS_I(inode)->i_pos >> 8;
 	ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 9e3f68cc1bd1..051b1a084528 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -637,8 +637,10 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
 	u64 nodeid;
 	u32 generation;
 
-	if (*max_len < len)
+	if (*max_len < len) {
+		*max_len = len;
 		return  255;
+	}
 
 	nodeid = get_fuse_inode(inode)->nodeid;
 	generation = inode->i_generation;
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9023db8184f9..b5a5e60df0d5 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -36,9 +36,13 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
 	struct super_block *sb = inode->i_sb;
 	struct gfs2_inode *ip = GFS2_I(inode);
 
-	if (*len < GFS2_SMALL_FH_SIZE ||
-	    (connectable && *len < GFS2_LARGE_FH_SIZE))
+	if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
+		*len = GFS2_LARGE_FH_SIZE;
 		return 255;
+	} else if (*len < GFS2_SMALL_FH_SIZE) {
+		*len = GFS2_SMALL_FH_SIZE;
+		return 255;
+	}
 
 	fh[0] = cpu_to_be32(ip->i_no_formal_ino >> 32);
 	fh[1] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
diff --git a/fs/isofs/export.c b/fs/isofs/export.c
index ed752cb38474..dd4687ff30d0 100644
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -124,9 +124,13 @@ isofs_export_encode_fh(struct dentry *dentry,
 	 * offset of the inode and the upper 16 bits of fh32[1] to
 	 * hold the offset of the parent.
 	 */
-
-	if (len < 3 || (connectable && len < 5))
+	if (connectable && (len < 5)) {
+		*max_len = 5;
+		return 255;
+	} else if (len < 3) {
+		*max_len = 3;
 		return 255;
+	}
 
 	len = 3;
 	fh32[0] = ei->i_iget5_block;
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 5dbc3062b4fd..254652a9b542 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -197,8 +197,12 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 		   dentry->d_name.len, dentry->d_name.name,
 		   fh, len, connectable);
 
-	if (len < 3 || (connectable && len < 6)) {
-		mlog(ML_ERROR, "fh buffer is too small for encoding\n");
+	if (connectable && (len < 6)) {
+		*max_len = 6;
+		type = 255;
+		goto bail;
+	} else if (len < 3) {
+		*max_len = 3;
 		type = 255;
 		goto bail;
 	}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 0bae036831e2..1bba24bad820 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1593,8 +1593,13 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
 	struct inode *inode = dentry->d_inode;
 	int maxlen = *lenp;
 
-	if (maxlen < 3)
+	if (need_parent && (maxlen < 5)) {
+		*lenp = 5;
 		return 255;
+	} else if (maxlen < 3) {
+		*lenp = 3;
+		return 255;
+	}
 
 	data[0] = inode->i_ino;
 	data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index b7c338d5e9df..f1dce848ef96 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1286,8 +1286,13 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
 	struct fid *fid = (struct fid *)fh;
 	int type = FILEID_UDF_WITHOUT_PARENT;
 
-	if (len < 3 || (connectable && len < 5))
+	if (connectable && (len < 5)) {
+		*lenp = 5;
+		return 255;
+	} else if (len < 3) {
+		*lenp = 3;
 		return 255;
+	}
 
 	*lenp = 3;
 	fid->udf.block = location.logicalBlockNum;
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index fc0114da7fdd..f4f878fc0083 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -89,8 +89,10 @@ xfs_fs_encode_fh(
 	 * seven combinations work.  The real answer is "don't use v2".
 	 */
 	len = xfs_fileid_length(fileid_type);
-	if (*max_len < len)
+	if (*max_len < len) {
+		*max_len = len;
 		return 255;
+	}
 	*max_len = len;
 
 	switch (fileid_type) {
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 28028988c862..65afdfd31b7b 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -121,8 +121,10 @@ struct fid {
  *    set, the encode_fh() should store sufficient information so that a good
  *    attempt can be made to find not only the file but also it's place in the
  *    filesystem.   This typically means storing a reference to de->d_parent in
- *    the filehandle fragment.  encode_fh() should return the number of bytes
- *    stored or a negative error code such as %-ENOSPC
+ *    the filehandle fragment.  encode_fh() should return the fileid_type on
+ *    success and on error returns 255 (if the space needed to encode fh is
+ *    greater than @max_len*4 bytes). On error @max_len contains the minimum
+ *    size(in 4 byte unit) needed to encode the file handle.
  *
  * fh_to_dentry:
  *    @fh_to_dentry is given a &struct super_block (@sb) and a file handle
diff --git a/mm/shmem.c b/mm/shmem.c
index 5ee67c990602..3437b65d6d6e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2144,8 +2144,10 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
 {
 	struct inode *inode = dentry->d_inode;
 
-	if (*len < 3)
+	if (*len < 3) {
+		*len = 3;
 		return 255;
+	}
 
 	if (inode_unhashed(inode)) {
 		/* Unfortunately insert_inode_hash is not idempotent,
-- 
cgit v1.2.3


From 1eafbfeb7bdf59cfe173304c76188f3fd5f1fd05 Mon Sep 17 00:00:00 2001
From: Timo Warns <Warns@pre-sense.de>
Date: Mon, 14 Mar 2011 14:59:33 +0100
Subject: Fix corrupted OSF partition table parsing

The kernel automatically evaluates partition tables of storage devices.
The code for evaluating OSF partitions contains a bug that leaks data
from kernel heap memory to userspace for certain corrupted OSF
partitions.

In more detail:

  for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) {

iterates from 0 to d_npartitions - 1, where d_npartitions is read from
the partition table without validation and partition is a pointer to an
array of at most 8 d_partitions.

Add the proper and obvious validation.

Signed-off-by: Timo Warns <warns@pre-sense.de>
Cc: stable@kernel.org
[ Changed the patch trivially to not repeat the whole le16_to_cpu()
  thing, and to use an explicit constant for the magic value '8' ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/partitions/osf.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/partitions/osf.c b/fs/partitions/osf.c
index 48cec7cbca17..be03a0b08b47 100644
--- a/fs/partitions/osf.c
+++ b/fs/partitions/osf.c
@@ -10,10 +10,13 @@
 #include "check.h"
 #include "osf.h"
 
+#define MAX_OSF_PARTITIONS 8
+
 int osf_partition(struct parsed_partitions *state)
 {
 	int i;
 	int slot = 1;
+	unsigned int npartitions;
 	Sector sect;
 	unsigned char *data;
 	struct disklabel {
@@ -45,7 +48,7 @@ int osf_partition(struct parsed_partitions *state)
 			u8  p_fstype;
 			u8  p_frag;
 			__le16 p_cpg;
-		} d_partitions[8];
+		} d_partitions[MAX_OSF_PARTITIONS];
 	} * label;
 	struct d_partition * partition;
 
@@ -63,7 +66,12 @@ int osf_partition(struct parsed_partitions *state)
 		put_dev_sector(sect);
 		return 0;
 	}
-	for (i = 0 ; i < le16_to_cpu(label->d_npartitions); i++, partition++) {
+	npartitions = le16_to_cpu(label->d_npartitions);
+	if (npartitions > MAX_OSF_PARTITIONS) {
+		put_dev_sector(sect);
+		return 0;
+	}
+	for (i = 0 ; i < npartitions; i++, partition++) {
 		if (slot == state->limit)
 		        break;
 		if (le32_to_cpu(partition->p_size))
-- 
cgit v1.2.3


From f52e0c11305aa09ed56cad97ffc8f0cdc3d78b5d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 18:56:51 -0400
Subject: New AT_... flag: AT_EMPTY_PATH

For name_to_handle_at(2) we'll want both ...at()-style syscall that
would be usable for non-directory descriptors (with empty relative
pathname).  Introduce new flag (AT_EMPTY_PATH) to deal with that and
corresponding LOOKUP_EMPTY; teach user_path_at() and path_init() to
deal with the latter.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 29 +++++++++++++++++++----------
 include/linux/fcntl.h |  1 +
 include/linux/namei.h |  1 +
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index abc8d2df121c..83e92bab79a6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -136,7 +136,7 @@ static int do_getname(const char __user *filename, char *page)
 	return retval;
 }
 
-char * getname(const char __user * filename)
+static char *getname_flags(const char __user * filename, int flags)
 {
 	char *tmp, *result;
 
@@ -147,14 +147,21 @@ char * getname(const char __user * filename)
 
 		result = tmp;
 		if (retval < 0) {
-			__putname(tmp);
-			result = ERR_PTR(retval);
+			if (retval != -ENOENT || !(flags & LOOKUP_EMPTY)) {
+				__putname(tmp);
+				result = ERR_PTR(retval);
+			}
 		}
 	}
 	audit_getname(result);
 	return result;
 }
 
+char *getname(const char __user * filename)
+{
+	return getname_flags(filename, 0);
+}
+
 #ifdef CONFIG_AUDITSYSCALL
 void putname(const char *name)
 {
@@ -1544,13 +1551,15 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 
 		dentry = file->f_path.dentry;
 
-		retval = -ENOTDIR;
-		if (!S_ISDIR(dentry->d_inode->i_mode))
-			goto fput_fail;
+		if (*name) {
+			retval = -ENOTDIR;
+			if (!S_ISDIR(dentry->d_inode->i_mode))
+				goto fput_fail;
 
-		retval = file_permission(file, MAY_EXEC);
-		if (retval)
-			goto fput_fail;
+			retval = file_permission(file, MAY_EXEC);
+			if (retval)
+				goto fput_fail;
+		}
 
 		nd->path = file->f_path;
 		if (flags & LOOKUP_RCU) {
@@ -1759,7 +1768,7 @@ int user_path_at(int dfd, const char __user *name, unsigned flags,
 		 struct path *path)
 {
 	struct nameidata nd;
-	char *tmp = getname(name);
+	char *tmp = getname_flags(name, flags);
 	int err = PTR_ERR(tmp);
 	if (!IS_ERR(tmp)) {
 
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index a562fa5fb4e3..f550f894ba15 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -46,6 +46,7 @@
                                            unlinking file.  */
 #define AT_SYMLINK_FOLLOW	0x400   /* Follow symbolic links.  */
 #define AT_NO_AUTOMOUNT		0x800	/* Suppress terminal automount traversal */
+#define AT_EMPTY_PATH		0x1000	/* Allow empty relative pathname */
 
 #ifdef __KERNEL__
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 83cd6e5cd7dc..9c8603872c36 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -64,6 +64,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND};
 
 #define LOOKUP_JUMPED		0x1000
 #define LOOKUP_ROOT		0x2000
+#define LOOKUP_EMPTY		0x4000
 
 extern int user_path_at(int, const char __user *, unsigned, struct path *);
 
-- 
cgit v1.2.3


From 990d6c2d7aee921e3bce22b2d6a750fd552262be Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:26 +0530
Subject: vfs: Add name to file handle conversion support

The syscall also return mount id which can be used
to lookup file system specific information such as uuid
in /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Kconfig               |   2 +-
 fs/Makefile              |   2 +
 fs/fhandle.c             | 107 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/exportfs.h |   3 ++
 include/linux/fs.h       |   7 ++++
 include/linux/syscalls.h |   5 ++-
 init/Kconfig             |  12 ++++++
 kernel/sys_ni.c          |   3 ++
 8 files changed, 139 insertions(+), 2 deletions(-)
 create mode 100644 fs/fhandle.c

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index 3db9caa57edc..7cb53aafac1e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -47,7 +47,7 @@ config FS_POSIX_ACL
 	def_bool n
 
 config EXPORTFS
-	tristate
+	bool
 
 config FILE_LOCKING
 	bool "Enable POSIX file locking API" if EXPERT
diff --git a/fs/Makefile b/fs/Makefile
index a7f7cef0c0c8..ba01202844c5 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 
+obj-$(CONFIG_FHANDLE)		+= fhandle.o
+
 obj-y				+= quota/
 
 obj-$(CONFIG_PROC_FS)		+= proc/
diff --git a/fs/fhandle.c b/fs/fhandle.c
new file mode 100644
index 000000000000..9f79e743a840
--- /dev/null
+++ b/fs/fhandle.c
@@ -0,0 +1,107 @@
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <asm/uaccess.h>
+#include "internal.h"
+
+static long do_sys_name_to_handle(struct path *path,
+				  struct file_handle __user *ufh,
+				  int __user *mnt_id)
+{
+	long retval;
+	struct file_handle f_handle;
+	int handle_dwords, handle_bytes;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * We need t make sure wether the file system
+	 * support decoding of the file handle
+	 */
+	if (!path->mnt->mnt_sb->s_export_op ||
+	    !path->mnt->mnt_sb->s_export_op->fh_to_dentry)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle)))
+		return -EFAULT;
+
+	if (f_handle.handle_bytes > MAX_HANDLE_SZ)
+		return -EINVAL;
+
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	/* convert handle size to  multiple of sizeof(u32) */
+	handle_dwords = f_handle.handle_bytes >> 2;
+
+	/* we ask for a non connected handle */
+	retval = exportfs_encode_fh(path->dentry,
+				    (struct fid *)handle->f_handle,
+				    &handle_dwords,  0);
+	handle->handle_type = retval;
+	/* convert handle size to bytes */
+	handle_bytes = handle_dwords * sizeof(u32);
+	handle->handle_bytes = handle_bytes;
+	if ((handle->handle_bytes > f_handle.handle_bytes) ||
+	    (retval == 255) || (retval == -ENOSPC)) {
+		/* As per old exportfs_encode_fh documentation
+		 * we could return ENOSPC to indicate overflow
+		 * But file system returned 255 always. So handle
+		 * both the values
+		 */
+		/*
+		 * set the handle size to zero so we copy only
+		 * non variable part of the file_handle
+		 */
+		handle_bytes = 0;
+		retval = -EOVERFLOW;
+	} else
+		retval = 0;
+	/* copy the mount id */
+	if (copy_to_user(mnt_id, &path->mnt->mnt_id, sizeof(*mnt_id)) ||
+	    copy_to_user(ufh, handle,
+			 sizeof(struct file_handle) + handle_bytes))
+		retval = -EFAULT;
+	kfree(handle);
+	return retval;
+}
+
+/**
+ * sys_name_to_handle_at: convert name to handle
+ * @dfd: directory relative to which name is interpreted if not absolute
+ * @name: name that should be converted to handle.
+ * @handle: resulting file handle
+ * @mnt_id: mount id of the file system containing the file
+ * @flag: flag value to indicate whether to follow symlink or not
+ *
+ * @handle->handle_size indicate the space available to store the
+ * variable part of the file handle in bytes. If there is not
+ * enough space, the field is updated to return the minimum
+ * value required.
+ */
+SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
+		struct file_handle __user *, handle, int __user *, mnt_id,
+		int, flag)
+{
+	struct path path;
+	int lookup_flags;
+	int err;
+
+	if ((flag & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
+		return -EINVAL;
+
+	lookup_flags = (flag & AT_SYMLINK_FOLLOW) ? LOOKUP_FOLLOW : 0;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	err = user_path_at(dfd, name, lookup_flags, &path);
+	if (!err) {
+		err = do_sys_name_to_handle(&path, handle, mnt_id);
+		path_put(&path);
+	}
+	return err;
+}
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 65afdfd31b7b..33a42f24b275 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -8,6 +8,9 @@ struct inode;
 struct super_block;
 struct vfsmount;
 
+/* limit the handle size to NFSv4 handle size now */
+#define MAX_HANDLE_SZ 128
+
 /*
  * The fileid_type identifies how the file within the filesystem is encoded.
  * In theory this is freely set and parsed by the filesystem, but we try to
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b7178b05cf3a..3f64630c0e10 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -978,6 +978,13 @@ struct file {
 #endif
 };
 
+struct file_handle {
+	__u32 handle_bytes;
+	int handle_type;
+	/* file identifier */
+	unsigned char f_handle[0];
+};
+
 #define get_file(x)	atomic_long_inc(&(x)->f_count)
 #define fput_atomic(x)	atomic_long_add_unless(&(x)->f_count, -1, 1)
 #define file_count(x)	atomic_long_read(&(x)->f_count)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 98664db1be47..970112613fb4 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -62,6 +62,7 @@ struct robust_list_head;
 struct getcpu_cache;
 struct old_linux_dirent;
 struct perf_event_attr;
+struct file_handle;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -832,5 +833,7 @@ asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
 			unsigned long prot, unsigned long flags,
 			unsigned long fd, unsigned long pgoff);
 asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
-
+asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
+				      struct file_handle __user *handle,
+				      int __user *mnt_id, int flag);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index be788c0957d4..e72fa17fe559 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -287,6 +287,18 @@ config BSD_PROCESS_ACCT_V3
 	  for processing it. A preliminary version of these tools is available
 	  at <http://www.gnu.org/software/acct/>.
 
+config FHANDLE
+	bool "open by fhandle syscalls"
+	select EXPORTFS
+	help
+	  If you say Y here, a user level program will be able to map
+	  file names to handle and then later use the handle for
+	  different file system operations. This is useful in implementing
+	  userspace file servers, which now track files using handles instead
+	  of names. The handle would remain the same even if file names
+	  get renamed. Enables open_by_handle_at(2) and name_to_handle_at(2)
+	  syscalls.
+
 config TASKSTATS
 	bool "Export task/process statistics through netlink (EXPERIMENTAL)"
 	depends on NET
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c7..4e013439ac28 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -186,3 +186,6 @@ cond_syscall(sys_perf_event_open);
 /* fanotify! */
 cond_syscall(sys_fanotify_init);
 cond_syscall(sys_fanotify_mark);
+
+/* open by handle */
+cond_syscall(sys_name_to_handle_at);
-- 
cgit v1.2.3


From becfd1f37544798cbdfd788f32c827160fab98c1 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:26 +0530
Subject: vfs: Add open by file handle support

[AV: duplicate of open() guts removed; file_open_root() used instead]

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c              |  13 ++++
 fs/exportfs/expfs.c      |   2 +
 fs/fhandle.c             | 158 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/internal.h            |   3 +
 include/linux/syscalls.h |   3 +
 kernel/sys_ni.c          |   2 +
 6 files changed, 181 insertions(+)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index a071775f3bb3..c6d31a3bab88 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -2284,3 +2284,16 @@ asmlinkage long compat_sys_timerfd_gettime(int ufd,
 }
 
 #endif /* CONFIG_TIMERFD */
+
+#ifdef CONFIG_FHANDLE
+/*
+ * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
+ * doesn't set the O_LARGEFILE flag.
+ */
+asmlinkage long
+compat_sys_open_by_handle_at(int mountdirfd,
+			     struct file_handle __user *handle, int flags)
+{
+	return do_handle_open(mountdirfd, handle, flags);
+}
+#endif
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index cfe55731b6dc..b05acb796135 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -374,6 +374,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	/*
 	 * Try to get any dentry for the given file handle from the filesystem.
 	 */
+	if (!nop || !nop->fh_to_dentry)
+		return ERR_PTR(-ESTALE);
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
 	if (!result)
 		result = ERR_PTR(-ESTALE);
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 9f79e743a840..bf93ad2bee07 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -5,6 +5,8 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/fsnotify.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -105,3 +107,159 @@ SYSCALL_DEFINE5(name_to_handle_at, int, dfd, const char __user *, name,
 	}
 	return err;
 }
+
+static struct vfsmount *get_vfsmount_from_fd(int fd)
+{
+	struct path path;
+
+	if (fd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
+		spin_lock(&fs->lock);
+		path = fs->pwd;
+		mntget(path.mnt);
+		spin_unlock(&fs->lock);
+	} else {
+		int fput_needed;
+		struct file *file = fget_light(fd, &fput_needed);
+		if (!file)
+			return ERR_PTR(-EBADF);
+		path = file->f_path;
+		mntget(path.mnt);
+		fput_light(file, fput_needed);
+	}
+	return path.mnt;
+}
+
+static int vfs_dentry_acceptable(void *context, struct dentry *dentry)
+{
+	return 1;
+}
+
+static int do_handle_to_path(int mountdirfd, struct file_handle *handle,
+			     struct path *path)
+{
+	int retval = 0;
+	int handle_dwords;
+
+	path->mnt = get_vfsmount_from_fd(mountdirfd);
+	if (IS_ERR(path->mnt)) {
+		retval = PTR_ERR(path->mnt);
+		goto out_err;
+	}
+	/* change the handle size to multiple of sizeof(u32) */
+	handle_dwords = handle->handle_bytes >> 2;
+	path->dentry = exportfs_decode_fh(path->mnt,
+					  (struct fid *)handle->f_handle,
+					  handle_dwords, handle->handle_type,
+					  vfs_dentry_acceptable, NULL);
+	if (IS_ERR(path->dentry)) {
+		retval = PTR_ERR(path->dentry);
+		goto out_mnt;
+	}
+	return 0;
+out_mnt:
+	mntput(path->mnt);
+out_err:
+	return retval;
+}
+
+static int handle_to_path(int mountdirfd, struct file_handle __user *ufh,
+		   struct path *path)
+{
+	int retval = 0;
+	struct file_handle f_handle;
+	struct file_handle *handle = NULL;
+
+	/*
+	 * With handle we don't look at the execute bit on the
+	 * the directory. Ideally we would like CAP_DAC_SEARCH.
+	 * But we don't have that
+	 */
+	if (!capable(CAP_DAC_READ_SEARCH)) {
+		retval = -EPERM;
+		goto out_err;
+	}
+	if (copy_from_user(&f_handle, ufh, sizeof(struct file_handle))) {
+		retval = -EFAULT;
+		goto out_err;
+	}
+	if ((f_handle.handle_bytes > MAX_HANDLE_SZ) ||
+	    (f_handle.handle_bytes == 0)) {
+		retval = -EINVAL;
+		goto out_err;
+	}
+	handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+			 GFP_KERNEL);
+	if (!handle) {
+		retval = -ENOMEM;
+		goto out_err;
+	}
+	/* copy the full handle */
+	if (copy_from_user(handle, ufh,
+			   sizeof(struct file_handle) +
+			   f_handle.handle_bytes)) {
+		retval = -EFAULT;
+		goto out_handle;
+	}
+
+	retval = do_handle_to_path(mountdirfd, handle, path);
+
+out_handle:
+	kfree(handle);
+out_err:
+	return retval;
+}
+
+long do_handle_open(int mountdirfd,
+		    struct file_handle __user *ufh, int open_flag)
+{
+	long retval = 0;
+	struct path path;
+	struct file *file;
+	int fd;
+
+	retval = handle_to_path(mountdirfd, ufh, &path);
+	if (retval)
+		return retval;
+
+	fd = get_unused_fd_flags(open_flag);
+	if (fd < 0) {
+		path_put(&path);
+		return fd;
+	}
+	file = file_open_root(path.dentry, path.mnt, "", open_flag);
+	if (IS_ERR(file)) {
+		put_unused_fd(fd);
+		retval =  PTR_ERR(file);
+	} else {
+		retval = fd;
+		fsnotify_open(file);
+		fd_install(fd, file);
+	}
+	path_put(&path);
+	return retval;
+}
+
+/**
+ * sys_open_by_handle_at: Open the file handle
+ * @mountdirfd: directory file descriptor
+ * @handle: file handle to be opened
+ * @flag: open flags.
+ *
+ * @mountdirfd indicate the directory file descriptor
+ * of the mount point. file handle is decoded relative
+ * to the vfsmount pointed by the @mountdirfd. @flags
+ * value is same as the open(2) flags.
+ */
+SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
+		struct file_handle __user *, handle,
+		int, flags)
+{
+	long ret;
+
+	if (force_o_largefile())
+		flags |= O_LARGEFILE;
+
+	ret = do_handle_open(mountdirfd, handle, flags);
+	return ret;
+}
diff --git a/fs/internal.h b/fs/internal.h
index 52abc5287f50..f3d15de44b15 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -117,6 +117,9 @@ extern struct file *do_filp_open(int dfd, const char *pathname,
 extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 		const char *, const struct open_flags *, int lookup_flags);
 
+extern long do_handle_open(int mountdirfd,
+			   struct file_handle __user *ufh, int open_flag);
+
 /*
  * inode.c
  */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 970112613fb4..2d9b79c0f224 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -836,4 +836,7 @@ asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
 asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
 				      struct file_handle __user *handle,
 				      int __user *mnt_id, int flag);
+asmlinkage long sys_open_by_handle_at(int mountdirfd,
+				      struct file_handle __user *handle,
+				      int flags);
 #endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 4e013439ac28..25cc41cd8f33 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -189,3 +189,5 @@ cond_syscall(sys_fanotify_mark);
 
 /* open by handle */
 cond_syscall(sys_name_to_handle_at);
+cond_syscall(sys_open_by_handle_at);
+cond_syscall(compat_sys_open_by_handle_at);
-- 
cgit v1.2.3


From aae8a97d3ec30788790d1720b71d76fd8eb44b73 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:27 +0530
Subject: fs: Don't allow to create hardlink for deleted file

Add inode->i_nlink == 0 check in VFS. Some of the file systems
do this internally. A followup patch will remove those instance.
This is needed to ensure that with link by handle we don't allow
to create hardlink of an unlinked file. The check also prevent a race
between unlink and link

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 83e92bab79a6..33be51a2ddb7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2906,7 +2906,11 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		return error;
 
 	mutex_lock(&inode->i_mutex);
-	error = dir->i_op->link(old_dentry, dir, new_dentry);
+	/* Make sure we don't allow creating hardlink to an unlinked file */
+	if (inode->i_nlink == 0)
+		error =  -ENOENT;
+	else
+		error = dir->i_op->link(old_dentry, dir, new_dentry);
 	mutex_unlock(&inode->i_mutex);
 	if (!error)
 		fsnotify_link(dir, inode, new_dentry);
-- 
cgit v1.2.3


From f17b6042073e7000a90063f7edbca59a5bd1caa2 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:30 +0530
Subject: fs: Remove i_nlink check from file system link callback

Now that VFS check for inode->i_nlink == 0 and returns proper
error, remove similar check from file system

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c    |  3 ---
 fs/ext3/namei.c     |  7 -------
 fs/ext4/namei.c     |  7 -------
 fs/jfs/namei.c      |  3 ---
 fs/reiserfs/namei.c |  4 ----
 fs/ubifs/dir.c      | 18 ------------------
 6 files changed, 42 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0efdb65953c5..c23f050f47c2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4806,9 +4806,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	int err;
 	int drop_inode = 0;
 
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 	/* do not allow sys_link's with other subvols of the same device */
 	if (root->objectid != BTRFS_I(inode)->root->objectid)
 		return -EPERM;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index b27ba71810ec..561f69256266 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2253,13 +2253,6 @@ static int ext3_link (struct dentry * old_dentry,
 
 	dquot_initialize(dir);
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 */
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 retry:
 	handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT3_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 5485390d32c5..e781b7ea5630 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2304,13 +2304,6 @@ static int ext4_link(struct dentry *old_dentry,
 
 	dquot_initialize(dir);
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 */
-	if (inode->i_nlink == 0)
-		return -ENOENT;
-
 retry:
 	handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
 					EXT4_INDEX_EXTRA_TRANS_BLOCKS);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 5a2b269428a6..3f04a1804931 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -809,9 +809,6 @@ static int jfs_link(struct dentry *old_dentry,
 	if (ip->i_nlink == JFS_LINK_MAX)
 		return -EMLINK;
 
-	if (ip->i_nlink == 0)
-		return -ENOENT;
-
 	dquot_initialize(dir);
 
 	tid = txBegin(ip->i_sb, 0);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 68fdf45cc6c9..4b2eb564fdad 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1122,10 +1122,6 @@ static int reiserfs_link(struct dentry *old_dentry, struct inode *dir,
 		reiserfs_write_unlock(dir->i_sb);
 		return -EMLINK;
 	}
-	if (inode->i_nlink == 0) {
-		reiserfs_write_unlock(dir->i_sb);
-		return -ENOENT;
-	}
 
 	/* inc before scheduling so reiserfs_unlink knows we are here */
 	inc_nlink(inode);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 14f64b689d7f..7217d67a80a6 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -522,24 +522,6 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	ubifs_assert(mutex_is_locked(&dir->i_mutex));
 	ubifs_assert(mutex_is_locked(&inode->i_mutex));
 
-	/*
-	 * Return -ENOENT if we've raced with unlink and i_nlink is 0.  Doing
-	 * otherwise has the potential to corrupt the orphan inode list.
-	 *
-	 * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and
-	 * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not
-	 * lock 'dirA->i_mutex', so this is possible. Both of the functions
-	 * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes
-	 * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this
-	 * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA'
-	 * to the list of orphans. After this, 'vfs_link()' will link
-	 * 'dirB/fileB' to 'inodeA'. This is a problem because, for example,
-	 * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode
-	 * to the list of orphans.
-	 */
-	 if (inode->i_nlink == 0)
-		 return -ENOENT;
-
 	err = dbg_check_synced_i_size(inode);
 	if (err)
 		return err;
-- 
cgit v1.2.3


From 93f1c20bc8cdb757be50566eff88d65c3b26881f Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:38 +0530
Subject: vfs: Export file system uuid via /proc/<pid>/mountinfo

We add a per superblock uuid field. File systems should
update the uuid in the fill_super callback

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c     | 16 ++++++++++++++++
 include/linux/fs.h |  1 +
 2 files changed, 17 insertions(+)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index d1edf26025dc..dffe6f49ab93 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1002,6 +1002,18 @@ const struct seq_operations mounts_op = {
 	.show	= show_vfsmnt
 };
 
+static int uuid_is_nil(u8 *uuid)
+{
+	int i;
+	u8  *cp = (u8 *)uuid;
+
+	for (i = 0; i < 16; i++) {
+		if (*cp++)
+			return 0;
+	}
+	return 1;
+}
+
 static int show_mountinfo(struct seq_file *m, void *v)
 {
 	struct proc_mounts *p = m->private;
@@ -1040,6 +1052,10 @@ static int show_mountinfo(struct seq_file *m, void *v)
 	if (IS_MNT_UNBINDABLE(mnt))
 		seq_puts(m, " unbindable");
 
+	if (!uuid_is_nil(mnt->mnt_sb->s_uuid))
+		/* print the uuid */
+		seq_printf(m, " uuid:%pU", mnt->mnt_sb->s_uuid);
+
 	/* Filesystem specific data */
 	seq_puts(m, " - ");
 	show_type(m, sb);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 3f64630c0e10..f2143e0942c2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1408,6 +1408,7 @@ struct super_block {
 	wait_queue_head_t	s_wait_unfrozen;
 
 	char s_id[32];				/* Informational name */
+	u8 s_uuid[16];				/* UUID */
 
 	void 			*s_fs_info;	/* Filesystem private info */
 	fmode_t			s_mode;
-- 
cgit v1.2.3


From 03cb5f03dcb26846fcad345d8c15aae91579a53d Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:39 +0530
Subject: ext3: Copy fs UUID to superblock.

File system UUID is made available to application
via  /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/super.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 85c8cc8f2473..9cc19a1dea8e 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1936,6 +1936,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	sb->s_qcop = &ext3_qctl_operations;
 	sb->dq_op = &ext3_quota_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
 	mutex_init(&sbi->s_resize_lock);
-- 
cgit v1.2.3


From f2fa2ffc2046fdc35f96366d1ec8675f4d578522 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:40 +0530
Subject: ext4: Copy fs UUID to superblock

File system UUID is made available to application
via  /proc/<pid>/mountinfo

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/super.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f6a318f836b2..5977b356a435 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3415,6 +3415,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_qcop = &ext4_qctl_operations;
 	sb->dq_op = &ext4_quota_operations;
 #endif
+	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
+
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
 	mutex_init(&sbi->s_resize_lock);
-- 
cgit v1.2.3


From 1abf0c718f15a56a0a435588d1b104c7a37dc9bd Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 03:51:11 -0400
Subject: New kind of open files - "location only".

New flag for open(2) - O_PATH.  Semantics:
	* pathname is resolved, but the file itself is _NOT_ opened
as far as filesystem is concerned.
	* almost all operations on the resulting descriptors shall
fail with -EBADF.  Exceptions are:
	1) operations on descriptors themselves (i.e.
		close(), dup(), dup2(), dup3(), fcntl(fd, F_DUPFD),
		fcntl(fd, F_DUPFD_CLOEXEC, ...), fcntl(fd, F_GETFD),
		fcntl(fd, F_SETFD, ...))
	2) fcntl(fd, F_GETFL), for a common non-destructive way to
		check if descriptor is open
	3) "dfd" arguments of ...at(2) syscalls, i.e. the starting
		points of pathname resolution
	* closing such descriptor does *NOT* affect dnotify or
posix locks.
	* permissions are checked as usual along the way to file;
no permission checks are applied to the file itself.  Of course,
giving such thing to syscall will result in permission checks (at
the moment it means checking that starting point of ....at() is
a directory and caller has exec permissions on it).

fget() and fget_light() return NULL on such descriptors; use of
fget_raw() and fget_raw_light() is needed to get them.  That protects
existing code from dealing with those things.

There are two things still missing (they come in the next commits):
one is handling of symlinks (right now we refuse to open them that
way; see the next commit for semantics related to those) and another
is descriptor passing via SCM_RIGHTS datagrams.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fcntl.c                  | 37 ++++++++++++++++++++++++++-----
 fs/file_table.c             | 53 ++++++++++++++++++++++++++++++++++++++++-----
 fs/namei.c                  |  2 +-
 fs/open.c                   | 35 +++++++++++++++++++++++++-----
 include/asm-generic/fcntl.h |  4 ++++
 include/linux/file.h        |  2 ++
 include/linux/fs.h          |  3 +++
 7 files changed, 119 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/fcntl.c b/fs/fcntl.c
index cb1026181bdc..6c82e5bac039 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -131,7 +131,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
 SYSCALL_DEFINE1(dup, unsigned int, fildes)
 {
 	int ret = -EBADF;
-	struct file *file = fget(fildes);
+	struct file *file = fget_raw(fildes);
 
 	if (file) {
 		ret = get_unused_fd();
@@ -426,15 +426,35 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
 	return err;
 }
 
+static int check_fcntl_cmd(unsigned cmd)
+{
+	switch (cmd) {
+	case F_DUPFD:
+	case F_DUPFD_CLOEXEC:
+	case F_GETFD:
+	case F_SETFD:
+	case F_GETFL:
+		return 1;
+	}
+	return 0;
+}
+
 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {	
 	struct file *filp;
 	long err = -EBADF;
 
-	filp = fget(fd);
+	filp = fget_raw(fd);
 	if (!filp)
 		goto out;
 
+	if (unlikely(filp->f_mode & FMODE_PATH)) {
+		if (!check_fcntl_cmd(cmd)) {
+			fput(filp);
+			goto out;
+		}
+	}
+
 	err = security_file_fcntl(filp, cmd, arg);
 	if (err) {
 		fput(filp);
@@ -456,10 +476,17 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
 	long err;
 
 	err = -EBADF;
-	filp = fget(fd);
+	filp = fget_raw(fd);
 	if (!filp)
 		goto out;
 
+	if (unlikely(filp->f_mode & FMODE_PATH)) {
+		if (!check_fcntl_cmd(cmd)) {
+			fput(filp);
+			goto out;
+		}
+	}
+
 	err = security_file_fcntl(filp, cmd, arg);
 	if (err) {
 		fput(filp);
@@ -808,14 +835,14 @@ static int __init fcntl_init(void)
 	 * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
 	 * is defined as O_NONBLOCK on some platforms and not on others.
 	 */
-	BUILD_BUG_ON(18 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+	BUILD_BUG_ON(19 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
 		O_RDONLY	| O_WRONLY	| O_RDWR	|
 		O_CREAT		| O_EXCL	| O_NOCTTY	|
 		O_TRUNC		| O_APPEND	| /* O_NONBLOCK	| */
 		__O_SYNC	| O_DSYNC	| FASYNC	|
 		O_DIRECT	| O_LARGEFILE	| O_DIRECTORY	|
 		O_NOFOLLOW	| O_NOATIME	| O_CLOEXEC	|
-		__FMODE_EXEC
+		__FMODE_EXEC	| O_PATH
 		));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/file_table.c b/fs/file_table.c
index eb36b6b17e26..3c16e1ca163e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -276,11 +276,10 @@ struct file *fget(unsigned int fd)
 	rcu_read_lock();
 	file = fcheck_files(files, fd);
 	if (file) {
-		if (!atomic_long_inc_not_zero(&file->f_count)) {
-			/* File object ref couldn't be taken */
-			rcu_read_unlock();
-			return NULL;
-		}
+		/* File object ref couldn't be taken */
+		if (file->f_mode & FMODE_PATH ||
+		    !atomic_long_inc_not_zero(&file->f_count))
+			file = NULL;
 	}
 	rcu_read_unlock();
 
@@ -289,6 +288,23 @@ struct file *fget(unsigned int fd)
 
 EXPORT_SYMBOL(fget);
 
+struct file *fget_raw(unsigned int fd)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
+	rcu_read_lock();
+	file = fcheck_files(files, fd);
+	if (file) {
+		/* File object ref couldn't be taken */
+		if (!atomic_long_inc_not_zero(&file->f_count))
+			file = NULL;
+	}
+	rcu_read_unlock();
+
+	return file;
+}
+
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  *
@@ -310,6 +326,33 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 	struct file *file;
 	struct files_struct *files = current->files;
 
+	*fput_needed = 0;
+	if (atomic_read(&files->count) == 1) {
+		file = fcheck_files(files, fd);
+		if (file && (file->f_mode & FMODE_PATH))
+			file = NULL;
+	} else {
+		rcu_read_lock();
+		file = fcheck_files(files, fd);
+		if (file) {
+			if (!(file->f_mode & FMODE_PATH) &&
+			    atomic_long_inc_not_zero(&file->f_count))
+				*fput_needed = 1;
+			else
+				/* Didn't get the reference, someone's freed */
+				file = NULL;
+		}
+		rcu_read_unlock();
+	}
+
+	return file;
+}
+
+struct file *fget_raw_light(unsigned int fd, int *fput_needed)
+{
+	struct file *file;
+	struct files_struct *files = current->files;
+
 	*fput_needed = 0;
 	if (atomic_read(&files->count) == 1) {
 		file = fcheck_files(files, fd);
diff --git a/fs/namei.c b/fs/namei.c
index 33be51a2ddb7..e1d9f90d9776 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1544,7 +1544,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	} else {
 		struct dentry *dentry;
 
-		file = fget_light(dfd, &fput_needed);
+		file = fget_raw_light(dfd, &fput_needed);
 		retval = -EBADF;
 		if (!file)
 			goto out_fail;
diff --git a/fs/open.c b/fs/open.c
index 48afc5c139d2..14a51de01f54 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -669,11 +669,16 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 					int (*open)(struct inode *, struct file *),
 					const struct cred *cred)
 {
+	static const struct file_operations empty_fops = {};
 	struct inode *inode;
 	int error;
 
 	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
+
+	if (unlikely(f->f_flags & O_PATH))
+		f->f_mode = FMODE_PATH;
+
 	inode = dentry->d_inode;
 	if (f->f_mode & FMODE_WRITE) {
 		error = __get_file_write_access(inode, mnt);
@@ -687,9 +692,15 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	f->f_path.dentry = dentry;
 	f->f_path.mnt = mnt;
 	f->f_pos = 0;
-	f->f_op = fops_get(inode->i_fop);
 	file_sb_list_add(f, inode->i_sb);
 
+	if (unlikely(f->f_mode & FMODE_PATH)) {
+		f->f_op = &empty_fops;
+		return f;
+	}
+
+	f->f_op = fops_get(inode->i_fop);
+
 	error = security_dentry_open(f, cred);
 	if (error)
 		goto cleanup_all;
@@ -911,9 +922,18 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op)
 	if (flags & __O_SYNC)
 		flags |= O_DSYNC;
 
-	op->open_flag = flags;
+	/*
+	 * If we have O_PATH in the open flag. Then we
+	 * cannot have anything other than the below set of flags
+	 */
+	if (flags & O_PATH) {
+		flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
+		acc_mode = 0;
+	} else {
+		acc_mode = MAY_OPEN | ACC_MODE(flags);
+	}
 
-	acc_mode = MAY_OPEN | ACC_MODE(flags);
+	op->open_flag = flags;
 
 	/* O_TRUNC implies we need access checks for write permissions */
 	if (flags & O_TRUNC)
@@ -926,7 +946,8 @@ static inline int build_open_flags(int flags, int mode, struct open_flags *op)
 
 	op->acc_mode = acc_mode;
 
-	op->intent = LOOKUP_OPEN;
+	op->intent = flags & O_PATH ? 0 : LOOKUP_OPEN;
+
 	if (flags & O_CREAT) {
 		op->intent |= LOOKUP_CREATE;
 		if (flags & O_EXCL)
@@ -1053,8 +1074,10 @@ int filp_close(struct file *filp, fl_owner_t id)
 	if (filp->f_op && filp->f_op->flush)
 		retval = filp->f_op->flush(filp, id);
 
-	dnotify_flush(filp, id);
-	locks_remove_posix(filp, id);
+	if (likely(!(filp->f_mode & FMODE_PATH))) {
+		dnotify_flush(filp, id);
+		locks_remove_posix(filp, id);
+	}
 	fput(filp);
 	return retval;
 }
diff --git a/include/asm-generic/fcntl.h b/include/asm-generic/fcntl.h
index 0fc16e3f0bfc..84793c7025e2 100644
--- a/include/asm-generic/fcntl.h
+++ b/include/asm-generic/fcntl.h
@@ -80,6 +80,10 @@
 #define O_SYNC		(__O_SYNC|O_DSYNC)
 #endif
 
+#ifndef O_PATH
+#define O_PATH		010000000
+#endif
+
 #ifndef O_NDELAY
 #define O_NDELAY	O_NONBLOCK
 #endif
diff --git a/include/linux/file.h b/include/linux/file.h
index e85baebf6279..21a79958541c 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -29,6 +29,8 @@ static inline void fput_light(struct file *file, int fput_needed)
 
 extern struct file *fget(unsigned int fd);
 extern struct file *fget_light(unsigned int fd, int *fput_needed);
+extern struct file *fget_raw(unsigned int fd);
+extern struct file *fget_raw_light(unsigned int fd, int *fput_needed);
 extern void set_close_on_exec(unsigned int fd, int flag);
 extern void put_filp(struct file *);
 extern int alloc_fd(unsigned start, unsigned flags);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f2143e0942c2..13df14e2c42e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -102,6 +102,9 @@ struct inodes_stat_t {
 /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */
 #define FMODE_UNSIGNED_OFFSET	((__force fmode_t)0x2000)
 
+/* File is opened with O_PATH; almost nothing can be done with it */
+#define FMODE_PATH		((__force fmode_t)0x4000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
-- 
cgit v1.2.3


From bcda76524cd1fa32af748536f27f674a13e56700 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 16:42:14 -0400
Subject: Allow O_PATH for symlinks

At that point we can't do almost nothing with them.  They can be opened
with O_PATH, we can manipulate such descriptors with dup(), etc. and
we can see them in /proc/*/{fd,fdinfo}/*.

We can't (and won't be able to) follow /proc/*/fd/* symlinks for those;
there's simply not enough information for pathname resolution to go on
from such point - to resolve a symlink we need to know which directory
does it live in.

We will be able to do useful things with them after the next commit, though -
readlinkat() and fchownat() will be possible to use with dfd being an
O_PATH-opened symlink and empty relative pathname.  Combined with
open_by_handle() it'll give us a way to do realink-by-handle and
lchown-by-handle without messing with more redundant syscalls.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index e1d9f90d9776..9d4f32700179 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -766,8 +766,14 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 		error = 0;
 		if (s)
 			error = __vfs_follow_link(nd, s);
-		else if (nd->last_type == LAST_BIND)
+		else if (nd->last_type == LAST_BIND) {
 			nd->flags |= LOOKUP_JUMPED;
+			if (nd->path.dentry->d_inode->i_op->follow_link) {
+				/* stepped on a _really_ weird one */
+				path_put(&nd->path);
+				error = -ELOOP;
+			}
+		}
 	}
 	return error;
 }
@@ -1954,6 +1960,10 @@ static int may_open(struct path *path, int acc_mode, int flag)
 	struct inode *inode = dentry->d_inode;
 	int error;
 
+	/* O_PATH? */
+	if (!acc_mode)
+		return 0;
+
 	if (!inode)
 		return -ENOENT;
 
@@ -2056,7 +2066,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	int open_flag = op->open_flag;
 	int will_truncate = open_flag & O_TRUNC;
 	int want_write = 0;
-	int skip_perm = 0;
+	int acc_mode = op->acc_mode;
 	struct file *filp;
 	struct inode *inode;
 	int error;
@@ -2095,8 +2105,11 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	}
 
 	if (!(open_flag & O_CREAT)) {
+		int symlink_ok = 0;
 		if (nd->last.name[nd->last.len])
 			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
+			symlink_ok = 1;
 		/* we _can_ be in RCU mode here */
 		error = do_lookup(nd, &nd->last, path, &inode);
 		if (error) {
@@ -2108,7 +2121,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 			terminate_walk(nd);
 			return ERR_PTR(-ENOENT);
 		}
-		if (unlikely(inode->i_op->follow_link)) {
+		if (unlikely(inode->i_op->follow_link && !symlink_ok)) {
 			/* We drop rcu-walk here */
 			if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
 				return ERR_PTR(-ECHILD);
@@ -2175,7 +2188,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		/* Don't check for write permission, don't truncate */
 		open_flag &= ~O_TRUNC;
 		will_truncate = 0;
-		skip_perm = 1;
+		acc_mode = MAY_OPEN;
 		error = security_path_mknod(&nd->path, dentry, mode, 0);
 		if (error)
 			goto exit_mutex_unlock;
@@ -2225,7 +2238,7 @@ ok:
 		want_write = 1;
 	}
 common:
-	error = may_open(&nd->path, skip_perm ? 0 : op->acc_mode, open_flag);
+	error = may_open(&nd->path, acc_mode, open_flag);
 	if (error)
 		goto exit;
 	filp = nameidata_to_filp(nd);
@@ -2358,7 +2371,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 
 	flags |= LOOKUP_ROOT;
 
-	if (dentry->d_inode->i_op->follow_link)
+	if (dentry->d_inode->i_op->follow_link && op->intent & LOOKUP_OPEN)
 		return ERR_PTR(-ELOOP);
 
 	file = path_openat(-1, name, &nd, op, flags | LOOKUP_RCU);
-- 
cgit v1.2.3


From 65cfc6722361570bfe255698d9cd4dccaf47570d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 15:56:26 -0400
Subject: readlinkat(), fchownat() and fstatat() with empty relative pathnames

For readlinkat() we simply allow empty pathname; it will fail unless
we have dfd equal to O_PATH-opened symlink, so we are outside of
POSIX scope here.  For fchownat() and fstatat() we allow AT_EMPTY_PATH;
let the caller explicitly ask for such behaviour.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/open.c | 10 ++++++----
 fs/stat.c |  7 +++++--
 2 files changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/open.c b/fs/open.c
index 14a51de01f54..3cac0bda46df 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -573,13 +573,15 @@ SYSCALL_DEFINE5(fchownat, int, dfd, const char __user *, filename, uid_t, user,
 {
 	struct path path;
 	int error = -EINVAL;
-	int follow;
+	int lookup_flags;
 
-	if ((flag & ~AT_SYMLINK_NOFOLLOW) != 0)
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_EMPTY_PATH)) != 0)
 		goto out;
 
-	follow = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
-	error = user_path_at(dfd, filename, follow, &path);
+	lookup_flags = (flag & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
+	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
 		goto out;
 	error = mnt_want_write(path.mnt);
diff --git a/fs/stat.c b/fs/stat.c
index d5c61cf2b703..961039121cb8 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -75,13 +75,16 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
 	int error = -EINVAL;
 	int lookup_flags = 0;
 
-	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT)) != 0)
+	if ((flag & ~(AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT |
+		      AT_EMPTY_PATH)) != 0)
 		goto out;
 
 	if (!(flag & AT_SYMLINK_NOFOLLOW))
 		lookup_flags |= LOOKUP_FOLLOW;
 	if (flag & AT_NO_AUTOMOUNT)
 		lookup_flags |= LOOKUP_NO_AUTOMOUNT;
+	if (flag & AT_EMPTY_PATH)
+		lookup_flags |= LOOKUP_EMPTY;
 
 	error = user_path_at(dfd, filename, lookup_flags, &path);
 	if (error)
@@ -297,7 +300,7 @@ SYSCALL_DEFINE4(readlinkat, int, dfd, const char __user *, pathname,
 	if (bufsiz <= 0)
 		return -EINVAL;
 
-	error = user_path_at(dfd, pathname, 0, &path);
+	error = user_path_at(dfd, pathname, LOOKUP_EMPTY, &path);
 	if (!error) {
 		struct inode *inode = path.dentry->d_inode;
 
-- 
cgit v1.2.3


From 326be7b484843988afe57566b627fb7a70beac56 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 17:08:22 -0400
Subject: Allow passing O_PATH descriptors via SCM_RIGHTS datagrams

Just need to make sure that AF_UNIX garbage collector won't
confuse O_PATHed socket on filesystem for real AF_UNIX opened
socket.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c    | 2 ++
 net/core/scm.c     | 2 +-
 net/unix/garbage.c | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index 3c16e1ca163e..74a9544ac770 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -305,6 +305,8 @@ struct file *fget_raw(unsigned int fd)
 	return file;
 }
 
+EXPORT_SYMBOL(fget_raw);
+
 /*
  * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  *
diff --git a/net/core/scm.c b/net/core/scm.c
index bbe454450801..4c1ef026d695 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -95,7 +95,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 		int fd = fdp[i];
 		struct file *file;
 
-		if (fd < 0 || !(file = fget(fd)))
+		if (fd < 0 || !(file = fget_raw(fd)))
 			return -EBADF;
 		*fpp++ = file;
 		fpl->count++;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index f89f83bf828e..b6f4b994eb35 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -104,7 +104,7 @@ struct sock *unix_get_socket(struct file *filp)
 	/*
 	 *	Socket ?
 	 */
-	if (S_ISSOCK(inode->i_mode)) {
+	if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
 		struct socket *sock = SOCKET_I(inode);
 		struct sock *s = sock->sk;
 
-- 
cgit v1.2.3


From 11a7b371b64ef39fc5fb1b6f2218eef7c4d035e3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 29 Jan 2011 18:43:42 +0530
Subject: fs: allow AT_EMPTY_PATH in linkat(), limit that to
 CAP_DAC_READ_SEARCH

We don't want to allow creation of private hardlinks by different application
using the fd passed to them via SCM_RIGHTS. So limit the null relative name
usage in linkat syscall to CAP_DAC_READ_SEARCH

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
---
 fs/namei.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9d4f32700179..c9b7f5b7e92a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2945,15 +2945,27 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 	struct dentry *new_dentry;
 	struct nameidata nd;
 	struct path old_path;
+	int how = 0;
 	int error;
 	char *to;
 
-	if ((flags & ~AT_SYMLINK_FOLLOW) != 0)
+	if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
 		return -EINVAL;
+	/*
+	 * To use null names we require CAP_DAC_READ_SEARCH
+	 * This ensures that not everyone will be able to create
+	 * handlink using the passed filedescriptor.
+	 */
+	if (flags & AT_EMPTY_PATH) {
+		if (!capable(CAP_DAC_READ_SEARCH))
+			return -ENOENT;
+		how = LOOKUP_EMPTY;
+	}
+
+	if (flags & AT_SYMLINK_FOLLOW)
+		how |= LOOKUP_FOLLOW;
 
-	error = user_path_at(olddfd, oldname,
-			     flags & AT_SYMLINK_FOLLOW ? LOOKUP_FOLLOW : 0,
-			     &old_path);
+	error = user_path_at(olddfd, oldname, how, &old_path);
 	if (error)
 		return error;
 
-- 
cgit v1.2.3


From ce57dfc1791221ef58b6d6b8f5437fccefc4e187 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 13 Mar 2011 19:58:58 -0400
Subject: pull handling of one pathname component into a helper

new helper: walk_component().  Handles everything except symlinks;
returns negative on error, 0 on success and 1 on symlinks we decided
to follow.  Drops out of RCU mode on such symlinks.

link_path_walk() and do_last() switched to using that.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 123 +++++++++++++++++++++++++++----------------------------------
 1 file changed, 55 insertions(+), 68 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index c9b7f5b7e92a..549bbe2f25c6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -785,16 +785,11 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
  * Without that kind of total limit, nasty chains of consecutive
  * symlinks can cause almost arbitrarily long lookups. 
  */
-static inline int do_follow_link(struct inode *inode, struct path *path, struct nameidata *nd)
+static inline int do_follow_link(struct path *path, struct nameidata *nd)
 {
 	void *cookie;
 	int err = -ELOOP;
 
-	/* We drop rcu-walk here */
-	if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-		return -ECHILD;
-	BUG_ON(inode != path->dentry->d_inode);
-
 	if (current->link_count >= MAX_NESTED_LINKS)
 		goto loop;
 	if (current->total_link_count >= 40)
@@ -1337,6 +1332,39 @@ static void terminate_walk(struct nameidata *nd)
 	}
 }
 
+static inline int walk_component(struct nameidata *nd, struct path *path,
+		struct qstr *name, int type, int follow)
+{
+	struct inode *inode;
+	int err;
+	/*
+	 * "." and ".." are special - ".." especially so because it has
+	 * to be able to know about the current root directory and
+	 * parent relationships.
+	 */
+	if (unlikely(type != LAST_NORM))
+		return handle_dots(nd, type);
+	err = do_lookup(nd, name, path, &inode);
+	if (unlikely(err)) {
+		terminate_walk(nd);
+		return err;
+	}
+	if (!inode) {
+		path_to_nameidata(path, nd);
+		terminate_walk(nd);
+		return -ENOENT;
+	}
+	if (unlikely(inode->i_op->follow_link) && follow) {
+		if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
+			return -ECHILD;
+		BUG_ON(inode != path->dentry->d_inode);
+		return 1;
+	}
+	path_to_nameidata(path, nd);
+	nd->inode = inode;
+	return 0;
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1361,7 +1389,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
-		struct inode *inode;
 		unsigned long hash;
 		struct qstr this;
 		unsigned int c;
@@ -1414,34 +1441,16 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		if (!*name)
 			goto last_with_slashes;
 
-		/*
-		 * "." and ".." are special - ".." especially so because it has
-		 * to be able to know about the current root directory and
-		 * parent relationships.
-		 */
-		if (unlikely(type != LAST_NORM)) {
-			if (handle_dots(nd, type))
-				return -ECHILD;
-			continue;
-		}
-
-		/* This does the actual lookups.. */
-		err = do_lookup(nd, &this, &next, &inode);
-		if (err)
-			break;
+		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
+		if (err < 0)
+			return err;
 
-		if (inode && inode->i_op->follow_link) {
-			err = do_follow_link(inode, &next, nd);
+		if (err) {
+			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
 			nd->inode = nd->path.dentry->d_inode;
-		} else {
-			path_to_nameidata(&next, nd);
-			nd->inode = inode;
 		}
-		err = -ENOENT;
-		if (!nd->inode)
-			break;
 		err = -ENOTDIR; 
 		if (!nd->inode->i_op->lookup)
 			break;
@@ -1453,36 +1462,27 @@ last_with_slashes:
 last_component:
 		/* Clear LOOKUP_CONTINUE iff it was previously unset */
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-		if (lookup_flags & LOOKUP_PARENT)
-			goto lookup_parent;
-		if (unlikely(type != LAST_NORM))
-			return handle_dots(nd, type);
-		err = do_lookup(nd, &this, &next, &inode);
-		if (err)
-			break;
-		if (inode && unlikely(inode->i_op->follow_link) &&
-		    (lookup_flags & LOOKUP_FOLLOW)) {
-			err = do_follow_link(inode, &next, nd);
+		if (lookup_flags & LOOKUP_PARENT) {
+			nd->last = this;
+			nd->last_type = type;
+			return 0;
+		}
+		err = walk_component(nd, &next, &this, type,
+					lookup_flags & LOOKUP_FOLLOW);
+		if (err < 0)
+			return err;
+		if (err) {
+			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
 			nd->inode = nd->path.dentry->d_inode;
-		} else {
-			path_to_nameidata(&next, nd);
-			nd->inode = inode;
 		}
-		err = -ENOENT;
-		if (!nd->inode)
-			break;
 		if (lookup_flags & LOOKUP_DIRECTORY) {
 			err = -ENOTDIR; 
 			if (!nd->inode->i_op->lookup)
 				break;
 		}
 		return 0;
-lookup_parent:
-		nd->last = this;
-		nd->last_type = type;
-		return 0;
 	}
 	terminate_walk(nd);
 	return err;
@@ -2068,7 +2068,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 	int want_write = 0;
 	int acc_mode = op->acc_mode;
 	struct file *filp;
-	struct inode *inode;
 	int error;
 
 	nd->flags &= ~LOOKUP_PARENT;
@@ -2111,24 +2110,12 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 		if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
 			symlink_ok = 1;
 		/* we _can_ be in RCU mode here */
-		error = do_lookup(nd, &nd->last, path, &inode);
-		if (error) {
-			terminate_walk(nd);
+		error = walk_component(nd, path, &nd->last, LAST_NORM,
+					!symlink_ok);
+		if (error < 0)
 			return ERR_PTR(error);
-		}
-		if (!inode) {
-			path_to_nameidata(path, nd);
-			terminate_walk(nd);
-			return ERR_PTR(-ENOENT);
-		}
-		if (unlikely(inode->i_op->follow_link && !symlink_ok)) {
-			/* We drop rcu-walk here */
-			if (nameidata_dentry_drop_rcu_maybe(nd, path->dentry))
-				return ERR_PTR(-ECHILD);
+		if (error) /* symlink */
 			return NULL;
-		}
-		path_to_nameidata(path, nd);
-		nd->inode = inode;
 		/* sayonara */
 		if (nd->flags & LOOKUP_RCU) {
 			if (nameidata_drop_rcu_last(nd))
@@ -2137,7 +2124,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
 
 		error = -ENOTDIR;
 		if (nd->flags & LOOKUP_DIRECTORY) {
-			if (!inode->i_op->lookup)
+			if (!nd->inode->i_op->lookup)
 				goto exit;
 		}
 		audit_inode(pathname, nd->path.dentry);
-- 
cgit v1.2.3


From b21041d0f72899ed815bd2cbf7275339c74737b6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 20:01:51 -0400
Subject: update nd->inode in __do_follow_link() instead of after
 do_follow_link()

... and note that we only need to do it for LAST_BIND symlinks

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 549bbe2f25c6..9e7b18a8be66 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -768,7 +768,8 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 			error = __vfs_follow_link(nd, s);
 		else if (nd->last_type == LAST_BIND) {
 			nd->flags |= LOOKUP_JUMPED;
-			if (nd->path.dentry->d_inode->i_op->follow_link) {
+			nd->inode = nd->path.dentry->d_inode;
+			if (nd->inode->i_op->follow_link) {
 				/* stepped on a _really_ weird one */
 				path_put(&nd->path);
 				error = -ELOOP;
@@ -1449,7 +1450,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
-			nd->inode = nd->path.dentry->d_inode;
 		}
 		err = -ENOTDIR; 
 		if (!nd->inode->i_op->lookup)
@@ -1475,7 +1475,6 @@ last_component:
 			err = do_follow_link(&next, nd);
 			if (err)
 				return err;
-			nd->inode = nd->path.dentry->d_inode;
 		}
 		if (lookup_flags & LOOKUP_DIRECTORY) {
 			err = -ENOTDIR; 
-- 
cgit v1.2.3


From bd92d7fed877ed1e6997e4f3f13dbcd872947653 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 19:54:59 -0400
Subject: Make trailing symlink resolution in path_lookupat() iterative

Now the only caller of link_path_walk() that does *not* pass
LOOKUP_PARENT is do_follow_link()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 53 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9e7b18a8be66..a3431639e166 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1588,12 +1588,23 @@ out_fail:
 	return retval;
 }
 
+static inline int lookup_last(struct nameidata *nd, struct path *path)
+{
+	if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
+		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+	nd->flags &= ~LOOKUP_PARENT;
+	return walk_component(nd, path, &nd->last, nd->last_type,
+					nd->flags & LOOKUP_FOLLOW);
+}
+
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
 static int path_lookupat(int dfd, const char *name,
 				unsigned int flags, struct nameidata *nd)
 {
 	struct file *base = NULL;
-	int retval;
+	struct path path;
+	int err;
 
 	/*
 	 * Path walking is largely split up into 2 different synchronisation
@@ -1609,23 +1620,55 @@ static int path_lookupat(int dfd, const char *name,
 	 * be handled by restarting a traditional ref-walk (which will always
 	 * be able to complete).
 	 */
-	retval = path_init(dfd, name, flags, nd, &base);
+	err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base);
 
-	if (unlikely(retval))
-		return retval;
+	if (unlikely(err))
+		return err;
 
 	current->total_link_count = 0;
-	retval = link_path_walk(name, nd);
+	err = link_path_walk(name, nd);
+
+	if (!err && !(flags & LOOKUP_PARENT)) {
+		int count = 0;
+		err = lookup_last(nd, &path);
+		while (err > 0) {
+			void *cookie;
+			struct path link = path;
+			struct inode *inode = link.dentry->d_inode;
+
+			if (count++ > 32) {
+				path_put_conditional(&path, nd);
+				path_put(&nd->path);
+				err = -ELOOP;
+				break;
+			}
+			cond_resched();
+			nd->flags |= LOOKUP_PARENT;
+			err = __do_follow_link(&link, nd, &cookie);
+			if (!err)
+				err = lookup_last(nd, &path);
+			if (!IS_ERR(cookie) && inode->i_op->put_link)
+				inode->i_op->put_link(link.dentry, nd, cookie);
+			path_put(&link);
+		}
+	}
 
 	if (nd->flags & LOOKUP_RCU) {
 		/* went all way through without dropping RCU */
-		BUG_ON(retval);
+		BUG_ON(err);
 		if (nameidata_drop_rcu_last(nd))
-			retval = -ECHILD;
+			err = -ECHILD;
 	}
 
-	if (!retval)
-		retval = handle_reval_path(nd);
+	if (!err)
+		err = handle_reval_path(nd);
+
+	if (!err && nd->flags & LOOKUP_DIRECTORY) {
+		if (!nd->inode->i_op->lookup) {
+			path_put(&nd->path);
+			return -ENOTDIR;
+		}
+	}
 
 	if (base)
 		fput(base);
@@ -1634,7 +1677,7 @@ static int path_lookupat(int dfd, const char *name,
 		path_put(&nd->root);
 		nd->root.mnt = NULL;
 	}
-	return retval;
+	return err;
 }
 
 static int do_path_lookup(int dfd, const char *name,
-- 
cgit v1.2.3


From ce0525449da56444948c368f52e10f3db0465338 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 21:28:04 -0400
Subject: simplify link_path_walk() tail

Now that link_path_walk() is called without LOOKUP_PARENT
only from do_follow_link(), we can simplify the checks in
last component handling.  First of all, checking if we'd
arrived to a directory is not needed - the caller will check
it anyway.  And LOOKUP_FOLLOW is guaranteed to be there,
since we only get to that place with nd->depth > 0.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a3431639e166..9575d0039699 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1467,8 +1467,7 @@ last_component:
 			nd->last_type = type;
 			return 0;
 		}
-		err = walk_component(nd, &next, &this, type,
-					lookup_flags & LOOKUP_FOLLOW);
+		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
 		if (err < 0)
 			return err;
 		if (err) {
@@ -1476,11 +1475,6 @@ last_component:
 			if (err)
 				return err;
 		}
-		if (lookup_flags & LOOKUP_DIRECTORY) {
-			err = -ENOTDIR; 
-			if (!nd->inode->i_op->lookup)
-				break;
-		}
 		return 0;
 	}
 	terminate_walk(nd);
-- 
cgit v1.2.3


From b356379a020bb7197603118bb1cbc903963aa198 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 21:54:55 -0400
Subject: Turn resolution of trailing symlinks iterative everywhere

The last remaining place (resolution of nested symlink) converted
to the loop of the same kind we have in path_lookupat() and
path_openat().

Note that we still *do* have a recursion in pathname resolution;
can't avoid it, really.  However, it's strictly for nested symlinks
now - i.e. ones in the middle of a pathname.

link_path_walk() has lost the tail now - it always walks everything
except the last component.

do_follow_link() renamed to nested_symlink() and moved down.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 104 +++++++++++++++++++++++++++++--------------------------------
 1 file changed, 50 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 9575d0039699..017c3fa3a08e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -779,40 +779,6 @@ __do_follow_link(const struct path *link, struct nameidata *nd, void **p)
 	return error;
 }
 
-/*
- * This limits recursive symlink follows to 8, while
- * limiting consecutive symlinks to 40.
- *
- * Without that kind of total limit, nasty chains of consecutive
- * symlinks can cause almost arbitrarily long lookups. 
- */
-static inline int do_follow_link(struct path *path, struct nameidata *nd)
-{
-	void *cookie;
-	int err = -ELOOP;
-
-	if (current->link_count >= MAX_NESTED_LINKS)
-		goto loop;
-	if (current->total_link_count >= 40)
-		goto loop;
-	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
-	cond_resched();
-	current->link_count++;
-	current->total_link_count++;
-	nd->depth++;
-	err = __do_follow_link(path, nd, &cookie);
-	if (!IS_ERR(cookie) && path->dentry->d_inode->i_op->put_link)
-		path->dentry->d_inode->i_op->put_link(path->dentry, nd, cookie);
-	path_put(path);
-	current->link_count--;
-	nd->depth--;
-	return err;
-loop:
-	path_put_conditional(path, nd);
-	path_put(&nd->path);
-	return err;
-}
-
 static int follow_up_rcu(struct path *path)
 {
 	struct vfsmount *parent;
@@ -1366,6 +1332,52 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
 	return 0;
 }
 
+/*
+ * This limits recursive symlink follows to 8, while
+ * limiting consecutive symlinks to 40.
+ *
+ * Without that kind of total limit, nasty chains of consecutive
+ * symlinks can cause almost arbitrarily long lookups.
+ */
+static inline int nested_symlink(struct path *path, struct nameidata *nd)
+{
+	int res;
+
+	BUG_ON(nd->depth >= MAX_NESTED_LINKS);
+	if (unlikely(current->link_count >= MAX_NESTED_LINKS)) {
+		path_put_conditional(path, nd);
+		path_put(&nd->path);
+		return -ELOOP;
+	}
+
+	nd->depth++;
+	current->link_count++;
+
+	do {
+		struct path link = *path;
+		void *cookie;
+		if (unlikely(current->total_link_count >= 40)) {
+			path_put_conditional(path, nd);
+			path_put(&nd->path);
+			res = -ELOOP;
+			break;
+		}
+		cond_resched();
+		current->total_link_count++;
+		res = __do_follow_link(&link, nd, &cookie);
+		if (!res)
+			res = walk_component(nd, path, &nd->last,
+					     nd->last_type, LOOKUP_FOLLOW);
+		if (!IS_ERR(cookie) && link.dentry->d_inode->i_op->put_link)
+			link.dentry->d_inode->i_op->put_link(link.dentry, nd, cookie);
+		path_put(&link);
+	} while (res > 0);
+
+	current->link_count--;
+	nd->depth--;
+	return res;
+}
+
 /*
  * Name resolution.
  * This is the basic name resolution function, turning a pathname into
@@ -1385,9 +1397,6 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 	if (!*name)
 		return 0;
 
-	if (nd->depth)
-		lookup_flags = LOOKUP_FOLLOW | (nd->flags & LOOKUP_CONTINUE);
-
 	/* At this point we know we have a real path component. */
 	for(;;) {
 		unsigned long hash;
@@ -1440,14 +1449,14 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			goto last_component;
 		while (*++name == '/');
 		if (!*name)
-			goto last_with_slashes;
+			goto last_component;
 
 		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
 		if (err < 0)
 			return err;
 
 		if (err) {
-			err = do_follow_link(&next, nd);
+			err = nested_symlink(&next, nd);
 			if (err)
 				return err;
 		}
@@ -1457,24 +1466,11 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 		continue;
 		/* here ends the main loop */
 
-last_with_slashes:
-		lookup_flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 last_component:
 		/* Clear LOOKUP_CONTINUE iff it was previously unset */
 		nd->flags &= lookup_flags | ~LOOKUP_CONTINUE;
-		if (lookup_flags & LOOKUP_PARENT) {
-			nd->last = this;
-			nd->last_type = type;
-			return 0;
-		}
-		err = walk_component(nd, &next, &this, type, LOOKUP_FOLLOW);
-		if (err < 0)
-			return err;
-		if (err) {
-			err = do_follow_link(&next, nd);
-			if (err)
-				return err;
-		}
+		nd->last = this;
+		nd->last_type = type;
 		return 0;
 	}
 	terminate_walk(nd);
-- 
cgit v1.2.3


From 574197e0de46a8a4db5c54ef7b65e43ffa8873a7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 14 Mar 2011 22:20:34 -0400
Subject: tidy the trailing symlinks traversal up

* pull the handling of current->total_link_count into
__do_follow_link()
* put the common "do ->put_link() if needed and path_put() the link"
  stuff into a helper (put_link(nd, link, cookie))
* rename __do_follow_link() to follow_link(), while we are at it

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 71 +++++++++++++++++++++++---------------------------------------
 1 file changed, 26 insertions(+), 45 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 017c3fa3a08e..0a601cae23de 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -737,14 +737,31 @@ static inline void path_to_nameidata(const struct path *path,
 	nd->path.dentry = path->dentry;
 }
 
+static inline void put_link(struct nameidata *nd, struct path *link, void *cookie)
+{
+	struct inode *inode = link->dentry->d_inode;
+	if (!IS_ERR(cookie) && inode->i_op->put_link)
+		inode->i_op->put_link(link->dentry, nd, cookie);
+	path_put(link);
+}
+
 static __always_inline int
-__do_follow_link(const struct path *link, struct nameidata *nd, void **p)
+follow_link(struct path *link, struct nameidata *nd, void **p)
 {
 	int error;
 	struct dentry *dentry = link->dentry;
 
 	BUG_ON(nd->flags & LOOKUP_RCU);
 
+	if (unlikely(current->total_link_count >= 40)) {
+		*p = ERR_PTR(-ELOOP); /* no ->put_link(), please */
+		path_put_conditional(link, nd);
+		path_put(&nd->path);
+		return -ELOOP;
+	}
+	cond_resched();
+	current->total_link_count++;
+
 	touch_atime(link->mnt, dentry);
 	nd_set_link(nd, NULL);
 
@@ -1356,21 +1373,12 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 	do {
 		struct path link = *path;
 		void *cookie;
-		if (unlikely(current->total_link_count >= 40)) {
-			path_put_conditional(path, nd);
-			path_put(&nd->path);
-			res = -ELOOP;
-			break;
-		}
-		cond_resched();
-		current->total_link_count++;
-		res = __do_follow_link(&link, nd, &cookie);
+
+		res = follow_link(&link, nd, &cookie);
 		if (!res)
 			res = walk_component(nd, path, &nd->last,
 					     nd->last_type, LOOKUP_FOLLOW);
-		if (!IS_ERR(cookie) && link.dentry->d_inode->i_op->put_link)
-			link.dentry->d_inode->i_op->put_link(link.dentry, nd, cookie);
-		path_put(&link);
+		put_link(nd, &link, cookie);
 	} while (res > 0);
 
 	current->link_count--;
@@ -1619,27 +1627,15 @@ static int path_lookupat(int dfd, const char *name,
 	err = link_path_walk(name, nd);
 
 	if (!err && !(flags & LOOKUP_PARENT)) {
-		int count = 0;
 		err = lookup_last(nd, &path);
 		while (err > 0) {
 			void *cookie;
 			struct path link = path;
-			struct inode *inode = link.dentry->d_inode;
-
-			if (count++ > 32) {
-				path_put_conditional(&path, nd);
-				path_put(&nd->path);
-				err = -ELOOP;
-				break;
-			}
-			cond_resched();
 			nd->flags |= LOOKUP_PARENT;
-			err = __do_follow_link(&link, nd, &cookie);
+			err = follow_link(&link, nd, &cookie);
 			if (!err)
 				err = lookup_last(nd, &path);
-			if (!IS_ERR(cookie) && inode->i_op->put_link)
-				inode->i_op->put_link(link.dentry, nd, cookie);
-			path_put(&link);
+			put_link(nd, &link, cookie);
 		}
 	}
 
@@ -2298,7 +2294,6 @@ static struct file *path_openat(int dfd, const char *pathname,
 	struct file *base = NULL;
 	struct file *filp;
 	struct path path;
-	int count = 0;
 	int error;
 
 	filp = get_empty_filp();
@@ -2322,35 +2317,21 @@ static struct file *path_openat(int dfd, const char *pathname,
 	filp = do_last(nd, &path, op, pathname);
 	while (unlikely(!filp)) { /* trailing symlink */
 		struct path link = path;
-		struct inode *linki = link.dentry->d_inode;
 		void *cookie;
-		if (!(nd->flags & LOOKUP_FOLLOW) || count++ == 32) {
+		if (!(nd->flags & LOOKUP_FOLLOW)) {
 			path_put_conditional(&path, nd);
 			path_put(&nd->path);
 			filp = ERR_PTR(-ELOOP);
 			break;
 		}
-		/*
-		 * This is subtle. Instead of calling do_follow_link() we do
-		 * the thing by hands. The reason is that this way we have zero
-		 * link_count and path_walk() (called from ->follow_link)
-		 * honoring LOOKUP_PARENT.  After that we have the parent and
-		 * last component, i.e. we are in the same situation as after
-		 * the first path_walk().  Well, almost - if the last component
-		 * is normal we get its copy stored in nd->last.name and we will
-		 * have to putname() it when we are done. Procfs-like symlinks
-		 * just set LAST_BIND.
-		 */
 		nd->flags |= LOOKUP_PARENT;
 		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
-		error = __do_follow_link(&link, nd, &cookie);
+		error = follow_link(&link, nd, &cookie);
 		if (unlikely(error))
 			filp = ERR_PTR(error);
 		else
 			filp = do_last(nd, &path, op, pathname);
-		if (!IS_ERR(cookie) && linki->i_op->put_link)
-			linki->i_op->put_link(link.dentry, nd, cookie);
-		path_put(&link);
+		put_link(nd, &link, cookie);
 	}
 out:
 	if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT))
-- 
cgit v1.2.3


From c826cb7dfce80512c26c984350077a25046bd215 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 15 Mar 2011 15:29:21 -0700
Subject: dcache.c: create helper function for duplicated functionality

This creates a helper function for he "try to ascend into the parent
directory" case, which was written out in triplicate before.  With all
the locking and subtle sequence number stuff, we really don't want to
duplicate that kind of code.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 88 ++++++++++++++++++++++++++-----------------------------------
 1 file changed, 37 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 611ffe928c03..361882a14ccb 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1011,6 +1011,34 @@ void shrink_dcache_for_umount(struct super_block *sb)
 	}
 }
 
+/*
+ * This tries to ascend one level of parenthood, but
+ * we can race with renaming, so we need to re-check
+ * the parenthood after dropping the lock and check
+ * that the sequence number still matches.
+ */
+static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
+{
+	struct dentry *new = old->d_parent;
+
+	rcu_read_lock();
+	spin_unlock(&old->d_lock);
+	spin_lock(&new->d_lock);
+
+	/*
+	 * might go back up the wrong parent if we have had a rename
+	 * or deletion
+	 */
+	if (new != old->d_parent ||
+		 (!locked && read_seqretry(&rename_lock, seq))) {
+		spin_unlock(&new->d_lock);
+		new = NULL;
+	}
+	rcu_read_unlock();
+	return new;
+}
+
+
 /*
  * Search for at least 1 mount point in the dentry's subdirs.
  * We descend to the next level whenever the d_subdirs
@@ -1066,24 +1094,10 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		struct dentry *tmp;
-		struct dentry *child;
-
-		tmp = this_parent->d_parent;
-		rcu_read_lock();
-		spin_unlock(&this_parent->d_lock);
-		child = this_parent;
-		this_parent = tmp;
-		spin_lock(&this_parent->d_lock);
-		/* might go back up the wrong parent if we have had a rename
-		 * or deletion */
-		if (this_parent != child->d_parent ||
-			 (!locked && read_seqretry(&rename_lock, seq))) {
-			spin_unlock(&this_parent->d_lock);
-			rcu_read_unlock();
+		struct dentry *child = this_parent;
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
 			goto rename_retry;
-		}
-		rcu_read_unlock();
 		next = child->d_u.d_child.next;
 		goto resume;
 	}
@@ -1181,24 +1195,10 @@ resume:
 	 * All done at this level ... ascend and resume the search.
 	 */
 	if (this_parent != parent) {
-		struct dentry *tmp;
-		struct dentry *child;
-
-		tmp = this_parent->d_parent;
-		rcu_read_lock();
-		spin_unlock(&this_parent->d_lock);
-		child = this_parent;
-		this_parent = tmp;
-		spin_lock(&this_parent->d_lock);
-		/* might go back up the wrong parent if we have had a rename
-		 * or deletion */
-		if (this_parent != child->d_parent ||
-			(!locked && read_seqretry(&rename_lock, seq))) {
-			spin_unlock(&this_parent->d_lock);
-			rcu_read_unlock();
+		struct dentry *child = this_parent;
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
 			goto rename_retry;
-		}
-		rcu_read_unlock();
 		next = child->d_u.d_child.next;
 		goto resume;
 	}
@@ -2942,28 +2942,14 @@ resume:
 		spin_unlock(&dentry->d_lock);
 	}
 	if (this_parent != root) {
-		struct dentry *tmp;
-		struct dentry *child;
-
-		tmp = this_parent->d_parent;
+		struct dentry *child = this_parent;
 		if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
 			this_parent->d_flags |= DCACHE_GENOCIDE;
 			this_parent->d_count--;
 		}
-		rcu_read_lock();
-		spin_unlock(&this_parent->d_lock);
-		child = this_parent;
-		this_parent = tmp;
-		spin_lock(&this_parent->d_lock);
-		/* might go back up the wrong parent if we have had a rename
-		 * or deletion */
-		if (this_parent != child->d_parent ||
-			 (!locked && read_seqretry(&rename_lock, seq))) {
-			spin_unlock(&this_parent->d_lock);
-			rcu_read_unlock();
+		this_parent = try_to_ascend(this_parent, locked, seq);
+		if (!this_parent)
 			goto rename_retry;
-		}
-		rcu_read_unlock();
 		next = child->d_u.d_child.next;
 		goto resume;
 	}
-- 
cgit v1.2.3


From c83ce989cb5ff86575821992ea82c4df5c388ebc Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 15 Mar 2011 13:36:43 -0400
Subject: VFS: Fix the nfs sillyrename regression in kernel 2.6.38

The new vfs locking scheme introduced in 2.6.38 breaks NFS sillyrename
because the latter relies on being able to determine the parent
directory of the dentry in the ->iput() callback in order to send the
appropriate unlink rpc call.

Looking at the code that cares about races with dput(), there doesn't
seem to be anything that specifically uses d_parent as a test for
whether or not there is a race:
  - __d_lookup_rcu(), __d_lookup() all test for d_hashed() after d_parent
  - shrink_dcache_for_umount() is safe since nothing else can rearrange
    the dentries in that super block.
  - have_submount(), select_parent() and d_genocide() can test for a
    deletion if we set the DCACHE_DISCONNECTED flag when the dentry
    is removed from the parent's d_subdirs list.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Cc: stable@kernel.org (2.6.38, needs commit c826cb7dfce8 "dcache.c:
	create helper function for duplicated functionality" )
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 361882a14ccb..a39fe47c466f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -296,8 +296,12 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
 	__releases(parent->d_lock)
 	__releases(dentry->d_inode->i_lock)
 {
-	dentry->d_parent = NULL;
 	list_del(&dentry->d_u.d_child);
+	/*
+	 * Inform try_to_ascend() that we are no longer attached to the
+	 * dentry tree
+	 */
+	dentry->d_flags |= DCACHE_DISCONNECTED;
 	if (parent)
 		spin_unlock(&parent->d_lock);
 	dentry_iput(dentry);
@@ -1030,6 +1034,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
 	 * or deletion
 	 */
 	if (new != old->d_parent ||
+		 (old->d_flags & DCACHE_DISCONNECTED) ||
 		 (!locked && read_seqretry(&rename_lock, seq))) {
 		spin_unlock(&new->d_lock);
 		new = NULL;
-- 
cgit v1.2.3


From ef6b689b63b9f5227ccee6f16dd9ee3faf58a464 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 21 Feb 2011 11:10:44 +0800
Subject: ocfs2: Remove ENTRY from masklog.

ENTRY is used to record the entry of a function.
But because it is added in so many functions, if we enable it,
the system logs get filled up quickly and cause too much I/O.
So actually no one can open it for a production system or even
for a test.

So for mlog_entry_void, we just remove it.
for mlog_entry(...), we replace it with mlog(0,...), and they
will be replace by trace event later.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/alloc.c             | 28 ++--------------
 fs/ocfs2/aops.c              | 16 ++++-----
 fs/ocfs2/buffer_head_io.c    | 10 +++---
 fs/ocfs2/cluster/heartbeat.c |  2 --
 fs/ocfs2/cluster/masklog.c   |  1 -
 fs/ocfs2/cluster/masklog.h   | 13 +-------
 fs/ocfs2/dcache.c            |  4 +--
 fs/ocfs2/dir.c               | 20 +++---------
 fs/ocfs2/dlm/dlmconvert.c    |  6 ++--
 fs/ocfs2/dlm/dlmdomain.c     |  8 ++---
 fs/ocfs2/dlm/dlmlock.c       | 10 ++----
 fs/ocfs2/dlm/dlmmaster.c     |  6 +---
 fs/ocfs2/dlm/dlmrecovery.c   |  4 ---
 fs/ocfs2/dlm/dlmunlock.c     |  4 +--
 fs/ocfs2/dlmglue.c           | 78 --------------------------------------------
 fs/ocfs2/export.c            | 12 +++----
 fs/ocfs2/extent_map.c        |  8 ++---
 fs/ocfs2/file.c              | 73 ++++++++++++++++++-----------------------
 fs/ocfs2/inode.c             | 28 +++++++---------
 fs/ocfs2/journal.c           | 43 +++++++-----------------
 fs/ocfs2/localalloc.c        | 26 +++------------
 fs/ocfs2/mmap.c              |  2 +-
 fs/ocfs2/namei.c             | 74 ++++++++++++++++++++---------------------
 fs/ocfs2/quota_global.c      | 14 +++-----
 fs/ocfs2/quota_local.c       |  2 +-
 fs/ocfs2/resize.c            |  8 ++---
 fs/ocfs2/slot_map.c          |  2 --
 fs/ocfs2/suballoc.c          | 35 ++++----------------
 fs/ocfs2/super.c             | 32 ++++--------------
 fs/ocfs2/symlink.c           |  6 ----
 fs/ocfs2/xattr.c             |  4 +--
 31 files changed, 161 insertions(+), 418 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index e4984e259cb6..47a6ce84e67f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -965,8 +965,6 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 	struct buffer_head *eb_bh = NULL;
 	u64 last_eb_blk = 0;
 
-	mlog_entry_void();
-
 	el = et->et_root_el;
 	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
 
@@ -1010,8 +1008,6 @@ static int ocfs2_create_new_meta_bhs(handle_t *handle,
 		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
 	struct ocfs2_extent_block *eb;
 
-	mlog_entry_void();
-
 	count = 0;
 	while (count < wanted) {
 		status = ocfs2_claim_metadata(handle,
@@ -1173,8 +1169,6 @@ static int ocfs2_add_branch(handle_t *handle,
 	struct ocfs2_extent_list  *el;
 	u32 new_cpos, root_end;
 
-	mlog_entry_void();
-
 	BUG_ON(!last_eb_bh || !*last_eb_bh);
 
 	if (eb_bh) {
@@ -1353,8 +1347,6 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
 	struct ocfs2_extent_list  *root_el;
 	struct ocfs2_extent_list  *eb_el;
 
-	mlog_entry_void();
-
 	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
 					   &new_eb_bh);
 	if (status < 0) {
@@ -1446,8 +1438,6 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
 	struct buffer_head *bh = NULL;
 	struct buffer_head *lowest_bh = NULL;
 
-	mlog_entry_void();
-
 	*target_bh = NULL;
 
 	el = et->et_root_el;
@@ -5795,8 +5785,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
 
-	mlog_entry("start_blk = %llu, num_clusters = %u\n",
-		   (unsigned long long)start_blk, num_clusters);
+	mlog(0, "start_blk = %llu, num_clusters = %u\n",
+	     (unsigned long long)start_blk, num_clusters);
 
 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
@@ -5878,8 +5868,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct buffer_head *tl_bh = osb->osb_tl_bh;
 
-	mlog_entry_void();
-
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
 	tl = &di->id2.i_dealloc;
 	i = le16_to_cpu(tl->tl_used) - 1;
@@ -5949,8 +5937,6 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
 
-	mlog_entry_void();
-
 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
@@ -6032,8 +6018,6 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
 		container_of(work, struct ocfs2_super,
 			     osb_truncate_log_wq.work);
 
-	mlog_entry_void();
-
 	status = ocfs2_flush_truncate_log(osb);
 	if (status < 0)
 		mlog_errno(status);
@@ -6174,8 +6158,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 	struct inode *tl_inode = osb->osb_tl_inode;
 	struct ocfs2_truncate_log *tl;
 
-	mlog_entry_void();
-
 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
 		return -EINVAL;
@@ -6228,8 +6210,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
 	int status;
 	struct inode *tl_inode = osb->osb_tl_inode;
 
-	mlog_entry_void();
-
 	if (tl_inode) {
 		cancel_delayed_work(&osb->osb_truncate_log_wq);
 		flush_workqueue(ocfs2_wq);
@@ -6251,8 +6231,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 	struct inode *tl_inode = NULL;
 	struct buffer_head *tl_bh = NULL;
 
-	mlog_entry_void();
-
 	status = ocfs2_get_truncate_log_info(osb,
 					     osb->slot_num,
 					     &tl_inode,
@@ -7005,8 +6983,6 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 	struct ocfs2_extent_tree et;
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 
-	mlog_entry_void();
-
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 1fbb0e20131b..5dbfc9e45195 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -59,8 +59,8 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	void *kaddr;
 
-	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
-		   (unsigned long long)iblock, bh_result, create);
+	mlog(0, "(0x%p, %llu, 0x%p, %d)\n", inode,
+	     (unsigned long long)iblock, bh_result, create);
 
 	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
 
@@ -136,8 +136,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
 	u64 p_blkno, count, past_eof;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
-		   (unsigned long long)iblock, bh_result, create);
+	mlog(0, "(0x%p, %llu, 0x%p, %d)\n", inode,
+	     (unsigned long long)iblock, bh_result, create);
 
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
 		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
@@ -278,7 +278,7 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	int ret, unlock = 1;
 
-	mlog_entry("(0x%p, %lu)\n", file, (page ? page->index : 0));
+	mlog(0, "(0x%p, %lu)\n", file, (page ? page->index : 0));
 
 	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
 	if (ret != 0) {
@@ -398,7 +398,7 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
 
-	mlog_entry("(0x%p)\n", page);
+	mlog(0, "(0x%p)\n", page);
 
 	ret = block_write_full_page(page, ocfs2_get_block, wbc);
 
@@ -450,7 +450,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 	int err = 0;
 	struct inode *inode = mapping->host;
 
-	mlog_entry("(block = %llu)\n", (unsigned long long)block);
+	mlog(0, "(block = %llu)\n", (unsigned long long)block);
 
 	/* We don't need to lock journal system files, since they aren't
 	 * accessed concurrently from multiple nodes.
@@ -618,8 +618,6 @@ static ssize_t ocfs2_direct_IO(int rw,
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
 	int ret;
 
-	mlog_entry_void();
-
 	/*
 	 * Fallback to buffered I/O if we see an inode without
 	 * extents.
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index f9d5d3ffc75a..024bd45b7a48 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -55,8 +55,8 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 {
 	int ret = 0;
 
-	mlog_entry("(bh->b_blocknr = %llu, ci=%p)\n",
-		   (unsigned long long)bh->b_blocknr, ci);
+	mlog(0, "(bh->b_blocknr = %llu, ci=%p)\n",
+	     (unsigned long long)bh->b_blocknr, ci);
 
 	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
 	BUG_ON(buffer_jbd(bh));
@@ -186,8 +186,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 	struct buffer_head *bh;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 
-	mlog_entry("(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
-		   ci, (unsigned long long)block, nr, flags);
+	mlog(0, "(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
+	     ci, (unsigned long long)block, nr, flags);
 
 	BUG_ON(!ci);
 	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
@@ -408,8 +408,6 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 	int ret = 0;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 
-	mlog_entry_void();
-
 	BUG_ON(buffer_jbd(bh));
 	ocfs2_check_super_or_backup(osb->sb, bh->b_blocknr);
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index b108e863d8f6..929178ce17a2 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1658,8 +1658,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
 	struct o2hb_disk_slot *slot;
 	struct o2hb_disk_heartbeat_block *hb_block;
 
-	mlog_entry_void();
-
 	ret = o2hb_read_slots(reg, reg->hr_blocks);
 	if (ret) {
 		mlog_errno(ret);
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 6c61771469af..bac64570c525 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -80,7 +80,6 @@ struct mlog_attribute {
 }
 
 static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
-	define_mask(ENTRY),
 	define_mask(EXIT),
 	define_mask(TCP),
 	define_mask(MSG),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 34d6544357d9..6e20877436b9 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -82,7 +82,6 @@
 
 /* bits that are frequently given and infrequently matched in the low word */
 /* NOTE: If you add a flag, you need to also update masklog.c! */
-#define ML_ENTRY	0x0000000000000001ULL /* func call entry */
 #define ML_EXIT		0x0000000000000002ULL /* func call exit */
 #define ML_TCP		0x0000000000000004ULL /* net cluster/tcp.c */
 #define ML_MSG		0x0000000000000008ULL /* net network messages */
@@ -124,7 +123,7 @@
 #define ML_KTHREAD	0x4000000000000000ULL /* kernel thread activity */
 
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
-#define MLOG_INITIAL_NOT_MASK (ML_ENTRY|ML_EXIT)
+#define MLOG_INITIAL_NOT_MASK (ML_EXIT)
 #ifndef MLOG_MASK_PREFIX
 #define MLOG_MASK_PREFIX 0
 #endif
@@ -223,14 +222,6 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 } while (0)
 
 #if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
-#define mlog_entry(fmt, args...) do {					\
-	mlog(ML_ENTRY, "ENTRY:" fmt , ##args);				\
-} while (0)
-
-#define mlog_entry_void() do {						\
-	mlog(ML_ENTRY, "ENTRY:\n");					\
-} while (0)
-
 /*
  * We disable this for sparse.
  */
@@ -267,8 +258,6 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 	mlog(ML_EXIT, "EXIT\n");					\
 } while (0)
 #else
-#define mlog_entry(...)  do { } while (0)
-#define mlog_entry_void(...)  do { } while (0)
 #define mlog_exit(...)  do { } while (0)
 #define mlog_exit_ptr(...)  do { } while (0)
 #define mlog_exit_void(...)  do { } while (0)
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 7eb90403fc8a..67d17a45773b 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -62,8 +62,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 	inode = dentry->d_inode;
 	osb = OCFS2_SB(dentry->d_sb);
 
-	mlog_entry("(0x%p, '%.*s')\n", dentry,
-		   dentry->d_name.len, dentry->d_name.name);
+	mlog(0, "(0x%p, '%.*s')\n", dentry,
+	     dentry->d_name.len, dentry->d_name.name);
 
 	/* For a negative dentry -
 	 * check the generation number of the parent and compare with the
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index d417b3f9b0c7..2c983f083f8a 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -367,8 +367,6 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
 	int de_len;
 	int ret = 0;
 
-	mlog_entry_void();
-
 	de_buf = first_de;
 	dlimit = de_buf + bytes;
 
@@ -706,8 +704,6 @@ static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
 	int num = 0;
 	int nblocks, i, err;
 
-	mlog_entry_void();
-
 	sb = dir->i_sb;
 
 	nblocks = i_size_read(dir) >> sb->s_blocksize_bits;
@@ -1166,7 +1162,7 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 	int i, status = -ENOENT;
 	ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
-	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+	mlog(0, "(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
 
 	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		access = ocfs2_journal_access_di;
@@ -1632,8 +1628,6 @@ int __ocfs2_add_entry(handle_t *handle,
 	struct buffer_head *insert_bh = lookup->dl_leaf_bh;
 	char *data_start = insert_bh->b_data;
 
-	mlog_entry_void();
-
 	if (!namelen)
 		return -EINVAL;
 
@@ -2028,8 +2022,8 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int lock_level = 0;
 
-	mlog_entry("dirino=%llu\n",
-		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	mlog(0, "dirino=%llu\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
 	if (lock_level && error >= 0) {
@@ -2114,8 +2108,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
 	int ret;
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-	mlog_entry("dir %llu, name '%.*s'\n",
-		   (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
+	mlog(0, "dir %llu, name '%.*s'\n",
+	     (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
 
 	ret = -EEXIST;
 	if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
@@ -2324,8 +2318,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 	struct buffer_head *new_bh = NULL;
 	struct ocfs2_dir_entry *de;
 
-	mlog_entry_void();
-
 	if (ocfs2_new_dir_wants_trailer(inode))
 		size = ocfs2_dir_trailer_blk_off(parent->i_sb);
 
@@ -3270,8 +3262,6 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	struct ocfs2_extent_tree et;
 	struct buffer_head *dx_root_bh = lookup->dl_dx_root_bh;
 
-	mlog_entry_void();
-
 	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
 		/*
 		 * This would be a code error as an inline directory should
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 9f30491e5e88..29a886d1e82c 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -128,8 +128,8 @@ static enum dlm_status __dlmconvert_master(struct dlm_ctxt *dlm,
 
 	assert_spin_locked(&res->spinlock);
 
-	mlog_entry("type=%d, convert_type=%d, new convert_type=%d\n",
-		   lock->ml.type, lock->ml.convert_type, type);
+	mlog(0, "type=%d, convert_type=%d, new convert_type=%d\n",
+	     lock->ml.type, lock->ml.convert_type, type);
 
 	spin_lock(&lock->spinlock);
 
@@ -353,7 +353,7 @@ static enum dlm_status dlm_send_remote_convert_request(struct dlm_ctxt *dlm,
 	struct kvec vec[2];
 	size_t veclen = 1;
 
-	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+	mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
 
 	memset(&convert, 0, sizeof(struct dlm_convert_lock));
 	convert.node_idx = dlm->node_num;
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 7e38a072d720..5391f28d940b 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -188,7 +188,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
 	struct hlist_head *bucket;
 	struct hlist_node *list;
 
-	mlog_entry("%.*s\n", len, name);
+	mlog(0, "%.*s\n", len, name);
 
 	assert_spin_locked(&dlm->spinlock);
 
@@ -222,7 +222,7 @@ struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
 {
 	struct dlm_lock_resource *res = NULL;
 
-	mlog_entry("%.*s\n", len, name);
+	mlog(0, "%.*s\n", len, name);
 
 	assert_spin_locked(&dlm->spinlock);
 
@@ -531,7 +531,7 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
 	unsigned int node;
 	struct dlm_exit_domain *exit_msg = (struct dlm_exit_domain *) msg->buf;
 
-	mlog_entry("%p %u %p", msg, len, data);
+	mlog(0, "%p %u %p", msg, len, data);
 
 	if (!dlm_grab(dlm))
 		return 0;
@@ -1553,7 +1553,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 	struct domain_join_ctxt *ctxt;
 	enum dlm_query_join_response_code response = JOIN_DISALLOW;
 
-	mlog_entry("%p", dlm);
+	mlog(0, "%p", dlm);
 
 	ctxt = kzalloc(sizeof(*ctxt), GFP_KERNEL);
 	if (!ctxt) {
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 7009292aac5a..8d39e0fd66f7 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -128,7 +128,7 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
 	int call_ast = 0, kick_thread = 0;
 	enum dlm_status status = DLM_NORMAL;
 
-	mlog_entry("type=%d\n", lock->ml.type);
+	mlog(0, "type=%d\n", lock->ml.type);
 
 	spin_lock(&res->spinlock);
 	/* if called from dlm_create_lock_handler, need to
@@ -227,8 +227,8 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
 	enum dlm_status status = DLM_DENIED;
 	int lockres_changed = 1;
 
-	mlog_entry("type=%d\n", lock->ml.type);
-	mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
+	mlog(0, "type=%d, lockres %.*s, flags = 0x%x\n",
+	     lock->ml.type, res->lockname.len,
 	     res->lockname.name, flags);
 
 	spin_lock(&res->spinlock);
@@ -308,8 +308,6 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
 	int tmpret, status = 0;
 	enum dlm_status ret;
 
-	mlog_entry_void();
-
 	memset(&create, 0, sizeof(create));
 	create.node_idx = dlm->node_num;
 	create.requested_type = lock->ml.type;
@@ -477,8 +475,6 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data,
 
 	BUG_ON(!dlm);
 
-	mlog_entry_void();
-
 	if (!dlm_grab(dlm))
 		return DLM_REJECTED;
 
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 59f0f6bdfc62..9d67610dfc74 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -426,8 +426,6 @@ static void dlm_mle_release(struct kref *kref)
 	struct dlm_master_list_entry *mle;
 	struct dlm_ctxt *dlm;
 
-	mlog_entry_void();
-
 	mle = container_of(kref, struct dlm_master_list_entry, mle_refs);
 	dlm = mle->dlm;
 
@@ -3120,8 +3118,6 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 
 	*oldmle = NULL;
 
-	mlog_entry_void();
-
 	assert_spin_locked(&dlm->spinlock);
 	assert_spin_locked(&dlm->master_lock);
 
@@ -3261,7 +3257,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
 	struct hlist_node *list;
 	unsigned int i;
 
-	mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
+	mlog(0, "dlm=%s, dead node=%u\n", dlm->name, dead_node);
 top:
 	assert_spin_locked(&dlm->spinlock);
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index aaaffbcbe916..6602b23c2ab3 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2083,8 +2083,6 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
 	struct hlist_head *bucket;
 	struct dlm_lock_resource *res, *next;
 
-	mlog_entry_void();
-
 	assert_spin_locked(&dlm->spinlock);
 
 	list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
@@ -2607,8 +2605,6 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
 	int nodenum;
 	int status;
 
-	mlog_entry("%u\n", dead_node);
-
 	mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
 
 	spin_lock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 817287c6a6db..850aa7e87537 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -317,7 +317,7 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
 	struct kvec vec[2];
 	size_t veclen = 1;
 
-	mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+	mlog(0, "%.*s\n", res->lockname.len, res->lockname.name);
 
 	if (owner == dlm->node_num) {
 		/* ended up trying to contact ourself.  this means
@@ -588,8 +588,6 @@ enum dlm_status dlmunlock(struct dlm_ctxt *dlm, struct dlm_lockstatus *lksb,
 	struct dlm_lock *lock = NULL;
 	int call_ast, is_master;
 
-	mlog_entry_void();
-
 	if (!lksb) {
 		dlm_error(DLM_BADARGS);
 		return DLM_BADARGS;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e8d94d722ecb..c2f3fa1c4722 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -397,8 +397,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
 {
 	int len;
 
-	mlog_entry_void();
-
 	BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
 
 	len = snprintf(name, OCFS2_LOCK_ID_MAX_LEN, "%c%s%016llx%08x",
@@ -729,8 +727,6 @@ void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
 
 void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 {
-	mlog_entry_void();
-
 	if (!(res->l_flags & OCFS2_LOCK_INITIALIZED))
 		return;
 
@@ -762,8 +758,6 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
 				     int level)
 {
-	mlog_entry_void();
-
 	BUG_ON(!lockres);
 
 	switch(level) {
@@ -783,8 +777,6 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
 static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
 				     int level)
 {
-	mlog_entry_void();
-
 	BUG_ON(!lockres);
 
 	switch(level) {
@@ -846,8 +838,6 @@ static void lockres_clear_flags(struct ocfs2_lock_res *lockres,
 
 static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res *lockres)
 {
-	mlog_entry_void();
-
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
@@ -866,8 +856,6 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
 
 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
 {
-	mlog_entry_void();
-
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
 	BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
 
@@ -895,8 +883,6 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
 
 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
 {
-	mlog_entry_void();
-
 	BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
 	BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
 
@@ -916,7 +902,6 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
 				     int level)
 {
 	int needs_downconvert = 0;
-	mlog_entry_void();
 
 	assert_spin_locked(&lockres->l_lock);
 
@@ -1151,8 +1136,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
 	struct ocfs2_lock_res *lockres = ocfs2_lksb_to_lock_res(lksb);
 	unsigned long flags;
 
-	mlog_entry_void();
-
 	mlog(ML_BASTS, "UNLOCK AST fired for lockres %s, action = %d\n",
 	     lockres->l_name, lockres->l_unlock_action);
 
@@ -1233,7 +1216,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 {
 	unsigned long flags;
 
-	mlog_entry_void();
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
 	lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
@@ -1260,8 +1242,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 	unsigned long flags;
 	unsigned int gen;
 
-	mlog_entry_void();
-
 	mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
 	     dlm_flags);
 
@@ -1416,8 +1396,6 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb,
 	unsigned int gen;
 	int noqueue_attempted = 0;
 
-	mlog_entry_void();
-
 	ocfs2_init_mask_waiter(&mw);
 
 	if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
@@ -1605,7 +1583,6 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
 {
 	unsigned long flags;
 
-	mlog_entry_void();
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	ocfs2_dec_holders(lockres, level);
 	ocfs2_downconvert_on_unlock(osb, lockres);
@@ -1648,8 +1625,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
 	BUG_ON(!inode);
 	BUG_ON(!ocfs2_inode_is_new(inode));
 
-	mlog_entry_void();
-
 	mlog(0, "Inode %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	/* NOTE: That we don't increment any of the holder counts, nor
@@ -1695,8 +1670,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
 
 	BUG_ON(!inode);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu take %s RW lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     write ? "EXMODE" : "PRMODE");
@@ -1725,8 +1698,6 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu drop %s RW lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     write ? "EXMODE" : "PRMODE");
@@ -1748,8 +1719,6 @@ int ocfs2_open_lock(struct inode *inode)
 
 	BUG_ON(!inode);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu take PRMODE open lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
@@ -1776,8 +1745,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
 
 	BUG_ON(!inode);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu try to take %s open lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     write ? "EXMODE" : "PRMODE");
@@ -1811,8 +1778,6 @@ void ocfs2_open_unlock(struct inode *inode)
 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu drop open lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
@@ -2043,8 +2008,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
 {
 	int kick = 0;
 
-	mlog_entry_void();
-
 	/* If we know that another node is waiting on our lock, kick
 	 * the downconvert thread * pre-emptively when we reach a release
 	 * condition. */
@@ -2095,8 +2058,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
 
-	mlog_entry_void();
-
 	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 
 	/*
@@ -2145,8 +2106,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	struct ocfs2_lock_res *lockres = &oi->ip_inode_lockres;
 	struct ocfs2_meta_lvb *lvb;
 
-	mlog_entry_void();
-
 	mlog_meta_lvb(0, lockres);
 
 	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
@@ -2205,8 +2164,6 @@ static int ocfs2_should_refresh_lock_res(struct ocfs2_lock_res *lockres)
 	unsigned long flags;
 	int status = 0;
 
-	mlog_entry_void();
-
 refresh_check:
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	if (!(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) {
@@ -2237,7 +2194,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
 						   int status)
 {
 	unsigned long flags;
-	mlog_entry_void();
 
 	spin_lock_irqsave(&lockres->l_lock, flags);
 	lockres_clear_flags(lockres, OCFS2_LOCK_REFRESHING);
@@ -2260,8 +2216,6 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 	struct ocfs2_dinode *fe;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry_void();
-
 	if (ocfs2_mount_local(osb))
 		goto bail;
 
@@ -2374,8 +2328,6 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
 
 	BUG_ON(!inode);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu, take %s META lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     ex ? "EXMODE" : "PRMODE");
@@ -2517,7 +2469,6 @@ int ocfs2_inode_lock_atime(struct inode *inode,
 {
 	int ret;
 
-	mlog_entry_void();
 	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -2556,8 +2507,6 @@ void ocfs2_inode_unlock(struct inode *inode,
 	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry_void();
-
 	mlog(0, "inode %llu drop %s META lock\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     ex ? "EXMODE" : "PRMODE");
@@ -2617,8 +2566,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
 	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
 
-	mlog_entry_void();
-
 	if (ocfs2_is_hard_readonly(osb))
 		return -EROFS;
 
@@ -3054,8 +3001,6 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
 	int status = 0;
 	struct ocfs2_cluster_connection *conn = NULL;
 
-	mlog_entry_void();
-
 	if (ocfs2_mount_local(osb)) {
 		osb->node_num = 0;
 		goto local;
@@ -3119,8 +3064,6 @@ bail:
 void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
 			int hangup_pending)
 {
-	mlog_entry_void();
-
 	ocfs2_drop_osb_locks(osb);
 
 	/*
@@ -3284,8 +3227,6 @@ int ocfs2_drop_inode_locks(struct inode *inode)
 {
 	int status, err;
 
-	mlog_entry_void();
-
 	/* No need to call ocfs2_mark_lockres_freeing here -
 	 * ocfs2_clear_inode has done it for us. */
 
@@ -3352,8 +3293,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 	int ret;
 	u32 dlm_flags = DLM_LKF_CONVERT;
 
-	mlog_entry_void();
-
 	mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
 	     lockres->l_level, new_level);
 
@@ -3385,8 +3324,6 @@ static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
 {
 	assert_spin_locked(&lockres->l_lock);
 
-	mlog_entry_void();
-
 	if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
 		/* If we're already trying to cancel a lock conversion
 		 * then just drop the spinlock and allow the caller to
@@ -3416,8 +3353,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 {
 	int ret;
 
-	mlog_entry_void();
-
 	ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
 			       DLM_LKF_CANCEL);
 	if (ret) {
@@ -3443,8 +3378,6 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
 	int set_lvb = 0;
 	unsigned int gen;
 
-	mlog_entry_void();
-
 	spin_lock_irqsave(&lockres->l_lock, flags);
 
 recheck:
@@ -3859,8 +3792,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
 	struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
 					    oinfo->dqi_gi.dqi_type);
 
-	mlog_entry_void();
-
 	lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
 	lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
 	lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
@@ -3879,7 +3810,6 @@ void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
 	struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
 	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 
-	mlog_entry_void();
 	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
 		ocfs2_cluster_unlock(osb, lockres, level);
 	mlog_exit_void();
@@ -3937,8 +3867,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
 	int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
 	int status = 0;
 
-	mlog_entry_void();
-
 	/* On RO devices, locking really isn't needed... */
 	if (ocfs2_is_hard_readonly(osb)) {
 		if (ex)
@@ -4007,8 +3935,6 @@ static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
 	 * considered valid until we remove the OCFS2_LOCK_QUEUED
 	 * flag. */
 
-	mlog_entry_void();
-
 	BUG_ON(!lockres);
 	BUG_ON(!lockres->l_ops);
 
@@ -4049,8 +3975,6 @@ unqueue:
 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres)
 {
-	mlog_entry_void();
-
 	assert_spin_locked(&lockres->l_lock);
 
 	if (lockres->l_flags & OCFS2_LOCK_FREEING) {
@@ -4080,8 +4004,6 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
 	unsigned long processed;
 	struct ocfs2_lock_res *lockres;
 
-	mlog_entry_void();
-
 	spin_lock(&osb->dc_task_lock);
 	/* grab this early so we know to try again if a state change and
 	 * wake happens part-way through our work  */
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 254652a9b542..b90c751ba938 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -56,7 +56,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
 	int status, set;
 	struct dentry *result;
 
-	mlog_entry("(0x%p, 0x%p)\n", sb, handle);
+	mlog(0, "(0x%p, 0x%p)\n", sb, handle);
 
 	if (blkno == 0) {
 		mlog(0, "nfs wants inode with blkno: 0\n");
@@ -152,8 +152,8 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 	struct dentry *parent;
 	struct inode *dir = child->d_inode;
 
-	mlog_entry("(0x%p, '%.*s')\n", child,
-		   child->d_name.len, child->d_name.name);
+	mlog(0, "(0x%p, '%.*s')\n", child,
+	     child->d_name.len, child->d_name.name);
 
 	mlog(0, "find parent of directory %llu\n",
 	     (unsigned long long)OCFS2_I(dir)->ip_blkno);
@@ -193,9 +193,9 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 	u32 generation;
 	__le32 *fh = (__force __le32 *) fh_in;
 
-	mlog_entry("(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
-		   dentry->d_name.len, dentry->d_name.name,
-		   fh, len, connectable);
+	mlog(0, "(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
+	     dentry->d_name.len, dentry->d_name.name,
+	     fh, len, connectable);
 
 	if (connectable && (len < 6)) {
 		*max_len = 6;
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 09e3fdfa6d33..ed9916a9a271 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -841,10 +841,10 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
 	u64 p_block, p_count;
 	int i, count, done = 0;
 
-	mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
-		   "flags = %x, validate = %p)\n",
-		   inode, (unsigned long long)v_block, nr, bhs, flags,
-		   validate);
+	mlog(0, "(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+	     "flags = %x, validate = %p)\n",
+	     inode, (unsigned long long)v_block, nr, bhs, flags,
+	     validate);
 
 	if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
 	    i_size_read(inode)) {
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a6651956482e..71375d7b7bbe 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -99,8 +99,8 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
 	int mode = file->f_flags;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
-		   file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, '%.*s')\n", inode, file,
+	     file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
 
 	if (file->f_mode & FMODE_WRITE)
 		dquot_initialize(inode);
@@ -143,9 +143,9 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file,
-		       file->f_path.dentry->d_name.len,
-		       file->f_path.dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, '%.*s')\n", inode, file,
+	     file->f_path.dentry->d_name.len,
+	     file->f_path.dentry->d_name.name);
 
 	spin_lock(&oi->ip_lock);
 	if (!--oi->ip_open_count)
@@ -177,9 +177,9 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
-		   file->f_path.dentry, file->f_path.dentry->d_name.len,
-		   file->f_path.dentry->d_name.name);
+	mlog(0, "(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
+	     file->f_path.dentry, file->f_path.dentry->d_name.len,
+	     file->f_path.dentry->d_name.name);
 
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
 		/*
@@ -251,8 +251,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
 	handle_t *handle;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data;
 
-	mlog_entry_void();
-
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
@@ -291,7 +289,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
 {
 	int status;
 
-	mlog_entry_void();
 	i_size_write(inode, new_i_size);
 	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@ -375,8 +372,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 	struct ocfs2_dinode *di;
 	u64 cluster_bytes;
 
-	mlog_entry_void();
-
 	/*
 	 * We need to CoW the cluster contains the offset if it is reflinked
 	 * since we will call ocfs2_zero_range_for_truncate later which will
@@ -442,9 +437,9 @@ static int ocfs2_truncate_file(struct inode *inode,
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog_entry("(inode = %llu, new_i_size = %llu\n",
-		   (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		   (unsigned long long)new_i_size);
+	mlog(0, "(inode = %llu, new_i_size = %llu\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     (unsigned long long)new_i_size);
 
 	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
 	 * already validated it */
@@ -578,7 +573,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	struct ocfs2_extent_tree et;
 	int did_quota = 0;
 
-	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
+	mlog(0, "(clusters_to_add = %u)\n", clusters_to_add);
 
 	/*
 	 * This function only exists for file systems which don't
@@ -1113,8 +1108,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	struct dquot *transfer_to[MAXQUOTAS] = { };
 	int qtype;
 
-	mlog_entry("(0x%p, '%.*s')\n", dentry,
-	           dentry->d_name.len, dentry->d_name.name);
+	mlog(0, "(0x%p, '%.*s')\n", dentry,
+	     dentry->d_name.len, dentry->d_name.name);
 
 	/* ensuring we don't even attempt to truncate a symlink */
 	if (S_ISLNK(inode->i_mode))
@@ -1287,8 +1282,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
 	struct ocfs2_super *osb = sb->s_fs_info;
 	int err;
 
-	mlog_entry_void();
-
 	err = ocfs2_inode_revalidate(dentry);
 	if (err) {
 		if (err != -ENOENT)
@@ -1314,8 +1307,6 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
 	if (flags & IPERM_FLAG_RCU)
 		return -ECHILD;
 
-	mlog_entry_void();
-
 	ret = ocfs2_inode_lock(inode, NULL, 0);
 	if (ret) {
 		if (ret != -ENOENT)
@@ -1339,8 +1330,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di;
 
-	mlog_entry("(Inode %llu, mode 0%o)\n",
-		   (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
+	mlog(0, "(Inode %llu, mode 0%o)\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
@@ -2233,10 +2224,10 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	int full_coherency = !(osb->s_mount_opt &
 			       OCFS2_MOUNT_COHERENCY_BUFFERED);
 
-	mlog_entry("(0x%p, %u, '%.*s')\n", file,
-		   (unsigned int)nr_segs,
-		   file->f_path.dentry->d_name.len,
-		   file->f_path.dentry->d_name.name);
+	mlog(0, "(0x%p, %u, '%.*s')\n", file,
+	     (unsigned int)nr_segs,
+	     file->f_path.dentry->d_name.len,
+	     file->f_path.dentry->d_name.name);
 
 	if (iocb->ki_left == 0)
 		return 0;
@@ -2438,10 +2429,10 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 		.u.file = out,
 	};
 
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
-		   (unsigned int)len,
-		   out->f_path.dentry->d_name.len,
-		   out->f_path.dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
+	     (unsigned int)len,
+	     out->f_path.dentry->d_name.len,
+	     out->f_path.dentry->d_name.name);
 
 	if (pipe->inode)
 		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
@@ -2498,10 +2489,10 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
 	int ret = 0, lock_level = 0;
 	struct inode *inode = in->f_path.dentry->d_inode;
 
-	mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
-		   (unsigned int)len,
-		   in->f_path.dentry->d_name.len,
-		   in->f_path.dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
+	     (unsigned int)len,
+	     in->f_path.dentry->d_name.len,
+	     in->f_path.dentry->d_name.name);
 
 	/*
 	 * See the comment in ocfs2_file_aio_read()
@@ -2529,10 +2520,10 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 
-	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
-		   (unsigned int)nr_segs,
-		   filp->f_path.dentry->d_name.len,
-		   filp->f_path.dentry->d_name.name);
+	mlog(0, "(0x%p, %u, '%.*s')\n", filp,
+	     (unsigned int)nr_segs,
+	     filp->f_path.dentry->d_name.len,
+	     filp->f_path.dentry->d_name.name);
 
 	if (!inode) {
 		ret = -EINVAL;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 4068c6c4c6f6..16ee8e96f731 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -131,7 +131,7 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
 	struct super_block *sb = osb->sb;
 	struct ocfs2_find_inode_args args;
 
-	mlog_entry("(blkno = %llu)\n", (unsigned long long)blkno);
+	mlog(0, "(blkno = %llu)\n", (unsigned long long)blkno);
 
 	/* Ok. By now we've either got the offsets passed to us by the
 	 * caller, or we just pulled them off the bh. Lets do some
@@ -192,7 +192,7 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	int ret = 0;
 
-	mlog_entry("(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
+	mlog(0, "(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
 
 	args = opaque;
 
@@ -218,7 +218,7 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 	static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
 				     ocfs2_file_ip_alloc_sem_key;
 
-	mlog_entry("inode = %p, opaque = %p\n", inode, opaque);
+	mlog(0, "inode = %p, opaque = %p\n", inode, opaque);
 
 	inode->i_ino = args->fi_ino;
 	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
@@ -246,8 +246,8 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	struct ocfs2_super *osb;
 	int use_plocks = 1;
 
-	mlog_entry("(0x%p, size:%llu)\n", inode,
-		   (unsigned long long)le64_to_cpu(fe->i_size));
+	mlog(0, "(0x%p, size:%llu)\n", inode,
+	     (unsigned long long)le64_to_cpu(fe->i_size));
 
 	sb = inode->i_sb;
 	osb = OCFS2_SB(sb);
@@ -394,7 +394,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	int status, can_lock;
 	u32 generation = 0;
 
-	mlog_entry("(0x%p, 0x%p)\n", inode, args);
+	mlog(0, "(0x%p, 0x%p)\n", inode, args);
 
 	status = -EINVAL;
 	if (inode == NULL || inode->i_sb == NULL) {
@@ -551,8 +551,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 	struct ocfs2_dinode *fe;
 	handle_t *handle = NULL;
 
-	mlog_entry_void();
-
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
 	/*
@@ -980,7 +978,7 @@ static void ocfs2_delete_inode(struct inode *inode)
 	sigset_t oldset;
 	struct buffer_head *di_bh = NULL;
 
-	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+	mlog(0, "(inode->i_ino = %lu)\n", inode->i_ino);
 
 	/* When we fail in read_inode() we mark inode as bad. The second test
 	 * catches the case when inode allocation fails before allocating
@@ -1088,8 +1086,6 @@ static void ocfs2_clear_inode(struct inode *inode)
 	int status;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	mlog_entry_void();
-
 	end_writeback(inode);
 	mlog(0, "Clearing inode: %llu, nlink = %u\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
@@ -1204,8 +1200,6 @@ int ocfs2_drop_inode(struct inode *inode)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	int res;
 
-	mlog_entry_void();
-
 	mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
 	     (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
 
@@ -1226,8 +1220,8 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int status = 0;
 
-	mlog_entry("(inode = 0x%p, ino = %llu)\n", inode,
-		   inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL);
+	mlog(0, "(inode = 0x%p, ino = %llu)\n", inode,
+	     inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL);
 
 	if (!inode) {
 		mlog(0, "eep, no inode!\n");
@@ -1271,8 +1265,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 	int status;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
 
-	mlog_entry("(inode %llu)\n",
-		   (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	mlog(0, "(inode %llu)\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index faa2303dbf0a..7378277b17a4 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -303,8 +303,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 	unsigned int flushed;
 	struct ocfs2_journal *journal = NULL;
 
-	mlog_entry_void();
-
 	journal = osb->journal;
 
 	/* Flush all pending commits and checkpoint the journal. */
@@ -425,7 +423,6 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 		return 0;
 
 	old_nblocks = handle->h_buffer_credits;
-	mlog_entry_void();
 
 	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
 
@@ -622,12 +619,12 @@ static int __ocfs2_journal_access(handle_t *handle,
 	BUG_ON(!handle);
 	BUG_ON(!bh);
 
-	mlog_entry("bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
-		   (unsigned long long)bh->b_blocknr, type,
-		   (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
-		   "OCFS2_JOURNAL_ACCESS_CREATE" :
-		   "OCFS2_JOURNAL_ACCESS_WRITE",
-		   bh->b_size);
+	mlog(0, "bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
+	     (unsigned long long)bh->b_blocknr, type,
+	     (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
+	     "OCFS2_JOURNAL_ACCESS_CREATE" :
+	     "OCFS2_JOURNAL_ACCESS_WRITE",
+	     bh->b_size);
 
 	/* we can safely remove this assertion after testing. */
 	if (!buffer_uptodate(bh)) {
@@ -737,8 +734,8 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
 {
 	int status;
 
-	mlog_entry("(bh->b_blocknr=%llu)\n",
-		   (unsigned long long)bh->b_blocknr);
+	mlog(0, "(bh->b_blocknr=%llu)\n",
+	     (unsigned long long)bh->b_blocknr);
 
 	status = jbd2_journal_dirty_metadata(handle, bh);
 	BUG_ON(status);
@@ -775,8 +772,6 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 	struct ocfs2_super *osb;
 	int inode_lock = 0;
 
-	mlog_entry_void();
-
 	BUG_ON(!journal);
 
 	osb = journal->j_osb;
@@ -882,8 +877,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 	struct buffer_head *bh = journal->j_bh;
 	struct ocfs2_dinode *fe;
 
-	mlog_entry_void();
-
 	fe = (struct ocfs2_dinode *)bh->b_data;
 
 	/* The journal bh on the osb always comes from ocfs2_journal_init()
@@ -921,8 +914,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	struct inode *inode = NULL;
 	int num_running_trans = 0;
 
-	mlog_entry_void();
-
 	BUG_ON(!osb);
 
 	journal = osb->journal;
@@ -1024,8 +1015,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 	int status = 0;
 	struct ocfs2_super *osb;
 
-	mlog_entry_void();
-
 	BUG_ON(!journal);
 
 	osb = journal->j_osb;
@@ -1070,8 +1059,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
 {
 	int status;
 
-	mlog_entry_void();
-
 	BUG_ON(!journal);
 
 	status = jbd2_journal_wipe(journal->j_journal, full);
@@ -1124,8 +1111,6 @@ static int ocfs2_force_read_journal(struct inode *inode)
 #define CONCURRENT_JOURNAL_FILL 32ULL
 	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
 
-	mlog_entry_void();
-
 	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
 
 	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
@@ -1194,8 +1179,6 @@ void ocfs2_complete_recovery(struct work_struct *work)
 	struct ocfs2_quota_recovery *qrec;
 	LIST_HEAD(tmp_la_list);
 
-	mlog_entry_void();
-
 	mlog(0, "completing recovery from keventd\n");
 
 	spin_lock(&journal->j_lock);
@@ -1339,8 +1322,6 @@ static int __ocfs2_recovery_thread(void *arg)
 	int rm_quota_used = 0, i;
 	struct ocfs2_quota_recovery *qrec;
 
-	mlog_entry_void();
-
 	status = ocfs2_wait_on_mount(osb);
 	if (status < 0) {
 		goto bail;
@@ -1461,8 +1442,8 @@ bail:
 
 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 {
-	mlog_entry("(node_num=%d, osb->node_num = %d)\n",
-		   node_num, osb->node_num);
+	mlog(0, "(node_num=%d, osb->node_num = %d)\n",
+	     node_num, osb->node_num);
 
 	mutex_lock(&osb->recovery_lock);
 	if (osb->disable_recovery)
@@ -1688,8 +1669,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 	struct ocfs2_dinode *la_copy = NULL;
 	struct ocfs2_dinode *tl_copy = NULL;
 
-	mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
-		   node_num, slot_num, osb->node_num);
+	mlog(0, "(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
+	     node_num, slot_num, osb->node_num);
 
 	/* Should not ever be called to recover ourselves -- in that
 	 * case we should've called ocfs2_journal_load instead. */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ec6adbf8f551..5f08c703f354 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -291,8 +291,6 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 	struct inode *inode = NULL;
 	struct ocfs2_local_alloc *la;
 
-	mlog_entry_void();
-
 	if (osb->local_alloc_bits == 0)
 		goto bail;
 
@@ -388,8 +386,6 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 	struct ocfs2_dinode *alloc_copy = NULL;
 	struct ocfs2_dinode *alloc = NULL;
 
-	mlog_entry_void();
-
 	cancel_delayed_work(&osb->la_enable_wq);
 	flush_workqueue(ocfs2_wq);
 
@@ -502,7 +498,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 	struct inode *inode = NULL;
 	struct ocfs2_dinode *alloc;
 
-	mlog_entry("(slot_num = %d)\n", slot_num);
+	mlog(0, "(slot_num = %d)\n", slot_num);
 
 	*alloc_copy = NULL;
 
@@ -570,8 +566,6 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
 	struct buffer_head *main_bm_bh = NULL;
 	struct inode *main_bm_inode;
 
-	mlog_entry_void();
-
 	main_bm_inode = ocfs2_get_system_file_inode(osb,
 						    GLOBAL_BITMAP_SYSTEM_INODE,
 						    OCFS2_INVALID_SLOT);
@@ -640,8 +634,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 	struct inode *local_alloc_inode;
 	unsigned int free_bits;
 
-	mlog_entry_void();
-
 	BUG_ON(!ac);
 
 	local_alloc_inode =
@@ -749,7 +741,6 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 	struct ocfs2_dinode *alloc;
 	struct ocfs2_local_alloc *la;
 
-	mlog_entry_void();
 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
 
 	local_alloc_inode = ac->ac_inode;
@@ -799,8 +790,6 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 	u32 count = 0;
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 
-	mlog_entry_void();
-
 	buffer = la->la_bitmap;
 	for (i = 0; i < le16_to_cpu(la->la_size); i++)
 		count += hweight8(buffer[i]);
@@ -820,7 +809,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 	void *bitmap = NULL;
 	struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
 
-	mlog_entry("(numbits wanted = %u)\n", *numbits);
+	mlog(0, "(numbits wanted = %u)\n", *numbits);
 
 	if (!alloc->id1.bitmap1.i_total) {
 		mlog(0, "No bits in my window!\n");
@@ -903,7 +892,6 @@ static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
 {
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 	int i;
-	mlog_entry_void();
 
 	alloc->id1.bitmap1.i_total = 0;
 	alloc->id1.bitmap1.i_used = 0;
@@ -952,9 +940,9 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 	void *bitmap;
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 
-	mlog_entry("total = %u, used = %u\n",
-		   le32_to_cpu(alloc->id1.bitmap1.i_total),
-		   le32_to_cpu(alloc->id1.bitmap1.i_used));
+	mlog(0, "total = %u, used = %u\n",
+	     le32_to_cpu(alloc->id1.bitmap1.i_total),
+	     le32_to_cpu(alloc->id1.bitmap1.i_used));
 
 	if (!alloc->id1.bitmap1.i_total) {
 		mlog(0, "nothing to sync!\n");
@@ -1148,8 +1136,6 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 	struct ocfs2_dinode *alloc = NULL;
 	struct ocfs2_local_alloc *la;
 
-	mlog_entry_void();
-
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 	la = OCFS2_LOCAL_ALLOC(alloc);
 
@@ -1243,8 +1229,6 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
 	struct ocfs2_dinode *alloc_copy = NULL;
 	struct ocfs2_alloc_context *ac = NULL;
 
-	mlog_entry_void();
-
 	ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
 
 	/* This will lock the main bitmap for us. */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 7e32db9c2c99..87438e178e11 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -49,7 +49,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	sigset_t oldset;
 	int ret;
 
-	mlog_entry("(area=%p, page offset=%lu)\n", area, vmf->pgoff);
+	mlog(0, "(area=%p, page offset=%lu)\n", area, vmf->pgoff);
 
 	ocfs2_block_signals(&oldset);
 	ret = filemap_fault(area, vmf);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 849fb4a2e814..fdf5c0b75caa 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -106,8 +106,8 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 	struct dentry *ret;
 	struct ocfs2_inode_info *oi;
 
-	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
-		   dentry->d_name.len, dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+	     dentry->d_name.len, dentry->d_name.name);
 
 	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
 		ret = ERR_PTR(-ENAMETOOLONG);
@@ -235,9 +235,9 @@ static int ocfs2_mknod(struct inode *dir,
 	sigset_t oldset;
 	int did_block_signals = 0;
 
-	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
-		   (unsigned long)dev, dentry->d_name.len,
-		   dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
+	     (unsigned long)dev, dentry->d_name.len,
+	     dentry->d_name.name);
 
 	dquot_initialize(dir);
 
@@ -354,9 +354,9 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	did_quota_inode = 1;
 
-	mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
-		   inode->i_mode, (unsigned long)dev, dentry->d_name.len,
-		   dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
+	     inode->i_mode, (unsigned long)dev, dentry->d_name.len,
+	     dentry->d_name.name);
 
 	/* do the real work now. */
 	status = ocfs2_mknod_locked(osb, dir, inode, dev,
@@ -615,8 +615,8 @@ static int ocfs2_mkdir(struct inode *dir,
 {
 	int ret;
 
-	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
-		   dentry->d_name.len, dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+	     dentry->d_name.len, dentry->d_name.name);
 	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
 	mlog_exit(ret);
 
@@ -630,8 +630,8 @@ static int ocfs2_create(struct inode *dir,
 {
 	int ret;
 
-	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
-		   dentry->d_name.len, dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
+	     dentry->d_name.len, dentry->d_name.name);
 	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
 	mlog_exit(ret);
 
@@ -652,9 +652,9 @@ static int ocfs2_link(struct dentry *old_dentry,
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 	sigset_t oldset;
 
-	mlog_entry("(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
-		   old_dentry->d_name.len, old_dentry->d_name.name,
-		   dentry->d_name.len, dentry->d_name.name);
+	mlog(0, "(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
+	     old_dentry->d_name.len, old_dentry->d_name.name,
+	     dentry->d_name.len, dentry->d_name.name);
 
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
@@ -809,8 +809,8 @@ static int ocfs2_unlink(struct inode *dir,
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
 
-	mlog_entry("(0x%p, 0x%p, '%.*s')\n", dir, dentry,
-		   dentry->d_name.len, dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, '%.*s')\n", dir, dentry,
+	     dentry->d_name.len, dentry->d_name.name);
 
 	dquot_initialize(dir);
 
@@ -975,9 +975,9 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 	struct buffer_head **tmpbh;
 	struct inode *tmpinode;
 
-	mlog_entry("(inode1 = %llu, inode2 = %llu)\n",
-		   (unsigned long long)oi1->ip_blkno,
-		   (unsigned long long)oi2->ip_blkno);
+	mlog(0, "(inode1 = %llu, inode2 = %llu)\n",
+	     (unsigned long long)oi1->ip_blkno,
+	     (unsigned long long)oi2->ip_blkno);
 
 	if (*bh1)
 		*bh1 = NULL;
@@ -1067,10 +1067,10 @@ static int ocfs2_rename(struct inode *old_dir,
 	/* At some point it might be nice to break this function up a
 	 * bit. */
 
-	mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
-		   old_dir, old_dentry, new_dir, new_dentry,
-		   old_dentry->d_name.len, old_dentry->d_name.name,
-		   new_dentry->d_name.len, new_dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
+	     old_dir, old_dentry, new_dir, new_dentry,
+	     old_dentry->d_name.len, old_dentry->d_name.name,
+	     new_dentry->d_name.len, new_dentry->d_name.name);
 
 	dquot_initialize(old_dir);
 	dquot_initialize(new_dir);
@@ -1501,9 +1501,9 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 	 * write i_size + 1 bytes. */
 	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 
-	mlog_entry("i_blocks = %llu, i_size = %llu, blocks = %d\n",
-			(unsigned long long)inode->i_blocks,
-			i_size_read(inode), blocks);
+	mlog(0, "i_blocks = %llu, i_size = %llu, blocks = %d\n",
+	     (unsigned long long)inode->i_blocks,
+	     i_size_read(inode), blocks);
 
 	/* Sanity check -- make sure we're going to fit. */
 	if (bytes_left >
@@ -1610,8 +1610,8 @@ static int ocfs2_symlink(struct inode *dir,
 	sigset_t oldset;
 	int did_block_signals = 0;
 
-	mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
-		   dentry, symname, dentry->d_name.len, dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
+	     dentry, symname, dentry->d_name.len, dentry->d_name.name);
 
 	dquot_initialize(dir);
 
@@ -1713,9 +1713,9 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	did_quota_inode = 1;
 
-	mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
-		   inode->i_mode, dentry->d_name.len,
-		   dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
+	     inode->i_mode, dentry->d_name.len,
+	     dentry->d_name.name);
 
 	status = ocfs2_mknod_locked(osb, dir, inode,
 				    0, &new_fe_bh, parent_fe_bh, handle,
@@ -1844,8 +1844,6 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
 {
 	int status, namelen;
 
-	mlog_entry_void();
-
 	namelen = snprintf(name, OCFS2_ORPHAN_NAMELEN + 1, "%016llx",
 			   (long long)blkno);
 	if (namelen <= 0) {
@@ -1997,7 +1995,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	struct ocfs2_dinode *orphan_fe;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
-	mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
+	mlog(0, "(inode->i_ino = %lu)\n", inode->i_ino);
 
 	status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
 	if (status < 0) {
@@ -2078,8 +2076,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	int status = 0;
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-	mlog_entry_void();
-
 	status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
 	if (status < 0) {
 		mlog_errno(status);
@@ -2358,8 +2354,8 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-	mlog_entry("(0x%p, 0x%p, %.*s')\n", dir, dentry,
-		   dentry->d_name.len, dentry->d_name.name);
+	mlog(0, "(0x%p, 0x%p, %.*s')\n", dir, dentry,
+	     dentry->d_name.len, dentry->d_name.name);
 
 	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
 	if (status < 0) {
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 4607923eb24c..33397bd7e1bc 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -343,8 +343,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 	u64 pcount;
 	int status;
 
-	mlog_entry_void();
-
 	/* Read global header */
 	gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
 			OCFS2_INVALID_SLOT);
@@ -596,7 +594,7 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	int status = 0;
 
-	mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+	mlog(0, "id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
 		   dquot->dq_type, type, sb->s_id);
 	if (type != dquot->dq_type)
 		goto out;
@@ -649,7 +647,7 @@ static int ocfs2_write_dquot(struct dquot *dquot)
 	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
 	int status = 0;
 
-	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+	mlog(0, "id=%u, type=%d", dquot->dq_id, dquot->dq_type);
 
 	handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
 	if (IS_ERR(handle)) {
@@ -688,7 +686,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
 	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
 	int status = 0;
 
-	mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+	mlog(0, "id=%u, type=%d", dquot->dq_id, dquot->dq_type);
 
 	mutex_lock(&dquot->dq_lock);
 	/* Check whether we are not racing with some other dqget() */
@@ -745,7 +743,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
 	int need_alloc = ocfs2_global_qinit_alloc(sb, type);
 	handle_t *handle;
 
-	mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+	mlog(0, "id=%u, type=%d", dquot->dq_id, type);
 	mutex_lock(&dquot->dq_lock);
 	/*
 	 * We need an exclusive lock, because we're going to update use count
@@ -831,7 +829,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 	handle_t *handle;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+	mlog(0, "id=%u, type=%d", dquot->dq_id, type);
 
 	/* In case user set some limits, sync dquot immediately to global
 	 * quota file so that information propagates quicker */
@@ -879,8 +877,6 @@ static int ocfs2_write_info(struct super_block *sb, int type)
 	int status = 0;
 	struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
 
-	mlog_entry_void();
-
 	status = ocfs2_lock_global_qf(oinfo, 1);
 	if (status < 0)
 		goto out;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc78764ccc4c..a71d68d88667 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -475,7 +475,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 	struct ocfs2_recovery_chunk *rchunk, *next;
 	qsize_t spacechange, inodechange;
 
-	mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+	mlog(0, "ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
 
 	list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
 		chunk = rchunk->rc_chunk;
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index dacd553d8617..e4a50e3fcb80 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -103,8 +103,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
 	u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
 	u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
 
-	mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
-		   new_clusters, first_new_cluster);
+	mlog(0, "(new_clusters=%d, first_new_cluster = %u)\n",
+	     new_clusters, first_new_cluster);
 
 	ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
 				      group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -281,8 +281,6 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 	u32 first_new_cluster;
 	u64 lgd_blkno;
 
-	mlog_entry_void();
-
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
 
@@ -472,8 +470,6 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 	struct ocfs2_chain_rec *cr;
 	u16 cl_bpc;
 
-	mlog_entry_void();
-
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
 
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index ab4e0172cc1d..a6016e6b3ca2 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -475,8 +475,6 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 	int slot;
 	struct ocfs2_slot_info *si;
 
-	mlog_entry_void();
-
 	si = osb->slot_info;
 
 	spin_lock(&osb->osb_lock);
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 71998d4d61d5..dd618496c2c6 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -389,8 +389,6 @@ static int ocfs2_block_group_fill(handle_t *handle,
 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
 	struct super_block * sb = alloc_inode->i_sb;
 
-	mlog_entry_void();
-
 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
 		ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
 			    "b_blocknr (%llu)",
@@ -707,8 +705,6 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 
 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
 
-	mlog_entry_void();
-
 	cl = &fe->id2.i_chain;
 	status = ocfs2_reserve_clusters_with_limit(osb,
 						   le16_to_cpu(cl->cl_cpg),
@@ -814,8 +810,6 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	struct ocfs2_dinode *fe;
 	u32 free_bits;
 
-	mlog_entry_void();
-
 	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
 	if (!alloc_inode) {
 		mlog_errno(-EINVAL);
@@ -1189,8 +1183,6 @@ static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
 {
 	int status;
 
-	mlog_entry_void();
-
 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
 	if (!(*ac)) {
 		status = -ENOMEM;
@@ -1357,8 +1349,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 	void *bitmap = bg->bg_bitmap;
 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
 
-	mlog_entry_void();
-
 	/* All callers get the descriptor via
 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
@@ -1888,8 +1878,6 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 	struct ocfs2_chain_list *cl;
 	struct ocfs2_dinode *fe;
 
-	mlog_entry_void();
-
 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
 	BUG_ON(!ac->ac_bh);
@@ -2201,8 +2189,6 @@ int ocfs2_claim_new_inode(handle_t *handle,
 	int status;
 	struct ocfs2_suballoc_result res;
 
-	mlog_entry_void();
-
 	BUG_ON(!ac);
 	BUG_ON(ac->ac_bits_given != 0);
 	BUG_ON(ac->ac_bits_wanted != 1);
@@ -2307,8 +2293,6 @@ int __ocfs2_claim_clusters(handle_t *handle,
 	struct ocfs2_suballoc_result res = { .sr_blkno = 0, };
 	struct ocfs2_super *osb = OCFS2_SB(ac->ac_inode->i_sb);
 
-	mlog_entry_void();
-
 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
 
 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
@@ -2392,8 +2376,6 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
 	unsigned int tmp;
 	struct ocfs2_group_desc *undo_bg = NULL;
 
-	mlog_entry_void();
-
 	/* The caller got this descriptor from
 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
@@ -2463,8 +2445,6 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
 	struct buffer_head *group_bh = NULL;
 	struct ocfs2_group_desc *group;
 
-	mlog_entry_void();
-
 	/* The alloc_bh comes from ocfs2_free_dinode() or
 	 * ocfs2_free_clusters().  The callers have all locked the
 	 * allocator and gotten alloc_bh from the lock call.  This
@@ -2556,11 +2536,8 @@ static int _ocfs2_free_clusters(handle_t *handle,
 
 	/* You can't ever have a contiguous set of clusters
 	 * bigger than a block group bitmap so we never have to worry
-	 * about looping on them. */
-
-	mlog_entry_void();
-
-	/* This is expensive. We can safely remove once this stuff has
+	 * about looping on them.
+	 * This is expensive. We can safely remove once this stuff has
 	 * gotten tested really well. */
 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
 
@@ -2756,7 +2733,7 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
 	struct buffer_head *inode_bh = NULL;
 	struct ocfs2_dinode *inode_fe;
 
-	mlog_entry("blkno: %llu\n", (unsigned long long)blkno);
+	mlog(0, "blkno: %llu\n", (unsigned long long)blkno);
 
 	/* dirty read disk */
 	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
@@ -2816,8 +2793,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 	u64 bg_blkno;
 	int status;
 
-	mlog_entry("blkno: %llu bit: %u\n", (unsigned long long)blkno,
-		   (unsigned int)bit);
+	mlog(0, "blkno: %llu bit: %u\n", (unsigned long long)blkno,
+	     (unsigned int)bit);
 
 	alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
 	if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
@@ -2869,7 +2846,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 	struct inode *inode_alloc_inode;
 	struct buffer_head *alloc_bh = NULL;
 
-	mlog_entry("blkno: %llu", (unsigned long long)blkno);
+	mlog(0, "blkno: %llu", (unsigned long long)blkno);
 
 	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
 					     &group_blkno, &suballoc_bit);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 36c423fb0635..21c4bfdfd739 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -441,8 +441,6 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 	int status = 0;
 	int i;
 
-	mlog_entry_void();
-
 	new = ocfs2_iget(osb, osb->root_blkno, OCFS2_FI_FLAG_SYSFILE, 0);
 	if (IS_ERR(new)) {
 		status = PTR_ERR(new);
@@ -488,8 +486,6 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
 	int status = 0;
 	int i;
 
-	mlog_entry_void();
-
 	for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
 	     i < NUM_SYSTEM_INODES;
 	     i++) {
@@ -517,8 +513,6 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 	int i;
 	struct inode *inode;
 
-	mlog_entry_void();
-
 	for (i = 0; i < NUM_GLOBAL_SYSTEM_INODES; i++) {
 		inode = osb->global_system_inodes[i];
 		if (inode) {
@@ -1032,7 +1026,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	char nodestr[8];
 	struct ocfs2_blockcheck_stats stats;
 
-	mlog_entry("%p, %p, %i", sb, data, silent);
+	mlog(0, "%p, %p, %i", sb, data, silent);
 
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
 		status = -EINVAL;
@@ -1320,8 +1314,8 @@ static int ocfs2_parse_options(struct super_block *sb,
 	char *p;
 	u32 tmp;
 
-	mlog_entry("remount: %d, options: \"%s\"\n", is_remount,
-		   options ? options : "(none)");
+	mlog(0, "remount: %d, options: \"%s\"\n", is_remount,
+	     options ? options : "(none)");
 
 	mopt->commit_interval = 0;
 	mopt->mount_opt = OCFS2_MOUNT_NOINTR;
@@ -1629,8 +1623,6 @@ static int __init ocfs2_init(void)
 {
 	int status;
 
-	mlog_entry_void();
-
 	ocfs2_print_version();
 
 	status = init_ocfs2_uptodate_cache();
@@ -1681,8 +1673,6 @@ leave:
 
 static void __exit ocfs2_exit(void)
 {
-	mlog_entry_void();
-
 	ocfs2_quota_shutdown();
 
 	if (ocfs2_wq) {
@@ -1705,7 +1695,7 @@ static void __exit ocfs2_exit(void)
 
 static void ocfs2_put_super(struct super_block *sb)
 {
-	mlog_entry("(0x%p)\n", sb);
+	mlog(0, "(0x%p)\n", sb);
 
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
@@ -1722,7 +1712,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct buffer_head *bh = NULL;
 	struct inode *inode = NULL;
 
-	mlog_entry("(%p, %p)\n", dentry->d_sb, buf);
+	mlog(0, "(%p, %p)\n", dentry->d_sb, buf);
 
 	osb = OCFS2_SB(dentry->d_sb);
 
@@ -1889,8 +1879,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
 	int unlock_super = 0;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	mlog_entry_void();
-
 	if (ocfs2_is_hard_readonly(osb))
 		goto leave;
 
@@ -1945,7 +1933,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	struct ocfs2_super *osb = NULL;
 	char nodestr[8];
 
-	mlog_entry("(0x%p)\n", sb);
+	mlog(0, "(0x%p)\n", sb);
 
 	BUG_ON(!sb);
 	osb = OCFS2_SB(sb);
@@ -2097,8 +2085,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	struct ocfs2_super *osb;
 	u64 total_blocks;
 
-	mlog_entry_void();
-
 	osb = kzalloc(sizeof(struct ocfs2_super), GFP_KERNEL);
 	if (!osb) {
 		status = -ENOMEM;
@@ -2403,8 +2389,6 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 {
 	int status = -EAGAIN;
 
-	mlog_entry_void();
-
 	if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
 		   strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
 		/* We have to do a raw check of the feature here */
@@ -2472,8 +2456,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
 						  * recover
 						  * ourselves. */
 
-	mlog_entry_void();
-
 	/* Init our journal object. */
 	status = ocfs2_journal_init(osb->journal, &dirty);
 	if (status < 0) {
@@ -2568,8 +2550,6 @@ finally:
  */
 static void ocfs2_delete_osb(struct ocfs2_super *osb)
 {
-	mlog_entry_void();
-
 	/* This function assumes that the caller has the main osb resource */
 
 	ocfs2_free_slot_info(osb);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 9975457c981f..8a81688c57df 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -62,8 +62,6 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
 	char *link = NULL;
 	struct ocfs2_dinode *fe;
 
-	mlog_entry_void();
-
 	status = ocfs2_read_inode_block(inode, bh);
 	if (status < 0) {
 		mlog_errno(status);
@@ -88,8 +86,6 @@ static int ocfs2_readlink(struct dentry *dentry,
 	struct buffer_head *bh = NULL;
 	struct inode *inode = dentry->d_inode;
 
-	mlog_entry_void();
-
 	link = ocfs2_fast_symlink_getlink(inode, &bh);
 	if (IS_ERR(link)) {
 		ret = PTR_ERR(link);
@@ -117,8 +113,6 @@ static void *ocfs2_fast_follow_link(struct dentry *dentry,
 	struct inode *inode = dentry->d_inode;
 	struct buffer_head *bh = NULL;
 
-	mlog_entry_void();
-
 	BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
 	target = ocfs2_fast_symlink_getlink(inode, &bh);
 	if (IS_ERR(target)) {
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 67cd43914641..81f98b7f7502 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5538,7 +5538,7 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
 	int ret;
 	struct ocfs2_xa_loc loc;
 
-	mlog_entry("Set xattr %s in xattr bucket\n", xi->xi_name);
+	mlog(0, "Set xattr %s in xattr bucket\n", xi->xi_name);
 
 	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
 				       xs->not_found ? NULL : xs->here);
@@ -5581,7 +5581,7 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 {
 	int ret;
 
-	mlog_entry("Set xattr %s in xattr index block\n", xi->xi_name);
+	mlog(0, "Set xattr %s in xattr index block\n", xi->xi_name);
 
 	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
 	if (!ret)
-- 
cgit v1.2.3


From c1e8d35ef5ffb393b94a192034b5e3541e005d75 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 7 Mar 2011 16:43:21 +0800
Subject: ocfs2: Remove EXIT from masklog.

mlog_exit is used to record the exit status of a function.
But because it is added in so many functions, if we enable it,
the system logs get filled up quickly and cause too much I/O.
So actually no one can open it for a production system or even
for a test.

This patch just try to remove it or change it. So:
1. if all the error paths already use mlog_errno, it is just removed.
   Otherwise, it will be replaced by mlog_errno.
2. if it is used to print some return value, it is replaced with
   mlog(0,...).
mlog_exit_ptr is changed to mlog(0.
All those mlog(0,...) will be replaced with trace events later.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/alloc.c             | 26 ++++------------
 fs/ocfs2/aops.c              | 19 +++---------
 fs/ocfs2/buffer_head_io.c    |  7 +++--
 fs/ocfs2/cluster/heartbeat.c |  1 -
 fs/ocfs2/cluster/masklog.c   |  3 +-
 fs/ocfs2/cluster/masklog.h   | 44 ---------------------------
 fs/ocfs2/dcache.c            |  3 +-
 fs/ocfs2/dir.c               | 20 ++++++-------
 fs/ocfs2/dlm/dlmrecovery.c   |  5 +---
 fs/ocfs2/dlmglue.c           | 71 ++++----------------------------------------
 fs/ocfs2/export.c            |  6 ++--
 fs/ocfs2/extent_map.c        |  1 -
 fs/ocfs2/file.c              | 21 ++-----------
 fs/ocfs2/inode.c             | 15 ++--------
 fs/ocfs2/ioctl.c             |  2 --
 fs/ocfs2/journal.c           | 19 ------------
 fs/ocfs2/localalloc.c        | 35 ++++++++++++----------
 fs/ocfs2/mmap.c              |  2 +-
 fs/ocfs2/namei.c             | 47 +++++++++++++++++++----------
 fs/ocfs2/quota_global.c      | 17 ++++++-----
 fs/ocfs2/quota_local.c       |  3 +-
 fs/ocfs2/resize.c            |  6 ++--
 fs/ocfs2/slot_map.c          |  1 -
 fs/ocfs2/suballoc.c          | 54 ++++++++++++++++++++++-----------
 fs/ocfs2/super.c             | 37 +++++++++--------------
 fs/ocfs2/symlink.c           |  7 +++--
 fs/ocfs2/xattr.c             |  2 --
 27 files changed, 159 insertions(+), 315 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 47a6ce84e67f..80ffbe3dd32f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -985,7 +985,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 bail:
 	brelse(eb_bh);
 
-	mlog_exit(retval);
+	mlog(0, "retval = %d\n", retval);
 	return retval;
 }
 
@@ -1070,8 +1070,8 @@ bail:
 			brelse(bhs[i]);
 			bhs[i] = NULL;
 		}
+		mlog_errno(status);
 	}
-	mlog_exit(status);
 	return status;
 }
 
@@ -1326,7 +1326,6 @@ bail:
 		kfree(new_eb_bhs);
 	}
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1407,7 +1406,6 @@ static int ocfs2_shift_tree_depth(handle_t *handle,
 bail:
 	brelse(new_eb_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1493,7 +1491,6 @@ static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
 bail:
 	brelse(bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -4552,7 +4549,7 @@ static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
 					      ocfs2_et_get_last_eb_blk(et),
 					      &bh);
 		if (ret) {
-			mlog_exit(ret);
+			mlog_errno(ret);
 			goto out;
 		}
 		eb = (struct ocfs2_extent_block *) bh->b_data;
@@ -4716,7 +4713,6 @@ int ocfs2_insert_extent(handle_t *handle,
 bail:
 	brelse(last_eb_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -4818,7 +4814,6 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 	}
 
 leave:
-	mlog_exit(status);
 	if (reason_ret)
 		*reason_ret = reason;
 	return status;
@@ -5029,7 +5024,7 @@ int ocfs2_split_extent(handle_t *handle,
 					      ocfs2_et_get_last_eb_blk(et),
 					      &last_eb_bh);
 		if (ret) {
-			mlog_exit(ret);
+			mlog_errno(ret);
 			goto out;
 		}
 
@@ -5849,7 +5844,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 
 	osb->truncated_clusters += num_clusters;
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -5920,7 +5914,6 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 	osb->truncated_clusters = 0;
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -5995,7 +5988,6 @@ out_mutex:
 	iput(data_alloc_inode);
 
 out:
-	mlog_exit(status);
 	return status;
 }
 
@@ -6023,8 +6015,6 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
 		mlog_errno(status);
 	else
 		ocfs2_init_steal_slots(osb);
-
-	mlog_exit(status);
 }
 
 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
@@ -6070,7 +6060,6 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
 	*tl_inode = inode;
 	*tl_bh    = bh;
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -6141,9 +6130,9 @@ bail:
 	if (status < 0 && (*tl_copy)) {
 		kfree(*tl_copy);
 		*tl_copy = NULL;
+		mlog_errno(status);
 	}
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -6201,7 +6190,6 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 bail_up:
 	mutex_unlock(&tl_inode->i_mutex);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -6221,8 +6209,6 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
 		brelse(osb->osb_tl_bh);
 		iput(osb->osb_tl_inode);
 	}
-
-	mlog_exit_void();
 }
 
 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
@@ -6246,7 +6232,6 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
 	osb->osb_tl_bh    = tl_bh;
 	osb->osb_tl_inode = tl_inode;
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -7112,7 +7097,6 @@ bail:
 
 	ocfs2_free_path(path);
 
-	mlog_exit(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 5dbfc9e45195..9551518be435 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -123,7 +123,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 bail:
 	brelse(bh);
 
-	mlog_exit(err);
 	return err;
 }
 
@@ -208,7 +207,6 @@ bail:
 	if (err < 0)
 		err = -EIO;
 
-	mlog_exit(err);
 	return err;
 }
 
@@ -323,7 +321,6 @@ out_inode_unlock:
 out:
 	if (unlock)
 		unlock_page(page);
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -402,8 +399,6 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 
 	ret = block_write_full_page(page, ocfs2_get_block, wbc);
 
-	mlog_exit(ret);
-
 	return ret;
 }
 
@@ -484,8 +479,6 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 bail:
 	status = err ? 0 : p_blkno;
 
-	mlog_exit((int)status);
-
 	return status;
 }
 
@@ -616,7 +609,6 @@ static ssize_t ocfs2_direct_IO(int rw,
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_path.dentry->d_inode->i_mapping->host;
-	int ret;
 
 	/*
 	 * Fallback to buffered I/O if we see an inode without
@@ -629,13 +621,10 @@ static ssize_t ocfs2_direct_IO(int rw,
 	if (i_size_read(inode) <= offset)
 		return 0;
 
-	ret = __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
-				   iov, offset, nr_segs,
-				   ocfs2_direct_IO_get_blocks,
-				   ocfs2_dio_end_io, NULL, 0);
-
-	mlog_exit(ret);
-	return ret;
+	return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+				    iov, offset, nr_segs,
+				    ocfs2_direct_IO_get_blocks,
+				    ocfs2_dio_end_io, NULL, 0);
 }
 
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 024bd45b7a48..c23846dbfc8f 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -66,6 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 	 * can get modified during recovery even if read-only. */
 	if (ocfs2_is_hard_readonly(osb)) {
 		ret = -EROFS;
+		mlog_errno(ret);
 		goto out;
 	}
 
@@ -91,11 +92,11 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 		 * uptodate. */
 		ret = -EIO;
 		put_bh(bh);
+		mlog_errno(ret);
 	}
 
 	ocfs2_metadata_cache_io_unlock(ci);
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -374,7 +375,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 
 bail:
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -413,6 +413,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) {
 		ret = -EROFS;
+		mlog_errno(ret);
 		goto out;
 	}
 
@@ -432,9 +433,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
 	if (!buffer_uptodate(bh)) {
 		ret = -EIO;
 		put_bh(bh);
+		mlog_errno(ret);
 	}
 
 out:
-	mlog_exit(ret);
 	return ret;
 }
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 929178ce17a2..fc9e96a03a6b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1679,7 +1679,6 @@ static int o2hb_populate_slot_data(struct o2hb_region *reg)
 	}
 
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index bac64570c525..fdc302ddafe8 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -30,7 +30,7 @@
 
 struct mlog_bits mlog_and_bits = MLOG_BITS_RHS(MLOG_INITIAL_AND_MASK);
 EXPORT_SYMBOL_GPL(mlog_and_bits);
-struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(MLOG_INITIAL_NOT_MASK);
+struct mlog_bits mlog_not_bits = MLOG_BITS_RHS(0);
 EXPORT_SYMBOL_GPL(mlog_not_bits);
 
 static ssize_t mlog_mask_show(u64 mask, char *buf)
@@ -80,7 +80,6 @@ struct mlog_attribute {
 }
 
 static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
-	define_mask(EXIT),
 	define_mask(TCP),
 	define_mask(MSG),
 	define_mask(SOCKET),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 6e20877436b9..51ee790c7678 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -82,7 +82,6 @@
 
 /* bits that are frequently given and infrequently matched in the low word */
 /* NOTE: If you add a flag, you need to also update masklog.c! */
-#define ML_EXIT		0x0000000000000002ULL /* func call exit */
 #define ML_TCP		0x0000000000000004ULL /* net cluster/tcp.c */
 #define ML_MSG		0x0000000000000008ULL /* net network messages */
 #define ML_SOCKET	0x0000000000000010ULL /* net socket lifetime */
@@ -123,7 +122,6 @@
 #define ML_KTHREAD	0x4000000000000000ULL /* kernel thread activity */
 
 #define MLOG_INITIAL_AND_MASK (ML_ERROR|ML_NOTICE)
-#define MLOG_INITIAL_NOT_MASK (ML_EXIT)
 #ifndef MLOG_MASK_PREFIX
 #define MLOG_MASK_PREFIX 0
 #endif
@@ -221,48 +219,6 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits;
 		mlog(ML_ERROR, "status = %lld\n", (long long)_st);	\
 } while (0)
 
-#if defined(CONFIG_OCFS2_DEBUG_MASKLOG)
-/*
- * We disable this for sparse.
- */
-#if !defined(__CHECKER__)
-#define mlog_exit(st) do {						     \
-	if (__builtin_types_compatible_p(typeof(st), unsigned long))	     \
-		mlog(ML_EXIT, "EXIT: %lu\n", (unsigned long) (st));	     \
-	else if (__builtin_types_compatible_p(typeof(st), signed long))      \
-		mlog(ML_EXIT, "EXIT: %ld\n", (signed long) (st));	     \
-	else if (__builtin_types_compatible_p(typeof(st), unsigned int)	     \
-		 || __builtin_types_compatible_p(typeof(st), unsigned short) \
-		 || __builtin_types_compatible_p(typeof(st), unsigned char)) \
-		mlog(ML_EXIT, "EXIT: %u\n", (unsigned int) (st));	     \
-	else if (__builtin_types_compatible_p(typeof(st), signed int)	     \
-		 || __builtin_types_compatible_p(typeof(st), signed short)   \
-		 || __builtin_types_compatible_p(typeof(st), signed char))   \
-		mlog(ML_EXIT, "EXIT: %d\n", (signed int) (st));		     \
-	else if (__builtin_types_compatible_p(typeof(st), long long))	     \
-		mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));	     \
-	else								     \
-		mlog(ML_EXIT, "EXIT: %llu\n", (unsigned long long) (st));    \
-} while (0)
-#else
-#define mlog_exit(st) do {						     \
-	mlog(ML_EXIT, "EXIT: %lld\n", (long long) (st));		     \
-} while (0)
-#endif
-
-#define mlog_exit_ptr(ptr) do {						\
-	mlog(ML_EXIT, "EXIT: %p\n", ptr);				\
-} while (0)
-
-#define mlog_exit_void() do {						\
-	mlog(ML_EXIT, "EXIT\n");					\
-} while (0)
-#else
-#define mlog_exit(...)  do { } while (0)
-#define mlog_exit_ptr(...)  do { } while (0)
-#define mlog_exit_void(...)  do { } while (0)
-#endif  /* defined(CONFIG_OCFS2_DEBUG_MASKLOG) */
-
 #define mlog_bug_on_msg(cond, fmt, args...) do {			\
 	if (cond) {							\
 		mlog(ML_ERROR, "bug expression: " #cond "\n");		\
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 67d17a45773b..c26551c25e8e 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -123,8 +123,7 @@ valid:
 	ret = 1;
 
 bail:
-	mlog_exit(ret);
-
+	mlog(0, "ret = %d\n", ret);
 	return ret;
 }
 
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 2c983f083f8a..2fc6c3673bea 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -400,7 +400,7 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
 	}
 
 bail:
-	mlog_exit(ret);
+	mlog(0, "ret = %d\n", ret);
 	return ret;
 }
 
@@ -784,7 +784,7 @@ cleanup_and_exit:
 	for (; ra_ptr < ra_max; ra_ptr++)
 		brelse(bh_use[ra_ptr]);
 
-	mlog_exit_ptr(ret);
+	mlog(0, "ret = %p\n", ret);
 	return ret;
 }
 
@@ -1198,7 +1198,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 		de = (struct ocfs2_dir_entry *)((char *)de + le16_to_cpu(de->rec_len));
 	}
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1759,8 +1758,9 @@ int __ocfs2_add_entry(handle_t *handle,
 	 * from ever getting here. */
 	retval = -ENOSPC;
 bail:
+	if (retval)
+		mlog_errno(retval);
 
-	mlog_exit(retval);
 	return retval;
 }
 
@@ -2045,9 +2045,10 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 				      dirent, filldir, NULL);
 
 	ocfs2_inode_unlock(inode, lock_level);
+	if (error)
+		mlog_errno(error);
 
 bail_nolock:
-	mlog_exit(error);
 
 	return error;
 }
@@ -2119,7 +2120,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
 bail:
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 	return ret;
 }
 
@@ -2372,7 +2374,6 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
 bail:
 	brelse(new_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -3227,7 +3228,6 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
 bail:
 	if (did_quota && status < 0)
 		dquot_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
-	mlog_exit(status);
 	return status;
 }
 
@@ -3426,7 +3426,6 @@ bail:
 
 	brelse(new_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -3573,8 +3572,9 @@ next:
 	status = 0;
 bail:
 	brelse(bh);
+	if (status)
+		mlog_errno(status);
 
-	mlog_exit(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 6602b23c2ab3..f1beb6fc254d 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -727,7 +727,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 	if (destroy)
 		dlm_destroy_recovery_area(dlm, dead_node);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1496,9 +1495,9 @@ leave:
 			kfree(buf);
 		if (item)
 			kfree(item);
+		mlog_errno(ret);
 	}
 
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1567,7 +1566,6 @@ leave:
 		dlm_lockres_put(res);
 	}
 	kfree(data);
-	mlog_exit(ret);
 }
 
 
@@ -1986,7 +1984,6 @@ leave:
 			dlm_lock_put(newlock);
 	}
 
-	mlog_exit(ret);
 	return ret;
 }
 
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c2f3fa1c4722..7eb4b8da5cbc 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -406,8 +406,6 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type,
 	BUG_ON(len != (OCFS2_LOCK_ID_MAX_LEN - 1));
 
 	mlog(0, "built lock resource with name: %s\n", name);
-
-	mlog_exit_void();
 }
 
 static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock);
@@ -752,7 +750,6 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
 	memset(&res->l_lksb, 0, sizeof(res->l_lksb));
 
 	res->l_flags = 0UL;
-	mlog_exit_void();
 }
 
 static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
@@ -770,8 +767,6 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
 	default:
 		BUG();
 	}
-
-	mlog_exit_void();
 }
 
 static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
@@ -791,7 +786,6 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
 	default:
 		BUG();
 	}
-	mlog_exit_void();
 }
 
 /* WARNING: This function lives in a world where the only three lock
@@ -850,8 +844,6 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
 		lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
 	}
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
-	mlog_exit_void();
 }
 
 static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lockres)
@@ -877,8 +869,6 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
 	lockres_or_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING);
 
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
-	mlog_exit_void();
 }
 
 static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *lockres)
@@ -894,8 +884,6 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
 	lockres->l_level = lockres->l_requested;
 	lockres_or_flags(lockres, OCFS2_LOCK_ATTACHED);
 	lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
-
-	mlog_exit_void();
 }
 
 static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
@@ -923,8 +911,7 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
 
 	if (needs_downconvert)
 		lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
-
-	mlog_exit(needs_downconvert);
+	mlog(0, "needs_downconvert = %d\n", needs_downconvert);
 	return needs_downconvert;
 }
 
@@ -1145,7 +1132,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
 		     "unlock_action %d\n", error, lockres->l_name,
 		     lockres->l_unlock_action);
 		spin_unlock_irqrestore(&lockres->l_lock, flags);
-		mlog_exit_void();
 		return;
 	}
 
@@ -1169,8 +1155,6 @@ static void ocfs2_unlock_ast(struct ocfs2_dlm_lksb *lksb, int error)
 	lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
 	wake_up(&lockres->l_event);
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
-
-	mlog_exit_void();
 }
 
 /*
@@ -1226,7 +1210,6 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	wake_up(&lockres->l_event);
-	mlog_exit_void();
 }
 
 /* Note: If we detect another process working on the lock (i.e.,
@@ -1273,7 +1256,6 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
 	mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
 
 bail:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1561,7 +1543,6 @@ out:
 				caller_ip);
 	}
 #endif
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1591,7 +1572,6 @@ static void __ocfs2_cluster_unlock(struct ocfs2_super *osb,
 	if (lockres->l_lockdep_map.key != NULL)
 		rwsem_release(&lockres->l_lockdep_map, 1, caller_ip);
 #endif
-	mlog_exit_void();
 }
 
 static int ocfs2_create_new_lock(struct ocfs2_super *osb,
@@ -1658,7 +1638,6 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
 	}
 
 bail:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1674,10 +1653,8 @@ int ocfs2_rw_lock(struct inode *inode, int write)
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     write ? "EXMODE" : "PRMODE");
 
-	if (ocfs2_mount_local(osb)) {
-		mlog_exit(0);
+	if (ocfs2_mount_local(osb))
 		return 0;
-	}
 
 	lockres = &OCFS2_I(inode)->ip_rw_lockres;
 
@@ -1688,7 +1665,6 @@ int ocfs2_rw_lock(struct inode *inode, int write)
 	if (status < 0)
 		mlog_errno(status);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1704,8 +1680,6 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
 
 	if (!ocfs2_mount_local(osb))
 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-
-	mlog_exit_void();
 }
 
 /*
@@ -1733,7 +1707,6 @@ int ocfs2_open_lock(struct inode *inode)
 		mlog_errno(status);
 
 out:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1766,7 +1739,6 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
 				    level, DLM_LKF_NOQUEUE, 0);
 
 out:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1792,7 +1764,7 @@ void ocfs2_open_unlock(struct inode *inode)
 				     DLM_LOCK_EX);
 
 out:
-	mlog_exit_void();
+	return;
 }
 
 static int ocfs2_flock_handle_signal(struct ocfs2_lock_res *lockres,
@@ -2028,8 +2000,6 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
 
 	if (kick)
 		ocfs2_wake_downconvert_thread(osb);
-
-	mlog_exit_void();
 }
 
 #define OCFS2_SEC_BITS   34
@@ -2089,8 +2059,6 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
 
 out:
 	mlog_meta_lvb(0, lockres);
-
-	mlog_exit_void();
 }
 
 static void ocfs2_unpack_timespec(struct timespec *spec,
@@ -2136,8 +2104,6 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	ocfs2_unpack_timespec(&inode->i_ctime,
 			      be64_to_cpu(lvb->lvb_ictime_packed));
 	spin_unlock(&oi->ip_lock);
-
-	mlog_exit_void();
 }
 
 static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
@@ -2184,7 +2150,7 @@ refresh_check:
 
 	status = 1;
 bail:
-	mlog_exit(status);
+	mlog(0, "status %d\n", status);
 	return status;
 }
 
@@ -2202,8 +2168,6 @@ static inline void ocfs2_complete_lock_res_refresh(struct ocfs2_lock_res *lockre
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 
 	wake_up(&lockres->l_event);
-
-	mlog_exit_void();
 }
 
 /* may or may not return a bh if it went to disk. */
@@ -2284,7 +2248,6 @@ static int ocfs2_inode_lock_update(struct inode *inode,
 bail_refresh:
 	ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -2419,7 +2382,6 @@ bail:
 	if (local_bh)
 		brelse(local_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -2496,7 +2458,6 @@ int ocfs2_inode_lock_atime(struct inode *inode,
 	} else
 		*level = 0;
 
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -2514,8 +2475,6 @@ void ocfs2_inode_unlock(struct inode *inode,
 	if (!ocfs2_is_hard_readonly(OCFS2_SB(inode->i_sb)) &&
 	    !ocfs2_mount_local(osb))
 		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, level);
-
-	mlog_exit_void();
 }
 
 int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno)
@@ -2597,7 +2556,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
 		ocfs2_track_lock_refresh(lockres);
 	}
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -3057,7 +3015,6 @@ bail:
 			kthread_stop(osb->dc_task);
 	}
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -3086,8 +3043,6 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
 	osb->cconn = NULL;
 
 	ocfs2_dlm_shutdown_debug(osb);
-
-	mlog_exit_void();
 }
 
 static int ocfs2_drop_lock(struct ocfs2_super *osb,
@@ -3169,7 +3124,6 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
 
 	ocfs2_wait_on_busy_lock(lockres);
 out:
-	mlog_exit(0);
 	return 0;
 }
 
@@ -3251,7 +3205,6 @@ int ocfs2_drop_inode_locks(struct inode *inode)
 	if (err < 0 && !status)
 		status = err;
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -3314,7 +3267,6 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 
 	ret = 0;
 bail:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -3362,7 +3314,6 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
 
 	mlog(ML_BASTS, "lockres %s\n", lockres->l_name);
 
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -3552,14 +3503,14 @@ downconvert:
 				     gen);
 
 leave:
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 	return ret;
 
 leave_requeue:
 	spin_unlock_irqrestore(&lockres->l_lock, flags);
 	ctl->requeue = 1;
 
-	mlog_exit(0);
 	return 0;
 }
 
@@ -3800,8 +3751,6 @@ static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
 	lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
 	lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
 	lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
-
-	mlog_exit_void();
 }
 
 void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
@@ -3812,7 +3761,6 @@ void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
 
 	if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
 		ocfs2_cluster_unlock(osb, lockres, level);
-	mlog_exit_void();
 }
 
 static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
@@ -3889,7 +3837,6 @@ int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
 		ocfs2_qinfo_unlock(oinfo, ex);
 	ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -3968,8 +3915,6 @@ unqueue:
 	if (ctl.unblock_action != UNBLOCK_CONTINUE
 	    && lockres->l_ops->post_unlock)
 		lockres->l_ops->post_unlock(osb, lockres);
-
-	mlog_exit_void();
 }
 
 static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
@@ -3995,8 +3940,6 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 		osb->blocked_lock_count++;
 	}
 	spin_unlock(&osb->dc_task_lock);
-
-	mlog_exit_void();
 }
 
 static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
@@ -4027,8 +3970,6 @@ static void ocfs2_downconvert_thread_do_work(struct ocfs2_super *osb)
 		spin_lock(&osb->dc_task_lock);
 	}
 	spin_unlock(&osb->dc_task_lock);
-
-	mlog_exit_void();
 }
 
 static int ocfs2_downconvert_thread_lists_empty(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index b90c751ba938..96d7ed7f8305 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -141,7 +141,7 @@ check_gen:
 		mlog_errno(PTR_ERR(result));
 
 bail:
-	mlog_exit_ptr(result);
+	mlog(0, "result = %p\n", result);
 	return result;
 }
 
@@ -178,7 +178,7 @@ bail_unlock:
 	ocfs2_inode_unlock(dir, 0);
 
 bail:
-	mlog_exit_ptr(parent);
+	mlog(0, "parent %p\n", parent);
 
 	return parent;
 }
@@ -243,7 +243,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 	*max_len = len;
 
 bail:
-	mlog_exit(type);
+	mlog(0, "type = %d\n", type);
 	return type;
 }
 
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index ed9916a9a271..7d74d176706a 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -897,7 +897,6 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
 	}
 
 out:
-	mlog_exit(rc);
 	return rc;
 }
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 71375d7b7bbe..1e1a93aa0526 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -135,7 +135,6 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
 	}
 
 leave:
-	mlog_exit(status);
 	return status;
 }
 
@@ -154,8 +153,6 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
 
 	ocfs2_free_file_private(inode, file);
 
-	mlog_exit(0);
-
 	return 0;
 }
 
@@ -195,7 +192,8 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 	err = jbd2_journal_force_commit(journal);
 
 bail:
-	mlog_exit(err);
+	if (err)
+		mlog_errno(err);
 
 	return (err < 0) ? -EIO : 0;
 }
@@ -278,7 +276,6 @@ int ocfs2_update_inode_atime(struct inode *inode,
 out_commit:
 	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -300,7 +297,6 @@ static int ocfs2_set_inode_size(handle_t *handle,
 	}
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -424,8 +420,6 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
-
-	mlog_exit(status);
 	return status;
 }
 
@@ -520,7 +514,6 @@ bail:
 	if (!status && OCFS2_I(inode)->ip_clusters == 0)
 		status = ocfs2_try_remove_refcount_tree(inode, di_bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -713,7 +706,6 @@ leave:
 	brelse(bh);
 	bh = NULL;
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1269,7 +1261,6 @@ bail:
 			mlog_errno(status);
 	}
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1295,8 +1286,6 @@ int ocfs2_getattr(struct vfsmount *mnt,
 	stat->blksize = osb->s_clustersize;
 
 bail:
-	mlog_exit(err);
-
 	return err;
 }
 
@@ -1318,7 +1307,6 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
 
 	ocfs2_inode_unlock(inode, 0);
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -1359,7 +1347,6 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 out_trans:
 	ocfs2_commit_trans(osb, handle);
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -2393,7 +2380,6 @@ out_sems:
 
 	if (written)
 		ret = written;
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -2476,7 +2462,6 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
 	}
 
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -2507,7 +2492,6 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
 	ret = generic_file_splice_read(in, ppos, pipe, len, flags);
 
 bail:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -2588,7 +2572,6 @@ bail:
 	}
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);
-	mlog_exit(ret);
 
 	return ret;
 }
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 16ee8e96f731..a96e56f12728 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -172,7 +172,7 @@ bail:
 	if (!IS_ERR(inode)) {
 		mlog(0, "returning inode with number %llu\n",
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		mlog_exit_ptr(inode);
+		mlog(0, "inode %p\n", inode);
 	}
 
 	return inode;
@@ -203,7 +203,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
 
 	ret = 1;
 bail:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -235,7 +234,6 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 		lockdep_set_class(&OCFS2_I(inode)->ip_alloc_sem,
 				  &ocfs2_file_ip_alloc_sem_key);
 
-	mlog_exit(0);
 	return 0;
 }
 
@@ -381,7 +379,6 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	if (S_ISDIR(inode->i_mode))
 		ocfs2_resv_set_type(&OCFS2_I(inode)->ip_la_data_resv,
 				    OCFS2_RESV_FLAG_DIR);
-	mlog_exit_void();
 }
 
 static int ocfs2_read_locked_inode(struct inode *inode,
@@ -534,7 +531,6 @@ bail:
 	if (args && bh)
 		brelse(bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -598,7 +594,6 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 out:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
-	mlog_exit(status);
 	return status;
 }
 
@@ -1078,7 +1073,7 @@ bail_unlock_nfs_sync:
 bail_unblock:
 	ocfs2_unblock_signals(&oldset);
 bail:
-	mlog_exit_void();
+	return;
 }
 
 static void ocfs2_clear_inode(struct inode *inode)
@@ -1177,8 +1172,6 @@ static void ocfs2_clear_inode(struct inode *inode)
 	 */
 	jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal,
 				       &oi->ip_jinode);
-
-	mlog_exit_void();
 }
 
 void ocfs2_evict_inode(struct inode *inode)
@@ -1208,7 +1201,6 @@ int ocfs2_drop_inode(struct inode *inode)
 	else
 		res = generic_drop_inode(inode);
 
-	mlog_exit_void();
 	return res;
 }
 
@@ -1248,8 +1240,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
 	}
 	ocfs2_inode_unlock(inode, 0);
 bail:
-	mlog_exit(status);
-
 	return status;
 }
 
@@ -1296,7 +1286,6 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 
 	ocfs2_journal_dirty(handle, bh);
 leave:
-	mlog_exit(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 7a4868196152..b4f51e94c8f5 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -59,7 +59,6 @@ static int ocfs2_get_inode_attr(struct inode *inode, unsigned *flags)
 	*flags = OCFS2_I(inode)->ip_attr;
 	ocfs2_inode_unlock(inode, 0);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -125,7 +124,6 @@ bail:
 
 	brelse(bh);
 
-	mlog_exit(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 7378277b17a4..956509167bf5 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -335,7 +335,6 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 	ocfs2_wake_downconvert_thread(osb);
 	wake_up(&journal->j_checkpointed);
 finally:
-	mlog_exit(status);
 	return status;
 }
 
@@ -450,8 +449,6 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 
 	status = 0;
 bail:
-
-	mlog_exit(status);
 	return status;
 }
 
@@ -665,7 +662,6 @@ static int __ocfs2_journal_access(handle_t *handle,
 		mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
 		     status, type);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -739,8 +735,6 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
 
 	status = jbd2_journal_dirty_metadata(handle, bh);
 	BUG_ON(status);
-
-	mlog_exit_void();
 }
 
 #define OCFS2_DEFAULT_COMMIT_INTERVAL	(HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
@@ -854,7 +848,6 @@ done:
 		}
 	}
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -899,7 +892,6 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 	if (status < 0)
 		mlog_errno(status);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -989,7 +981,6 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 done:
 	if (inode)
 		iput(inode);
-	mlog_exit_void();
 }
 
 static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1048,7 +1039,6 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
 		osb->commit_task = NULL;
 
 done:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1072,7 +1062,6 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
 		mlog_errno(status);
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1146,7 +1135,6 @@ static int ocfs2_force_read_journal(struct inode *inode)
 bail:
 	for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++)
 		brelse(bhs[i]);
-	mlog_exit(status);
 	return status;
 }
 
@@ -1236,7 +1224,6 @@ void ocfs2_complete_recovery(struct work_struct *work)
 	}
 
 	mlog(0, "Recovery completion\n");
-	mlog_exit_void();
 }
 
 /* NOTE: This function always eats your references to la_dinode and
@@ -1432,7 +1419,6 @@ bail:
 	if (rm_quota)
 		kfree(rm_quota);
 
-	mlog_exit(status);
 	/* no one is callint kthread_stop() for us so the kthread() api
 	 * requires that we call do_exit().  And it isn't exported, but
 	 * complete_and_exit() seems to be a minimal wrapper around it. */
@@ -1469,8 +1455,6 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 out:
 	mutex_unlock(&osb->recovery_lock);
 	wake_up(&osb->recovery_event);
-
-	mlog_exit_void();
 }
 
 static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
@@ -1646,7 +1630,6 @@ done:
 
 	brelse(bh);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1716,7 +1699,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 	status = 0;
 done:
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -1826,7 +1808,6 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 
 	status = 0;
 bail:
-	mlog_exit(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 5f08c703f354..36636e1c84e7 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -364,7 +364,8 @@ bail:
 
 	mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -478,8 +479,6 @@ out:
 
 	if (alloc_copy)
 		kfree(alloc_copy);
-
-	mlog_exit_void();
 }
 
 /*
@@ -548,7 +547,8 @@ bail:
 		iput(inode);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -614,7 +614,8 @@ out_mutex:
 out:
 	if (!status)
 		ocfs2_init_steal_slots(osb);
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -724,7 +725,8 @@ bail:
 	mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
 	     status);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -779,7 +781,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 	ocfs2_journal_dirty(handle, osb->local_alloc_bh);
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -794,7 +797,7 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 	for (i = 0; i < le16_to_cpu(la->la_size); i++)
 		count += hweight8(buffer[i]);
 
-	mlog_exit(count);
+	mlog(0, "count %u\n", count);
 	return count;
 }
 
@@ -884,7 +887,7 @@ bail:
 	if (local_resv)
 		ocfs2_resv_discard(resmap, resv);
 
-	mlog_exit(bitoff);
+	mlog(0, "bitoff %d\n", bitoff);
 	return bitoff;
 }
 
@@ -898,8 +901,6 @@ static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
 	la->la_bm_off = 0;
 	for(i = 0; i < le16_to_cpu(la->la_size); i++)
 		la->la_bitmap[i] = 0;
-
-	mlog_exit_void();
 }
 
 #if 0
@@ -995,7 +996,8 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 	}
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1120,7 +1122,8 @@ bail:
 		*ac = NULL;
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1212,7 +1215,8 @@ retry_enospc:
 	mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1308,7 +1312,8 @@ bail:
 	if (ac)
 		ocfs2_free_alloc_context(ac);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 87438e178e11..49730b51ae45 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -55,7 +55,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	ret = filemap_fault(area, vmf);
 	ocfs2_unblock_signals(&oldset);
 
-	mlog_exit_ptr(vmf->page);
+	mlog(0, "%p\n", vmf->page);
 	return ret;
 }
 
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index fdf5c0b75caa..8a13226eb1a2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -182,7 +182,7 @@ bail_unlock:
 
 bail:
 
-	mlog_exit_ptr(ret);
+	mlog(0, "%p\n", ret);
 
 	return ret;
 }
@@ -466,7 +466,8 @@ leave:
 		iput(inode);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
@@ -577,7 +578,8 @@ leave:
 		}
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -618,7 +620,8 @@ static int ocfs2_mkdir(struct inode *dir,
 	mlog(0, "(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
 	     dentry->d_name.len, dentry->d_name.name);
 	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 
 	return ret;
 }
@@ -633,7 +636,8 @@ static int ocfs2_create(struct inode *dir,
 	mlog(0, "(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
 	     dentry->d_name.len, dentry->d_name.name);
 	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 
 	return ret;
 }
@@ -757,7 +761,8 @@ out:
 
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	mlog_exit(err);
+	if (err)
+		mlog_errno(err);
 
 	return err;
 }
@@ -954,7 +959,8 @@ leave:
 	ocfs2_free_dir_lookup_result(&orphan_insert);
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
@@ -1025,7 +1031,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 	}
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1476,7 +1483,8 @@ bail:
 	brelse(old_dir_bh);
 	brelse(new_dir_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
@@ -1579,7 +1587,8 @@ bail:
 		kfree(bhs);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1835,7 +1844,8 @@ bail:
 		iput(inode);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
@@ -1865,7 +1875,8 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
 
 	status = 0;
 bail:
-	mlog_exit(status);
+	if (status < 0)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1978,7 +1989,8 @@ out:
 		iput(orphan_dir_inode);
 	}
 
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 	return ret;
 }
 
@@ -2060,7 +2072,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 leave:
 	brelse(orphan_dir_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2120,7 +2133,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 leave:
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2472,7 +2486,8 @@ leave:
 
 	ocfs2_free_dir_lookup_result(&lookup);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 33397bd7e1bc..10a49e8a5f03 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -402,7 +402,8 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
 			   msecs_to_jiffies(oinfo->dqi_syncms));
 
 out_err:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 out_unlock:
 	ocfs2_unlock_global_qf(oinfo, 0);
@@ -621,7 +622,6 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
-	mlog_exit(status);
 	return status;
 }
 
@@ -660,7 +660,6 @@ static int ocfs2_write_dquot(struct dquot *dquot)
 	mutex_unlock(&sb_dqopt(dquot->dq_sb)->dqio_mutex);
 	ocfs2_commit_trans(osb, handle);
 out:
-	mlog_exit(status);
 	return status;
 }
 
@@ -722,7 +721,8 @@ out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
 	mutex_unlock(&dquot->dq_lock);
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -809,7 +809,8 @@ out_dq:
 	set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
 out:
 	mutex_unlock(&dquot->dq_lock);
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -866,7 +867,8 @@ out_dlock:
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -891,7 +893,8 @@ static int ocfs2_write_info(struct super_block *sb, int type)
 out_ilock:
 	ocfs2_unlock_global_qf(oinfo, 1);
 out:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index a71d68d88667..88ec4516637b 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -575,7 +575,8 @@ out_put_bh:
 	}
 	if (status < 0)
 		free_recovery_list(&(rec->r_list[type]));
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index e4a50e3fcb80..76b37909a4a1 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -82,7 +82,6 @@ static u16 ocfs2_calc_new_backup_super(struct inode *inode,
 		backups++;
 	}
 
-	mlog_exit_void();
 	return backups;
 }
 
@@ -176,7 +175,8 @@ out_rollback:
 		le16_add_cpu(&group->bg_free_bits_count, -1 * num_bits);
 	}
 out:
-	mlog_exit(ret);
+	if (ret)
+		mlog_errno(ret);
 	return ret;
 }
 
@@ -375,7 +375,6 @@ out_mutex:
 	iput(main_bm_inode);
 
 out:
-	mlog_exit_void();
 	return ret;
 }
 
@@ -585,6 +584,5 @@ out_mutex:
 	iput(main_bm_inode);
 
 out:
-	mlog_exit_void();
 	return ret;
 }
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index a6016e6b3ca2..93b449ed82f9 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -510,7 +510,6 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 		mlog_errno(status);
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index dd618496c2c6..2d722f1fbaa0 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -434,7 +434,8 @@ static int ocfs2_block_group_fill(handle_t *handle,
 	 * allocation time. */
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -792,7 +793,8 @@ bail:
 
 	brelse(bg_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -884,7 +886,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 bail:
 	brelse(bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1046,7 +1049,8 @@ bail:
 		*ac = NULL;
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1146,7 +1150,8 @@ bail:
 		*ac = NULL;
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1221,7 +1226,8 @@ bail:
 		*ac = NULL;
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1384,7 +1390,8 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 	ocfs2_journal_dirty(handle, group_bh);
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1474,7 +1481,8 @@ out_rollback:
 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1860,7 +1868,8 @@ bail:
 	brelse(group_bh);
 	brelse(prev_group_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1972,7 +1981,8 @@ set_hint:
 	}
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2009,7 +2019,8 @@ int ocfs2_claim_metadata(handle_t *handle,
 	*num_bits = res.sr_bits;
 	status = 0;
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2216,7 +2227,8 @@ int ocfs2_claim_new_inode(handle_t *handle,
 	ocfs2_save_inode_ac_group(dir, ac);
 	status = 0;
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2347,7 +2359,8 @@ int __ocfs2_claim_clusters(handle_t *handle,
 	ac->ac_bits_given += *num_clusters;
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2491,7 +2504,8 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
 bail:
 	brelse(group_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2563,7 +2577,8 @@ static int _ocfs2_free_clusters(handle_t *handle,
 					 num_clusters);
 
 out:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2770,7 +2785,8 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
 bail:
 	brelse(inode_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2821,7 +2837,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 bail:
 	brelse(group_bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2887,6 +2904,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 	iput(inode_alloc_inode);
 	brelse(alloc_bh);
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 21c4bfdfd739..b21b702c5603 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -476,7 +476,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
 	}
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -504,7 +505,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
 	}
 
 bail:
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -534,7 +536,7 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 	}
 
 	if (!osb->local_system_inodes)
-		goto out;
+		return;
 
 	for (i = 0; i < NUM_LOCAL_SYSTEM_INODES * osb->max_slots; i++) {
 		if (osb->local_system_inodes[i]) {
@@ -545,9 +547,6 @@ static void ocfs2_release_system_inodes(struct ocfs2_super *osb)
 
 	kfree(osb->local_system_inodes);
 	osb->local_system_inodes = NULL;
-
-out:
-	mlog_exit(0);
 }
 
 /* We're allocating fs objects, use GFP_NOFS */
@@ -1202,7 +1201,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 			mlog_errno(status);
 			atomic_set(&osb->vol_state, VOLUME_DISABLED);
 			wake_up(&osb->osb_mount_event);
-			mlog_exit(status);
 			return status;
 		}
 	}
@@ -1216,7 +1214,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	/* Start this when the mount is almost sure of being successful */
 	ocfs2_orphan_scan_start(osb);
 
-	mlog_exit(status);
 	return status;
 
 read_super_error:
@@ -1231,7 +1228,8 @@ read_super_error:
 		ocfs2_dismount_volume(sb, 1);
 	}
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -1532,7 +1530,6 @@ static int ocfs2_parse_options(struct super_block *sb,
 	status = 1;
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -1661,10 +1658,9 @@ leave:
 		ocfs2_quota_shutdown();
 		ocfs2_free_mem_caches();
 		exit_ocfs2_uptodate_cache();
+		mlog_errno(status);
 	}
 
-	mlog_exit(status);
-
 	if (status >= 0) {
 		return register_filesystem(&ocfs2_fs_type);
 	} else
@@ -1689,8 +1685,6 @@ static void __exit ocfs2_exit(void)
 	unregister_filesystem(&ocfs2_fs_type);
 
 	exit_ocfs2_uptodate_cache();
-
-	mlog_exit_void();
 }
 
 static void ocfs2_put_super(struct super_block *sb)
@@ -1699,8 +1693,6 @@ static void ocfs2_put_super(struct super_block *sb)
 
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
-
-	mlog_exit_void();
 }
 
 static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -1759,7 +1751,8 @@ bail:
 	if (inode)
 		iput(inode);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 
 	return status;
 }
@@ -1923,7 +1916,6 @@ leave:
 	if (unlock_super)
 		ocfs2_super_unlock(osb, 1);
 
-	mlog_exit(status);
 	return status;
 }
 
@@ -2373,7 +2365,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	}
 
 bail:
-	mlog_exit(status);
 	return status;
 }
 
@@ -2443,7 +2434,8 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
 	}
 
 out:
-	mlog_exit(status);
+	if (status && status != -EAGAIN)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2538,7 +2530,8 @@ finally:
 	if (local_alloc)
 		kfree(local_alloc);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return status;
 }
 
@@ -2567,8 +2560,6 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
 	kfree(osb->uuid_str);
 	ocfs2_put_dlm_debug(osb->osb_dlm_debug);
 	memset(osb, 0, sizeof(struct ocfs2_super));
-
-	mlog_exit_void();
 }
 
 /* Put OCFS2 into a readonly state, or (if the user specifies it),
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 8a81688c57df..5485b19ec8f2 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -72,7 +72,6 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
 	fe = (struct ocfs2_dinode *) (*bh)->b_data;
 	link = (char *) fe->id2.i_symlink;
 bail:
-	mlog_exit(status);
 
 	return link;
 }
@@ -100,7 +99,8 @@ static int ocfs2_readlink(struct dentry *dentry,
 
 	brelse(bh);
 out:
-	mlog_exit(ret);
+	if (ret < 0)
+		mlog_errno(ret);
 	return ret;
 }
 
@@ -136,7 +136,8 @@ bail:
 	nd_set_link(nd, status ? ERR_PTR(status) : link);
 	brelse(bh);
 
-	mlog_exit(status);
+	if (status)
+		mlog_errno(status);
 	return NULL;
 }
 
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 81f98b7f7502..d57c7942e4de 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5570,7 +5570,6 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
 
 
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
@@ -5637,7 +5636,6 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 		mlog_errno(ret);
 
 out:
-	mlog_exit(ret);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 80a9a84da381087ed89f5fdfc40a513cf9768ac4 Mon Sep 17 00:00:00 2001
From: Wengang Wang <wen.gang.wang@oracle.com>
Date: Mon, 21 Feb 2011 11:13:14 +0800
Subject: ocfs2: Add ocfs2_trace.h.

About one year ago, Wengang Wang tried some first steps
to add tracepoints to ocfs2. Hiss original patch is here:
http://oss.oracle.com/pipermail/ocfs2-devel/2009-November/005512.html

But as Steven Rostedt indicated in his article
http://lwn.net/Articles/383362/, we'd better have our trace
files resides in fs/ocfs2, so I rewrited the patch using the
method Steven mentioned in that article.

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/ocfs2_trace.h | 16 ++++++++++++++++
 fs/ocfs2/super.c       |  3 +++
 2 files changed, 19 insertions(+)
 create mode 100644 fs/ocfs2/ocfs2_trace.h

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
new file mode 100644
index 000000000000..8d4e49aef491
--- /dev/null
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -0,0 +1,16 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ocfs2
+
+#if !defined(_TRACE_OCFS2_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_OCFS2_H
+
+#include <linux/tracepoint.h>
+
+
+#endif /* _TRACE_OCFS2_H */
+
+/* This part must be outside protection */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE ocfs2_trace
+#include <trace/define_trace.h>
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b21b702c5603..f7e73a029844 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,9 @@
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
 
+#define CREATE_TRACE_POINTS
+#include "ocfs2_trace.h"
+
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
 
-- 
cgit v1.2.3


From a09d09b8f8d7c8acd46d96e3e9899bd1461fc036 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 08:21:10 +0800
Subject: ocfs2: Remove mlog(0) from fs/ocfs2/alloc.c

This is the first try of replacing debug mlog(0,...) to
trace events. Wengang has did some work in his original
patch
http://oss.oracle.com/pipermail/ocfs2-devel/2009-November/005513.html
But he didn't finished it.

So this patch removes all mlog(0,...) from alloc.c and adds
the corresponding trace events. Different mlogs have different
solutions.
1. Some are replaced with trace event directly.
2. Some are replaced and some new parameters are added since
   I think we need to know the btree owner in that case.
3. Some are combined into one trace events.
4. Some redundant mlogs are removed.
What's more, it defines some event classes so that we can use
them later.

Cc: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/alloc.c       | 165 +++++++++++-----------
 fs/ocfs2/ocfs2_trace.h | 364 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 452 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 80ffbe3dd32f..f0e26c4adc94 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -50,6 +50,7 @@
 #include "uptodate.h"
 #include "xattr.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -886,8 +887,7 @@ static int ocfs2_validate_extent_block(struct super_block *sb,
 	struct ocfs2_extent_block *eb =
 		(struct ocfs2_extent_block *)bh->b_data;
 
-	mlog(0, "Validating extent block %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_extent_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -985,7 +985,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
 bail:
 	brelse(eb_bh);
 
-	mlog(0, "retval = %d\n", retval);
+	trace_ocfs2_num_free_extents(retval);
 	return retval;
 }
 
@@ -1194,8 +1194,11 @@ static int ocfs2_add_branch(handle_t *handle,
 	 * from new_cpos).
 	 */
 	if (root_end > new_cpos) {
-		mlog(0, "adjust the cluster end from %u to %u\n",
-		     root_end, new_cpos);
+		trace_ocfs2_adjust_rightmost_branch(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			root_end, new_cpos);
+
 		status = ocfs2_adjust_rightmost_branch(handle, et);
 		if (status) {
 			mlog_errno(status);
@@ -1527,7 +1530,10 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
 	 * another tree level */
 	if (shift) {
 		BUG_ON(bh);
-		mlog(0, "need to shift tree depth (current = %d)\n", depth);
+		trace_ocfs2_grow_tree(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			depth);
 
 		/* ocfs2_shift_tree_depth will return us a buffer with
 		 * the new extent block (so we can pass that to
@@ -1557,7 +1563,6 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
 
 	/* call ocfs2_add_branch to add the final part of the tree with
 	 * the new data. */
-	mlog(0, "add branch. bh = %p\n", bh);
 	ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
 			       meta_ac);
 	if (ret < 0) {
@@ -1632,8 +1637,9 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
 	}
 	insert_index = i;
 
-	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
-	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
+	trace_ocfs2_rotate_leaf(insert_cpos, insert_index,
+				has_empty, next_free,
+				le16_to_cpu(el->l_count));
 
 	BUG_ON(insert_index < 0);
 	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
@@ -2046,7 +2052,7 @@ static void ocfs2_complete_edge_insert(handle_t *handle,
 	left_el = path_leaf_el(left_path);
 	right_el = path_leaf_el(right_path);
 	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
-		mlog(0, "Adjust records at index %u\n", i);
+		trace_ocfs2_complete_edge_insert(i);
 
 		/*
 		 * One nice property of knowing that all of these
@@ -2376,7 +2382,9 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
 		goto out;
 	}
 
-	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
+	trace_ocfs2_rotate_tree_right(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		insert_cpos, cpos);
 
 	/*
 	 * What we want to do here is:
@@ -2405,8 +2413,10 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
 	 * rotating subtrees.
 	 */
 	while (cpos && insert_cpos <= cpos) {
-		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
-		     insert_cpos, cpos);
+		trace_ocfs2_rotate_tree_right(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			insert_cpos, cpos);
 
 		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
 		if (ret) {
@@ -2448,10 +2458,10 @@ static int ocfs2_rotate_tree_right(handle_t *handle,
 
 		start = ocfs2_find_subtree_root(et, left_path, right_path);
 
-		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
-		     start,
-		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
-		     right_path->p_tree_depth);
+		trace_ocfs2_rotate_subtree(start,
+			(unsigned long long)
+			right_path->p_node[start].bh->b_blocknr,
+			right_path->p_tree_depth);
 
 		ret = ocfs2_extend_rotate_transaction(handle, start,
 						      orig_credits, right_path);
@@ -2951,8 +2961,7 @@ static int __ocfs2_rotate_tree_left(handle_t *handle,
 		subtree_root = ocfs2_find_subtree_root(et, left_path,
 						       right_path);
 
-		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
-		     subtree_root,
+		trace_ocfs2_rotate_subtree(subtree_root,
 		     (unsigned long long)
 		     right_path->p_node[subtree_root].bh->b_blocknr,
 		     right_path->p_tree_depth);
@@ -3976,9 +3985,11 @@ static int ocfs2_append_rec_to_path(handle_t *handle,
 			goto out;
 		}
 
-		mlog(0, "Append may need a left path update. cpos: %u, "
-		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
-		     left_cpos);
+		trace_ocfs2_append_rec_to_path(
+			(unsigned long long)
+			ocfs2_metadata_cache_owner(et->et_ci),
+			le32_to_cpu(insert_rec->e_cpos),
+			left_cpos);
 
 		/*
 		 * No need to worry if the append is already in the
@@ -4665,9 +4676,9 @@ int ocfs2_insert_extent(handle_t *handle,
 	struct ocfs2_insert_type insert = {0, };
 	struct ocfs2_extent_rec rec;
 
-	mlog(0, "add %u clusters at position %u to owner %llu\n",
-	     new_clusters, cpos,
-	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+	trace_ocfs2_insert_extent_start(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		cpos, new_clusters);
 
 	memset(&rec, 0, sizeof(rec));
 	rec.e_cpos = cpu_to_le32(cpos);
@@ -4687,11 +4698,9 @@ int ocfs2_insert_extent(handle_t *handle,
 		goto bail;
 	}
 
-	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
-	     "Insert.contig_index: %d, Insert.free_records: %d, "
-	     "Insert.tree_depth: %d\n",
-	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
-	     free_records, insert.ins_tree_depth);
+	trace_ocfs2_insert_extent(insert.ins_appending, insert.ins_contig,
+				  insert.ins_contig_index, free_records,
+				  insert.ins_tree_depth);
 
 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
 		status = ocfs2_grow_tree(handle, et,
@@ -4732,7 +4741,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 				struct ocfs2_alloc_context *meta_ac,
 				enum ocfs2_alloc_restarted *reason_ret)
 {
-	int status = 0;
+	int status = 0, err = 0;
 	int free_extents;
 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
 	u32 bit_off, num_bits;
@@ -4759,14 +4768,14 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 	 * 2) we are so fragmented, we've needed to add metadata too
 	 *    many times. */
 	if (!free_extents && !meta_ac) {
-		mlog(0, "we haven't reserved any metadata!\n");
+		err = -1;
 		status = -EAGAIN;
 		reason = RESTART_META;
 		goto leave;
 	} else if ((!free_extents)
 		   && (ocfs2_alloc_context_bits_left(meta_ac)
 		       < ocfs2_extend_meta_needed(et->et_root_el))) {
-		mlog(0, "filesystem is really fragmented...\n");
+		err = -2;
 		status = -EAGAIN;
 		reason = RESTART_META;
 		goto leave;
@@ -4791,9 +4800,9 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 	}
 
 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
-	     num_bits, bit_off,
-	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
+	trace_ocfs2_add_clusters_in_btree(
+	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+	     bit_off, num_bits);
 	status = ocfs2_insert_extent(handle, et, *logical_offset, block,
 				     num_bits, flags, meta_ac);
 	if (status < 0) {
@@ -4807,8 +4816,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 	*logical_offset += num_bits;
 
 	if (clusters_to_add) {
-		mlog(0, "need to alloc once more, wanted = %u\n",
-		     clusters_to_add);
+		err = clusters_to_add;
 		status = -EAGAIN;
 		reason = RESTART_TRANS;
 	}
@@ -4816,6 +4824,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 leave:
 	if (reason_ret)
 		*reason_ret = reason;
+	trace_ocfs2_add_clusters_in_btree_ret(status, reason, err);
 	return status;
 }
 
@@ -5041,9 +5050,9 @@ int ocfs2_split_extent(handle_t *handle,
 
 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
 
-	mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
-	     split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
-	     ctxt.c_split_covers_rec);
+	trace_ocfs2_split_extent(split_index, ctxt.c_contig_type,
+				 ctxt.c_has_empty_extent,
+				 ctxt.c_split_covers_rec);
 
 	if (ctxt.c_contig_type == CONTIG_NONE) {
 		if (ctxt.c_split_covers_rec)
@@ -5177,8 +5186,9 @@ int ocfs2_mark_extent_written(struct inode *inode,
 {
 	int ret;
 
-	mlog(0, "Inode %lu cpos %u, len %u, phys clusters %u\n",
-	     inode->i_ino, cpos, len, phys);
+	trace_ocfs2_mark_extent_written(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		cpos, len, phys);
 
 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
@@ -5497,11 +5507,10 @@ int ocfs2_remove_extent(handle_t *handle,
 
 	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
 
-	mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
-	     "(cpos %u, len %u)\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
-	     cpos, len, index,
-	     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
+	trace_ocfs2_remove_extent(
+		(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
+		cpos, len, index, le32_to_cpu(rec->e_cpos),
+		ocfs2_rec_clusters(el, rec));
 
 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
 		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
@@ -5780,9 +5789,6 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
 
-	mlog(0, "start_blk = %llu, num_clusters = %u\n",
-	     (unsigned long long)start_blk, num_clusters);
-
 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
 
 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
@@ -5819,10 +5825,9 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
-	     "%llu (index = %d)\n", num_clusters, start_cluster,
-	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
-
+	trace_ocfs2_truncate_log_append(
+		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index,
+		start_cluster, num_clusters);
 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
 		/*
 		 * Move index back to the record we are coalescing with.
@@ -5831,9 +5836,10 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 		index--;
 
 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
-		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
-		     index, le32_to_cpu(tl->tl_recs[index].t_start),
-		     num_clusters);
+		trace_ocfs2_truncate_log_append(
+			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+			index, le32_to_cpu(tl->tl_recs[index].t_start),
+			num_clusters);
 	} else {
 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
 		tl->tl_used = cpu_to_le16(index + 1);
@@ -5897,8 +5903,9 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
 		/* if start_blk is not set, we ignore the record as
 		 * invalid. */
 		if (start_blk) {
-			mlog(0, "free record %d, start = %u, clusters = %u\n",
-			     i, le32_to_cpu(rec.t_start), num_clusters);
+			trace_ocfs2_replay_truncate_records(
+				(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+				i, le32_to_cpu(rec.t_start), num_clusters);
 
 			status = ocfs2_free_clusters(handle, data_alloc_inode,
 						     data_alloc_bh, start_blk,
@@ -5941,8 +5948,9 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 
 	tl = &di->id2.i_dealloc;
 	num_to_flush = le16_to_cpu(tl->tl_used);
-	mlog(0, "Flush %u records from truncate log #%llu\n",
-	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
+	trace_ocfs2_flush_truncate_log(
+		(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
+		num_to_flush);
 	if (!num_to_flush) {
 		status = 0;
 		goto out;
@@ -6079,7 +6087,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 
 	*tl_copy = NULL;
 
-	mlog(0, "recover truncate log from slot %d\n", slot_num);
+	trace_ocfs2_begin_truncate_log_recovery(slot_num);
 
 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
 	if (status < 0) {
@@ -6096,8 +6104,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 
 	tl = &di->id2.i_dealloc;
 	if (le16_to_cpu(tl->tl_used)) {
-		mlog(0, "We'll have %u logs to recover\n",
-		     le16_to_cpu(tl->tl_used));
+		trace_ocfs2_truncate_log_recovery_num(le16_to_cpu(tl->tl_used));
 
 		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
 		if (!(*tl_copy)) {
@@ -6154,8 +6161,9 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 
 	tl = &tl_copy->id2.i_dealloc;
 	num_recs = le16_to_cpu(tl->tl_used);
-	mlog(0, "cleanup %u records from %llu\n", num_recs,
-	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
+	trace_ocfs2_complete_truncate_log_recovery(
+		(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
+		num_recs);
 
 	mutex_lock(&tl_inode->i_mutex);
 	for(i = 0; i < num_recs; i++) {
@@ -6313,8 +6321,8 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
 		else
 			bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
 							      head->free_bit);
-		mlog(0, "Free bit: (bit %u, blkno %llu)\n",
-		     head->free_bit, (unsigned long long)head->free_blk);
+		trace_ocfs2_free_cached_blocks(
+		     (unsigned long long)head->free_blk, head->free_bit);
 
 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
 					       head->free_bit, bg_blkno, 1);
@@ -6367,8 +6375,7 @@ int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
 		return ret;
 	}
 
-	mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
-	     bit, (unsigned long long)blkno);
+	trace_ocfs2_cache_cluster_dealloc((unsigned long long)blkno, bit);
 
 	item->free_blk = blkno;
 	item->free_bit = bit;
@@ -6443,8 +6450,8 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
 		fl = ctxt->c_first_suballocator;
 
 		if (fl->f_first) {
-			mlog(0, "Free items: (type %u, slot %d)\n",
-			     fl->f_inode_type, fl->f_slot);
+			trace_ocfs2_run_deallocs(fl->f_inode_type,
+						 fl->f_slot);
 			ret2 = ocfs2_free_cached_blocks(osb,
 							fl->f_inode_type,
 							fl->f_slot,
@@ -6521,8 +6528,9 @@ int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
 		goto out;
 	}
 
-	mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
-	     type, slot, bit, (unsigned long long)blkno);
+	trace_ocfs2_cache_block_dealloc(type, slot,
+					(unsigned long long)suballoc,
+					(unsigned long long)blkno, bit);
 
 	item->free_bg = suballoc;
 	item->free_blk = blkno;
@@ -7002,8 +7010,11 @@ start:
 		goto bail;
 	}
 
-	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
-	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
+	trace_ocfs2_commit_truncate(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		new_highest_cpos,
+		OCFS2_I(inode)->ip_clusters,
+		path->p_tree_depth);
 
 	/*
 	 * By now, el will point to the extent list on the bottom most
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 8d4e49aef491..1793b1f71805 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -6,6 +6,370 @@
 
 #include <linux/tracepoint.h>
 
+DECLARE_EVENT_CLASS(ocfs2__int,
+	TP_PROTO(int num),
+	TP_ARGS(num),
+	TP_STRUCT__entry(
+		__field(int, num)
+	),
+	TP_fast_assign(
+		__entry->num = num;
+	),
+	TP_printk("%d", __entry->num)
+);
+
+#define DEFINE_OCFS2_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__int, name,	\
+	TP_PROTO(int num),	\
+	TP_ARGS(num))
+
+DECLARE_EVENT_CLASS(ocfs2__int_int,
+	TP_PROTO(int value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(int, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%d %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_INT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__int_int, name,	\
+	TP_PROTO(int val1, int val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint,
+	TP_PROTO(unsigned long long value1, unsigned int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%llu %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint, name,	\
+	TP_PROTO(unsigned long long val1, unsigned int val2),	\
+	TP_ARGS(val1, val2))
+
+/* Trace events for fs/ocfs2/alloc.c. */
+DECLARE_EVENT_CLASS(ocfs2__btree_ops,
+	TP_PROTO(unsigned long long owner,\
+		 unsigned int value1, unsigned int value2),
+	TP_ARGS(owner, value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->value1 = value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%llu %u %u",
+		  __entry->owner, __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_BTREE_EVENT(name)	\
+DEFINE_EVENT(ocfs2__btree_ops, name,	\
+	TP_PROTO(unsigned long long owner,	\
+		 unsigned int value1, unsigned int value2),	\
+	TP_ARGS(owner, value1, value2))
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_adjust_rightmost_branch);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_rotate_tree_right);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_append_rec_to_path);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_insert_extent_start);
+
+DEFINE_OCFS2_BTREE_EVENT(ocfs2_add_clusters_in_btree);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_num_free_extents);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_edge_insert);
+
+TRACE_EVENT(ocfs2_grow_tree,
+	TP_PROTO(unsigned long long owner, int depth),
+	TP_ARGS(owner, depth),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(int, depth)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->depth = depth;
+	),
+	TP_printk("%llu %d", __entry->owner, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_rotate_subtree,
+	TP_PROTO(int subtree_root, unsigned long long blkno,
+		 int depth),
+	TP_ARGS(subtree_root, blkno, depth),
+	TP_STRUCT__entry(
+		__field(int, subtree_root)
+		__field(unsigned long long, blkno)
+		__field(int, depth)
+	),
+	TP_fast_assign(
+		__entry->subtree_root = subtree_root;
+		__entry->blkno = blkno;
+		__entry->depth = depth;
+	),
+	TP_printk("%d %llu %d", __entry->subtree_root,
+		  __entry->blkno, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_insert_extent,
+	TP_PROTO(unsigned int ins_appending, unsigned int ins_contig,
+		 int ins_contig_index, int free_records, int ins_tree_depth),
+	TP_ARGS(ins_appending, ins_contig, ins_contig_index, free_records,
+		ins_tree_depth),
+	TP_STRUCT__entry(
+		__field(unsigned int, ins_appending)
+		__field(unsigned int, ins_contig)
+		__field(int, ins_contig_index)
+		__field(int, free_records)
+		__field(int, ins_tree_depth)
+	),
+	TP_fast_assign(
+		__entry->ins_appending = ins_appending;
+		__entry->ins_contig = ins_contig;
+		__entry->ins_contig_index = ins_contig_index;
+		__entry->free_records = free_records;
+		__entry->ins_tree_depth = ins_tree_depth;
+	),
+	TP_printk("%u %u %d %d %d",
+		  __entry->ins_appending, __entry->ins_contig,
+		  __entry->ins_contig_index, __entry->free_records,
+		  __entry->ins_tree_depth)
+);
+
+TRACE_EVENT(ocfs2_split_extent,
+	TP_PROTO(int split_index, unsigned int c_contig_type,
+		 unsigned int c_has_empty_extent,
+		 unsigned int c_split_covers_rec),
+	TP_ARGS(split_index, c_contig_type,
+		c_has_empty_extent, c_split_covers_rec),
+	TP_STRUCT__entry(
+		__field(int, split_index)
+		__field(unsigned int, c_contig_type)
+		__field(unsigned int, c_has_empty_extent)
+		__field(unsigned int, c_split_covers_rec)
+	),
+	TP_fast_assign(
+		__entry->split_index = split_index;
+		__entry->c_contig_type = c_contig_type;
+		__entry->c_has_empty_extent = c_has_empty_extent;
+		__entry->c_split_covers_rec = c_split_covers_rec;
+	),
+	TP_printk("%d %u %u %u", __entry->split_index, __entry->c_contig_type,
+		  __entry->c_has_empty_extent, __entry->c_split_covers_rec)
+);
+
+TRACE_EVENT(ocfs2_remove_extent,
+	TP_PROTO(unsigned long long owner, unsigned int cpos,
+		 unsigned int len, int index,
+		 unsigned int e_cpos, unsigned int clusters),
+	TP_ARGS(owner, cpos, len, index, e_cpos, clusters),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(int, index)
+		__field(unsigned int, e_cpos)
+		__field(unsigned int, clusters)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->index = index;
+		__entry->e_cpos = e_cpos;
+		__entry->clusters = clusters;
+	),
+	TP_printk("%llu %u %u %d %u %u",
+		  __entry->owner, __entry->cpos, __entry->len, __entry->index,
+		  __entry->e_cpos, __entry->clusters)
+);
+
+TRACE_EVENT(ocfs2_commit_truncate,
+	TP_PROTO(unsigned long long ino, unsigned int new_cpos,
+		 unsigned int clusters, unsigned int depth),
+	TP_ARGS(ino, new_cpos, clusters, depth),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, new_cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, depth)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->new_cpos = new_cpos;
+		__entry->clusters = clusters;
+		__entry->depth = depth;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->ino, __entry->new_cpos,
+		  __entry->clusters, __entry->depth)
+);
+
+TRACE_EVENT(ocfs2_validate_extent_block,
+	TP_PROTO(unsigned long long blkno),
+	TP_ARGS(blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu ", __entry->blkno)
+);
+
+TRACE_EVENT(ocfs2_rotate_leaf,
+	TP_PROTO(unsigned int insert_cpos, int insert_index,
+		 int has_empty, int next_free,
+		 unsigned int l_count),
+	TP_ARGS(insert_cpos, insert_index, has_empty,
+		next_free, l_count),
+	TP_STRUCT__entry(
+		__field(unsigned int, insert_cpos)
+		__field(int, insert_index)
+		__field(int, has_empty)
+		__field(int, next_free)
+		__field(unsigned int, l_count)
+	),
+	TP_fast_assign(
+		__entry->insert_cpos = insert_cpos;
+		__entry->insert_index = insert_index;
+		__entry->has_empty = has_empty;
+		__entry->next_free = next_free;
+		__entry->l_count = l_count;
+	),
+	TP_printk("%u %d %d %d %u", __entry->insert_cpos,
+		  __entry->insert_index, __entry->has_empty,
+		  __entry->next_free, __entry->l_count)
+);
+
+TRACE_EVENT(ocfs2_add_clusters_in_btree_ret,
+	TP_PROTO(int status, int reason, int err),
+	TP_ARGS(status, reason, err),
+	TP_STRUCT__entry(
+		__field(int, status)
+		__field(int, reason)
+		__field(int, err)
+	),
+	TP_fast_assign(
+		__entry->status = status;
+		__entry->reason = reason;
+		__entry->err = err;
+	),
+	TP_printk("%d %d %d", __entry->status,
+		  __entry->reason, __entry->err)
+);
+
+TRACE_EVENT(ocfs2_mark_extent_written,
+	TP_PROTO(unsigned long long owner, unsigned int cpos,
+		 unsigned int len, unsigned int phys),
+	TP_ARGS(owner, cpos, len, phys),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(unsigned int, phys)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->phys = phys;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->owner, __entry->cpos,
+		  __entry->len, __entry->phys)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__truncate_log_ops,
+	TP_PROTO(unsigned long long blkno, int index,
+		 unsigned int start, unsigned int num),
+	TP_ARGS(blkno, index, start, num),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__field(int, index)
+		__field(unsigned int, start)
+		__field(unsigned int, num)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__entry->index = index;
+		__entry->start = start;
+		__entry->num = num;
+	),
+	TP_printk("%llu %d %u %u",
+		  __entry->blkno, __entry->index,
+		  __entry->start, __entry->num)
+);
+
+#define DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(name)	\
+DEFINE_EVENT(ocfs2__truncate_log_ops, name,	\
+	TP_PROTO(unsigned long long blkno, int index,	\
+		 unsigned int start, unsigned int num),	\
+	TP_ARGS(blkno, index, start, num))
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_truncate_log_append);
+
+DEFINE_OCFS2_TRUNCATE_LOG_OPS_EVENT(ocfs2_replay_truncate_records);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_flush_truncate_log);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_truncate_log_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_truncate_log_recovery_num);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_complete_truncate_log_recovery);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_free_cached_blocks);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_cache_cluster_dealloc);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_run_deallocs);
+
+TRACE_EVENT(ocfs2_cache_block_dealloc,
+	TP_PROTO(int type, int slot, unsigned long long suballoc,
+		 unsigned long long blkno, unsigned int bit),
+	TP_ARGS(type, slot, suballoc, blkno, bit),
+	TP_STRUCT__entry(
+		__field(int, type)
+		__field(int, slot)
+		__field(unsigned long long, suballoc)
+		__field(unsigned long long, blkno)
+		__field(unsigned int, bit)
+	),
+	TP_fast_assign(
+		__entry->type = type;
+		__entry->slot = slot;
+		__entry->suballoc = suballoc;
+		__entry->blkno = blkno;
+		__entry->bit = bit;
+	),
+	TP_printk("%d %d %llu %llu %u",
+		  __entry->type, __entry->slot, __entry->suballoc,
+		  __entry->blkno, __entry->bit)
+);
+
+/* End of trace events for fs/ocfs2/alloc.c. */
 
 #endif /* _TRACE_OCFS2_H */
 
-- 
cgit v1.2.3


From a04733d8afad9cd0bfb8269173e30d28fb96088e Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 07:56:45 +0800
Subject: ocfs2: Remove mlog(0) from fs/ocfs2/localalloc.c

This is the 2nd step to remove the debug info of DISK_ALLOC.

So this patch removes all mlog(0,...) from localalloc.c and adds
the corresponding tracepoints. Different mlogs have different
solutions.
1. Some are replaced with trace event directly.
2. Some are replaced while some new parameters are added.
3. Some are combined into one trace events.
4. Some redundant mlogs are removed.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/localalloc.c  |  57 ++++++++++---------------
 fs/ocfs2/ocfs2_trace.h | 112 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 135 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 36636e1c84e7..a311b1f658e2 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -43,6 +43,7 @@
 #include "suballoc.h"
 #include "super.h"
 #include "sysfile.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -201,8 +202,7 @@ void ocfs2_la_set_sizes(struct ocfs2_super *osb, int requested_mb)
 	la_max_mb = ocfs2_clusters_to_megabytes(sb,
 						ocfs2_local_alloc_size(sb) * 8);
 
-	mlog(0, "requested: %dM, max: %uM, default: %uM\n",
-	     requested_mb, la_max_mb, la_default_mb);
+	trace_ocfs2_la_set_sizes(requested_mb, la_max_mb, la_default_mb);
 
 	if (requested_mb == -1) {
 		/* No user request - use defaults */
@@ -276,8 +276,8 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
 
 	ret = 1;
 bail:
-	mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
-	     osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
+	trace_ocfs2_alloc_should_use_local(
+	     (unsigned long long)bits, osb->local_alloc_state, la_bits, ret);
 	spin_unlock(&osb->osb_lock);
 	return ret;
 }
@@ -362,7 +362,7 @@ bail:
 	if (inode)
 		iput(inode);
 
-	mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
+	trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
 
 	if (status)
 		mlog_errno(status);
@@ -497,7 +497,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 	struct inode *inode = NULL;
 	struct ocfs2_dinode *alloc;
 
-	mlog(0, "(slot_num = %d)\n", slot_num);
+	trace_ocfs2_begin_local_alloc_recovery(slot_num);
 
 	*alloc_copy = NULL;
 
@@ -705,10 +705,6 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 			goto bail;
 	}
 
-	if (ac->ac_max_block)
-		mlog(0, "Calling in_range for max block %llu\n",
-		     (unsigned long long)ac->ac_max_block);
-
 	ac->ac_inode = local_alloc_inode;
 	/* We should never use localalloc from another slot */
 	ac->ac_alloc_slot = osb->slot_num;
@@ -722,8 +718,9 @@ bail:
 		iput(local_alloc_inode);
 	}
 
-	mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
-	     status);
+	trace_ocfs2_reserve_local_alloc_bits(
+		(unsigned long long)ac->ac_max_block,
+		bits_wanted, osb->slot_num, status);
 
 	if (status)
 		mlog_errno(status);
@@ -797,7 +794,7 @@ static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
 	for (i = 0; i < le16_to_cpu(la->la_size); i++)
 		count += hweight8(buffer[i]);
 
-	mlog(0, "count %u\n", count);
+	trace_ocfs2_local_alloc_count_bits(count);
 	return count;
 }
 
@@ -812,10 +809,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 	void *bitmap = NULL;
 	struct ocfs2_reservation_map *resmap = &osb->osb_la_resmap;
 
-	mlog(0, "(numbits wanted = %u)\n", *numbits);
-
 	if (!alloc->id1.bitmap1.i_total) {
-		mlog(0, "No bits in my window!\n");
 		bitoff = -1;
 		goto bail;
 	}
@@ -875,8 +869,7 @@ static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
 		}
 	}
 
-	mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
-	     numfound);
+	trace_ocfs2_local_alloc_find_clear_bits_search_bitmap(bitoff, numfound);
 
 	if (numfound == *numbits)
 		bitoff = startoff - numfound;
@@ -887,7 +880,10 @@ bail:
 	if (local_resv)
 		ocfs2_resv_discard(resmap, resv);
 
-	mlog(0, "bitoff %d\n", bitoff);
+	trace_ocfs2_local_alloc_find_clear_bits(*numbits,
+		le32_to_cpu(alloc->id1.bitmap1.i_total),
+		bitoff, numfound);
+
 	return bitoff;
 }
 
@@ -941,18 +937,16 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 	void *bitmap;
 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
 
-	mlog(0, "total = %u, used = %u\n",
+	trace_ocfs2_sync_local_to_main(
 	     le32_to_cpu(alloc->id1.bitmap1.i_total),
 	     le32_to_cpu(alloc->id1.bitmap1.i_used));
 
 	if (!alloc->id1.bitmap1.i_total) {
-		mlog(0, "nothing to sync!\n");
 		goto bail;
 	}
 
 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
 	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
-		mlog(0, "all bits were taken!\n");
 		goto bail;
 	}
 
@@ -974,8 +968,7 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
 				ocfs2_clusters_to_blocks(osb->sb,
 							 start - count);
 
-			mlog(0, "freeing %u bits starting at local alloc bit "
-			     "%u (la_start_blk = %llu, blkno = %llu)\n",
+			trace_ocfs2_sync_local_to_main_free(
 			     count, start - count,
 			     (unsigned long long)la_start_blk,
 			     (unsigned long long)blkno);
@@ -1142,12 +1135,9 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
 	la = OCFS2_LOCAL_ALLOC(alloc);
 
-	if (alloc->id1.bitmap1.i_total)
-		mlog(0, "asking me to alloc a new window over a non-empty "
-		     "one\n");
-
-	mlog(0, "Allocating %u clusters for a new window.\n",
-	     osb->local_alloc_bits);
+	trace_ocfs2_local_alloc_new_window(
+		le32_to_cpu(alloc->id1.bitmap1.i_total),
+		osb->local_alloc_bits);
 
 	/* Instruct the allocation code to try the most recently used
 	 * cluster group. We'll re-record the group used this pass
@@ -1209,10 +1199,9 @@ retry_enospc:
 	ocfs2_resmap_restart(&osb->osb_la_resmap, cluster_count,
 			     OCFS2_LOCAL_ALLOC(alloc)->la_bitmap);
 
-	mlog(0, "New window allocated:\n");
-	mlog(0, "window la_bm_off = %u\n",
-	     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
-	mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
+	trace_ocfs2_local_alloc_new_window_result(
+		OCFS2_LOCAL_ALLOC(alloc)->la_bm_off,
+		le32_to_cpu(alloc->id1.bitmap1.i_total));
 
 bail:
 	if (status)
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 1793b1f71805..5bba629766ca 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -23,6 +23,23 @@ DEFINE_EVENT(ocfs2__int, name,	\
 	TP_PROTO(int num),	\
 	TP_ARGS(num))
 
+DECLARE_EVENT_CLASS(ocfs2__uint,
+	TP_PROTO(unsigned int num),
+	TP_ARGS(num),
+	TP_STRUCT__entry(
+		__field(	unsigned int,	num		)
+	),
+	TP_fast_assign(
+		__entry->num	= 	num;
+	),
+	TP_printk("%u", __entry->num)
+);
+
+#define DEFINE_OCFS2_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint, name,	\
+	TP_PROTO(unsigned int num),	\
+	TP_ARGS(num))
+
 DECLARE_EVENT_CLASS(ocfs2__int_int,
 	TP_PROTO(int value1, int value2),
 	TP_ARGS(value1, value2),
@@ -61,6 +78,55 @@ DEFINE_EVENT(ocfs2__ull_uint, name,	\
 	TP_PROTO(unsigned long long val1, unsigned int val2),	\
 	TP_ARGS(val1, val2))
 
+DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint,
+	TP_PROTO(unsigned int value1, unsigned int value2,
+		 unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(	unsigned int,	value1		)
+		__field(	unsigned int,	value2		)
+		__field(	unsigned int,	value3		)
+	),
+	TP_fast_assign(
+		__entry->value1	= 	value1;
+		__entry->value2	= 	value2;
+		__entry->value3	= 	value3;
+	),
+	TP_printk("%u %u %u", __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_uint_uint, name,	\
+	TP_PROTO(unsigned int value1, unsigned int value2,	\
+		 unsigned int value3),	\
+	TP_ARGS(value1, value2, value3))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int,
+	TP_PROTO(unsigned long long ull, int value1, int value2, int value3),
+	TP_ARGS(ull, value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(	unsigned long long,	ull	)
+		__field(	int,	value1			)
+		__field(	int,	value2			)
+		__field(	int,	value3			)
+	),
+	TP_fast_assign(
+		__entry->ull		= ull;
+		__entry->value1		= value1;
+		__entry->value2		= value2;
+		__entry->value3		= value3;
+	),
+	TP_printk("%llu %d %d %d",
+		  __entry->ull, __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_int_int_int, name,	\
+	TP_PROTO(unsigned long long ull, int value1,	\
+		 int value2, int value3),	\
+	TP_ARGS(ull, value1, value2, value3))
+
 /* Trace events for fs/ocfs2/alloc.c. */
 DECLARE_EVENT_CLASS(ocfs2__btree_ops,
 	TP_PROTO(unsigned long long owner,\
@@ -371,6 +437,52 @@ TRACE_EVENT(ocfs2_cache_block_dealloc,
 
 /* End of trace events for fs/ocfs2/alloc.c. */
 
+/* Trace events for fs/ocfs2/localalloc.c. */
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_la_set_sizes);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_alloc_should_use_local);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_load_local_alloc);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_begin_local_alloc_recovery);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_reserve_local_alloc_bits);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_local_alloc_count_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits_search_bitmap);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_local_alloc_find_clear_bits);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_sync_local_to_main);
+
+TRACE_EVENT(ocfs2_sync_local_to_main_free,
+	TP_PROTO(int count, int bit, unsigned long long start_blk,
+		 unsigned long long blkno),
+	TP_ARGS(count, bit, start_blk, blkno),
+	TP_STRUCT__entry(
+		__field(int, count)
+		__field(int, bit)
+		__field(unsigned long long, start_blk)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->count = count;
+		__entry->bit = bit;
+		__entry->start_blk = start_blk;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%d %d %llu %llu",
+		  __entry->count, __entry->bit, __entry->start_blk,
+		  __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
+
+/* End of trace events for fs/ocfs2/localalloc.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 2f73e135b83c5001dad84cbd1d445d23fd045b83 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 08:22:33 +0800
Subject: ocfs2: Remove mlog(0) from fs/ocfs2/suballoc.c

This is the 3rd step to remove the debug info of DISK_ALLOC.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/ocfs2_trace.h | 212 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/suballoc.c    | 107 +++++++++++++------------
 2 files changed, 265 insertions(+), 54 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 5bba629766ca..11980ecba546 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -40,6 +40,23 @@ DEFINE_EVENT(ocfs2__uint, name,	\
 	TP_PROTO(unsigned int num),	\
 	TP_ARGS(num))
 
+DECLARE_EVENT_CLASS(ocfs2__ull,
+	TP_PROTO(unsigned long long blkno),
+	TP_ARGS(blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu", __entry->blkno)
+);
+
+#define DEFINE_OCFS2_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull, name,	\
+	TP_PROTO(unsigned long long num),	\
+	TP_ARGS(num))
+
 DECLARE_EVENT_CLASS(ocfs2__int_int,
 	TP_PROTO(int value1, int value2),
 	TP_ARGS(value1, value2),
@@ -59,6 +76,25 @@ DEFINE_EVENT(ocfs2__int_int, name,	\
 	TP_PROTO(int val1, int val2),	\
 	TP_ARGS(val1, val2))
 
+DECLARE_EVENT_CLASS(ocfs2__uint_uint,
+	TP_PROTO(unsigned int value1, unsigned int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%u %u", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_uint, name,	\
+	TP_PROTO(unsigned int val1, unsigned int val2),	\
+	TP_ARGS(val1, val2))
+
 DECLARE_EVENT_CLASS(ocfs2__ull_uint,
 	TP_PROTO(unsigned long long value1, unsigned int value2),
 	TP_ARGS(value1, value2),
@@ -78,6 +114,49 @@ DEFINE_EVENT(ocfs2__ull_uint, name,	\
 	TP_PROTO(unsigned long long val1, unsigned int val2),	\
 	TP_ARGS(val1, val2))
 
+DECLARE_EVENT_CLASS(ocfs2__ull_ull,
+	TP_PROTO(unsigned long long value1, unsigned long long value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+	),
+	TP_printk("%llu %llu", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull, name,	\
+	TP_PROTO(unsigned long long val1, unsigned long long val2),	\
+	TP_ARGS(val1, val2))
+
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint,
+	TP_PROTO(unsigned long long value1,
+		 unsigned int value2, unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned int, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3	= value3;
+	),
+	TP_printk("%llu %u %u", __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint_uint, name,	\
+	TP_PROTO(unsigned long long val1,	\
+		 unsigned int val2, unsigned int val3),	\
+	TP_ARGS(val1, val2, val3))
+
 DECLARE_EVENT_CLASS(ocfs2__uint_uint_uint,
 	TP_PROTO(unsigned int value1, unsigned int value2,
 		 unsigned int value3),
@@ -127,6 +206,33 @@ DEFINE_EVENT(ocfs2__ull_int_int_int, name,	\
 		 int value2, int value3),	\
 	TP_ARGS(ull, value1, value2, value3))
 
+DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint_uint,
+	TP_PROTO(unsigned long long ull, unsigned int value1,
+		 unsigned int value2, unsigned int value3),
+	TP_ARGS(ull, value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ull)
+		__field(unsigned int, value1)
+		__field(unsigned int, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->ull = ull;
+		__entry->value1 = value1;
+		__entry->value2	= value2;
+		__entry->value3	= value3;
+	),
+	TP_printk("%llu %u %u %u",
+		  __entry->ull, __entry->value1,
+		  __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name,	\
+	TP_PROTO(unsigned long long ull, unsigned int value1,	\
+		 unsigned int value2, unsigned int value3),	\
+	TP_ARGS(ull, value1, value2, value3))
+
 /* Trace events for fs/ocfs2/alloc.c. */
 DECLARE_EVENT_CLASS(ocfs2__btree_ops,
 	TP_PROTO(unsigned long long owner,\
@@ -483,6 +589,112 @@ DEFINE_OCFS2_INT_INT_EVENT(ocfs2_local_alloc_new_window);
 DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
 
 /* End of trace events for fs/ocfs2/localalloc.c. */
+
+/* Trace events for fs/ocfs2/suballoc.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_contig);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_block_group_alloc_discontig);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_block_group_alloc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_nospc);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_reserve_suballoc_bits_no_new_group);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_reserve_new_inode_new_group);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_set_bits);
+
+TRACE_EVENT(ocfs2_relink_block_group,
+	TP_PROTO(unsigned long long i_blkno, unsigned int chain,
+		 unsigned long long bg_blkno,
+		 unsigned long long prev_blkno),
+	TP_ARGS(i_blkno, chain, bg_blkno, prev_blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, i_blkno)
+		__field(unsigned int, chain)
+		__field(unsigned long long, bg_blkno)
+		__field(unsigned long long, prev_blkno)
+	),
+	TP_fast_assign(
+		__entry->i_blkno = i_blkno;
+		__entry->chain = chain;
+		__entry->bg_blkno = bg_blkno;
+		__entry->prev_blkno = prev_blkno;
+	),
+	TP_printk("%llu %u %llu %llu",
+		  __entry->i_blkno, __entry->chain, __entry->bg_blkno,
+		  __entry->prev_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_cluster_group_search_wrong_max_bits);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cluster_group_search_max_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_block_group_search_max_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_search_chain_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_succ);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_search_chain_end);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_claim_suballoc_bits);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_claim_new_inode_at_loc);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_block_group_clear_bits);
+
+TRACE_EVENT(ocfs2_free_suballoc_bits,
+	TP_PROTO(unsigned long long inode, unsigned long long group,
+		 unsigned int start_bit, unsigned int count),
+	TP_ARGS(inode, group, start_bit, count),
+	TP_STRUCT__entry(
+		__field(unsigned long long, inode)
+		__field(unsigned long long, group)
+		__field(unsigned int, start_bit)
+		__field(unsigned int, count)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->group = group;
+		__entry->start_bit = start_bit;
+		__entry->count = count;
+	),
+	TP_printk("%llu %llu %u %u", __entry->inode, __entry->group,
+		  __entry->start_bit, __entry->count)
+);
+
+TRACE_EVENT(ocfs2_free_clusters,
+	TP_PROTO(unsigned long long bg_blkno, unsigned long long start_blk,
+		 unsigned int start_bit, unsigned int count),
+	TP_ARGS(bg_blkno, start_blk, start_bit, count),
+	TP_STRUCT__entry(
+		__field(unsigned long long, bg_blkno)
+		__field(unsigned long long, start_blk)
+		__field(unsigned int, start_bit)
+		__field(unsigned int, count)
+	),
+	TP_fast_assign(
+		__entry->bg_blkno = bg_blkno;
+		__entry->start_blk = start_blk;
+		__entry->start_bit = start_bit;
+		__entry->count = count;
+	),
+	TP_printk("%llu %llu %u %u", __entry->bg_blkno, __entry->start_blk,
+		  __entry->start_bit, __entry->count)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_get_suballoc_slot_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit);
+
+/* End of trace events for fs/ocfs2/suballoc.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 2d722f1fbaa0..94342bd9800d 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -44,6 +44,7 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -308,8 +309,8 @@ static int ocfs2_validate_group_descriptor(struct super_block *sb,
 	int rc;
 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
 
-	mlog(0, "Validating group descriptor %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_group_descriptor(
+					(unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -476,8 +477,8 @@ ocfs2_block_group_alloc_contig(struct ocfs2_super *osb, handle_t *handle,
 
 	/* setup the group */
 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "new descriptor, record %u, at block %llu\n",
-	     alloc_rec, (unsigned long long)bg_blkno);
+	trace_ocfs2_block_group_alloc_contig(
+	     (unsigned long long)bg_blkno, alloc_rec);
 
 	bg_bh = sb_getblk(osb->sb, bg_blkno);
 	if (!bg_bh) {
@@ -656,8 +657,8 @@ ocfs2_block_group_alloc_discontig(handle_t *handle,
 
 	/* setup the group */
 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "new descriptor, record %u, at block %llu\n",
-	     alloc_rec, (unsigned long long)bg_blkno);
+	trace_ocfs2_block_group_alloc_discontig(
+				(unsigned long long)bg_blkno, alloc_rec);
 
 	bg_bh = sb_getblk(osb->sb, bg_blkno);
 	if (!bg_bh) {
@@ -727,8 +728,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 	}
 
 	if (last_alloc_group && *last_alloc_group != 0) {
-		mlog(0, "use old allocation group %llu for block group alloc\n",
-		     (unsigned long long)*last_alloc_group);
+		trace_ocfs2_block_group_alloc(
+				(unsigned long long)*last_alloc_group);
 		ac->ac_last_group = *last_alloc_group;
 	}
 
@@ -851,16 +852,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 	if (bits_wanted > free_bits) {
 		/* cluster bitmap never grows */
 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
-			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
-			     bits_wanted, free_bits);
+			trace_ocfs2_reserve_suballoc_bits_nospc(bits_wanted,
+								free_bits);
 			status = -ENOSPC;
 			goto bail;
 		}
 
 		if (!(flags & ALLOC_NEW_GROUP)) {
-			mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
-			     "and we don't alloc a new group for it.\n",
-			     slot, bits_wanted, free_bits);
+			trace_ocfs2_reserve_suballoc_bits_no_new_group(
+						slot, bits_wanted, free_bits);
 			status = -ENOSPC;
 			goto bail;
 		}
@@ -1117,8 +1117,8 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
 		spin_lock(&osb->osb_lock);
 		osb->osb_inode_alloc_group = alloc_group;
 		spin_unlock(&osb->osb_lock);
-		mlog(0, "after reservation, new allocation group is "
-		     "%llu\n", (unsigned long long)alloc_group);
+		trace_ocfs2_reserve_new_inode_new_group(
+			(unsigned long long)alloc_group);
 
 		/*
 		 * Some inodes must be freed by us, so try to allocate
@@ -1360,8 +1360,7 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
 
-	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
-	     num_bits);
+	trace_ocfs2_block_group_set_bits(bit_off, num_bits);
 
 	if (ocfs2_is_cluster_bitmap(alloc_inode))
 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
@@ -1434,10 +1433,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
 
-	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
-	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
-	     (unsigned long long)le64_to_cpu(bg->bg_blkno),
-	     (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
+	trace_ocfs2_relink_block_group(
+		(unsigned long long)le64_to_cpu(fe->i_blkno), chain,
+		(unsigned long long)le64_to_cpu(bg->bg_blkno),
+		(unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
 
 	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
 	bg_ptr = le64_to_cpu(bg->bg_next_group);
@@ -1523,10 +1522,10 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 		if ((gd_cluster_off + max_bits) >
 		    OCFS2_I(inode)->ip_clusters) {
 			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
-			mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
-			     (unsigned long long)le64_to_cpu(gd->bg_blkno),
-			     le16_to_cpu(gd->bg_bits),
-			     OCFS2_I(inode)->ip_clusters, max_bits);
+			trace_ocfs2_cluster_group_search_wrong_max_bits(
+				(unsigned long long)le64_to_cpu(gd->bg_blkno),
+				le16_to_cpu(gd->bg_bits),
+				OCFS2_I(inode)->ip_clusters, max_bits);
 		}
 
 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
@@ -1540,9 +1539,9 @@ static int ocfs2_cluster_group_search(struct inode *inode,
 							  gd_cluster_off +
 							  res->sr_bit_offset +
 							  res->sr_bits);
-			mlog(0, "Checking %llu against %llu\n",
-			     (unsigned long long)blkoff,
-			     (unsigned long long)max_block);
+			trace_ocfs2_cluster_group_search_max_block(
+				(unsigned long long)blkoff,
+				(unsigned long long)max_block);
 			if (blkoff > max_block)
 				return -ENOSPC;
 		}
@@ -1586,9 +1585,9 @@ static int ocfs2_block_group_search(struct inode *inode,
 		if (!ret && max_block) {
 			blkoff = le64_to_cpu(bg->bg_blkno) +
 				res->sr_bit_offset + res->sr_bits;
-			mlog(0, "Checking %llu against %llu\n",
-			     (unsigned long long)blkoff,
-			     (unsigned long long)max_block);
+			trace_ocfs2_block_group_search_max_block(
+				(unsigned long long)blkoff,
+				(unsigned long long)max_block);
 			if (blkoff > max_block)
 				ret = -ENOSPC;
 		}
@@ -1754,9 +1753,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 	struct ocfs2_group_desc *bg;
 
 	chain = ac->ac_chain;
-	mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
-	     bits_wanted, chain,
-	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
+	trace_ocfs2_search_chain_begin(
+		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+		bits_wanted, chain);
 
 	status = ocfs2_read_group_descriptor(alloc_inode, fe,
 					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
@@ -1797,8 +1796,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 		goto bail;
 	}
 
-	mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
-	     res->sr_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
+	trace_ocfs2_search_chain_succ(
+		(unsigned long long)le64_to_cpu(bg->bg_blkno), res->sr_bits);
 
 	res->sr_bg_blkno = le64_to_cpu(bg->bg_blkno);
 
@@ -1859,8 +1858,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
 		goto bail;
 	}
 
-	mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
-	     (unsigned long long)le64_to_cpu(fe->i_blkno));
+	trace_ocfs2_search_chain_end(
+			(unsigned long long)le64_to_cpu(fe->i_blkno),
+			res->sr_bits);
 
 out_loc_only:
 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
@@ -1942,8 +1942,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac,
 		goto bail;
 	}
 
-	mlog(0, "Search of victim chain %u came up with nothing, "
-	     "trying all chains now.\n", victim);
+	trace_ocfs2_claim_suballoc_bits(victim);
 
 	/* If we didn't pick a good victim, then just default to
 	 * searching each chain in order. Don't allow chain relinking
@@ -2171,8 +2170,8 @@ int ocfs2_claim_new_inode_at_loc(handle_t *handle,
 		goto out;
 	}
 
-	mlog(0, "Allocated %u bits from suballocator %llu\n", res->sr_bits,
-	     (unsigned long long)di_blkno);
+	trace_ocfs2_claim_new_inode_at_loc((unsigned long long)di_blkno,
+					   res->sr_bits);
 
 	atomic_inc(&OCFS2_SB(ac->ac_inode->i_sb)->alloc_stats.bg_allocs);
 
@@ -2393,7 +2392,7 @@ static int ocfs2_block_group_clear_bits(handle_t *handle,
 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
 
-	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
+	trace_ocfs2_block_group_clear_bits(bit_off, num_bits);
 
 	BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode));
 	status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode),
@@ -2466,9 +2465,10 @@ static int _ocfs2_free_suballoc_bits(handle_t *handle,
 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
 
-	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
-	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
-	     (unsigned long long)bg_blkno, start_bit);
+	trace_ocfs2_free_suballoc_bits(
+		(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno,
+		(unsigned long long)bg_blkno,
+		start_bit, count);
 
 	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
 					     &group_bh);
@@ -2560,10 +2560,9 @@ static int _ocfs2_free_clusters(handle_t *handle,
 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
 				     &bg_start_bit);
 
-	mlog(0, "want to free %u clusters starting at block %llu\n",
-	     num_clusters, (unsigned long long)start_blk);
-	mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
-	     (unsigned long long)bg_blkno, bg_start_bit);
+	trace_ocfs2_free_clusters((unsigned long long)bg_blkno,
+			(unsigned long long)start_blk,
+			bg_start_bit, num_clusters);
 
 	status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
 					   bg_start_bit, bg_blkno,
@@ -2748,7 +2747,7 @@ static int ocfs2_get_suballoc_slot_bit(struct ocfs2_super *osb, u64 blkno,
 	struct buffer_head *inode_bh = NULL;
 	struct ocfs2_dinode *inode_fe;
 
-	mlog(0, "blkno: %llu\n", (unsigned long long)blkno);
+	trace_ocfs2_get_suballoc_slot_bit((unsigned long long)blkno);
 
 	/* dirty read disk */
 	status = ocfs2_read_blocks_sync(osb, blkno, 1, &inode_bh);
@@ -2809,8 +2808,8 @@ static int ocfs2_test_suballoc_bit(struct ocfs2_super *osb,
 	u64 bg_blkno;
 	int status;
 
-	mlog(0, "blkno: %llu bit: %u\n", (unsigned long long)blkno,
-	     (unsigned int)bit);
+	trace_ocfs2_test_suballoc_bit((unsigned long long)blkno,
+				      (unsigned int)bit);
 
 	alloc_di = (struct ocfs2_dinode *)alloc_bh->b_data;
 	if ((bit + 1) > ocfs2_bits_per_group(&alloc_di->id2.i_chain)) {
@@ -2863,7 +2862,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 	struct inode *inode_alloc_inode;
 	struct buffer_head *alloc_bh = NULL;
 
-	mlog(0, "blkno: %llu", (unsigned long long)blkno);
+	trace_ocfs2_test_inode_bit((unsigned long long)blkno);
 
 	status = ocfs2_get_suballoc_slot_bit(osb, blkno, &suballoc_slot,
 					     &group_blkno, &suballoc_bit);
-- 
cgit v1.2.3


From a5438702ee63b13710c497bb22c7952a7e5d6a52 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 08:24:01 +0800
Subject: ocfs2: Remove mlog(0) from fs/ocfs2/suballoc.c

This is the 4th step to remove the debug info of DISK_ALLOC.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/ocfs2_trace.h | 10 ++++++++++
 fs/ocfs2/resize.c      | 12 +++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 11980ecba546..335edc50f132 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -590,6 +590,16 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_local_alloc_new_window_result);
 
 /* End of trace events for fs/ocfs2/localalloc.c. */
 
+/* Trace events for fs/ocfs2/resize.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_update_last_group_and_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_group_extend);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_group_add);
+
+/* End of trace events for fs/ocfs2/resize.c. */
+
 /* Trace events for fs/ocfs2/suballoc.c. */
 
 DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_group_descriptor);
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 76b37909a4a1..761d20da9dcd 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -39,6 +39,7 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 #include "suballoc.h"
@@ -102,8 +103,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
 	u16 cl_bpc = le16_to_cpu(cl->cl_bpc);
 	u16 cl_cpg = le16_to_cpu(cl->cl_cpg);
 
-	mlog(0, "(new_clusters=%d, first_new_cluster = %u)\n",
-	     new_clusters, first_new_cluster);
+	trace_ocfs2_update_last_group_and_inode(new_clusters,
+						first_new_cluster);
 
 	ret = ocfs2_journal_access_gd(handle, INODE_CACHE(bm_inode),
 				      group_bh, OCFS2_JOURNAL_ACCESS_WRITE);
@@ -340,7 +341,8 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 		goto out_unlock;
 	}
 
-	mlog(0, "extend the last group at %llu, new clusters = %d\n",
+
+	trace_ocfs2_group_extend(
 	     (unsigned long long)le64_to_cpu(group->bg_blkno), new_clusters);
 
 	handle = ocfs2_start_trans(osb, OCFS2_GROUP_EXTEND_CREDITS);
@@ -515,8 +517,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 		goto out_unlock;
 	}
 
-	mlog(0, "Add a new group  %llu in chain = %u, length = %u\n",
-	     (unsigned long long)input->group, input->chain, input->clusters);
+	trace_ocfs2_group_add((unsigned long long)input->group,
+			       input->chain, input->clusters, input->frees);
 
 	handle = ocfs2_start_trans(osb, OCFS2_GROUP_ADD_CREDITS);
 	if (IS_ERR(handle)) {
-- 
cgit v1.2.3


From 81bad69749623062fae2f94e2d98dd43d95a36f4 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 08:01:27 +0800
Subject: ocfs2: Remove DISK_ALLOC from masklog.

Since all 4 files, localalloc.c, suballoc.c, alloc.c and
resize.c, which use DISK_ALLOC are changed to trace events,
Remove masklog DISK_ALLOC totally.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/alloc.c           | 1 -
 fs/ocfs2/cluster/masklog.c | 1 -
 fs/ocfs2/cluster/masklog.h | 1 -
 fs/ocfs2/localalloc.c      | 1 -
 fs/ocfs2/resize.c          | 1 -
 fs/ocfs2/suballoc.c        | 1 -
 6 files changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f0e26c4adc94..b27a0d86f8c5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -30,7 +30,6 @@
 #include <linux/swap.h>
 #include <linux/quotaops.h>
 
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index fdc302ddafe8..caba07c2dac6 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -93,7 +93,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(DLM_RECOVERY),
 	define_mask(AIO),
 	define_mask(JOURNAL),
-	define_mask(DISK_ALLOC),
 	define_mask(SUPER),
 	define_mask(FILE_IO),
 	define_mask(EXTENT_MAP),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 51ee790c7678..155835a6b1c3 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -95,7 +95,6 @@
 #define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
 #define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
 #define ML_JOURNAL	0x0000000000004000ULL /* ocfs2 journalling functions */
-#define ML_DISK_ALLOC	0x0000000000008000ULL /* ocfs2 disk allocation */
 #define ML_SUPER	0x0000000000010000ULL /* ocfs2 mount / umount */
 #define ML_FILE_IO	0x0000000000020000ULL /* ocfs2 file I/O */
 #define ML_EXTENT_MAP	0x0000000000040000ULL /* ocfs2 extent map caching */
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index a311b1f658e2..210c35237548 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -29,7 +29,6 @@
 #include <linux/highmem.h>
 #include <linux/bitops.h>
 
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 761d20da9dcd..ec55add7604a 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -27,7 +27,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 94342bd9800d..ab6e2061074f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -29,7 +29,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 
-#define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
-- 
cgit v1.2.3


From 198aac2838731691ca67e15fd8a56f65307d3677 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 08:20:04 +0800
Subject: ocfs2: Remove masklog ML_REFCOUNT.

Change all the "mlog(0," in fs/ocfs2/refcounttree.c to trace events.
And finally remove masklog ML_REFCOUNT.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |   1 -
 fs/ocfs2/cluster/masklog.h |   1 -
 fs/ocfs2/ocfs2_trace.h     | 263 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/refcounttree.c    | 158 ++++++++++++++-------------
 4 files changed, 341 insertions(+), 82 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index caba07c2dac6..a961cca12529 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -108,7 +108,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(EXPORT),
 	define_mask(XATTR),
 	define_mask(QUOTA),
-	define_mask(REFCOUNT),
 	define_mask(BASTS),
 	define_mask(RESERVATIONS),
 	define_mask(CLUSTER),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 155835a6b1c3..845ea2b02f0b 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -110,7 +110,6 @@
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
-#define ML_REFCOUNT	0x0000000080000000ULL /* refcount tree operations */
 #define ML_BASTS	0x0000000100000000ULL /* dlmglue asts and basts */
 #define ML_RESERVATIONS	0x0000000200000000ULL /* ocfs2 alloc reservations */
 #define ML_CLUSTER	0x0000000400000000ULL /* cluster stack */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 335edc50f132..a8b57ca7bdb5 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -133,6 +133,30 @@ DEFINE_EVENT(ocfs2__ull_ull, name,	\
 	TP_PROTO(unsigned long long val1, unsigned long long val2),	\
 	TP_ARGS(val1, val2))
 
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint,
+	TP_PROTO(unsigned long long value1,
+		 unsigned long long value2, unsigned int value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned int, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+	),
+	TP_printk("%llu %llu %u",
+		  __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_uint, name,	\
+	TP_PROTO(unsigned long long val1,	\
+		 unsigned long long val2, unsigned int val3),	\
+	TP_ARGS(val1, val2, val3))
+
 DECLARE_EVENT_CLASS(ocfs2__ull_uint_uint,
 	TP_PROTO(unsigned long long value1,
 		 unsigned int value2, unsigned int value3),
@@ -705,6 +729,245 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_test_suballoc_bit);
 DEFINE_OCFS2_ULL_EVENT(ocfs2_test_inode_bit);
 
 /* End of trace events for fs/ocfs2/suballoc.c. */
+
+/* Trace events for fs/ocfs2/refcounttree.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_refcount_block);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_refcount_trees);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_create_refcount_tree_blkno);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_change_refcount_rec);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_expand_inline_ref_root);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_divide_leaf_refcount_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_new_leaf_refcount_block);
+
+DECLARE_EVENT_CLASS(ocfs2__refcount_tree_ops,
+	TP_PROTO(unsigned long long blkno, int index,
+		 unsigned long long cpos,
+		 unsigned int clusters, unsigned int refcount),
+	TP_ARGS(blkno, index, cpos, clusters, refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__field(int, index)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, refcount)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__entry->index = index;
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->refcount = refcount;
+	),
+	TP_printk("%llu %d %llu %u %u", __entry->blkno, __entry->index,
+		  __entry->cpos, __entry->clusters, __entry->refcount)
+);
+
+#define DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(name)	\
+DEFINE_EVENT(ocfs2__refcount_tree_ops, name,		\
+	TP_PROTO(unsigned long long blkno, int index,	\
+		 unsigned long long cpos,		\
+		 unsigned int count, unsigned int refcount),	\
+	TP_ARGS(blkno, index, cpos, count, refcount))
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_insert_refcount_rec);
+
+TRACE_EVENT(ocfs2_split_refcount_rec,
+	TP_PROTO(unsigned long long cpos,
+		 unsigned int clusters, unsigned int refcount,
+		 unsigned long long split_cpos,
+		 unsigned int split_clusters, unsigned int split_refcount),
+	TP_ARGS(cpos, clusters, refcount,
+		split_cpos, split_clusters, split_refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned int, refcount)
+		__field(unsigned long long, split_cpos)
+		__field(unsigned int, split_clusters)
+		__field(unsigned int, split_refcount)
+	),
+	TP_fast_assign(
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->refcount = refcount;
+		__entry->split_cpos = split_cpos;
+		__entry->split_clusters = split_clusters;
+		__entry->split_refcount	= split_refcount;
+	),
+	TP_printk("%llu %u %u %llu %u %u",
+		  __entry->cpos, __entry->clusters, __entry->refcount,
+		  __entry->split_cpos, __entry->split_clusters,
+		  __entry->split_refcount)
+);
+
+DEFINE_OCFS2_REFCOUNT_TREE_OPS_EVENT(ocfs2_split_refcount_rec_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_increase_refcount_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_change);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_increase_refcount_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_increase_refcount_split);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_remove_refcount_extent);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_restore_refcount_block);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_decrease_refcount_rec);
+
+TRACE_EVENT(ocfs2_decrease_refcount,
+	TP_PROTO(unsigned long long owner,
+		 unsigned long long cpos,
+		 unsigned int len, int delete),
+	TP_ARGS(owner, cpos, len, delete),
+	TP_STRUCT__entry(
+		__field(unsigned long long, owner)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, len)
+		__field(int, delete)
+	),
+	TP_fast_assign(
+		__entry->owner = owner;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->delete = delete;
+	),
+	TP_printk("%llu %llu %u %d",
+		  __entry->owner, __entry->cpos, __entry->len, __entry->delete)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_mark_extent_refcounted);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_calc_refcount_meta_credits);
+
+TRACE_EVENT(ocfs2_calc_refcount_meta_credits_iterate,
+	TP_PROTO(int recs_add, unsigned long long cpos,
+		 unsigned int clusters, unsigned long long r_cpos,
+		 unsigned int r_clusters, unsigned int refcount, int index),
+	TP_ARGS(recs_add, cpos, clusters, r_cpos, r_clusters, refcount, index),
+	TP_STRUCT__entry(
+		__field(int, recs_add)
+		__field(unsigned long long, cpos)
+		__field(unsigned int, clusters)
+		__field(unsigned long long, r_cpos)
+		__field(unsigned int, r_clusters)
+		__field(unsigned int, refcount)
+		__field(int, index)
+	),
+	TP_fast_assign(
+		__entry->recs_add = recs_add;
+		__entry->cpos = cpos;
+		__entry->clusters = clusters;
+		__entry->r_cpos = r_cpos;
+		__entry->r_clusters = r_clusters;
+		__entry->refcount = refcount;
+		__entry->index = index;
+	),
+	TP_printk("%d %llu %u %llu %u %u %d",
+		  __entry->recs_add, __entry->cpos, __entry->clusters,
+		  __entry->r_cpos, __entry->r_clusters,
+		  __entry->refcount, __entry->index)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_add_refcount_flag);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_prepare_refcount_change_for_del);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_lock_refcount_allocators);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_page);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_duplicate_clusters_by_jbd);
+
+TRACE_EVENT(ocfs2_clear_ext_refcount,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int len, unsigned int p_cluster,
+		 unsigned int ext_flags),
+	TP_ARGS(ino, cpos, len, p_cluster, ext_flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, len)
+		__field(unsigned int, p_cluster)
+		__field(unsigned int, ext_flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->len = len;
+		__entry->p_cluster = p_cluster;
+		__entry->ext_flags = ext_flags;
+	),
+	TP_printk("%llu %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->len,
+		  __entry->p_cluster, __entry->ext_flags)
+);
+
+TRACE_EVENT(ocfs2_replace_clusters,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int old, unsigned int new, unsigned int len,
+		 unsigned int ext_flags),
+	TP_ARGS(ino, cpos, old, new, len, ext_flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, old)
+		__field(unsigned int, new)
+		__field(unsigned int, len)
+		__field(unsigned int, ext_flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->old = old;
+		__entry->new = new;
+		__entry->len = len;
+		__entry->ext_flags = ext_flags;
+	),
+	TP_printk("%llu %u %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->old, __entry->new,
+		  __entry->len, __entry->ext_flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_make_clusters_writable);
+
+TRACE_EVENT(ocfs2_refcount_cow_hunk,
+	TP_PROTO(unsigned long long ino, unsigned int cpos,
+		 unsigned int write_len, unsigned int max_cpos,
+		 unsigned int cow_start, unsigned int cow_len),
+	TP_ARGS(ino, cpos, write_len, max_cpos, cow_start, cow_len),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, cpos)
+		__field(unsigned int, write_len)
+		__field(unsigned int, max_cpos)
+		__field(unsigned int, cow_start)
+		__field(unsigned int, cow_len)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->cpos = cpos;
+		__entry->write_len = write_len;
+		__entry->max_cpos = max_cpos;
+		__entry->cow_start = cow_start;
+		__entry->cow_len = cow_len;
+	),
+	TP_printk("%llu %u %u %u %u %u",
+		  __entry->ino, __entry->cpos, __entry->write_len,
+		  __entry->max_cpos, __entry->cow_start, __entry->cow_len)
+);
+
+/* End of trace events for fs/ocfs2/refcounttree.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 29623da133cc..93a81f853e66 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -16,7 +16,6 @@
  */
 
 #include <linux/sort.h>
-#define MLOG_MASK_PREFIX ML_REFCOUNT
 #include <cluster/masklog.h>
 #include "ocfs2.h"
 #include "inode.h"
@@ -34,6 +33,7 @@
 #include "aops.h"
 #include "xattr.h"
 #include "namei.h"
+#include "ocfs2_trace.h"
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
@@ -84,8 +84,7 @@ static int ocfs2_validate_refcount_block(struct super_block *sb,
 	struct ocfs2_refcount_block *rb =
 		(struct ocfs2_refcount_block *)bh->b_data;
 
-	mlog(0, "Validating refcount block %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -545,8 +544,8 @@ void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
 	while ((node = rb_last(root)) != NULL) {
 		tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
 
-		mlog(0, "Purge tree %llu\n",
-		     (unsigned long long) tree->rf_blkno);
+		trace_ocfs2_purge_refcount_trees(
+				(unsigned long long) tree->rf_blkno);
 
 		rb_erase(&tree->rf_node, root);
 		ocfs2_free_refcount_tree(tree);
@@ -575,7 +574,8 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
 
 	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
 
-	mlog(0, "create tree for inode %lu\n", inode->i_ino);
+	trace_ocfs2_create_refcount_tree(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
 	if (ret) {
@@ -646,8 +646,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
 	di->i_refcount_loc = cpu_to_le64(first_blkno);
 	spin_unlock(&oi->ip_lock);
 
-	mlog(0, "created tree for inode %lu, refblock %llu\n",
-	     inode->i_ino, (unsigned long long)first_blkno);
+	trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
 
 	ocfs2_journal_dirty(handle, di_bh);
 
@@ -1256,8 +1255,9 @@ static int ocfs2_change_refcount_rec(handle_t *handle,
 		goto out;
 	}
 
-	mlog(0, "change index %d, old count %u, change %d\n", index,
-	     le32_to_cpu(rec->r_refcount), change);
+	trace_ocfs2_change_refcount_rec(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		index, le32_to_cpu(rec->r_refcount), change);
 	le32_add_cpu(&rec->r_refcount, change);
 
 	if (!rec->r_refcount) {
@@ -1353,8 +1353,8 @@ static int ocfs2_expand_inline_ref_root(handle_t *handle,
 
 	ocfs2_journal_dirty(handle, ref_root_bh);
 
-	mlog(0, "new leaf block %llu, used %u\n", (unsigned long long)blkno,
-	     le16_to_cpu(new_rb->rf_records.rl_used));
+	trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
+		le16_to_cpu(new_rb->rf_records.rl_used));
 
 	*ref_leaf_bh = new_bh;
 	new_bh = NULL;
@@ -1466,9 +1466,9 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
 			(struct ocfs2_refcount_block *)new_bh->b_data;
 	struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
 
-	mlog(0, "split old leaf refcount block %llu, count = %u, used = %u\n",
-	     (unsigned long long)ref_leaf_bh->b_blocknr,
-	     le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
+	trace_ocfs2_divide_leaf_refcount_block(
+		(unsigned long long)ref_leaf_bh->b_blocknr,
+		le32_to_cpu(rl->rl_count), le32_to_cpu(rl->rl_used));
 
 	/*
 	 * XXX: Improvement later.
@@ -1601,8 +1601,8 @@ static int ocfs2_new_leaf_refcount_block(handle_t *handle,
 
 	ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
 
-	mlog(0, "insert new leaf block %llu at %u\n",
-	     (unsigned long long)new_bh->b_blocknr, new_cpos);
+	trace_ocfs2_new_leaf_refcount_block(
+			(unsigned long long)new_bh->b_blocknr, new_cpos);
 
 	/* Insert the new leaf block with the specific offset cpos. */
 	ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
@@ -1794,11 +1794,10 @@ static int ocfs2_insert_refcount_rec(handle_t *handle,
 			(le16_to_cpu(rf_list->rl_used) - index) *
 			 sizeof(struct ocfs2_refcount_rec));
 
-	mlog(0, "insert refcount record start %llu, len %u, count %u "
-	     "to leaf block %llu at index %d\n",
-	     (unsigned long long)le64_to_cpu(rec->r_cpos),
-	     le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount),
-	     (unsigned long long)ref_leaf_bh->b_blocknr, index);
+	trace_ocfs2_insert_refcount_rec(
+		(unsigned long long)ref_leaf_bh->b_blocknr, index,
+		(unsigned long long)le64_to_cpu(rec->r_cpos),
+		le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
 
 	rf_list->rl_recs[index] = *rec;
 
@@ -1850,10 +1849,12 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
 
 	BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
 
-	mlog(0, "original r_pos %llu, cluster %u, split %llu, cluster %u\n",
-	     le64_to_cpu(orig_rec->r_cpos), le32_to_cpu(orig_rec->r_clusters),
-	     le64_to_cpu(split_rec->r_cpos),
-	     le32_to_cpu(split_rec->r_clusters));
+	trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
+		le32_to_cpu(orig_rec->r_clusters),
+		le32_to_cpu(orig_rec->r_refcount),
+		le64_to_cpu(split_rec->r_cpos),
+		le32_to_cpu(split_rec->r_clusters),
+		le32_to_cpu(split_rec->r_refcount));
 
 	/*
 	 * If we just need to split the header or tail clusters,
@@ -1967,12 +1968,11 @@ static int ocfs2_split_refcount_rec(handle_t *handle,
 
 	if (split_rec->r_refcount) {
 		rf_list->rl_recs[index] = *split_rec;
-		mlog(0, "insert refcount record start %llu, len %u, count %u "
-		     "to leaf block %llu at index %d\n",
-		     (unsigned long long)le64_to_cpu(split_rec->r_cpos),
-		     le32_to_cpu(split_rec->r_clusters),
-		     le32_to_cpu(split_rec->r_refcount),
-		     (unsigned long long)ref_leaf_bh->b_blocknr, index);
+		trace_ocfs2_split_refcount_rec_insert(
+			(unsigned long long)ref_leaf_bh->b_blocknr, index,
+			(unsigned long long)le64_to_cpu(split_rec->r_cpos),
+			le32_to_cpu(split_rec->r_clusters),
+			le32_to_cpu(split_rec->r_refcount));
 
 		if (merge)
 			ocfs2_refcount_rec_merge(rb, index);
@@ -1997,7 +1997,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
 	struct ocfs2_refcount_rec rec;
 	unsigned int set_len = 0;
 
-	mlog(0, "Tree owner %llu, add refcount start %llu, len %u\n",
+	trace_ocfs2_increase_refcount_begin(
 	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
 	     (unsigned long long)cpos, len);
 
@@ -2024,9 +2024,9 @@ static int __ocfs2_increase_refcount(handle_t *handle,
 		 */
 		if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
 		    set_len <= len) {
-			mlog(0, "increase refcount rec, start %llu, len %u, "
-			     "count %u\n", (unsigned long long)cpos, set_len,
-			     le32_to_cpu(rec.r_refcount));
+			trace_ocfs2_increase_refcount_change(
+				(unsigned long long)cpos, set_len,
+				le32_to_cpu(rec.r_refcount));
 			ret = ocfs2_change_refcount_rec(handle, ci,
 							ref_leaf_bh, index,
 							merge, 1);
@@ -2037,7 +2037,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
 		} else if (!rec.r_refcount) {
 			rec.r_refcount = cpu_to_le32(1);
 
-			mlog(0, "insert refcount rec, start %llu, len %u\n",
+			trace_ocfs2_increase_refcount_insert(
 			     (unsigned long long)le64_to_cpu(rec.r_cpos),
 			     set_len);
 			ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
@@ -2055,8 +2055,7 @@ static int __ocfs2_increase_refcount(handle_t *handle,
 			rec.r_clusters = cpu_to_le32(set_len);
 			le32_add_cpu(&rec.r_refcount, 1);
 
-			mlog(0, "split refcount rec, start %llu, "
-			     "len %u, count %u\n",
+			trace_ocfs2_increase_refcount_split(
 			     (unsigned long long)le64_to_cpu(rec.r_cpos),
 			     set_len, le32_to_cpu(rec.r_refcount));
 			ret = ocfs2_split_refcount_rec(handle, ci,
@@ -2095,6 +2094,11 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
 
 	BUG_ON(rb->rf_records.rl_used);
 
+	trace_ocfs2_remove_refcount_extent(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)ref_leaf_bh->b_blocknr,
+		le32_to_cpu(rb->rf_cpos));
+
 	ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
 	ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
 				  1, meta_ac, dealloc);
@@ -2137,7 +2141,7 @@ static int ocfs2_remove_refcount_extent(handle_t *handle,
 	if (!rb->rf_list.l_next_free_rec) {
 		BUG_ON(rb->rf_clusters);
 
-		mlog(0, "reset refcount tree root %llu to be a record block.\n",
+		trace_ocfs2_restore_refcount_block(
 		     (unsigned long long)ref_root_bh->b_blocknr);
 
 		rb->rf_flags = 0;
@@ -2184,6 +2188,10 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
 	BUG_ON(cpos + len >
 	       le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
 
+	trace_ocfs2_decrease_refcount_rec(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)cpos, len);
+
 	if (cpos == le64_to_cpu(rec->r_cpos) &&
 	    len == le32_to_cpu(rec->r_clusters))
 		ret = ocfs2_change_refcount_rec(handle, ci,
@@ -2195,12 +2203,6 @@ static int ocfs2_decrease_refcount_rec(handle_t *handle,
 
 		le32_add_cpu(&split.r_refcount, -1);
 
-		mlog(0, "split refcount rec, start %llu, "
-		     "len %u, count %u, original start %llu, len %u\n",
-		     (unsigned long long)le64_to_cpu(split.r_cpos),
-		     len, le32_to_cpu(split.r_refcount),
-		     (unsigned long long)le64_to_cpu(rec->r_cpos),
-		     le32_to_cpu(rec->r_clusters));
 		ret = ocfs2_split_refcount_rec(handle, ci,
 					       ref_root_bh, ref_leaf_bh,
 					       &split, index, 1,
@@ -2239,10 +2241,9 @@ static int __ocfs2_decrease_refcount(handle_t *handle,
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 	struct buffer_head *ref_leaf_bh = NULL;
 
-	mlog(0, "Tree owner %llu, decrease refcount start %llu, "
-	     "len %u, delete %u\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long)cpos, len, delete);
+	trace_ocfs2_decrease_refcount(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)cpos, len, delete);
 
 	while (len) {
 		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
@@ -2352,8 +2353,8 @@ static int ocfs2_mark_extent_refcounted(struct inode *inode,
 {
 	int ret;
 
-	mlog(0, "Inode %lu refcount tree cpos %u, len %u, phys cluster %u\n",
-	     inode->i_ino, cpos, len, phys);
+	trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
+					   cpos, len, phys);
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
 		ocfs2_error(inode->i_sb, "Inode %lu want to use refcount "
@@ -2392,8 +2393,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
 	struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
 	u32 len;
 
-	mlog(0, "start_cpos %llu, clusters %u\n",
-	     (unsigned long long)start_cpos, clusters);
 	while (clusters) {
 		ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
 					     cpos, clusters, &rec,
@@ -2427,12 +2426,11 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
 
 		rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
 
-		mlog(0, "recs_add %d,cpos %llu, clusters %u, rec->r_cpos %llu,"
-		     "rec->r_clusters %u, rec->r_refcount %u, index %d\n",
-		     recs_add, (unsigned long long)cpos, clusters,
-		     (unsigned long long)le64_to_cpu(rec.r_cpos),
-		     le32_to_cpu(rec.r_clusters),
-		     le32_to_cpu(rec.r_refcount), index);
+		trace_ocfs2_calc_refcount_meta_credits_iterate(
+				recs_add, (unsigned long long)cpos, clusters,
+				(unsigned long long)le64_to_cpu(rec.r_cpos),
+				le32_to_cpu(rec.r_clusters),
+				le32_to_cpu(rec.r_refcount), index);
 
 		len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
 			  le32_to_cpu(rec.r_clusters)) - cpos;
@@ -2488,7 +2486,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
 	if (!ref_blocks)
 		goto out;
 
-	mlog(0, "we need ref_blocks %d\n", ref_blocks);
 	*meta_add += ref_blocks;
 	*credits += ref_blocks;
 
@@ -2514,6 +2511,10 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
 	}
 
 out:
+
+	trace_ocfs2_calc_refcount_meta_credits(
+		(unsigned long long)start_cpos, clusters,
+		*meta_add, *credits);
 	brelse(ref_leaf_bh);
 	brelse(prev_bh);
 	return ret;
@@ -2578,8 +2579,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 		goto out;
 	}
 
-	mlog(0, "reserve new metadata %d blocks, credits = %d\n",
-	     *ref_blocks, *credits);
+	trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
 
 out:
 	brelse(ref_root_bh);
@@ -2886,8 +2886,7 @@ static int ocfs2_lock_refcount_allocators(struct super_block *sb,
 		goto out;
 	}
 
-	mlog(0, "reserve new metadata %d, clusters %u, credits = %d\n",
-	     meta_add, num_clusters, *credits);
+	trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
 	ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
 						meta_ac);
 	if (ret) {
@@ -2937,8 +2936,8 @@ static int ocfs2_duplicate_clusters_by_page(handle_t *handle,
 	loff_t offset, end, map_end;
 	struct address_space *mapping = context->inode->i_mapping;
 
-	mlog(0, "old_cluster %u, new %u, len %u at offset %u\n", old_cluster,
-	     new_cluster, new_len, cpos);
+	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+					       new_cluster, new_len);
 
 	readahead_pages =
 		(ocfs2_cow_contig_clusters(sb) <<
@@ -3031,8 +3030,8 @@ static int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
 	struct buffer_head *old_bh = NULL;
 	struct buffer_head *new_bh = NULL;
 
-	mlog(0, "old_cluster %u, new %u, len %u\n", old_cluster,
-	     new_cluster, new_len);
+	trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
+					       new_cluster, new_len);
 
 	for (i = 0; i < blocks; i++, old_block++, new_block++) {
 		new_bh = sb_getblk(osb->sb, new_block);
@@ -3085,8 +3084,8 @@ static int ocfs2_clear_ext_refcount(handle_t *handle,
 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
 	u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
 
-	mlog(0, "inode %llu cpos %u, len %u, p_cluster %u, ext_flags %u\n",
-	     (unsigned long long)ino, cpos, len, p_cluster, ext_flags);
+	trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
+				       cpos, len, p_cluster, ext_flags);
 
 	memset(&replace_rec, 0, sizeof(replace_rec));
 	replace_rec.e_cpos = cpu_to_le32(cpos);
@@ -3141,8 +3140,8 @@ static int ocfs2_replace_clusters(handle_t *handle,
 	struct ocfs2_caching_info *ci = context->data_et.et_ci;
 	u64 ino = ocfs2_metadata_cache_owner(ci);
 
-	mlog(0, "inode %llu, cpos %u, old %u, new %u, len %u, ext_flags %u\n",
-	     (unsigned long long)ino, cpos, old, new, len, ext_flags);
+	trace_ocfs2_replace_clusters((unsigned long long)ino,
+				     cpos, old, new, len, ext_flags);
 
 	/*If the old clusters is unwritten, no need to duplicate. */
 	if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
@@ -3236,8 +3235,8 @@ static int ocfs2_make_clusters_writable(struct super_block *sb,
 	struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
 	struct ocfs2_refcount_rec rec;
 
-	mlog(0, "cpos %u, p_cluster %u, num_clusters %u, e_flags %u\n",
-	     cpos, p_cluster, num_clusters, e_flags);
+	trace_ocfs2_make_clusters_writable(cpos, p_cluster,
+					   num_clusters, e_flags);
 
 	ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
 					     &context->data_et,
@@ -3475,9 +3474,9 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
 		goto out;
 	}
 
-	mlog(0, "CoW inode %lu, cpos %u, write_len %u, cow_start %u, "
-	     "cow_len %u\n", inode->i_ino,
-	     cpos, write_len, cow_start, cow_len);
+	trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
+				      cpos, write_len, max_cpos,
+				      cow_start, cow_len);
 
 	BUG_ON(cow_len == 0);
 
@@ -3756,8 +3755,7 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 		goto out;
 	}
 
-	mlog(0, "reserve new metadata %d, credits = %d\n",
-	     ref_blocks, credits);
+	trace_ocfs2_add_refcount_flag(ref_blocks, credits);
 
 	if (ref_blocks) {
 		ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(inode->i_sb),
-- 
cgit v1.2.3


From 9558156bcf8e4750d20034e941213273743ed86a Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 21:33:59 +0800
Subject: ocfs2: Remove mlog(0) from fs/ocfs2/aops.c

Remove all the "mlog(0," in fs/ocfs2/aops.c.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/aops.c        |  55 +++++++++++-----------
 fs/ocfs2/ocfs2_trace.h | 124 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 153 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 9551518be435..0d44b7701844 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -29,7 +29,6 @@
 #include <linux/mpage.h>
 #include <linux/quotaops.h>
 
-#define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -45,6 +44,7 @@
 #include "super.h"
 #include "symlink.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -59,8 +59,9 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	void *kaddr;
 
-	mlog(0, "(0x%p, %llu, 0x%p, %d)\n", inode,
-	     (unsigned long long)iblock, bh_result, create);
+	trace_ocfs2_symlink_get_block(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)iblock, bh_result, create);
 
 	BUG_ON(ocfs2_inode_is_fast_symlink(inode));
 
@@ -135,8 +136,8 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
 	u64 p_blkno, count, past_eof;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog(0, "(0x%p, %llu, 0x%p, %d)\n", inode,
-	     (unsigned long long)iblock, bh_result, create);
+	trace_ocfs2_get_block((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			      (unsigned long long)iblock, bh_result, create);
 
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE)
 		mlog(ML_NOTICE, "get_block on system inode 0x%p (%lu)\n",
@@ -198,8 +199,9 @@ int ocfs2_get_block(struct inode *inode, sector_t iblock,
 	}
 
 	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-	     (unsigned long long)past_eof);
+
+	trace_ocfs2_get_block_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				  (unsigned long long)past_eof);
 	if (create && (iblock >= past_eof))
 		set_buffer_new(bh_result);
 
@@ -276,7 +278,8 @@ static int ocfs2_readpage(struct file *file, struct page *page)
 	loff_t start = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	int ret, unlock = 1;
 
-	mlog(0, "(0x%p, %lu)\n", file, (page ? page->index : 0));
+	trace_ocfs2_readpage((unsigned long long)oi->ip_blkno,
+			     (page ? page->index : 0));
 
 	ret = ocfs2_inode_lock_with_page(inode, NULL, 0, page);
 	if (ret != 0) {
@@ -393,13 +396,11 @@ out_unlock:
  */
 static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 {
-	int ret;
-
-	mlog(0, "(0x%p)\n", page);
-
-	ret = block_write_full_page(page, ocfs2_get_block, wbc);
+	trace_ocfs2_writepage(
+		(unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
+		page->index);
 
-	return ret;
+	return block_write_full_page(page, ocfs2_get_block, wbc);
 }
 
 /* Taken from ext3. We don't necessarily need the full blown
@@ -445,7 +446,8 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 	int err = 0;
 	struct inode *inode = mapping->host;
 
-	mlog(0, "(block = %llu)\n", (unsigned long long)block);
+	trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			 (unsigned long long)block);
 
 	/* We don't need to lock journal system files, since they aren't
 	 * accessed concurrently from multiple nodes.
@@ -1521,9 +1523,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping,
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di = NULL;
 
-	mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n",
-	     (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos,
-	     oi->ip_dyn_features);
+	trace_ocfs2_try_to_write_inline_data((unsigned long long)oi->ip_blkno,
+					     len, (unsigned long long)pos,
+					     oi->ip_dyn_features);
 
 	/*
 	 * Handle inodes which already have inline data 1st.
@@ -1726,6 +1728,13 @@ try_again:
 
 	di = (struct ocfs2_dinode *)wc->w_di_bh->b_data;
 
+	trace_ocfs2_write_begin_nolock(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(long long)i_size_read(inode),
+			le32_to_cpu(di->i_clusters),
+			pos, len, flags, mmap_page,
+			clusters_to_alloc, extents_to_split);
+
 	/*
 	 * We set w_target_from, w_target_to here so that
 	 * ocfs2_write_end() knows which range in the target page to
@@ -1738,12 +1747,6 @@ try_again:
 		 * ocfs2_lock_allocators(). It greatly over-estimates
 		 * the work to be done.
 		 */
-		mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u,"
-		     " clusters_to_add = %u, extents_to_split = %u\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     (long long)i_size_read(inode), le32_to_cpu(di->i_clusters),
-		     clusters_to_alloc, extents_to_split);
-
 		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode),
 					      wc->w_di_bh);
 		ret = ocfs2_lock_allocators(inode, &et,
@@ -1925,8 +1928,8 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
 	memcpy(di->id2.i_data.id_data + pos, kaddr + pos, *copied);
 	kunmap_atomic(kaddr, KM_USER0);
 
-	mlog(0, "Data written to inode at offset %llu. "
-	     "id_count = %u, copied = %u, i_dyn_features = 0x%x\n",
+	trace_ocfs2_write_end_inline(
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 	     (unsigned long long)pos, *copied,
 	     le16_to_cpu(di->id2.i_data.id_count),
 	     le16_to_cpu(di->i_dyn_features));
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a8b57ca7bdb5..5958da57f8ff 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -968,6 +968,130 @@ TRACE_EVENT(ocfs2_refcount_cow_hunk,
 );
 
 /* End of trace events for fs/ocfs2/refcounttree.c. */
+
+/* Trace events for fs/ocfs2/aops.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__get_block,
+	TP_PROTO(unsigned long long ino, unsigned long long iblock,
+		 void *bh_result, int create),
+	TP_ARGS(ino, iblock, bh_result, create),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, iblock)
+		__field(void *, bh_result)
+		__field(int, create)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->iblock = iblock;
+		__entry->bh_result = bh_result;
+		__entry->create = create;
+	),
+	TP_printk("%llu %llu %p %d",
+		  __entry->ino, __entry->iblock,
+		  __entry->bh_result, __entry->create)
+);
+
+#define DEFINE_OCFS2_GET_BLOCK_EVENT(name)	\
+DEFINE_EVENT(ocfs2__get_block, name,	\
+	TP_PROTO(unsigned long long ino, unsigned long long iblock,	\
+		 void *bh_result, int create),	\
+	TP_ARGS(ino, iblock, bh_result, create))
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_symlink_get_block);
+
+DEFINE_OCFS2_GET_BLOCK_EVENT(ocfs2_get_block);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
+
+TRACE_EVENT(ocfs2_try_to_write_inline_data,
+	TP_PROTO(unsigned long long ino, unsigned int len,
+		 unsigned long long pos, unsigned int flags),
+	TP_ARGS(ino, len, pos, flags),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, len)
+		__field(unsigned long long, pos)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->len = len;
+		__entry->pos = pos;
+		__entry->flags = flags;
+	),
+	TP_printk("%llu %u %llu 0x%x",
+		  __entry->ino, __entry->len, __entry->pos, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_write_begin_nolock,
+	TP_PROTO(unsigned long long ino,
+		 long long i_size, unsigned int i_clusters,
+		 unsigned long long pos, unsigned int len,
+		 unsigned int flags, void *page,
+		 unsigned int clusters, unsigned int extents_to_split),
+	TP_ARGS(ino, i_size, i_clusters, pos, len, flags,
+		page, clusters, extents_to_split),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(long long, i_size)
+		__field(unsigned int, i_clusters)
+		__field(unsigned long long, pos)
+		__field(unsigned int, len)
+		__field(unsigned int, flags)
+		__field(void *, page)
+		__field(unsigned int, clusters)
+		__field(unsigned int, extents_to_split)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->i_size = i_size;
+		__entry->i_clusters = i_clusters;
+		__entry->pos = pos;
+		__entry->len = len;
+		__entry->flags = flags;
+		__entry->page = page;
+		__entry->clusters = clusters;
+		__entry->extents_to_split = extents_to_split;
+	),
+	TP_printk("%llu %lld %u %llu %u %u %p %u %u",
+		  __entry->ino, __entry->i_size, __entry->i_clusters,
+		  __entry->pos, __entry->len,
+		  __entry->flags, __entry->page, __entry->clusters,
+		  __entry->extents_to_split)
+);
+
+TRACE_EVENT(ocfs2_write_end_inline,
+	TP_PROTO(unsigned long long ino,
+		 unsigned long long pos, unsigned int copied,
+		 unsigned int id_count, unsigned int features),
+	TP_ARGS(ino, pos, copied, id_count, features),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, pos)
+		__field(unsigned int, copied)
+		__field(unsigned int, id_count)
+		__field(unsigned int, features)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->pos = pos;
+		__entry->copied = copied;
+		__entry->id_count = id_count;
+		__entry->features = features;
+	),
+	TP_printk("%llu %llu %u %u %u",
+		  __entry->ino, __entry->pos, __entry->copied,
+		  __entry->id_count, __entry->features)
+);
+
+/* End of trace events for fs/ocfs2/aops.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 614a9e849ca6ea24843795251cb30af525d5336b Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 21:59:46 +0800
Subject: ocfs2: Remove FILE_IO from masklog.

Change all the "mlog(0," in fs/ocfs2/mmap.c to trace events.
And finally remove masklog FILE_IO.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |  1 -
 fs/ocfs2/cluster/masklog.h |  1 -
 fs/ocfs2/mmap.c            |  7 +++----
 fs/ocfs2/ocfs2_trace.h     | 25 +++++++++++++++++++++++++
 4 files changed, 28 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index a961cca12529..036e8b8814f7 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -94,7 +94,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(AIO),
 	define_mask(JOURNAL),
 	define_mask(SUPER),
-	define_mask(FILE_IO),
 	define_mask(EXTENT_MAP),
 	define_mask(DLM_GLUE),
 	define_mask(BH_IO),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 845ea2b02f0b..d07780ab9f8f 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -96,7 +96,6 @@
 #define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
 #define ML_JOURNAL	0x0000000000004000ULL /* ocfs2 journalling functions */
 #define ML_SUPER	0x0000000000010000ULL /* ocfs2 mount / umount */
-#define ML_FILE_IO	0x0000000000020000ULL /* ocfs2 file I/O */
 #define ML_EXTENT_MAP	0x0000000000040000ULL /* ocfs2 extent map caching */
 #define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
 #define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 49730b51ae45..3e9393ca39eb 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -31,7 +31,6 @@
 #include <linux/signal.h>
 #include <linux/rbtree.h>
 
-#define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -42,6 +41,7 @@
 #include "inode.h"
 #include "mmap.h"
 #include "super.h"
+#include "ocfs2_trace.h"
 
 
 static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
@@ -49,13 +49,12 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
 	sigset_t oldset;
 	int ret;
 
-	mlog(0, "(area=%p, page offset=%lu)\n", area, vmf->pgoff);
-
 	ocfs2_block_signals(&oldset);
 	ret = filemap_fault(area, vmf);
 	ocfs2_unblock_signals(&oldset);
 
-	mlog(0, "%p\n", vmf->page);
+	trace_ocfs2_fault(OCFS2_I(area->vm_file->f_mapping->host)->ip_blkno,
+			  area, vmf->page, vmf->pgoff);
 	return ret;
 }
 
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 5958da57f8ff..77148e7a14f8 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1092,6 +1092,31 @@ TRACE_EVENT(ocfs2_write_end_inline,
 );
 
 /* End of trace events for fs/ocfs2/aops.c. */
+
+/* Trace events for fs/ocfs2/mmap.c. */
+
+TRACE_EVENT(ocfs2_fault,
+	TP_PROTO(unsigned long long ino,
+		 void *area, void *page, unsigned long pgoff),
+	TP_ARGS(ino, area, page, pgoff),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(void *, area)
+		__field(void *, page)
+		__field(unsigned long, pgoff)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->area = area;
+		__entry->page = page;
+		__entry->pgoff = pgoff;
+	),
+	TP_printk("%llu %p %p %lu",
+		  __entry->ino, __entry->area, __entry->page, __entry->pgoff)
+);
+
+/* End of trace events for fs/ocfs2/mmap.c. */
+
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From da561c9cfa3022f6134ef52686d52931d3d1df3d Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 21 Feb 2011 11:18:23 +0800
Subject: ocfs2: remove INODE from unused files.

As there are no such debug information in fs/ocfs2/ioctl.c,
fs/ocfs2/locks.c and fs/ocfs2/sysfile.c, ML_INODE are also
removed.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/acl.c   | 1 -
 fs/ocfs2/ioctl.c | 1 -
 fs/ocfs2/locks.c | 1 -
 3 files changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 704f6b1742f3..69786e3eb43c 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -24,7 +24,6 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index b4f51e94c8f5..d9bfa904d006 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -9,7 +9,6 @@
 #include <linux/mount.h>
 #include <linux/compat.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c
index b5cb3ede9408..e57c804069ea 100644
--- a/fs/ocfs2/locks.c
+++ b/fs/ocfs2/locks.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/fcntl.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
-- 
cgit v1.2.3


From 468eedde23d6c9335935773f4f5764267d5a7763 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 22:14:41 +0800
Subject: ocfs2: Remove mlog(0) from fs/ocfs2/file.c

This is the 2nd step to remove the debug info of INODE.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/file.c        | 190 +++++++++++++++++++--------------------
 fs/ocfs2/ocfs2_trace.h | 237 +++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/sysfile.c     |   1 -
 3 files changed, 330 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1e1a93aa0526..41565ae52856 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -38,7 +38,6 @@
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -61,6 +60,7 @@
 #include "acl.h"
 #include "quota.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -99,8 +99,10 @@ static int ocfs2_file_open(struct inode *inode, struct file *file)
 	int mode = file->f_flags;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	mlog(0, "(0x%p, 0x%p, '%.*s')\n", inode, file,
-	     file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name);
+	trace_ocfs2_file_open(inode, file, file->f_path.dentry,
+			      (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			      file->f_path.dentry->d_name.len,
+			      file->f_path.dentry->d_name.name, mode);
 
 	if (file->f_mode & FMODE_WRITE)
 		dquot_initialize(inode);
@@ -142,13 +144,15 @@ static int ocfs2_file_release(struct inode *inode, struct file *file)
 {
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	mlog(0, "(0x%p, 0x%p, '%.*s')\n", inode, file,
-	     file->f_path.dentry->d_name.len,
-	     file->f_path.dentry->d_name.name);
-
 	spin_lock(&oi->ip_lock);
 	if (!--oi->ip_open_count)
 		oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT;
+
+	trace_ocfs2_file_release(inode, file, file->f_path.dentry,
+				 oi->ip_blkno,
+				 file->f_path.dentry->d_name.len,
+				 file->f_path.dentry->d_name.name,
+				 oi->ip_open_count);
 	spin_unlock(&oi->ip_lock);
 
 	ocfs2_free_file_private(inode, file);
@@ -174,9 +178,11 @@ static int ocfs2_sync_file(struct file *file, int datasync)
 	struct inode *inode = file->f_mapping->host;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog(0, "(0x%p, %d, 0x%p, '%.*s')\n", file, datasync,
-	     file->f_path.dentry, file->f_path.dentry->d_name.len,
-	     file->f_path.dentry->d_name.name);
+	trace_ocfs2_sync_file(inode, file, file->f_path.dentry,
+			      OCFS2_I(inode)->ip_blkno,
+			      file->f_path.dentry->d_name.len,
+			      file->f_path.dentry->d_name.name,
+			      (unsigned long long)datasync);
 
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
 		/*
@@ -431,14 +437,14 @@ static int ocfs2_truncate_file(struct inode *inode,
 	struct ocfs2_dinode *fe = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
-	mlog(0, "(inode = %llu, new_i_size = %llu\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     (unsigned long long)new_i_size);
-
 	/* We trust di_bh because it comes from ocfs2_inode_lock(), which
 	 * already validated it */
 	fe = (struct ocfs2_dinode *) di_bh->b_data;
 
+	trace_ocfs2_truncate_file((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				  (unsigned long long)le64_to_cpu(fe->i_size),
+				  (unsigned long long)new_i_size);
+
 	mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
 			"Inode %llu, inode i_size = %lld != di "
 			"i_size = %llu, i_flags = 0x%x\n",
@@ -448,19 +454,14 @@ static int ocfs2_truncate_file(struct inode *inode,
 			le32_to_cpu(fe->i_flags));
 
 	if (new_i_size > le64_to_cpu(fe->i_size)) {
-		mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n",
-		     (unsigned long long)le64_to_cpu(fe->i_size),
-		     (unsigned long long)new_i_size);
+		trace_ocfs2_truncate_file_error(
+			(unsigned long long)le64_to_cpu(fe->i_size),
+			(unsigned long long)new_i_size);
 		status = -EINVAL;
 		mlog_errno(status);
 		goto bail;
 	}
 
-	mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n",
-	     (unsigned long long)le64_to_cpu(fe->i_blkno),
-	     (unsigned long long)le64_to_cpu(fe->i_size),
-	     (unsigned long long)new_i_size);
-
 	/* lets handle the simple truncate cases before doing any more
 	 * cluster locking. */
 	if (new_i_size == le64_to_cpu(fe->i_size))
@@ -566,8 +567,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 	struct ocfs2_extent_tree et;
 	int did_quota = 0;
 
-	mlog(0, "(clusters_to_add = %u)\n", clusters_to_add);
-
 	/*
 	 * This function only exists for file systems which don't
 	 * support holes.
@@ -584,11 +583,6 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
-	     "clusters_to_add = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters),
-	     clusters_to_add);
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), bh);
 	status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
 				       &data_ac, &meta_ac);
@@ -608,6 +602,12 @@ restart_all:
 	}
 
 restarted_transaction:
+	trace_ocfs2_extend_allocation(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)i_size_read(inode),
+		le32_to_cpu(fe->i_clusters), clusters_to_add,
+		why, restart_func);
+
 	status = dquot_alloc_space_nodirty(inode,
 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
 	if (status)
@@ -654,13 +654,11 @@ restarted_transaction:
 
 	if (why != RESTART_NONE && clusters_to_add) {
 		if (why == RESTART_META) {
-			mlog(0, "restarting function.\n");
 			restart_func = 1;
 			status = 0;
 		} else {
 			BUG_ON(why != RESTART_TRANS);
 
-			mlog(0, "restarting transaction.\n");
 			/* TODO: This can be more intelligent. */
 			credits = ocfs2_calc_extend_credits(osb->sb,
 							    &fe->id2.i_list,
@@ -677,11 +675,11 @@ restarted_transaction:
 		}
 	}
 
-	mlog(0, "fe: i_clusters = %u, i_size=%llu\n",
+	trace_ocfs2_extend_allocation_end(OCFS2_I(inode)->ip_blkno,
 	     le32_to_cpu(fe->i_clusters),
-	     (unsigned long long)le64_to_cpu(fe->i_size));
-	mlog(0, "inode: ip_clusters=%u, i_size=%lld\n",
-	     OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
+	     (unsigned long long)le64_to_cpu(fe->i_size),
+	     OCFS2_I(inode)->ip_clusters,
+	     (unsigned long long)i_size_read(inode));
 
 leave:
 	if (status < 0 && did_quota)
@@ -772,10 +770,11 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
 	if (!zero_to)
 		zero_to = PAGE_CACHE_SIZE;
 
-	mlog(0,
-	     "abs_from = %llu, abs_to = %llu, index = %lu, zero_from = %u, zero_to = %u\n",
-	     (unsigned long long)abs_from, (unsigned long long)abs_to,
-	     index, zero_from, zero_to);
+	trace_ocfs2_write_zero_page(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)abs_from,
+			(unsigned long long)abs_to,
+			index, zero_from, zero_to);
 
 	/* We know that zero_from is block aligned */
 	for (block_start = zero_from; block_start < zero_to;
@@ -915,9 +914,10 @@ static int ocfs2_zero_extend_range(struct inode *inode, u64 range_start,
 	u64 next_pos;
 	u64 zero_pos = range_start;
 
-	mlog(0, "range_start = %llu, range_end = %llu\n",
-	     (unsigned long long)range_start,
-	     (unsigned long long)range_end);
+	trace_ocfs2_zero_extend_range(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)range_start,
+			(unsigned long long)range_end);
 	BUG_ON(range_start >= range_end);
 
 	while (zero_pos < range_end) {
@@ -949,9 +949,9 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
 	struct super_block *sb = inode->i_sb;
 
 	zero_start = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode));
-	mlog(0, "zero_start %llu for i_size %llu\n",
-	     (unsigned long long)zero_start,
-	     (unsigned long long)i_size_read(inode));
+	trace_ocfs2_zero_extend((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				(unsigned long long)zero_start,
+				(unsigned long long)i_size_read(inode));
 	while (zero_start < zero_to_size) {
 		ret = ocfs2_zero_extend_get_range(inode, di_bh, zero_start,
 						  zero_to_size,
@@ -1100,30 +1100,20 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	struct dquot *transfer_to[MAXQUOTAS] = { };
 	int qtype;
 
-	mlog(0, "(0x%p, '%.*s')\n", dentry,
-	     dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_setattr(inode, dentry,
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    dentry->d_name.len, dentry->d_name.name,
+			    attr->ia_valid, attr->ia_mode,
+			    attr->ia_uid, attr->ia_gid);
 
 	/* ensuring we don't even attempt to truncate a symlink */
 	if (S_ISLNK(inode->i_mode))
 		attr->ia_valid &= ~ATTR_SIZE;
 
-	if (attr->ia_valid & ATTR_MODE)
-		mlog(0, "mode change: %d\n", attr->ia_mode);
-	if (attr->ia_valid & ATTR_UID)
-		mlog(0, "uid change: %d\n", attr->ia_uid);
-	if (attr->ia_valid & ATTR_GID)
-		mlog(0, "gid change: %d\n", attr->ia_gid);
-	if (attr->ia_valid & ATTR_SIZE)
-		mlog(0, "size change...\n");
-	if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME))
-		mlog(0, "time change...\n");
-
 #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \
 			   | ATTR_GID | ATTR_UID | ATTR_MODE)
-	if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) {
-		mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid);
+	if (!(attr->ia_valid & OCFS2_VALID_ATTRS))
 		return 0;
-	}
 
 	status = inode_change_ok(inode, attr);
 	if (status)
@@ -1318,8 +1308,9 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di;
 
-	mlog(0, "(Inode %llu, mode 0%o)\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
+	trace_ocfs2_write_remove_suid(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			inode->i_mode);
 
 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
 	if (IS_ERR(handle)) {
@@ -1525,8 +1516,9 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
 	 * partial clusters here. There's no need to worry about
 	 * physical allocation - the zeroing code knows to skip holes.
 	 */
-	mlog(0, "byte start: %llu, end: %llu\n",
-	     (unsigned long long)start, (unsigned long long)end);
+	trace_ocfs2_zero_partial_clusters(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)start, (unsigned long long)end);
 
 	/*
 	 * If both edges are on a cluster boundary then there's no
@@ -1550,8 +1542,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
 	if (tmpend > end)
 		tmpend = end;
 
-	mlog(0, "1st range: start: %llu, tmpend: %llu\n",
-	     (unsigned long long)start, (unsigned long long)tmpend);
+	trace_ocfs2_zero_partial_clusters_range1((unsigned long long)start,
+						 (unsigned long long)tmpend);
 
 	ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
 	if (ret)
@@ -1565,8 +1557,8 @@ static int ocfs2_zero_partial_clusters(struct inode *inode,
 		 */
 		start = end & ~(osb->s_clustersize - 1);
 
-		mlog(0, "2nd range: start: %llu, end: %llu\n",
-		     (unsigned long long)start, (unsigned long long)end);
+		trace_ocfs2_zero_partial_clusters_range2(
+			(unsigned long long)start, (unsigned long long)end);
 
 		ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
 		if (ret)
@@ -1666,6 +1658,11 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
+	trace_ocfs2_remove_inode_range(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)byte_start,
+			(unsigned long long)byte_len);
+
 	if (byte_len == 0)
 		return 0;
 
@@ -1712,11 +1709,6 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 	trunc_end = (byte_start + byte_len) >> osb->s_clustersize_bits;
 	cluster_in_el = trunc_end;
 
-	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, cend: %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     (unsigned long long)byte_start,
-	     (unsigned long long)byte_len, trunc_start, trunc_end);
-
 	ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
 	if (ret) {
 		mlog_errno(ret);
@@ -2071,7 +2063,7 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
 	int ret = 0, meta_level = 0;
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	loff_t saved_pos, end;
+	loff_t saved_pos = 0, end;
 
 	/*
 	 * We start with a read level meta lock and only jump to an ex
@@ -2110,12 +2102,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
 
 		/* work on a copy of ppos until we're sure that we won't have
 		 * to recalculate it due to relocking. */
-		if (appending) {
+		if (appending)
 			saved_pos = i_size_read(inode);
-			mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos);
-		} else {
+		else
 			saved_pos = *ppos;
-		}
 
 		end = saved_pos + count;
 
@@ -2186,6 +2176,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
 		*ppos = saved_pos;
 
 out_unlock:
+	trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
+					    saved_pos, appending, count,
+					    direct_io, has_refcount);
+
 	if (meta_level >= 0)
 		ocfs2_inode_unlock(inode, meta_level);
 
@@ -2211,10 +2205,11 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	int full_coherency = !(osb->s_mount_opt &
 			       OCFS2_MOUNT_COHERENCY_BUFFERED);
 
-	mlog(0, "(0x%p, %u, '%.*s')\n", file,
-	     (unsigned int)nr_segs,
-	     file->f_path.dentry->d_name.len,
-	     file->f_path.dentry->d_name.name);
+	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		file->f_path.dentry->d_name.len,
+		file->f_path.dentry->d_name.name,
+		(unsigned int)nr_segs);
 
 	if (iocb->ki_left == 0)
 		return 0;
@@ -2415,10 +2410,11 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 		.u.file = out,
 	};
 
-	mlog(0, "(0x%p, 0x%p, %u, '%.*s')\n", out, pipe,
-	     (unsigned int)len,
-	     out->f_path.dentry->d_name.len,
-	     out->f_path.dentry->d_name.name);
+
+	trace_ocfs2_file_splice_write(inode, out, out->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			out->f_path.dentry->d_name.len,
+			out->f_path.dentry->d_name.name, len);
 
 	if (pipe->inode)
 		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
@@ -2474,10 +2470,10 @@ static ssize_t ocfs2_file_splice_read(struct file *in,
 	int ret = 0, lock_level = 0;
 	struct inode *inode = in->f_path.dentry->d_inode;
 
-	mlog(0, "(0x%p, 0x%p, %u, '%.*s')\n", in, pipe,
-	     (unsigned int)len,
-	     in->f_path.dentry->d_name.len,
-	     in->f_path.dentry->d_name.name);
+	trace_ocfs2_file_splice_read(inode, in, in->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			in->f_path.dentry->d_name.len,
+			in->f_path.dentry->d_name.name, len);
 
 	/*
 	 * See the comment in ocfs2_file_aio_read()
@@ -2504,10 +2500,11 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	struct file *filp = iocb->ki_filp;
 	struct inode *inode = filp->f_path.dentry->d_inode;
 
-	mlog(0, "(0x%p, %u, '%.*s')\n", filp,
-	     (unsigned int)nr_segs,
-	     filp->f_path.dentry->d_name.len,
-	     filp->f_path.dentry->d_name.name);
+	trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			filp->f_path.dentry->d_name.len,
+			filp->f_path.dentry->d_name.name, nr_segs);
+
 
 	if (!inode) {
 		ret = -EINVAL;
@@ -2553,8 +2550,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 	ocfs2_inode_unlock(inode, lock_level);
 
 	ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos);
-	if (ret == -EINVAL)
-		mlog(0, "generic_file_aio_read returned -EINVAL\n");
+	trace_generic_file_aio_read_ret(ret);
 
 	/* buffered aio wouldn't have proper lock coverage today */
 	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 77148e7a14f8..16013b5588c6 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -204,6 +204,30 @@ DEFINE_EVENT(ocfs2__uint_uint_uint, name,	\
 		 unsigned int value3),	\
 	TP_ARGS(value1, value2, value3))
 
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_ull,
+	TP_PROTO(unsigned long long value1,
+		 unsigned long long value2, unsigned long long value3),
+	TP_ARGS(value1, value2, value3),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned long long, value3)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+	),
+	TP_printk("%llu %llu %llu",
+		  __entry->value1, __entry->value2, __entry->value3)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_ULL_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_ull, name,	\
+	TP_PROTO(unsigned long long value1, unsigned long long value2,	\
+		 unsigned long long value3),	\
+	TP_ARGS(value1, value2, value3))
+
 DECLARE_EVENT_CLASS(ocfs2__ull_int_int_int,
 	TP_PROTO(unsigned long long ull, int value1, int value2, int value3),
 	TP_ARGS(ull, value1, value2, value3),
@@ -1117,6 +1141,219 @@ TRACE_EVENT(ocfs2_fault,
 
 /* End of trace events for fs/ocfs2/mmap.c. */
 
+/* Trace events for fs/ocfs2/file.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__file_ops,
+	TP_PROTO(void *inode, void *file, void *dentry,
+		 unsigned long long ino,
+		 unsigned int d_len, const unsigned char *d_name,
+		 unsigned long long para),
+	TP_ARGS(inode, file, dentry, ino, d_len, d_name, para),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(void *, file)
+		__field(void *, dentry)
+		__field(unsigned long long, ino)
+		__field(unsigned int, d_len)
+		__string(d_name, d_name)
+		__field(unsigned long long, para)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->file = file;
+		__entry->dentry = dentry;
+		__entry->ino = ino;
+		__entry->d_len = d_len;
+		__assign_str(d_name, d_name);
+		__entry->para = para;
+	),
+	TP_printk("%p %p %p %llu %llu %.*s", __entry->inode, __entry->file,
+		  __entry->dentry, __entry->ino, __entry->para,
+		  __entry->d_len, __get_str(d_name))
+);
+
+#define DEFINE_OCFS2_FILE_OPS(name)				\
+DEFINE_EVENT(ocfs2__file_ops, name,				\
+TP_PROTO(void *inode, void *file, void *dentry,			\
+	 unsigned long long ino,				\
+	 unsigned int d_len, const unsigned char *d_name,	\
+	 unsigned long long mode),				\
+	TP_ARGS(inode, file, dentry, ino, d_len, d_name, mode))
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_open);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_release);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_sync_file);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_write);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_splice_read);
+
+DEFINE_OCFS2_FILE_OPS(ocfs2_file_aio_read);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_truncate_file);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_truncate_file_error);
+
+TRACE_EVENT(ocfs2_extend_allocation,
+	TP_PROTO(unsigned long long ip_blkno, unsigned long long size,
+		 unsigned int clusters, unsigned int clusters_to_add,
+		 int why, int restart_func),
+	TP_ARGS(ip_blkno, size, clusters, clusters_to_add, why, restart_func),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ip_blkno)
+		__field(unsigned long long, size)
+		__field(unsigned int, clusters)
+		__field(unsigned int, clusters_to_add)
+		__field(int, why)
+		__field(int, restart_func)
+	),
+	TP_fast_assign(
+		__entry->ip_blkno = ip_blkno;
+		__entry->size = size;
+		__entry->clusters = clusters;
+		__entry->clusters_to_add = clusters_to_add;
+		__entry->why = why;
+		__entry->restart_func = restart_func;
+	),
+	TP_printk("%llu %llu %u %u %d %d",
+		  __entry->ip_blkno, __entry->size, __entry->clusters,
+		  __entry->clusters_to_add, __entry->why, __entry->restart_func)
+);
+
+TRACE_EVENT(ocfs2_extend_allocation_end,
+	TP_PROTO(unsigned long long ino,
+		 unsigned int di_clusters, unsigned long long di_size,
+		 unsigned int ip_clusters, unsigned long long i_size),
+	TP_ARGS(ino, di_clusters, di_size, ip_clusters, i_size),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, di_clusters)
+		__field(unsigned long long, di_size)
+		__field(unsigned int, ip_clusters)
+		__field(unsigned long long, i_size)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->di_clusters = di_clusters;
+		__entry->di_size = di_size;
+		__entry->ip_clusters = ip_clusters;
+		__entry->i_size = i_size;
+	),
+	TP_printk("%llu %u %llu %u %llu", __entry->ino, __entry->di_clusters,
+		  __entry->di_size, __entry->ip_clusters, __entry->i_size)
+);
+
+TRACE_EVENT(ocfs2_write_zero_page,
+	TP_PROTO(unsigned long long ino,
+		 unsigned long long abs_from, unsigned long long abs_to,
+		 unsigned long index, unsigned int zero_from,
+		 unsigned int zero_to),
+	TP_ARGS(ino, abs_from, abs_to, index, zero_from, zero_to),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, abs_from)
+		__field(unsigned long long, abs_to)
+		__field(unsigned long, index)
+		__field(unsigned int, zero_from)
+		__field(unsigned int, zero_to)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->abs_from = abs_from;
+		__entry->abs_to = abs_to;
+		__entry->index = index;
+		__entry->zero_from = zero_from;
+		__entry->zero_to = zero_to;
+	),
+	TP_printk("%llu %llu %llu %lu %u %u", __entry->ino,
+		  __entry->abs_from, __entry->abs_to,
+		  __entry->index, __entry->zero_from, __entry->zero_to)
+);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend_range);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_extend);
+
+TRACE_EVENT(ocfs2_setattr,
+	TP_PROTO(void *inode, void *dentry,
+		 unsigned long long ino,
+		 unsigned int d_len, const unsigned char *d_name,
+		 unsigned int ia_valid, unsigned int ia_mode,
+		 unsigned int ia_uid, unsigned int ia_gid),
+	TP_ARGS(inode, dentry, ino, d_len, d_name,
+		ia_valid, ia_mode, ia_uid, ia_gid),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(void *, dentry)
+		__field(unsigned long long, ino)
+		__field(unsigned int, d_len)
+		__string(d_name, d_name)
+		__field(unsigned int, ia_valid)
+		__field(unsigned int, ia_mode)
+		__field(unsigned int, ia_uid)
+		__field(unsigned int, ia_gid)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->dentry = dentry;
+		__entry->ino = ino;
+		__entry->d_len = d_len;
+		__assign_str(d_name, d_name);
+		__entry->ia_valid = ia_valid;
+		__entry->ia_mode = ia_mode;
+		__entry->ia_uid = ia_uid;
+		__entry->ia_gid = ia_gid;
+	),
+	TP_printk("%p %p %llu %.*s %u %u %u %u", __entry->inode,
+		  __entry->dentry, __entry->ino, __entry->d_len,
+		  __get_str(d_name), __entry->ia_valid, __entry->ia_mode,
+		  __entry->ia_uid, __entry->ia_gid)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_write_remove_suid);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_zero_partial_clusters);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range1);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_zero_partial_clusters_range2);
+
+DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range);
+
+TRACE_EVENT(ocfs2_prepare_inode_for_write,
+	TP_PROTO(unsigned long long ino, unsigned long long saved_pos,
+		 int appending, unsigned long count,
+		 int *direct_io, int *has_refcount),
+	TP_ARGS(ino, saved_pos, appending, count, direct_io, has_refcount),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned long long, saved_pos)
+		__field(int, appending)
+		__field(unsigned long, count)
+		__field(int, direct_io)
+		__field(int, has_refcount)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->saved_pos = saved_pos;
+		__entry->appending = appending;
+		__entry->count = count;
+		__entry->direct_io = direct_io ? *direct_io : -1;
+		__entry->has_refcount = has_refcount ? *has_refcount : -1;
+	),
+	TP_printk("%llu %llu %d %lu %d %d", __entry->ino,
+		  __entry->saved_pos, __entry->appending, __entry->count,
+		  __entry->direct_io, __entry->has_refcount)
+);
+
+DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
+
+/* End of trace events for fs/ocfs2/file.c. */
+
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 902efb23b6a6..3d635f4bbb20 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -27,7 +27,6 @@
 #include <linux/types.h>
 #include <linux/highmem.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
-- 
cgit v1.2.3


From 6218b90e7669d83cb8ad744f3400205d260597f2 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 21 Feb 2011 11:18:30 +0800
Subject: ocfs2: Little refactoring against ocfs2_iget.

ocfs2_iget is used to get/create inode. Only iget5_locked
will give us an inode = NULL. So move this check ahead of
ocfs2_read_locked_inode so that we don't need to check
inode before we read and unlock inode. This is also helpful
for trace event(see the next patch).

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/inode.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index a96e56f12728..8ec0cb01cb7c 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -152,16 +152,16 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
 	/* inode was *not* in the inode cache. 2.6.x requires
 	 * us to do our own read_inode call and unlock it
 	 * afterwards. */
-	if (inode && inode->i_state & I_NEW) {
-		mlog(0, "Inode was not in inode cache, reading it.\n");
-		ocfs2_read_locked_inode(inode, &args);
-		unlock_new_inode(inode);
-	}
 	if (inode == NULL) {
 		inode = ERR_PTR(-ENOMEM);
 		mlog_errno(PTR_ERR(inode));
 		goto bail;
 	}
+	if (inode->i_state & I_NEW) {
+		mlog(0, "Inode was not in inode cache, reading it.\n");
+		ocfs2_read_locked_inode(inode, &args);
+		unlock_new_inode(inode);
+	}
 	if (is_bad_inode(inode)) {
 		iput(inode);
 		inode = ERR_PTR(-ESTALE);
-- 
cgit v1.2.3


From 64f3b2692724fdbe72697b77887685b5bd3f4a3c Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 22:24:57 +0800
Subject: ocfs2: Remove masklog ML_INODE.

Remove mlog(0) from fs/ocfs2/inode.c and the masklog INODE.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |   1 -
 fs/ocfs2/cluster/masklog.h |   1 -
 fs/ocfs2/inode.c           | 107 +++++++++++++++--------------------
 fs/ocfs2/ocfs2_trace.h     | 136 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 182 insertions(+), 63 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 036e8b8814f7..e18fb801633e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -99,7 +99,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(BH_IO),
 	define_mask(UPTODATE),
 	define_mask(NAMEI),
-	define_mask(INODE),
 	define_mask(VOTE),
 	define_mask(DCACHE),
 	define_mask(CONN),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index d07780ab9f8f..b76d07786107 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -101,7 +101,6 @@
 #define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
 #define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
 #define ML_NAMEI	0x0000000000400000ULL /* ocfs2 directory / namespace */
-#define ML_INODE	0x0000000000800000ULL /* ocfs2 inode manipulation */
 #define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
 #define ML_DCACHE	0x0000000002000000ULL /* ocfs2 dcache operations */
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8ec0cb01cb7c..177d3a6c2a5f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -31,7 +31,6 @@
 
 #include <asm/byteorder.h>
 
-#define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -53,6 +52,7 @@
 #include "uptodate.h"
 #include "xattr.h"
 #include "refcounttree.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -131,7 +131,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
 	struct super_block *sb = osb->sb;
 	struct ocfs2_find_inode_args args;
 
-	mlog(0, "(blkno = %llu)\n", (unsigned long long)blkno);
+	trace_ocfs2_iget_begin((unsigned long long)blkno, flags,
+			       sysfile_type);
 
 	/* Ok. By now we've either got the offsets passed to us by the
 	 * caller, or we just pulled them off the bh. Lets do some
@@ -157,8 +158,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
 		mlog_errno(PTR_ERR(inode));
 		goto bail;
 	}
+	trace_ocfs2_iget5_locked(inode->i_state);
 	if (inode->i_state & I_NEW) {
-		mlog(0, "Inode was not in inode cache, reading it.\n");
 		ocfs2_read_locked_inode(inode, &args);
 		unlock_new_inode(inode);
 	}
@@ -170,9 +171,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, unsigned flags,
 
 bail:
 	if (!IS_ERR(inode)) {
-		mlog(0, "returning inode with number %llu\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		mlog(0, "inode %p\n", inode);
+		trace_ocfs2_iget_end(inode, 
+			(unsigned long long)OCFS2_I(inode)->ip_blkno);
 	}
 
 	return inode;
@@ -192,12 +192,12 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	int ret = 0;
 
-	mlog(0, "(0x%p, %lu, 0x%p)\n", inode, inode->i_ino, opaque);
-
 	args = opaque;
 
 	mlog_bug_on_msg(!inode, "No inode in find actor!\n");
 
+	trace_ocfs2_find_actor(inode, inode->i_ino, opaque, args->fi_blkno);
+
 	if (oi->ip_blkno != args->fi_blkno)
 		goto bail;
 
@@ -217,8 +217,6 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
 	static struct lock_class_key ocfs2_quota_ip_alloc_sem_key,
 				     ocfs2_file_ip_alloc_sem_key;
 
-	mlog(0, "inode = %p, opaque = %p\n", inode, opaque);
-
 	inode->i_ino = args->fi_ino;
 	OCFS2_I(inode)->ip_blkno = args->fi_blkno;
 	if (args->fi_sysfile_type != 0)
@@ -244,9 +242,6 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	struct ocfs2_super *osb;
 	int use_plocks = 1;
 
-	mlog(0, "(0x%p, size:%llu)\n", inode,
-	     (unsigned long long)le64_to_cpu(fe->i_size));
-
 	sb = inode->i_sb;
 	osb = OCFS2_SB(sb);
 
@@ -298,20 +293,20 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 	inode->i_nlink = ocfs2_read_links_count(fe);
 
+	trace_ocfs2_populate_inode(OCFS2_I(inode)->ip_blkno,
+				   le32_to_cpu(fe->i_flags));
 	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
 		inode->i_flags |= S_NOQUOTA;
 	}
-
+  
 	if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
-		mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
 		OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
 		inode->i_flags |= S_NOQUOTA;
 	} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
-		mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
 		/* we can't actually hit this as read_inode can't
 		 * handle superblocks today ;-) */
 		BUG();
@@ -391,8 +386,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	int status, can_lock;
 	u32 generation = 0;
 
-	mlog(0, "(0x%p, 0x%p)\n", inode, args);
-
 	status = -EINVAL;
 	if (inode == NULL || inode->i_sb == NULL) {
 		mlog(ML_ERROR, "bad inode\n");
@@ -440,6 +433,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 		&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
 		&& !ocfs2_mount_local(osb);
 
+	trace_ocfs2_read_locked_inode(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno, can_lock);
+
 	/*
 	 * To maintain backwards compatibility with older versions of
 	 * ocfs2-tools, we still store the generation value for system
@@ -689,8 +685,6 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
 
 	spin_lock(&osb->osb_lock);
 	if (ocfs2_node_map_test_bit(osb, &osb->osb_recovering_orphan_dirs, slot)) {
-		mlog(0, "Recovery is happening on orphan dir %d, will skip "
-		     "this inode\n", slot);
 		ret = -EDEADLK;
 		goto out;
 	}
@@ -699,6 +693,7 @@ static int ocfs2_check_orphan_recovery_state(struct ocfs2_super *osb,
 	osb->osb_orphan_wipes[slot]++;
 out:
 	spin_unlock(&osb->osb_lock);
+	trace_ocfs2_check_orphan_recovery_state(slot, ret);
 	return ret;
 }
 
@@ -809,6 +804,10 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
+	trace_ocfs2_inode_is_valid_to_delete(current, osb->dc_task,
+					     (unsigned long long)oi->ip_blkno,
+					     oi->ip_flags);
+
 	/* We shouldn't be getting here for the root directory
 	 * inode.. */
 	if (inode == osb->root_inode) {
@@ -821,11 +820,8 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
 	 * have to skip deleting this guy. That's OK though because
 	 * the node who's doing the actual deleting should handle it
 	 * anyway. */
-	if (current == osb->dc_task) {
-		mlog(0, "Skipping delete of %lu because we're currently "
-		     "in downconvert\n", inode->i_ino);
+	if (current == osb->dc_task)
 		goto bail;
-	}
 
 	spin_lock(&oi->ip_lock);
 	/* OCFS2 *never* deletes system files. This should technically
@@ -840,11 +836,8 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode)
 	/* If we have allowd wipe of this inode for another node, it
 	 * will be marked here so we can safely skip it. Recovery will
 	 * cleanup any inodes we might inadvertantly skip here. */
-	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE) {
-		mlog(0, "Skipping delete of %lu because another node "
-		     "has done this for us.\n", inode->i_ino);
+	if (oi->ip_flags & OCFS2_INODE_SKIP_DELETE)
 		goto bail_unlock;
-	}
 
 	ret = 1;
 bail_unlock:
@@ -861,28 +854,27 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 				  struct buffer_head *di_bh,
 				  int *wipe)
 {
-	int status = 0;
+	int status = 0, reason = 0;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_dinode *di;
 
 	*wipe = 0;
 
+	trace_ocfs2_query_inode_wipe_begin((unsigned long long)oi->ip_blkno,
+					   inode->i_nlink);
+
 	/* While we were waiting for the cluster lock in
 	 * ocfs2_delete_inode, another node might have asked to delete
 	 * the inode. Recheck our flags to catch this. */
 	if (!ocfs2_inode_is_valid_to_delete(inode)) {
-		mlog(0, "Skipping delete of %llu because flags changed\n",
-		     (unsigned long long)oi->ip_blkno);
+		reason = 1;
 		goto bail;
 	}
 
 	/* Now that we have an up to date inode, we can double check
 	 * the link count. */
-	if (inode->i_nlink) {
-		mlog(0, "Skipping delete of %llu because nlink = %u\n",
-		     (unsigned long long)oi->ip_blkno, inode->i_nlink);
+	if (inode->i_nlink)
 		goto bail;
-	}
 
 	/* Do some basic inode verification... */
 	di = (struct ocfs2_dinode *) di_bh->b_data;
@@ -897,9 +889,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 		 * ORPHANED_FL not.
 		 */
 		if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) {
-			mlog(0, "Reflinked inode %llu is no longer orphaned.  "
-			     "it shouldn't be deleted\n",
-			     (unsigned long long)oi->ip_blkno);
+			reason = 2;
 			goto bail;
 		}
 
@@ -936,8 +926,7 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 	status = ocfs2_try_open_lock(inode, 1);
 	if (status == -EAGAIN) {
 		status = 0;
-		mlog(0, "Skipping delete of %llu because it is in use on "
-		     "other nodes\n", (unsigned long long)oi->ip_blkno);
+		reason = 3;
 		goto bail;
 	}
 	if (status < 0) {
@@ -946,11 +935,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 	}
 
 	*wipe = 1;
-	mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
-	     (unsigned long long)oi->ip_blkno,
-	     le16_to_cpu(di->i_orphaned_slot));
+	trace_ocfs2_query_inode_wipe_succ(le16_to_cpu(di->i_orphaned_slot));
 
 bail:
+	trace_ocfs2_query_inode_wipe_end(status, reason);
 	return status;
 }
 
@@ -960,8 +948,8 @@ bail:
 static void ocfs2_cleanup_delete_inode(struct inode *inode,
 				       int sync_data)
 {
-	mlog(0, "Cleanup inode %llu, sync = %d\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
+	trace_ocfs2_cleanup_delete_inode(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
 	if (sync_data)
 		write_inode_now(inode, 1);
 	truncate_inode_pages(&inode->i_data, 0);
@@ -973,15 +961,15 @@ static void ocfs2_delete_inode(struct inode *inode)
 	sigset_t oldset;
 	struct buffer_head *di_bh = NULL;
 
-	mlog(0, "(inode->i_ino = %lu)\n", inode->i_ino);
+	trace_ocfs2_delete_inode(inode->i_ino,
+				 (unsigned long long)OCFS2_I(inode)->ip_blkno,
+				 is_bad_inode(inode));
 
 	/* When we fail in read_inode() we mark inode as bad. The second test
 	 * catches the case when inode allocation fails before allocating
 	 * a block for inode. */
-	if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
-		mlog(0, "Skipping delete of bad inode\n");
+	if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno)
 		goto bail;
-	}
 
 	dquot_initialize(inode);
 
@@ -1082,8 +1070,8 @@ static void ocfs2_clear_inode(struct inode *inode)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
 	end_writeback(inode);
-	mlog(0, "Clearing inode: %llu, nlink = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_nlink);
+	trace_ocfs2_clear_inode((unsigned long long)oi->ip_blkno,
+				inode->i_nlink);
 
 	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
 			"Inode=%lu\n", inode->i_ino);
@@ -1193,8 +1181,8 @@ int ocfs2_drop_inode(struct inode *inode)
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	int res;
 
-	mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
-	     (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
+	trace_ocfs2_drop_inode((unsigned long long)oi->ip_blkno,
+				inode->i_nlink, oi->ip_flags);
 
 	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
 		res = 1;
@@ -1212,11 +1200,11 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
 	struct inode *inode = dentry->d_inode;
 	int status = 0;
 
-	mlog(0, "(inode = 0x%p, ino = %llu)\n", inode,
-	     inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL);
+	trace_ocfs2_inode_revalidate(inode,
+		inode ? (unsigned long long)OCFS2_I(inode)->ip_blkno : 0ULL,
+		inode ? (unsigned long long)OCFS2_I(inode)->ip_flags : 0);
 
 	if (!inode) {
-		mlog(0, "eep, no inode!\n");
 		status = -ENOENT;
 		goto bail;
 	}
@@ -1224,7 +1212,6 @@ int ocfs2_inode_revalidate(struct dentry *dentry)
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		mlog(0, "inode deleted!\n");
 		status = -ENOENT;
 		goto bail;
 	}
@@ -1255,8 +1242,7 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
 	int status;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
 
-	mlog(0, "(inode %llu)\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	trace_ocfs2_mark_inode_dirty((unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), bh,
 					 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -1328,8 +1314,7 @@ int ocfs2_validate_inode_block(struct super_block *sb,
 	int rc;
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
 
-	mlog(0, "Validating dinode %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_inode_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 16013b5588c6..052057a01ea8 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -114,6 +114,25 @@ DEFINE_EVENT(ocfs2__ull_uint, name,	\
 	TP_PROTO(unsigned long long val1, unsigned int val2),	\
 	TP_ARGS(val1, val2))
 
+DECLARE_EVENT_CLASS(ocfs2__ull_int,
+	TP_PROTO(unsigned long long value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%llu %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_ULL_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_int, name,	\
+	TP_PROTO(unsigned long long val1, int val2),	\
+	TP_ARGS(val1, val2))
+
 DECLARE_EVENT_CLASS(ocfs2__ull_ull,
 	TP_PROTO(unsigned long long value1, unsigned long long value2),
 	TP_ARGS(value1, value2),
@@ -1354,6 +1373,123 @@ DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret);
 
 /* End of trace events for fs/ocfs2/file.c. */
 
+/* Trace events for fs/ocfs2/inode.c. */
+
+TRACE_EVENT(ocfs2_iget_begin,
+	TP_PROTO(unsigned long long ino, unsigned int flags, int sysfile_type),
+	TP_ARGS(ino, flags, sysfile_type),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+		__field(int, sysfile_type)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->flags = flags;
+		__entry->sysfile_type = sysfile_type;
+	),
+	TP_printk("%llu %u %d", __entry->ino,
+		  __entry->flags, __entry->sysfile_type)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_iget5_locked);
+
+TRACE_EVENT(ocfs2_iget_end,
+	TP_PROTO(void *inode, unsigned long long ino),
+	TP_ARGS(inode, ino),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+	),
+	TP_printk("%p %llu", __entry->inode, __entry->ino)
+);
+
+TRACE_EVENT(ocfs2_find_actor,
+	TP_PROTO(void *inode, unsigned long long ino,
+		 void *args,  unsigned long long fi_blkno),
+	TP_ARGS(inode, ino, args, fi_blkno),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+		__field(void *, args)
+		__field(unsigned long long, fi_blkno)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+		__entry->args = args;
+		__entry->fi_blkno = fi_blkno;
+	),
+	TP_printk("%p %llu %p %llu", __entry->inode, __entry->ino,
+		  __entry->args, __entry->fi_blkno)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_populate_inode);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_read_locked_inode);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_check_orphan_recovery_state);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_inode_block);
+
+TRACE_EVENT(ocfs2_inode_is_valid_to_delete,
+	TP_PROTO(void *task, void *dc_task, unsigned long long ino,
+		 unsigned int flags),
+	TP_ARGS(task, dc_task, ino, flags),
+	TP_STRUCT__entry(
+		__field(void *, task)
+		__field(void *, dc_task)
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->task = task;
+		__entry->dc_task = dc_task;
+		__entry->ino = ino;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %p %llu %u", __entry->task, __entry->dc_task,
+		  __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_query_inode_wipe_begin);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_query_inode_wipe_succ);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_query_inode_wipe_end);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_cleanup_delete_inode);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_delete_inode);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_clear_inode);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_drop_inode);
+
+TRACE_EVENT(ocfs2_inode_revalidate,
+	TP_PROTO(void *inode, unsigned long long ino,
+		 unsigned int flags),
+	TP_ARGS(inode, ino, flags),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, ino)
+		__field(unsigned int, flags)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->ino = ino;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %llu %u", __entry->inode, __entry->ino, __entry->flags)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty);
+
+/* End of trace events for fs/ocfs2/inode.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From a716357c47ea1cc4f535356cff2c39edd327136b Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 22:27:33 +0800
Subject: ocfs2: Remove masklog ML_EXTENT_MAP.

Remove mlog(0) from fs/ocfs2/extent_map.c and the masklog EXTENT_MAP.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |  1 -
 fs/ocfs2/cluster/masklog.h |  1 -
 fs/ocfs2/extent_map.c      |  5 ++---
 fs/ocfs2/ocfs2_trace.h     | 29 +++++++++++++++++++++++++++++
 4 files changed, 31 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index e18fb801633e..c5ba4373d863 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -94,7 +94,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(AIO),
 	define_mask(JOURNAL),
 	define_mask(SUPER),
-	define_mask(EXTENT_MAP),
 	define_mask(DLM_GLUE),
 	define_mask(BH_IO),
 	define_mask(UPTODATE),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index b76d07786107..e4ca487078b3 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -96,7 +96,6 @@
 #define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
 #define ML_JOURNAL	0x0000000000004000ULL /* ocfs2 journalling functions */
 #define ML_SUPER	0x0000000000010000ULL /* ocfs2 mount / umount */
-#define ML_EXTENT_MAP	0x0000000000040000ULL /* ocfs2 extent map caching */
 #define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
 #define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
 #define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 7d74d176706a..23457b491e8c 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/fiemap.h>
 
-#define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "inode.h"
 #include "super.h"
 #include "symlink.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -841,8 +841,7 @@ int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
 	u64 p_block, p_count;
 	int i, count, done = 0;
 
-	mlog(0, "(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
-	     "flags = %x, validate = %p)\n",
+	trace_ocfs2_read_virt_blocks(
 	     inode, (unsigned long long)v_block, nr, bhs, flags,
 	     validate);
 
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 052057a01ea8..a7938505227b 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1490,6 +1490,35 @@ TRACE_EVENT(ocfs2_inode_revalidate,
 DEFINE_OCFS2_ULL_EVENT(ocfs2_mark_inode_dirty);
 
 /* End of trace events for fs/ocfs2/inode.c. */
+
+/* Trace events for fs/ocfs2/extent_map.c. */
+
+TRACE_EVENT(ocfs2_read_virt_blocks,
+	TP_PROTO(void *inode, unsigned long long vblock, int nr,
+		 void *bhs, unsigned int flags, void *validate),
+	TP_ARGS(inode, vblock, nr, bhs, flags, validate),
+	TP_STRUCT__entry(
+		__field(void *, inode)
+		__field(unsigned long long, vblock)
+		__field(int, nr)
+		__field(void *, bhs)
+		__field(unsigned int, flags)
+		__field(void *, validate)
+	),
+	TP_fast_assign(
+		__entry->inode = inode;
+		__entry->vblock = vblock;
+		__entry->nr = nr;
+		__entry->bhs = bhs;
+		__entry->flags = flags;
+		__entry->validate = validate;
+	),
+	TP_printk("%p %llu %d %p %x %p", __entry->inode, __entry->vblock,
+		  __entry->nr, __entry->bhs, __entry->flags, __entry->validate)
+);
+
+/* End of trace events for fs/ocfs2/extent_map.c. */
+
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From a8731086eff053b430cddbf5783654dfd700ea06 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Tue, 22 Feb 2011 22:29:08 +0800
Subject: ocfs2: Remove mlog(0) from fs/ocfs2/slot_map.c

This is the 1st step to remove the debug info of SUPER.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/ocfs2_trace.h | 11 +++++++++++
 fs/ocfs2/slot_map.c    | 13 +++++--------
 2 files changed, 16 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a7938505227b..d70471f8ad04 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1519,6 +1519,17 @@ TRACE_EVENT(ocfs2_read_virt_blocks,
 
 /* End of trace events for fs/ocfs2/extent_map.c. */
 
+/* Trace events for fs/ocfs2/slot_map.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_refresh_slot_info);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot);
+
+/* End of trace events for fs/ocfs2/slot_map.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 93b449ed82f9..26fc0014d509 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -27,7 +27,6 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 
-#define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "slot_map.h"
 #include "super.h"
 #include "sysfile.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -142,8 +142,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
 	BUG_ON(si->si_blocks == 0);
 	BUG_ON(si->si_bh == NULL);
 
-	mlog(0, "Refreshing slot map, reading %u block(s)\n",
-	     si->si_blocks);
+	trace_ocfs2_refresh_slot_info(si->si_blocks);
 
 	/*
 	 * We pass -1 as blocknr because we expect all of si->si_bh to
@@ -381,8 +380,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 	/* The size checks above should ensure this */
 	BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
 
-	mlog(0, "Slot map needs %u buffers for %llu bytes\n",
-	     si->si_blocks, bytes);
+	trace_ocfs2_map_slot_buffers(bytes, si->si_blocks);
 
 	si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
 			    GFP_KERNEL);
@@ -400,8 +398,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
 			goto bail;
 		}
 
-		mlog(0, "Reading slot map block %u at %llu\n", i,
-		     (unsigned long long)blkno);
+		trace_ocfs2_map_slot_buffers_block((unsigned long long)blkno, i);
 
 		bh = NULL;  /* Acquire a fresh bh */
 		status = ocfs2_read_blocks(INODE_CACHE(si->si_inode), blkno,
@@ -503,7 +500,7 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 	osb->slot_num = slot;
 	spin_unlock(&osb->osb_lock);
 
-	mlog(0, "taking node slot %d\n", osb->slot_num);
+	trace_ocfs2_find_slot(osb->slot_num);
 
 	status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
 	if (status < 0)
-- 
cgit v1.2.3


From b5770f98b00e94dd573e3438427de09733fe2039 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 23 Feb 2011 21:17:39 +0800
Subject: ocfs2: Remove mlog(0) from fs/ocfs2/heartbeat.c

This is the 2nd step to remove the debug info of SUPER.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/heartbeat.c   | 4 ++--
 fs/ocfs2/ocfs2_trace.h | 7 +++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 1aa863dd901f..d8208b20dc53 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,7 +28,6 @@
 #include <linux/types.h>
 #include <linux/highmem.h>
 
-#define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -37,6 +36,7 @@
 #include "heartbeat.h"
 #include "inode.h"
 #include "journal.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -66,7 +66,7 @@ void ocfs2_do_node_down(int node_num, void *data)
 
 	BUG_ON(osb->node_num == node_num);
 
-	mlog(0, "ocfs2: node down event for %d\n", node_num);
+	trace_ocfs2_do_node_down(node_num);
 
 	if (!osb->cconn) {
 		/*
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index d70471f8ad04..876b119d3656 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1530,6 +1530,13 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_map_slot_buffers_block);
 DEFINE_OCFS2_INT_EVENT(ocfs2_find_slot);
 
 /* End of trace events for fs/ocfs2/slot_map.c. */
+
+/* Trace events for fs/ocfs2/heartbeat.c. */
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down);
+
+/* End of trace events for fs/ocfs2/heartbeat.c. */
+
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 32a42d392bf9b8f90f41434ccb3605e958b16251 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 23 Feb 2011 21:29:08 +0800
Subject: ocfs2: Remove masklog ML_SUPER.

Remove mlog(0) from fs/ocfs2/super.c and the masklog SUPER.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |   1 -
 fs/ocfs2/cluster/masklog.h |   1 -
 fs/ocfs2/ocfs2_trace.h     | 108 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/super.c           |  29 +++++-------
 4 files changed, 118 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index c5ba4373d863..ffc0de62a031 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -93,7 +93,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(DLM_RECOVERY),
 	define_mask(AIO),
 	define_mask(JOURNAL),
-	define_mask(SUPER),
 	define_mask(DLM_GLUE),
 	define_mask(BH_IO),
 	define_mask(UPTODATE),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index e4ca487078b3..98d4af95c8fe 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -95,7 +95,6 @@
 #define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
 #define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
 #define ML_JOURNAL	0x0000000000004000ULL /* ocfs2 journalling functions */
-#define ML_SUPER	0x0000000000010000ULL /* ocfs2 mount / umount */
 #define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
 #define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
 #define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 876b119d3656..75900635109b 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -57,6 +57,23 @@ DEFINE_EVENT(ocfs2__ull, name,	\
 	TP_PROTO(unsigned long long num),	\
 	TP_ARGS(num))
 
+DECLARE_EVENT_CLASS(ocfs2__pointer,
+	TP_PROTO(void *pointer),
+	TP_ARGS(pointer),
+	TP_STRUCT__entry(
+		__field(void *, pointer)
+	),
+	TP_fast_assign(
+		__entry->pointer = pointer;
+	),
+	TP_printk("%p", __entry->pointer)
+);
+
+#define DEFINE_OCFS2_POINTER_EVENT(name)	\
+DEFINE_EVENT(ocfs2__pointer, name,	\
+	TP_PROTO(void *pointer),	\
+	TP_ARGS(pointer))
+
 DECLARE_EVENT_CLASS(ocfs2__int_int,
 	TP_PROTO(int value1, int value2),
 	TP_ARGS(value1, value2),
@@ -1537,6 +1554,97 @@ DEFINE_OCFS2_INT_EVENT(ocfs2_do_node_down);
 
 /* End of trace events for fs/ocfs2/heartbeat.c. */
 
+/* Trace events for fs/ocfs2/super.c. */
+
+TRACE_EVENT(ocfs2_remount,
+	TP_PROTO(unsigned long s_flags, unsigned long osb_flags, int flags),
+	TP_ARGS(s_flags, osb_flags, flags),
+	TP_STRUCT__entry(
+		__field(unsigned long, s_flags)
+		__field(unsigned long, osb_flags)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->s_flags = s_flags;
+		__entry->osb_flags = osb_flags;
+		__entry->flags = flags;
+	),
+	TP_printk("%lu %lu %d", __entry->s_flags,
+		  __entry->osb_flags, __entry->flags)
+);
+
+TRACE_EVENT(ocfs2_fill_super,
+	TP_PROTO(void *sb, void *data, int silent),
+	TP_ARGS(sb, data, silent),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, data)
+		__field(int, silent)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->data = data;
+		__entry->silent = silent;
+	),
+	TP_printk("%p %p %d", __entry->sb,
+		  __entry->data, __entry->silent)
+);
+
+TRACE_EVENT(ocfs2_parse_options,
+	TP_PROTO(int is_remount, char *options),
+	TP_ARGS(is_remount, options),
+	TP_STRUCT__entry(
+		__field(int, is_remount)
+		__string(options, options)
+	),
+	TP_fast_assign(
+		__entry->is_remount = is_remount;
+		__assign_str(options, options);
+	),
+	TP_printk("%d %s", __entry->is_remount, __get_str(options))
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_put_super);
+
+TRACE_EVENT(ocfs2_statfs,
+	TP_PROTO(void *sb, void *buf),
+	TP_ARGS(sb, buf),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, buf)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->buf = buf;
+	),
+	TP_printk("%p %p", __entry->sb, __entry->buf)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_dismount_volume);
+
+TRACE_EVENT(ocfs2_initialize_super,
+	TP_PROTO(char *label, char *uuid_str, unsigned long long root_dir,
+		 unsigned long long system_dir, int cluster_bits),
+	TP_ARGS(label, uuid_str, root_dir, system_dir, cluster_bits),
+	TP_STRUCT__entry(
+		__string(label, label)
+		__string(uuid_str, uuid_str)
+		__field(unsigned long long, root_dir)
+		__field(unsigned long long, system_dir)
+		__field(int, cluster_bits)
+	),
+	TP_fast_assign(
+		__assign_str(label, label);
+		__assign_str(uuid_str, uuid_str);
+		__entry->root_dir = root_dir;
+		__entry->system_dir = system_dir;
+		__entry->cluster_bits = cluster_bits;
+	),
+	TP_printk("%s %s %llu %llu %d", __get_str(label), __get_str(uuid_str),
+		  __entry->root_dir, __entry->system_dir, __entry->cluster_bits)
+);
+
+/* End of trace events for fs/ocfs2/super.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f7e73a029844..0e4083987d2b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -45,7 +45,6 @@
 #define CREATE_TRACE_POINTS
 #include "ocfs2_trace.h"
 
-#define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -680,12 +679,9 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 		}
 
 		if (*flags & MS_RDONLY) {
-			mlog(0, "Going to ro mode.\n");
 			sb->s_flags |= MS_RDONLY;
 			osb->osb_flags |= OCFS2_OSB_SOFT_RO;
 		} else {
-			mlog(0, "Making ro filesystem writeable.\n");
-
 			if (osb->osb_flags & OCFS2_OSB_ERROR_FS) {
 				mlog(ML_ERROR, "Cannot remount RDWR "
 				     "filesystem due to previous errors.\n");
@@ -703,6 +699,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 			sb->s_flags &= ~MS_RDONLY;
 			osb->osb_flags &= ~OCFS2_OSB_SOFT_RO;
 		}
+		trace_ocfs2_remount(sb->s_flags, osb->osb_flags, *flags);
 unlock_osb:
 		spin_unlock(&osb->osb_lock);
 		/* Enable quota accounting after remounting RW */
@@ -1028,7 +1025,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 	char nodestr[8];
 	struct ocfs2_blockcheck_stats stats;
 
-	mlog(0, "%p, %p, %i", sb, data, silent);
+	trace_ocfs2_fill_super(sb, data, silent);
 
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
 		status = -EINVAL;
@@ -1315,8 +1312,7 @@ static int ocfs2_parse_options(struct super_block *sb,
 	char *p;
 	u32 tmp;
 
-	mlog(0, "remount: %d, options: \"%s\"\n", is_remount,
-	     options ? options : "(none)");
+	trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
 
 	mopt->commit_interval = 0;
 	mopt->mount_opt = OCFS2_MOUNT_NOINTR;
@@ -1692,7 +1688,7 @@ static void __exit ocfs2_exit(void)
 
 static void ocfs2_put_super(struct super_block *sb)
 {
-	mlog(0, "(0x%p)\n", sb);
+	trace_ocfs2_put_super(sb);
 
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
@@ -1707,7 +1703,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	struct buffer_head *bh = NULL;
 	struct inode *inode = NULL;
 
-	mlog(0, "(%p, %p)\n", dentry->d_sb, buf);
+	trace_ocfs2_statfs(dentry->d_sb, buf);
 
 	osb = OCFS2_SB(dentry->d_sb);
 
@@ -1928,7 +1924,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
 	struct ocfs2_super *osb = NULL;
 	char nodestr[8];
 
-	mlog(0, "(0x%p)\n", sb);
+	trace_ocfs2_dismount_volume(sb);
 
 	BUG_ON(!sb);
 	osb = OCFS2_SB(sb);
@@ -2143,7 +2139,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		status = -EINVAL;
 		goto bail;
 	}
-	mlog(0, "max_slots for this device: %u\n", osb->max_slots);
 
 	ocfs2_orphan_scan_init(osb);
 
@@ -2282,7 +2277,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	osb->s_clustersize_bits =
 		le32_to_cpu(di->id2.i_super.s_clustersize_bits);
 	osb->s_clustersize = 1 << osb->s_clustersize_bits;
-	mlog(0, "clusterbits=%d\n", osb->s_clustersize_bits);
 
 	if (osb->s_clustersize < OCFS2_MIN_CLUSTERSIZE ||
 	    osb->s_clustersize > OCFS2_MAX_CLUSTERSIZE) {
@@ -2321,11 +2315,10 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		le64_to_cpu(di->id2.i_super.s_first_cluster_group);
 	osb->fs_generation = le32_to_cpu(di->i_fs_generation);
 	osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash);
-	mlog(0, "vol_label: %s\n", osb->vol_label);
-	mlog(0, "uuid: %s\n", osb->uuid_str);
-	mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n",
-	     (unsigned long long)osb->root_blkno,
-	     (unsigned long long)osb->system_dir_blkno);
+	trace_ocfs2_initialize_super(osb->vol_label, osb->uuid_str,
+				     (unsigned long long)osb->root_blkno,
+				     (unsigned long long)osb->system_dir_blkno,
+				     osb->s_clustersize_bits);
 
 	osb->osb_dlm_debug = ocfs2_new_dlm_debug();
 	if (!osb->osb_dlm_debug) {
@@ -2500,8 +2493,6 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
 		 * ourselves as mounted. */
 	}
 
-	mlog(0, "Journal loaded.\n");
-
 	status = ocfs2_load_local_alloc(osb);
 	if (status < 0) {
 		mlog_errno(status);
-- 
cgit v1.2.3


From 402b418311f0d20a71451770d764a2e37b08dbcf Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 23 Feb 2011 22:01:17 +0800
Subject: ocfs2: Remove masklog ML_XATTR.

Remove mlog(0) from fs/ocfs2/xattr.c and the masklog ML_XATTR.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |   1 -
 fs/ocfs2/cluster/masklog.h |   1 -
 fs/ocfs2/ocfs2_trace.h     | 164 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/xattr.c           | 153 +++++++++++++++++++++---------------------
 4 files changed, 239 insertions(+), 80 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index ffc0de62a031..ef140ab7dcf8 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -102,7 +102,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(CONN),
 	define_mask(QUORUM),
 	define_mask(EXPORT),
-	define_mask(XATTR),
 	define_mask(QUOTA),
 	define_mask(BASTS),
 	define_mask(RESERVATIONS),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 98d4af95c8fe..f574b167e8dc 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -104,7 +104,6 @@
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
-#define ML_XATTR	0x0000000020000000ULL /* ocfs2 extended attributes */
 #define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_BASTS	0x0000000100000000ULL /* dlmglue asts and basts */
 #define ML_RESERVATIONS	0x0000000200000000ULL /* ocfs2 alloc reservations */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 75900635109b..0fc2840d6fb1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -74,6 +74,23 @@ DEFINE_EVENT(ocfs2__pointer, name,	\
 	TP_PROTO(void *pointer),	\
 	TP_ARGS(pointer))
 
+DECLARE_EVENT_CLASS(ocfs2__string,
+	TP_PROTO(const char *name),
+	TP_ARGS(name),
+	TP_STRUCT__entry(
+		__string(name,name)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+	),
+	TP_printk("%s", __get_str(name))
+);
+
+#define DEFINE_OCFS2_STRING_EVENT(name)	\
+DEFINE_EVENT(ocfs2__string, name,	\
+	TP_PROTO(const char *name),	\
+	TP_ARGS(name))
+
 DECLARE_EVENT_CLASS(ocfs2__int_int,
 	TP_PROTO(int value1, int value2),
 	TP_ARGS(value1, value2),
@@ -317,6 +334,33 @@ DEFINE_EVENT(ocfs2__ull_uint_uint_uint, name,	\
 		 unsigned int value2, unsigned int value3),	\
 	TP_ARGS(ull, value1, value2, value3))
 
+DECLARE_EVENT_CLASS(ocfs2__ull_ull_uint_uint,
+	TP_PROTO(unsigned long long value1, unsigned long long value2,
+		 unsigned int value3, unsigned int value4),
+	TP_ARGS(value1, value2, value3, value4),
+	TP_STRUCT__entry(
+		__field(unsigned long long, value1)
+		__field(unsigned long long, value2)
+		__field(unsigned int, value3)
+		__field(unsigned int, value4)
+	),
+	TP_fast_assign(
+		__entry->value1 = value1;
+		__entry->value2 = value2;
+		__entry->value3 = value3;
+		__entry->value4 = value4;
+	),
+	TP_printk("%llu %llu %u %u",
+		  __entry->value1, __entry->value2,
+		  __entry->value3, __entry->value4)
+);
+
+#define DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__ull_ull_uint_uint, name,	\
+	TP_PROTO(unsigned long long ull, unsigned long long ull1,	\
+		 unsigned int value2, unsigned int value3),	\
+	TP_ARGS(ull, ull1, value2, value3))
+
 /* Trace events for fs/ocfs2/alloc.c. */
 DECLARE_EVENT_CLASS(ocfs2__btree_ops,
 	TP_PROTO(unsigned long long owner,\
@@ -1645,6 +1689,126 @@ TRACE_EVENT(ocfs2_initialize_super,
 );
 
 /* End of trace events for fs/ocfs2/super.c. */
+
+/* Trace events for fs/ocfs2/xattr.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_xattr_block);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_xattr_extend_allocation);
+
+TRACE_EVENT(ocfs2_init_xattr_set_ctxt,
+	TP_PROTO(const char *name, int meta, int clusters, int credits),
+	TP_ARGS(name, meta, clusters, credits),
+	TP_STRUCT__entry(
+		__string(name, name)
+		__field(int, meta)
+		__field(int, clusters)
+		__field(int, credits)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->meta = meta;
+		__entry->clusters = clusters;
+		__entry->credits = credits;
+	),
+	TP_printk("%s %d %d %d", __get_str(name), __entry->meta,
+		  __entry->clusters, __entry->credits)
+);
+
+DECLARE_EVENT_CLASS(ocfs2__xattr_find,
+	TP_PROTO(unsigned long long ino, const char *name, int name_index,
+		 unsigned int hash, unsigned long long location,
+		 int xe_index),
+	TP_ARGS(ino, name, name_index, hash, location, xe_index),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__string(name, name)
+		__field(int, name_index)
+		__field(unsigned int, hash)
+		__field(unsigned long long, location)
+		__field(int, xe_index)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__assign_str(name, name);
+		__entry->name_index = name_index;
+		__entry->hash = hash;
+		__entry->location = location;
+		__entry->xe_index = xe_index;
+	),
+	TP_printk("%llu %s %d %u %llu %d", __entry->ino, __get_str(name),
+		  __entry->name_index, __entry->hash, __entry->location,
+		  __entry->xe_index)
+);
+
+#define DEFINE_OCFS2_XATTR_FIND_EVENT(name)					\
+DEFINE_EVENT(ocfs2__xattr_find, name,					\
+TP_PROTO(unsigned long long ino, const char *name, int name_index,	\
+	 unsigned int hash, unsigned long long bucket,			\
+	 int xe_index),							\
+	TP_ARGS(ino, name, name_index, hash, bucket, xe_index))
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_bucket_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find);
+
+DEFINE_OCFS2_XATTR_FIND_EVENT(ocfs2_xattr_index_block_find_rec);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_iterate_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_iterate_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_cp_xattr_block_to_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cp_xattr_block_to_bucket_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block_begin);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_xattr_create_index_block);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_defrag_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_bucket_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_divide_xattr_bucket_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_divide_xattr_bucket_move);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_cp_xattr_bucket);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_mv_xattr_buckets);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_adjust_xattr_cross_cluster);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_add_new_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_add_new_xattr_cluster_insert);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_extend_xattr_bucket);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_add_new_xattr_bucket);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_xattr_bucket_value_truncate);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_rm_xattr_cluster);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_header);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_create_empty_xattr_block);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_bucket);
+
+DEFINE_OCFS2_STRING_EVENT(ocfs2_xattr_set_entry_index_block);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_xattr_bucket_value_refcount);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec);
+
+/* End of trace events for fs/ocfs2/xattr.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index d57c7942e4de..52e3dadd3642 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -37,7 +37,6 @@
 #include <linux/string.h>
 #include <linux/security.h>
 
-#define MLOG_MASK_PREFIX ML_XATTR
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -57,6 +56,7 @@
 #include "xattr.h"
 #include "refcounttree.h"
 #include "acl.h"
+#include "ocfs2_trace.h"
 
 struct ocfs2_xattr_def_value_root {
 	struct ocfs2_xattr_value_root	xv;
@@ -474,8 +474,7 @@ static int ocfs2_validate_xattr_block(struct super_block *sb,
 	struct ocfs2_xattr_block *xb =
 		(struct ocfs2_xattr_block *)bh->b_data;
 
-	mlog(0, "Validating xattr block %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_xattr_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -715,11 +714,11 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 	u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
 	struct ocfs2_extent_tree et;
 
-	mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
-
 	ocfs2_init_xattr_value_extent_tree(&et, INODE_CACHE(inode), vb);
 
 	while (clusters_to_add) {
+		trace_ocfs2_xattr_extend_allocation(clusters_to_add);
+
 		status = vb->vb_access(handle, INODE_CACHE(inode), vb->vb_bh,
 				       OCFS2_JOURNAL_ACCESS_WRITE);
 		if (status < 0) {
@@ -754,8 +753,6 @@ static int ocfs2_xattr_extend_allocation(struct inode *inode,
 			 */
 			BUG_ON(why == RESTART_META);
 
-			mlog(0, "restarting xattr value extension for %u"
-			     " clusters,.\n", clusters_to_add);
 			credits = ocfs2_calc_extend_credits(inode->i_sb,
 							    &vb->vb_xv->xr_list,
 							    clusters_to_add);
@@ -3246,8 +3243,8 @@ static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
 	}
 
 	meta_add += extra_meta;
-	mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
-	     "credits = %d\n", xi->xi_name, meta_add, clusters_add, *credits);
+	trace_ocfs2_init_xattr_set_ctxt(xi->xi_name, meta_add,
+					clusters_add, *credits);
 
 	if (meta_add) {
 		ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
@@ -3887,8 +3884,10 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
 
 	if (found) {
 		xs->here = &xs->header->xh_entries[index];
-		mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
-		     (unsigned long long)bucket_blkno(xs->bucket), index);
+		trace_ocfs2_xattr_bucket_find(OCFS2_I(inode)->ip_blkno,
+			name, name_index, name_hash,
+			(unsigned long long)bucket_blkno(xs->bucket),
+			index);
 	} else
 		ret = -ENODATA;
 
@@ -3915,8 +3914,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
 	if (le16_to_cpu(el->l_next_free_rec) == 0)
 		return -ENODATA;
 
-	mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n",
-	     name, name_hash, name_index);
+	trace_ocfs2_xattr_index_block_find(OCFS2_I(inode)->ip_blkno,
+					name, name_index, name_hash,
+					(unsigned long long)root_bh->b_blocknr,
+					-1);
 
 	ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash,
 				  &num_clusters, el);
@@ -3927,9 +3928,10 @@ static int ocfs2_xattr_index_block_find(struct inode *inode,
 
 	BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash);
 
-	mlog(0, "find xattr extent rec %u clusters from %llu, the first hash "
-	     "in the rec is %u\n", num_clusters, (unsigned long long)p_blkno,
-	     first_hash);
+	trace_ocfs2_xattr_index_block_find_rec(OCFS2_I(inode)->ip_blkno,
+					name, name_index, first_hash,
+					(unsigned long long)p_blkno,
+					num_clusters);
 
 	ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash,
 				      p_blkno, first_hash, num_clusters, xs);
@@ -3955,8 +3957,9 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 		return -ENOMEM;
 	}
 
-	mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
-	     clusters, (unsigned long long)blkno);
+	trace_ocfs2_iterate_xattr_buckets(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)blkno, clusters);
 
 	for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
 		ret = ocfs2_read_xattr_bucket(bucket, blkno);
@@ -3972,8 +3975,7 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
 		if (i == 0)
 			num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
 
-		mlog(0, "iterating xattr bucket %llu, first hash %u\n",
-		     (unsigned long long)blkno,
+		trace_ocfs2_iterate_xattr_bucket((unsigned long long)blkno,
 		     le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
 		if (func) {
 			ret = func(inode, bucket, para);
@@ -4173,9 +4175,9 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 	char *src = xb_bh->b_data;
 	char *target = bucket_block(bucket, blks - 1);
 
-	mlog(0, "cp xattr from block %llu to bucket %llu\n",
-	     (unsigned long long)xb_bh->b_blocknr,
-	     (unsigned long long)bucket_blkno(bucket));
+	trace_ocfs2_cp_xattr_block_to_bucket_begin(
+				(unsigned long long)xb_bh->b_blocknr,
+				(unsigned long long)bucket_blkno(bucket));
 
 	for (i = 0; i < blks; i++)
 		memset(bucket_block(bucket, i), 0, blocksize);
@@ -4211,8 +4213,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
 	for (i = 0; i < count; i++)
 		le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change);
 
-	mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n",
-	     offset, size, off_change);
+	trace_ocfs2_cp_xattr_block_to_bucket_end(offset, size, off_change);
 
 	sort(target + offset, count, sizeof(struct ocfs2_xattr_entry),
 	     cmp_xe, swap_xe);
@@ -4261,8 +4262,8 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	struct ocfs2_xattr_tree_root *xr;
 	u16 xb_flags = le16_to_cpu(xb->xb_flags);
 
-	mlog(0, "create xattr index block for %llu\n",
-	     (unsigned long long)xb_bh->b_blocknr);
+	trace_ocfs2_xattr_create_index_block_begin(
+				(unsigned long long)xb_bh->b_blocknr);
 
 	BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
 	BUG_ON(!xs->bucket);
@@ -4295,8 +4296,7 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
 	 */
 	blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
 
-	mlog(0, "allocate 1 cluster from %llu to xattr block\n",
-	     (unsigned long long)blkno);
+	trace_ocfs2_xattr_create_index_block((unsigned long long)blkno);
 
 	ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
 	if (ret) {
@@ -4400,8 +4400,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
 	entries = (char *)xh->xh_entries;
 	xh_free_start = le16_to_cpu(xh->xh_free_start);
 
-	mlog(0, "adjust xattr bucket in %llu, count = %u, "
-	     "xh_free_start = %u, xh_name_value_len = %u.\n",
+	trace_ocfs2_defrag_xattr_bucket(
 	     (unsigned long long)blkno, le16_to_cpu(xh->xh_count),
 	     xh_free_start, le16_to_cpu(xh->xh_name_value_len));
 
@@ -4503,8 +4502,9 @@ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
 	BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
 	BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
 
-	mlog(0, "move half of xattrs in cluster %llu to %llu\n",
-	     (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
+	trace_ocfs2_mv_xattr_bucket_cross_cluster(
+				(unsigned long long)last_cluster_blkno,
+				(unsigned long long)new_blkno);
 
 	ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
 				     last_cluster_blkno, new_blkno,
@@ -4614,8 +4614,8 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	struct ocfs2_xattr_entry *xe;
 	int blocksize = inode->i_sb->s_blocksize;
 
-	mlog(0, "move some of xattrs from bucket %llu to %llu\n",
-	     (unsigned long long)blk, (unsigned long long)new_blk);
+	trace_ocfs2_divide_xattr_bucket_begin((unsigned long long)blk,
+					      (unsigned long long)new_blk);
 
 	s_bucket = ocfs2_xattr_bucket_new(inode);
 	t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -4714,9 +4714,9 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
 	 */
 	xe = &xh->xh_entries[start];
 	len = sizeof(struct ocfs2_xattr_entry) * (count - start);
-	mlog(0, "mv xattr entry len %d from %d to %d\n", len,
-	     (int)((char *)xe - (char *)xh),
-	     (int)((char *)xh->xh_entries - (char *)xh));
+	trace_ocfs2_divide_xattr_bucket_move(len,
+			(int)((char *)xe - (char *)xh),
+			(int)((char *)xh->xh_entries - (char *)xh));
 	memmove((char *)xh->xh_entries, (char *)xe, len);
 	xe = &xh->xh_entries[count - start];
 	len = sizeof(struct ocfs2_xattr_entry) * start;
@@ -4788,9 +4788,9 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
 
 	BUG_ON(s_blkno == t_blkno);
 
-	mlog(0, "cp bucket %llu to %llu, target is %d\n",
-	     (unsigned long long)s_blkno, (unsigned long long)t_blkno,
-	     t_is_new);
+	trace_ocfs2_cp_xattr_bucket((unsigned long long)s_blkno,
+				    (unsigned long long)t_blkno,
+				    t_is_new);
 
 	s_bucket = ocfs2_xattr_bucket_new(inode);
 	t_bucket = ocfs2_xattr_bucket_new(inode);
@@ -4862,8 +4862,8 @@ static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
 	int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
 	struct ocfs2_xattr_bucket *old_first, *new_first;
 
-	mlog(0, "mv xattrs from cluster %llu to %llu\n",
-	     (unsigned long long)last_blk, (unsigned long long)to_blk);
+	trace_ocfs2_mv_xattr_buckets((unsigned long long)last_blk,
+				     (unsigned long long)to_blk);
 
 	BUG_ON(start_bucket >= num_buckets);
 	if (start_bucket) {
@@ -5013,9 +5013,9 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
 {
 	int ret;
 
-	mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
-	     (unsigned long long)bucket_blkno(first), prev_clusters,
-	     (unsigned long long)new_blk);
+	trace_ocfs2_adjust_xattr_cross_cluster(
+			(unsigned long long)bucket_blkno(first),
+			(unsigned long long)new_blk, prev_clusters);
 
 	if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
 		ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
@@ -5088,10 +5088,10 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_extent_tree et;
 
-	mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
-	     "previous xattr blkno = %llu\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     prev_cpos, (unsigned long long)bucket_blkno(first));
+	trace_ocfs2_add_new_xattr_cluster_begin(
+		(unsigned long long)OCFS2_I(inode)->ip_blkno,
+		(unsigned long long)bucket_blkno(first),
+		prev_cpos, prev_clusters);
 
 	ocfs2_init_xattr_tree_extent_tree(&et, INODE_CACHE(inode), root_bh);
 
@@ -5113,8 +5113,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 	BUG_ON(num_bits > clusters_to_add);
 
 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
-	mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
-	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	trace_ocfs2_add_new_xattr_cluster((unsigned long long)block, num_bits);
 
 	if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
 	    (prev_clusters + num_bits) << osb->s_clustersize_bits <=
@@ -5130,8 +5129,6 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		 */
 		v_start = prev_cpos + prev_clusters;
 		*num_clusters = prev_clusters + num_bits;
-		mlog(0, "Add contiguous %u clusters to previous extent rec.\n",
-		     num_bits);
 	} else {
 		ret = ocfs2_adjust_xattr_cross_cluster(inode,
 						       handle,
@@ -5147,8 +5144,8 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
 		}
 	}
 
-	mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
-	     num_bits, (unsigned long long)block, v_start);
+	trace_ocfs2_add_new_xattr_cluster_insert((unsigned long long)block,
+						 v_start, num_bits);
 	ret = ocfs2_insert_extent(handle, &et, v_start, block,
 				  num_bits, 0, ctxt->meta_ac);
 	if (ret < 0) {
@@ -5183,9 +5180,9 @@ static int ocfs2_extend_xattr_bucket(struct inode *inode,
 	u64 end_blk;
 	u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
 
-	mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
-	     "from %llu, len = %u\n", (unsigned long long)target_blk,
-	     (unsigned long long)bucket_blkno(first), num_clusters);
+	trace_ocfs2_extend_xattr_bucket((unsigned long long)target_blk,
+					(unsigned long long)bucket_blkno(first),
+					num_clusters, new_bucket);
 
 	/* The extent must have room for an additional bucket */
 	BUG_ON(new_bucket >=
@@ -5265,8 +5262,8 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
 	/* The bucket at the front of the extent */
 	struct ocfs2_xattr_bucket *first;
 
-	mlog(0, "Add new xattr bucket starting from %llu\n",
-	     (unsigned long long)bucket_blkno(target));
+	trace_ocfs2_add_new_xattr_bucket(
+				(unsigned long long)bucket_blkno(target));
 
 	/* The first bucket of the original extent */
 	first = ocfs2_xattr_bucket_new(inode);
@@ -5382,8 +5379,8 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
 	 * modified something.  We have to assume they did, and dirty
 	 * the whole bucket.  This leaves us in a consistent state.
 	 */
-	mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
-	     xe_off, (unsigned long long)bucket_blkno(bucket), len);
+	trace_ocfs2_xattr_bucket_value_truncate(
+			(unsigned long long)bucket_blkno(bucket), xe_off, len);
 	ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
 	if (ret) {
 		mlog_errno(ret);
@@ -5433,8 +5430,9 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
-	mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n",
-	     cpos, len, (unsigned long long)blkno);
+	trace_ocfs2_rm_xattr_cluster(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)blkno, cpos, len);
 
 	ocfs2_remove_xattr_clusters_from_cache(INODE_CACHE(inode), blkno,
 					       len);
@@ -5538,7 +5536,7 @@ static int ocfs2_xattr_set_entry_bucket(struct inode *inode,
 	int ret;
 	struct ocfs2_xa_loc loc;
 
-	mlog(0, "Set xattr %s in xattr bucket\n", xi->xi_name);
+	trace_ocfs2_xattr_set_entry_bucket(xi->xi_name);
 
 	ocfs2_init_xattr_bucket_xa_loc(&loc, xs->bucket,
 				       xs->not_found ? NULL : xs->here);
@@ -5580,7 +5578,7 @@ static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
 {
 	int ret;
 
-	mlog(0, "Set xattr %s in xattr index block\n", xi->xi_name);
+	trace_ocfs2_xattr_set_entry_index_block(xi->xi_name);
 
 	ret = ocfs2_xattr_set_entry_bucket(inode, xi, xs, ctxt);
 	if (!ret)
@@ -6039,9 +6037,9 @@ static int ocfs2_xattr_bucket_value_refcount(struct inode *inode,
 	if (ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)))
 		p = &refcount;
 
-	mlog(0, "refcount bucket %llu, count = %u\n",
-	     (unsigned long long)bucket_blkno(bucket),
-	     le16_to_cpu(xh->xh_count));
+	trace_ocfs2_xattr_bucket_value_refcount(
+				(unsigned long long)bucket_blkno(bucket),
+				le16_to_cpu(xh->xh_count));
 	for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
 		xe = &xh->xh_entries[i];
 
@@ -6337,8 +6335,8 @@ static int ocfs2_reflink_xattr_header(handle_t *handle,
 	u32 clusters, cpos, p_cluster, num_clusters;
 	unsigned int ext_flags = 0;
 
-	mlog(0, "reflink xattr in container %llu, count = %u\n",
-	     (unsigned long long)old_bh->b_blocknr, le16_to_cpu(xh->xh_count));
+	trace_ocfs2_reflink_xattr_header((unsigned long long)old_bh->b_blocknr,
+					 le16_to_cpu(xh->xh_count));
 
 	last = &new_xh->xh_entries[le16_to_cpu(new_xh->xh_count)];
 	for (i = 0, j = 0; i < le16_to_cpu(xh->xh_count); i++, j++) {
@@ -6538,8 +6536,8 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode,
 		goto out;
 	}
 
-	mlog(0, "create new xattr block for inode %llu, index = %d\n",
-	     (unsigned long long)fe_bh->b_blocknr, indexed);
+	trace_ocfs2_create_empty_xattr_block(
+				(unsigned long long)fe_bh->b_blocknr, indexed);
 	ret = ocfs2_create_xattr_block(inode, fe_bh, &ctxt, indexed,
 				       ret_bh);
 	if (ret)
@@ -6950,8 +6948,8 @@ static int ocfs2_reflink_xattr_buckets(handle_t *handle,
 		if (ret)
 			mlog_errno(ret);
 
-		mlog(0, "insert new xattr extent rec start %llu len %u to %u\n",
-		     (unsigned long long)new_blkno, num_clusters, reflink_cpos);
+		trace_ocfs2_reflink_xattr_buckets((unsigned long long)new_blkno,
+						  num_clusters, reflink_cpos);
 
 		len -= num_clusters;
 		blkno += ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
@@ -6980,8 +6978,7 @@ static int ocfs2_reflink_xattr_rec(struct inode *inode,
 	struct ocfs2_alloc_context *data_ac = NULL;
 	struct ocfs2_extent_tree et;
 
-	mlog(0, "reflink xattr buckets %llu len %u\n",
-	     (unsigned long long)blkno, len);
+	trace_ocfs2_reflink_xattr_rec((unsigned long long)blkno, len);
 
 	ocfs2_init_xattr_tree_extent_tree(&et,
 					  INODE_CACHE(args->reflink->new_inode),
-- 
cgit v1.2.3


From 2661836e0ca07f57505563ec31aeee20b1758987 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 23 Feb 2011 22:10:56 +0800
Subject: ocfs2: Remove masklog ML_RESERVATIONS.

Remove mlog(0) from fs/ocfs2/reservations.c and the masklog RESERVATIONS.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |   1 -
 fs/ocfs2/cluster/masklog.h |   1 -
 fs/ocfs2/ocfs2_trace.h     | 116 +++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/reservations.c    |  57 ++++++++++------------
 4 files changed, 142 insertions(+), 33 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index ef140ab7dcf8..39f2a46810d1 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -104,7 +104,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(EXPORT),
 	define_mask(QUOTA),
 	define_mask(BASTS),
-	define_mask(RESERVATIONS),
 	define_mask(CLUSTER),
 	define_mask(ERROR),
 	define_mask(NOTICE),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index f574b167e8dc..c36b4d8d5aa9 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -106,7 +106,6 @@
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_BASTS	0x0000000100000000ULL /* dlmglue asts and basts */
-#define ML_RESERVATIONS	0x0000000200000000ULL /* ocfs2 alloc reservations */
 #define ML_CLUSTER	0x0000000400000000ULL /* cluster stack */
 
 /* bits that are infrequently given and frequently matched in the high word */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 0fc2840d6fb1..1c040fcbd6e7 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1809,6 +1809,122 @@ DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_reflink_xattr_buckets);
 DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_reflink_xattr_rec);
 
 /* End of trace events for fs/ocfs2/xattr.c. */
+
+/* Trace events for fs/ocfs2/reservations.c. */
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_insert);
+
+DEFINE_OCFS2_ULL_UINT_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_begin);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_find_free_bits_end);
+
+TRACE_EVENT(ocfs2_resv_find_window_begin,
+	TP_PROTO(unsigned int r_start, unsigned int r_end, unsigned int goal,
+		 unsigned int wanted, int empty_root),
+	TP_ARGS(r_start, r_end, goal, wanted, empty_root),
+	TP_STRUCT__entry(
+		__field(unsigned int, r_start)
+		__field(unsigned int, r_end)
+		__field(unsigned int, goal)
+		__field(unsigned int, wanted)
+		__field(int, empty_root)
+	),
+	TP_fast_assign(
+		__entry->r_start = r_start;
+		__entry->r_end = r_end;
+		__entry->goal = goal;
+		__entry->wanted = wanted;
+		__entry->empty_root = empty_root;
+	),
+	TP_printk("%u %u %u %u %d", __entry->r_start, __entry->r_end,
+		  __entry->goal, __entry->wanted, __entry->empty_root)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resv_find_window_prev);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_resv_find_window_next);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_cannibalize_resv_begin);
+
+TRACE_EVENT(ocfs2_cannibalize_resv_end,
+	TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(start, end, len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, start)
+		__field(unsigned int, end)
+		__field(unsigned int, len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end = end;
+		__entry->len = len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+		  __entry->len, __entry->last_start, __entry->last_len)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_resmap_resv_bits);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_begin,
+	TP_PROTO(unsigned int cstart, unsigned int cend, unsigned int clen,
+		 unsigned int r_start, unsigned int r_end, unsigned int r_len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(cstart, cend, clen, r_start, r_end,
+		r_len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, cstart)
+		__field(unsigned int, cend)
+		__field(unsigned int, clen)
+		__field(unsigned int, r_start)
+		__field(unsigned int, r_end)
+		__field(unsigned int, r_len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->cstart = cstart;
+		__entry->cend = cend;
+		__entry->clen = clen;
+		__entry->r_start = r_start;
+		__entry->r_end = r_end;
+		__entry->r_len = r_len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u %u %u %u",
+		  __entry->cstart, __entry->cend, __entry->clen,
+		  __entry->r_start, __entry->r_end, __entry->r_len,
+		  __entry->last_start, __entry->last_len)
+);
+
+TRACE_EVENT(ocfs2_resmap_claimed_bits_end,
+	TP_PROTO(unsigned int start, unsigned int end, unsigned int len,
+		 unsigned int last_start, unsigned int last_len),
+	TP_ARGS(start, end, len, last_start, last_len),
+	TP_STRUCT__entry(
+		__field(unsigned int, start)
+		__field(unsigned int, end)
+		__field(unsigned int, len)
+		__field(unsigned int, last_start)
+		__field(unsigned int, last_len)
+	),
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end = end;
+		__entry->len = len;
+		__entry->last_start = last_start;
+		__entry->last_len = last_len;
+	),
+	TP_printk("%u %u %u %u %u", __entry->start, __entry->end,
+		  __entry->len, __entry->last_start, __entry->last_len)
+);
+
+/* End of trace events for fs/ocfs2/reservations.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 3e78db361bc7..41ffd36c689c 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -30,10 +30,10 @@
 #include <linux/bitops.h>
 #include <linux/list.h>
 
-#define MLOG_MASK_PREFIX ML_RESERVATIONS
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
+#include "ocfs2_trace.h"
 
 #ifdef CONFIG_OCFS2_DEBUG_FS
 #define OCFS2_CHECK_RESERVATIONS
@@ -321,8 +321,7 @@ static void ocfs2_resv_insert(struct ocfs2_reservation_map *resmap,
 
 	assert_spin_locked(&resv_lock);
 
-	mlog(0, "Insert reservation start: %u len: %u\n", new->r_start,
-	     new->r_len);
+	trace_ocfs2_resv_insert(new->r_start, new->r_len);
 
 	while (*p) {
 		parent = *p;
@@ -423,8 +422,8 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
 	unsigned int best_start, best_len = 0;
 	int offset, start, found;
 
-	mlog(0, "Find %u bits within range (%u, len %u) resmap len: %u\n",
-	     wanted, search_start, search_len, resmap->m_bitmap_len);
+	trace_ocfs2_resmap_find_free_bits_begin(search_start, search_len,
+						wanted, resmap->m_bitmap_len);
 
 	found = best_start = best_len = 0;
 
@@ -463,7 +462,7 @@ static int ocfs2_resmap_find_free_bits(struct ocfs2_reservation_map *resmap,
 	*rlen = best_len;
 	*rstart = best_start;
 
-	mlog(0, "Found start: %u len: %u\n", best_start, best_len);
+	trace_ocfs2_resmap_find_free_bits_end(best_start, best_len);
 
 	return *rlen;
 }
@@ -487,9 +486,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 	 * - our window should be last in all reservations
 	 * - need to make sure we don't go past end of bitmap
 	 */
-
-	mlog(0, "resv start: %u resv end: %u goal: %u wanted: %u\n",
-	     resv->r_start, ocfs2_resv_end(resv), goal, wanted);
+	trace_ocfs2_resv_find_window_begin(resv->r_start, ocfs2_resv_end(resv),
+					   goal, wanted, RB_EMPTY_ROOT(root));
 
 	assert_spin_locked(&resv_lock);
 
@@ -498,9 +496,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 		 * Easiest case - empty tree. We can just take
 		 * whatever window of free bits we want.
 		 */
-
-		mlog(0, "Empty root\n");
-
 		clen = ocfs2_resmap_find_free_bits(resmap, wanted, goal,
 						   resmap->m_bitmap_len - goal,
 						   &cstart, &clen);
@@ -524,8 +519,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 	prev_resv = ocfs2_find_resv_lhs(resmap, goal);
 
 	if (prev_resv == NULL) {
-		mlog(0, "Goal on LHS of leftmost window\n");
-
 		/*
 		 * A NULL here means that the search code couldn't
 		 * find a window that starts before goal.
@@ -570,13 +563,15 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 		next_resv = NULL;
 	}
 
+	trace_ocfs2_resv_find_window_prev(prev_resv->r_start,
+					  ocfs2_resv_end(prev_resv));
+
 	prev = &prev_resv->r_node;
 
 	/* Now we do a linear search for a window, starting at 'prev_rsv' */
 	while (1) {
 		next = rb_next(prev);
 		if (next) {
-			mlog(0, "One more resv found in linear search\n");
 			next_resv = rb_entry(next,
 					     struct ocfs2_alloc_reservation,
 					     r_node);
@@ -585,7 +580,6 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 			gap_end = next_resv->r_start - 1;
 			gap_len = gap_end - gap_start + 1;
 		} else {
-			mlog(0, "No next node\n");
 			/*
 			 * We're at the rightmost edge of the
 			 * tree. See if a reservation between this
@@ -596,6 +590,8 @@ static void __ocfs2_resv_find_window(struct ocfs2_reservation_map *resmap,
 			gap_end = resmap->m_bitmap_len - 1;
 		}
 
+		trace_ocfs2_resv_find_window_next(next ? next_resv->r_start: -1,
+					next ? ocfs2_resv_end(next_resv) : -1);
 		/*
 		 * No need to check this gap if we have already found
 		 * a larger region of free bits.
@@ -654,8 +650,9 @@ static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
 	lru_resv = list_first_entry(&resmap->m_lru,
 				    struct ocfs2_alloc_reservation, r_lru);
 
-	mlog(0, "lru resv: start: %u len: %u end: %u\n", lru_resv->r_start,
-	     lru_resv->r_len, ocfs2_resv_end(lru_resv));
+	trace_ocfs2_cannibalize_resv_begin(lru_resv->r_start,
+					   lru_resv->r_len,
+					   ocfs2_resv_end(lru_resv));
 
 	/*
 	 * Cannibalize (some or all) of the target reservation and
@@ -684,10 +681,9 @@ static void ocfs2_cannibalize_resv(struct ocfs2_reservation_map *resmap,
 		resv->r_len = shrink;
 	}
 
-	mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
-	     "r_len: %u r_last_start: %u r_last_len: %u\n",
-	     resv->r_start, ocfs2_resv_end(resv), resv->r_len,
-	     resv->r_last_start, resv->r_last_len);
+	trace_ocfs2_cannibalize_resv_end(resv->r_start, ocfs2_resv_end(resv),
+					 resv->r_len, resv->r_last_start,
+					 resv->r_last_len);
 
 	ocfs2_resv_insert(resmap, resv);
 }
@@ -748,7 +744,6 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
 		if ((resv->r_flags & OCFS2_RESV_FLAG_TMP) || wanted < *clen)
 			wanted = *clen;
 
-		mlog(0, "empty reservation, find new window\n");
 		/*
 		 * Try to get a window here. If it works, we must fall
 		 * through and test the bitmap . This avoids some
@@ -757,6 +752,7 @@ int ocfs2_resmap_resv_bits(struct ocfs2_reservation_map *resmap,
 		 * that inode.
 		 */
 		ocfs2_resv_find_window(resmap, resv, wanted);
+		trace_ocfs2_resmap_resv_bits(resv->r_start, resv->r_len);
 	}
 
 	BUG_ON(ocfs2_resv_empty(resv));
@@ -813,10 +809,10 @@ void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
 
 	spin_lock(&resv_lock);
 
-	mlog(0, "claim bits: cstart: %u cend: %u clen: %u r_start: %u "
-	     "r_end: %u r_len: %u, r_last_start: %u r_last_len: %u\n",
-	     cstart, cend, clen, resv->r_start, ocfs2_resv_end(resv),
-	     resv->r_len, resv->r_last_start, resv->r_last_len);
+	trace_ocfs2_resmap_claimed_bits_begin(cstart, cend, clen, resv->r_start,
+					      ocfs2_resv_end(resv), resv->r_len,
+					      resv->r_last_start,
+					      resv->r_last_len);
 
 	BUG_ON(cstart < resv->r_start);
 	BUG_ON(cstart > ocfs2_resv_end(resv));
@@ -833,10 +829,9 @@ void ocfs2_resmap_claimed_bits(struct ocfs2_reservation_map *resmap,
 	if (!ocfs2_resv_empty(resv))
 		ocfs2_resv_mark_lru(resmap, resv);
 
-	mlog(0, "Reservation now looks like: r_start: %u r_end: %u "
-	     "r_len: %u r_last_start: %u r_last_len: %u\n",
-	     resv->r_start, ocfs2_resv_end(resv), resv->r_len,
-	     resv->r_last_start, resv->r_last_len);
+	trace_ocfs2_resmap_claimed_bits_end(resv->r_start, ocfs2_resv_end(resv),
+					    resv->r_len, resv->r_last_start,
+					    resv->r_last_len);
 
 	ocfs2_check_resmap(resmap);
 
-- 
cgit v1.2.3


From 38877a43795d3a834fad827f3d6532ff649e5c45 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 23 Feb 2011 22:12:48 +0800
Subject: ocfs2: Remove mlog(0) from quota_local.c.

Remove mlog(0) from fs/ocfs2/quota_local.c.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/ocfs2_trace.h | 10 ++++++++++
 fs/ocfs2/quota_local.c | 13 +++++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 1c040fcbd6e7..dbd1812f5161 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1925,6 +1925,16 @@ TRACE_EVENT(ocfs2_resmap_claimed_bits_end,
 );
 
 /* End of trace events for fs/ocfs2/reservations.c. */
+
+/* Trace events for fs/ocfs2/quota_local.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_recover_local_quota_file);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot);
+
+/* End of trace events for fs/ocfs2/quota_local.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 88ec4516637b..dc8007fc9247 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -8,7 +8,6 @@
 #include <linux/quotaops.h>
 #include <linux/module.h>
 
-#define MLOG_MASK_PREFIX ML_QUOTA
 #include <cluster/masklog.h>
 
 #include "ocfs2_fs.h"
@@ -23,6 +22,7 @@
 #include "quota.h"
 #include "uptodate.h"
 #include "super.h"
+#include "ocfs2_trace.h"
 
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -475,7 +475,7 @@ static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 	struct ocfs2_recovery_chunk *rchunk, *next;
 	qsize_t spacechange, inodechange;
 
-	mlog(0, "ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+	trace_ocfs2_recover_local_quota_file((unsigned long)lqinode->i_ino, type);
 
 	list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
 		chunk = rchunk->rc_chunk;
@@ -601,7 +601,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
 	for (type = 0; type < MAXQUOTAS; type++) {
 		if (list_empty(&(rec->r_list[type])))
 			continue;
-		mlog(0, "Recovering quota in slot %d\n", slot_num);
+		trace_ocfs2_finish_quota_recovery(slot_num);
 		lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
 		if (!lqinode) {
 			status = -ENOENT;
@@ -883,9 +883,10 @@ static void olq_set_dquot(struct buffer_head *bh, void *private)
 	dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
 					  od->dq_originodes);
 	spin_unlock(&dq_data_lock);
-	mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
-	     od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
-	     (long long)le64_to_cpu(dqblk->dqb_inodemod));
+	trace_olq_set_dquot(
+		(unsigned long long)le64_to_cpu(dqblk->dqb_spacemod),
+		(unsigned long long)le64_to_cpu(dqblk->dqb_inodemod),
+		od->dq_dquot.dq_id);
 }
 
 /* Write dquot to local quota file */
-- 
cgit v1.2.3


From 1db986a839d8acf8ad809c876f1154f2e672d3af Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 23 Feb 2011 22:19:12 +0800
Subject: ocfs2: Remove masklog ML_QUOTA.

Remove mlog(0) from fs/ocfs2/quota_global.c and
the masklog QUOTA.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |  1 -
 fs/ocfs2/cluster/masklog.h |  1 -
 fs/ocfs2/ocfs2_trace.h     | 78 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/ocfs2/quota_global.c    | 24 +++++++-------
 4 files changed, 90 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 39f2a46810d1..90ed4c0ce180 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -102,7 +102,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(CONN),
 	define_mask(QUORUM),
 	define_mask(EXPORT),
-	define_mask(QUOTA),
 	define_mask(BASTS),
 	define_mask(CLUSTER),
 	define_mask(ERROR),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index c36b4d8d5aa9..0e65bae257d6 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -104,7 +104,6 @@
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
-#define ML_QUOTA	0x0000000040000000ULL /* ocfs2 quota operations */
 #define ML_BASTS	0x0000000100000000ULL /* dlmglue asts and basts */
 #define ML_CLUSTER	0x0000000400000000ULL /* cluster stack */
 
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index dbd1812f5161..0f7165914401 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -110,6 +110,25 @@ DEFINE_EVENT(ocfs2__int_int, name,	\
 	TP_PROTO(int val1, int val2),	\
 	TP_ARGS(val1, val2))
 
+DECLARE_EVENT_CLASS(ocfs2__uint_int,
+	TP_PROTO(unsigned int value1, int value2),
+	TP_ARGS(value1, value2),
+	TP_STRUCT__entry(
+		__field(unsigned int, value1)
+		__field(int, value2)
+	),
+	TP_fast_assign(
+		__entry->value1	= value1;
+		__entry->value2	= value2;
+	),
+	TP_printk("%u %d", __entry->value1, __entry->value2)
+);
+
+#define DEFINE_OCFS2_UINT_INT_EVENT(name)	\
+DEFINE_EVENT(ocfs2__uint_int, name,	\
+	TP_PROTO(unsigned int val1, int val2),	\
+	TP_ARGS(val1, val2))
+
 DECLARE_EVENT_CLASS(ocfs2__uint_uint,
 	TP_PROTO(unsigned int value1, unsigned int value2),
 	TP_ARGS(value1, value2),
@@ -1935,6 +1954,65 @@ DEFINE_OCFS2_INT_EVENT(ocfs2_finish_quota_recovery);
 DEFINE_OCFS2_ULL_ULL_UINT_EVENT(olq_set_dquot);
 
 /* End of trace events for fs/ocfs2/quota_local.c. */
+
+/* Trace events for fs/ocfs2/quota_global.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_quota_block);
+
+TRACE_EVENT(ocfs2_sync_dquot,
+	TP_PROTO(unsigned int dq_id, long long dqb_curspace,
+		 long long spacechange, long long curinodes,
+		 long long inodechange),
+	TP_ARGS(dq_id, dqb_curspace, spacechange, curinodes, inodechange),
+	TP_STRUCT__entry(
+		__field(unsigned int, dq_id)
+		__field(long long, dqb_curspace)
+		__field(long long, spacechange)
+		__field(long long, curinodes)
+		__field(long long, inodechange)
+	),
+	TP_fast_assign(
+		__entry->dq_id = dq_id;
+		__entry->dqb_curspace = dqb_curspace;
+		__entry->spacechange = spacechange;
+		__entry->curinodes = curinodes;
+		__entry->inodechange = inodechange;
+	),
+	TP_printk("%u %lld %lld %lld %lld", __entry->dq_id,
+		  __entry->dqb_curspace, __entry->spacechange,
+		  __entry->curinodes, __entry->inodechange)
+);
+
+TRACE_EVENT(ocfs2_sync_dquot_helper,
+	TP_PROTO(unsigned int dq_id, unsigned int dq_type, unsigned long type,
+		 const char *s_id),
+	TP_ARGS(dq_id, dq_type, type, s_id),
+
+	TP_STRUCT__entry(
+		__field(unsigned int, dq_id)
+		__field(unsigned int, dq_type)
+		__field(unsigned long, type)
+		__string(s_id, s_id)
+	),
+	TP_fast_assign(
+		__entry->dq_id = dq_id;
+		__entry->dq_type = dq_type;
+		__entry->type = type;
+		__assign_str(s_id, s_id);
+	),
+	TP_printk("%u %u %lu %s", __entry->dq_id, __entry->dq_type,
+		  __entry->type, __get_str(s_id))
+);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_write_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_release_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
+
+DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
+
+/* End of trace events for fs/ocfs2/quota_global.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 10a49e8a5f03..64346f836c09 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -11,7 +11,6 @@
 #include <linux/writeback.h>
 #include <linux/workqueue.h>
 
-#define MLOG_MASK_PREFIX ML_QUOTA
 #include <cluster/masklog.h>
 
 #include "ocfs2_fs.h"
@@ -27,6 +26,7 @@
 #include "super.h"
 #include "buffer_head_io.h"
 #include "quota.h"
+#include "ocfs2_trace.h"
 
 /*
  * Locking of quotas with OCFS2 is rather complex. Here are rules that
@@ -132,8 +132,7 @@ int ocfs2_validate_quota_block(struct super_block *sb, struct buffer_head *bh)
 	struct ocfs2_disk_dqtrailer *dqt =
 		ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
 
-	mlog(0, "Validating quota block %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_quota_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -509,9 +508,10 @@ int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
 	olditime = dquot->dq_dqb.dqb_itime;
 	oldbtime = dquot->dq_dqb.dqb_btime;
 	ocfs2_global_disk2memdqb(dquot, &dqblk);
-	mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
-	     dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
-	     dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+	trace_ocfs2_sync_dquot(dquot->dq_id, dquot->dq_dqb.dqb_curspace,
+			       (long long)spacechange,
+			       dquot->dq_dqb.dqb_curinodes,
+			       (long long)inodechange);
 	if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
 		dquot->dq_dqb.dqb_curspace += spacechange;
 	if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
@@ -595,8 +595,8 @@ static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 	int status = 0;
 
-	mlog(0, "id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
-		   dquot->dq_type, type, sb->s_id);
+	trace_ocfs2_sync_dquot_helper(dquot->dq_id, dquot->dq_type,
+				      type, sb->s_id);
 	if (type != dquot->dq_type)
 		goto out;
 	status = ocfs2_lock_global_qf(oinfo, 1);
@@ -647,7 +647,7 @@ static int ocfs2_write_dquot(struct dquot *dquot)
 	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
 	int status = 0;
 
-	mlog(0, "id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+	trace_ocfs2_write_dquot(dquot->dq_id, dquot->dq_type);
 
 	handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
 	if (IS_ERR(handle)) {
@@ -685,7 +685,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
 	struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
 	int status = 0;
 
-	mlog(0, "id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+	trace_ocfs2_release_dquot(dquot->dq_id, dquot->dq_type);
 
 	mutex_lock(&dquot->dq_lock);
 	/* Check whether we are not racing with some other dqget() */
@@ -743,7 +743,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
 	int need_alloc = ocfs2_global_qinit_alloc(sb, type);
 	handle_t *handle;
 
-	mlog(0, "id=%u, type=%d", dquot->dq_id, type);
+	trace_ocfs2_acquire_dquot(dquot->dq_id, type);
 	mutex_lock(&dquot->dq_lock);
 	/*
 	 * We need an exclusive lock, because we're going to update use count
@@ -830,7 +830,7 @@ static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
 	handle_t *handle;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	mlog(0, "id=%u, type=%d", dquot->dq_id, type);
+	trace_ocfs2_mark_dquot_dirty(dquot->dq_id, type);
 
 	/* In case user set some limits, sync dquot immediately to global
 	 * quota file so that information propagates quicker */
-- 
cgit v1.2.3


From f32037d9bda37ca9a7a179cc1546033d374a8541 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 21 Feb 2011 11:19:02 +0800
Subject: ocfs2: remove NAMEI from symlink.c

No mlog(0,...) in symlink.c, so just remove NAMEI.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/symlink.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 5485b19ec8f2..5d22872e2bb3 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -40,7 +40,6 @@
 #include <linux/pagemap.h>
 #include <linux/namei.h>
 
-#define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
-- 
cgit v1.2.3


From f1088d471f8d846e817eb4d673212d18d9820c61 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 23 Feb 2011 22:30:23 +0800
Subject: ocfs2: Remove mlog(0) from fs/ocfs2/dir.c

This is the 2nd step to remove the debug info of NAMEI.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/dir.c         |  83 ++++++++++++++++-------------------
 fs/ocfs2/ocfs2_trace.h | 116 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 153 insertions(+), 46 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 2fc6c3673bea..0740752ca31e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -43,7 +43,6 @@
 #include <linux/quotaops.h>
 #include <linux/sort.h>
 
-#define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -61,6 +60,7 @@
 #include "super.h"
 #include "sysfile.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -400,7 +400,7 @@ static int inline ocfs2_search_dirblock(struct buffer_head *bh,
 	}
 
 bail:
-	mlog(0, "ret = %d\n", ret);
+	trace_ocfs2_search_dirblock(ret);
 	return ret;
 }
 
@@ -445,8 +445,7 @@ static int ocfs2_validate_dir_block(struct super_block *sb,
 	 * We don't validate dirents here, that's handled
 	 * in-place when the code walks them.
 	 */
-	mlog(0, "Validating dirblock %llu\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_validate_dir_block((unsigned long long)bh->b_blocknr);
 
 	BUG_ON(!buffer_uptodate(bh));
 
@@ -784,7 +783,7 @@ cleanup_and_exit:
 	for (; ra_ptr < ra_max; ra_ptr++)
 		brelse(bh_use[ra_ptr]);
 
-	mlog(0, "ret = %p\n", ret);
+	trace_ocfs2_find_entry_el(ret);
 	return ret;
 }
 
@@ -946,11 +945,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
 		goto out;
 	}
 
-	mlog(0, "Dir %llu: name: \"%.*s\", lookup of hash: %u.0x%x "
-	     "returns: %llu\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno,
-	     namelen, name, hinfo->major_hash, hinfo->minor_hash,
-	     (unsigned long long)phys);
+	trace_ocfs2_dx_dir_search((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				  namelen, name, hinfo->major_hash,
+				  hinfo->minor_hash, (unsigned long long)phys);
 
 	ret = ocfs2_read_dx_leaf(dir, phys, &dx_leaf_bh);
 	if (ret) {
@@ -960,9 +957,9 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
 
 	dx_leaf = (struct ocfs2_dx_leaf *) dx_leaf_bh->b_data;
 
-	mlog(0, "leaf info: num_used: %d, count: %d\n",
-	     le16_to_cpu(dx_leaf->dl_list.de_num_used),
-	     le16_to_cpu(dx_leaf->dl_list.de_count));
+	trace_ocfs2_dx_dir_search_leaf_info(
+			le16_to_cpu(dx_leaf->dl_list.de_num_used),
+			le16_to_cpu(dx_leaf->dl_list.de_count));
 
 	entry_list = &dx_leaf->dl_list;
 
@@ -1162,8 +1159,6 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
 	int i, status = -ENOENT;
 	ocfs2_journal_access_func access = ocfs2_journal_access_db;
 
-	mlog(0, "(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
-
 	if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		access = ocfs2_journal_access_di;
 
@@ -1343,8 +1338,8 @@ static int ocfs2_delete_entry_dx(handle_t *handle, struct inode *dir,
 		}
 	}
 
-	mlog(0, "Dir %llu: delete entry at index: %d\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno, index);
+	trace_ocfs2_delete_entry_dx((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				    index);
 
 	ret = __ocfs2_delete_entry(handle, dir, lookup->dl_entry,
 				   leaf_bh, leaf_bh->b_data, leaf_bh->b_size);
@@ -2022,8 +2017,7 @@ int ocfs2_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	struct inode *inode = filp->f_path.dentry->d_inode;
 	int lock_level = 0;
 
-	mlog(0, "dirino=%llu\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	error = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level);
 	if (lock_level && error >= 0) {
@@ -2064,8 +2058,8 @@ int ocfs2_find_files_on_disk(const char *name,
 {
 	int status = -ENOENT;
 
-	mlog(0, "name=%.*s, blkno=%p, inode=%llu\n", namelen, name, blkno,
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+	trace_ocfs2_find_files_on_disk(namelen, name, blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	status = ocfs2_find_entry(name, namelen, inode, lookup);
 	if (status)
@@ -2109,8 +2103,8 @@ int ocfs2_check_dir_for_entry(struct inode *dir,
 	int ret;
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-	mlog(0, "dir %llu, name '%.*s'\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
+	trace_ocfs2_check_dir_for_entry(
+		(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen, name);
 
 	ret = -EEXIST;
 	if (ocfs2_find_entry(name, namelen, dir, &lookup) == 0)
@@ -2402,9 +2396,9 @@ static int ocfs2_dx_dir_attach_index(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	mlog(0, "Dir %llu, attach new index block: %llu\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno,
-	     (unsigned long long)dr_blkno);
+	trace_ocfs2_dx_dir_attach_index(
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)dr_blkno);
 
 	dx_root_bh = sb_getblk(osb->sb, dr_blkno);
 	if (dx_root_bh == NULL) {
@@ -2504,11 +2498,10 @@ static int ocfs2_dx_dir_format_cluster(struct ocfs2_super *osb,
 		dx_leaf->dl_list.de_count =
 			cpu_to_le16(ocfs2_dx_entries_per_leaf(osb->sb));
 
-		mlog(0,
-		     "Dir %llu, format dx_leaf: %llu, entry count: %u\n",
-		     (unsigned long long)OCFS2_I(dir)->ip_blkno,
-		     (unsigned long long)bh->b_blocknr,
-		     le16_to_cpu(dx_leaf->dl_list.de_count));
+		trace_ocfs2_dx_dir_format_cluster(
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)bh->b_blocknr,
+				le16_to_cpu(dx_leaf->dl_list.de_count));
 
 		ocfs2_journal_dirty(handle, bh);
 	}
@@ -2752,12 +2745,11 @@ static void ocfs2_dx_dir_index_root_block(struct inode *dir,
 
 		ocfs2_dx_dir_name_hash(dir, de->name, de->name_len, &hinfo);
 
-		mlog(0,
-		     "dir: %llu, major: 0x%x minor: 0x%x, index: %u, name: %.*s\n",
-		     (unsigned long long)dir->i_ino, hinfo.major_hash,
-		     hinfo.minor_hash,
-		     le16_to_cpu(dx_root->dr_entries.de_num_used),
-		     de->name_len, de->name);
+		trace_ocfs2_dx_dir_index_root_block(
+				(unsigned long long)dir->i_ino,
+				hinfo.major_hash, hinfo.minor_hash,
+				de->name_len, de->name,
+				le16_to_cpu(dx_root->dr_entries.de_num_used));
 
 		ocfs2_dx_entry_list_insert(&dx_root->dr_entries, &hinfo,
 					   dirent_blk);
@@ -3310,8 +3302,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 	down_write(&OCFS2_I(dir)->ip_alloc_sem);
 	drop_alloc_sem = 1;
 	dir_i_size = i_size_read(dir);
-	mlog(0, "extending dir %llu (i_size = %lld)\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno, dir_i_size);
+	trace_ocfs2_extend_dir((unsigned long long)OCFS2_I(dir)->ip_blkno,
+			       dir_i_size);
 
 	/* dir->i_size is always block aligned. */
 	spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -3805,9 +3797,9 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
 	struct ocfs2_dx_root_block *dx_root;
 	struct ocfs2_dx_leaf *tmp_dx_leaf = NULL;
 
-	mlog(0, "DX Dir: %llu, rebalance leaf leaf_blkno: %llu insert: %u\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno,
-	     (unsigned long long)leaf_blkno, insert_hash);
+	trace_ocfs2_dx_dir_rebalance((unsigned long long)OCFS2_I(dir)->ip_blkno,
+				     (unsigned long long)leaf_blkno,
+				     insert_hash);
 
 	ocfs2_init_dx_root_extent_tree(&et, INODE_CACHE(dir), dx_root_bh);
 
@@ -3887,8 +3879,7 @@ static int ocfs2_dx_dir_rebalance(struct ocfs2_super *osb, struct inode *dir,
 		goto  out_commit;
 	}
 
-	mlog(0, "Split leaf (%u) at %u, insert major hash is %u\n",
-	     leaf_cpos, split_hash, insert_hash);
+	trace_ocfs2_dx_dir_rebalance_split(leaf_cpos, split_hash, insert_hash);
 
 	/*
 	 * We have to carefully order operations here. There are items
@@ -4345,8 +4336,8 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
 	unsigned int blocks_wanted = 1;
 	struct buffer_head *bh = NULL;
 
-	mlog(0, "getting ready to insert namelen %d into dir %llu\n",
-	     namelen, (unsigned long long)OCFS2_I(dir)->ip_blkno);
+	trace_ocfs2_prepare_dir_for_insert(
+		(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
 
 	if (!namelen) {
 		ret = -EINVAL;
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 0f7165914401..25e057e9d14d 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2013,6 +2013,122 @@ DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_acquire_dquot);
 DEFINE_OCFS2_UINT_INT_EVENT(ocfs2_mark_dquot_dirty);
 
 /* End of trace events for fs/ocfs2/quota_global.c. */
+
+/* Trace events for fs/ocfs2/dir.c. */
+DEFINE_OCFS2_INT_EVENT(ocfs2_search_dirblock);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_validate_dir_block);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_find_entry_el);
+
+TRACE_EVENT(ocfs2_dx_dir_search,
+	TP_PROTO(unsigned long long ino, int namelen, const char *name,
+		 unsigned int major_hash, unsigned int minor_hash,
+		 unsigned long long blkno),
+	TP_ARGS(ino, namelen, name, major_hash, minor_hash, blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(int, namelen)
+		__string(name, name)
+		__field(unsigned int, major_hash)
+		__field(unsigned int,minor_hash)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->major_hash = major_hash;
+		__entry->minor_hash = minor_hash;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%llu %.*s %u %u %llu", __entry->ino,
+		   __entry->namelen, __get_str(name),
+		  __entry->major_hash, __entry->minor_hash, __entry->blkno)
+);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_dx_dir_search_leaf_info);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_delete_entry_dx);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_readdir);
+
+TRACE_EVENT(ocfs2_find_files_on_disk,
+	TP_PROTO(int namelen, const char *name, void *blkno,
+		 unsigned long long dir),
+	TP_ARGS(namelen, name, blkno, dir),
+	TP_STRUCT__entry(
+		__field(int, namelen)
+		__string(name, name)
+		__field(void *, blkno)
+		__field(unsigned long long, dir)
+	),
+	TP_fast_assign(
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->blkno = blkno;
+		__entry->dir = dir;
+	),
+	TP_printk("%.*s %p %llu", __entry->namelen, __get_str(name),
+		  __entry->blkno, __entry->dir)
+);
+
+TRACE_EVENT(ocfs2_check_dir_for_entry,
+	TP_PROTO(unsigned long long dir, int namelen, const char *name),
+	TP_ARGS(dir, namelen, name),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__field(int, namelen)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+	),
+	TP_printk("%llu %.*s", __entry->dir,
+		  __entry->namelen, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_dx_dir_attach_index);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_format_cluster);
+
+TRACE_EVENT(ocfs2_dx_dir_index_root_block,
+	TP_PROTO(unsigned long long dir,
+		 unsigned int major_hash, unsigned int minor_hash,
+		 int namelen, const char *name, unsigned int num_used),
+	TP_ARGS(dir, major_hash, minor_hash, namelen, name, num_used),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__field(unsigned int, major_hash)
+		__field(unsigned int, minor_hash)
+		__field(int, namelen)
+		__string(name, name)
+		__field(unsigned int, num_used)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->major_hash = major_hash;
+		__entry->minor_hash = minor_hash;
+		__entry->namelen = namelen;
+		__assign_str(name, name);
+		__entry->num_used = num_used;
+	),
+	TP_printk("%llu %x %x %.*s %u", __entry->dir,
+		  __entry->major_hash, __entry->minor_hash,
+		   __entry->namelen, __get_str(name), __entry->num_used)
+);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_extend_dir);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_dx_dir_rebalance);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert);
+
+/* End of trace events for fs/ocfs2/dir.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 8990e44a2bceaa68018fb13000e3a4497d78a53c Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 23 Feb 2011 22:45:26 +0800
Subject: ocfs2: Remove masklog ML_NAMEI.

Remove mlog(0) from fs/ocfs2/namei.c and the masklog NAMEI finally.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |   1 -
 fs/ocfs2/cluster/masklog.h |   1 -
 fs/ocfs2/namei.c           | 126 +++++++++++-------------
 fs/ocfs2/ocfs2_trace.h     | 235 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 292 insertions(+), 71 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 90ed4c0ce180..0668eb6e05d6 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -96,7 +96,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(DLM_GLUE),
 	define_mask(BH_IO),
 	define_mask(UPTODATE),
-	define_mask(NAMEI),
 	define_mask(VOTE),
 	define_mask(DCACHE),
 	define_mask(CONN),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 0e65bae257d6..a815b55d87a7 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -98,7 +98,6 @@
 #define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
 #define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
 #define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
-#define ML_NAMEI	0x0000000000400000ULL /* ocfs2 directory / namespace */
 #define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
 #define ML_DCACHE	0x0000000002000000ULL /* ocfs2 dcache operations */
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8a13226eb1a2..b6fa9eb2d298 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -42,7 +42,6 @@
 #include <linux/highmem.h>
 #include <linux/quotaops.h>
 
-#define MLOG_MASK_PREFIX ML_NAMEI
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -63,6 +62,7 @@
 #include "uptodate.h"
 #include "xattr.h"
 #include "acl.h"
+#include "ocfs2_trace.h"
 
 #include "buffer_head_io.h"
 
@@ -106,17 +106,15 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 	struct dentry *ret;
 	struct ocfs2_inode_info *oi;
 
-	mlog(0, "(0x%p, 0x%p, '%.*s')\n", dir, dentry,
-	     dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_lookup(dir, dentry, dentry->d_name.len,
+			   dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno, 0);
 
 	if (dentry->d_name.len > OCFS2_MAX_FILENAME_LEN) {
 		ret = ERR_PTR(-ENAMETOOLONG);
 		goto bail;
 	}
 
-	mlog(0, "find name %.*s in directory %llu\n", dentry->d_name.len,
-	     dentry->d_name.name, (unsigned long long)OCFS2_I(dir)->ip_blkno);
-
 	status = ocfs2_inode_lock_nested(dir, NULL, 0, OI_LS_PARENT);
 	if (status < 0) {
 		if (status != -ENOENT)
@@ -182,7 +180,7 @@ bail_unlock:
 
 bail:
 
-	mlog(0, "%p\n", ret);
+	trace_ocfs2_lookup_ret(ret);
 
 	return ret;
 }
@@ -235,9 +233,9 @@ static int ocfs2_mknod(struct inode *dir,
 	sigset_t oldset;
 	int did_block_signals = 0;
 
-	mlog(0, "(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
-	     (unsigned long)dev, dentry->d_name.len,
-	     dentry->d_name.name);
+	trace_ocfs2_mknod(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			  (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			  (unsigned long)dev, mode);
 
 	dquot_initialize(dir);
 
@@ -354,10 +352,6 @@ static int ocfs2_mknod(struct inode *dir,
 		goto leave;
 	did_quota_inode = 1;
 
-	mlog(0, "(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
-	     inode->i_mode, (unsigned long)dev, dentry->d_name.len,
-	     dentry->d_name.name);
-
 	/* do the real work now. */
 	status = ocfs2_mknod_locked(osb, dir, inode, dev,
 				    &new_fe_bh, parent_fe_bh, handle,
@@ -436,9 +430,6 @@ leave:
 	if (did_block_signals)
 		ocfs2_unblock_signals(&oldset);
 
-	if (status == -ENOSPC)
-		mlog(0, "Disk is full\n");
-
 	brelse(new_fe_bh);
 	brelse(parent_fe_bh);
 	kfree(si.name);
@@ -617,8 +608,8 @@ static int ocfs2_mkdir(struct inode *dir,
 {
 	int ret;
 
-	mlog(0, "(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
-	     dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_mkdir(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			  OCFS2_I(dir)->ip_blkno, mode);
 	ret = ocfs2_mknod(dir, dentry, mode | S_IFDIR, 0);
 	if (ret)
 		mlog_errno(ret);
@@ -633,8 +624,8 @@ static int ocfs2_create(struct inode *dir,
 {
 	int ret;
 
-	mlog(0, "(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry, mode,
-	     dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_create(dir, dentry, dentry->d_name.len, dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno, mode);
 	ret = ocfs2_mknod(dir, dentry, mode | S_IFREG, 0);
 	if (ret)
 		mlog_errno(ret);
@@ -656,9 +647,9 @@ static int ocfs2_link(struct dentry *old_dentry,
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 	sigset_t oldset;
 
-	mlog(0, "(inode=%lu, old='%.*s' new='%.*s')\n", inode->i_ino,
-	     old_dentry->d_name.len, old_dentry->d_name.name,
-	     dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_link((unsigned long long)OCFS2_I(inode)->ip_blkno,
+			 old_dentry->d_name.len, old_dentry->d_name.name,
+			 dentry->d_name.len, dentry->d_name.name);
 
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
@@ -814,19 +805,17 @@ static int ocfs2_unlink(struct inode *dir,
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 	struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
 
-	mlog(0, "(0x%p, 0x%p, '%.*s')\n", dir, dentry,
-	     dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_unlink(dir, dentry, dentry->d_name.len,
+			   dentry->d_name.name,
+			   (unsigned long long)OCFS2_I(dir)->ip_blkno,
+			   (unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	dquot_initialize(dir);
 
 	BUG_ON(dentry->d_parent->d_inode != dir);
 
-	mlog(0, "ino = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno);
-
-	if (inode == osb->root_inode) {
-		mlog(0, "Cannot delete the root directory\n");
+	if (inode == osb->root_inode)
 		return -EPERM;
-	}
 
 	status = ocfs2_inode_lock_nested(dir, &parent_node_bh, 1,
 					 OI_LS_PARENT);
@@ -848,9 +837,10 @@ static int ocfs2_unlink(struct inode *dir,
 	if (OCFS2_I(inode)->ip_blkno != blkno) {
 		status = -ENOENT;
 
-		mlog(0, "ip_blkno %llu != dirent blkno %llu ip_flags = %x\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     (unsigned long long)blkno, OCFS2_I(inode)->ip_flags);
+		trace_ocfs2_unlink_noent(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno,
+				(unsigned long long)blkno,
+				OCFS2_I(inode)->ip_flags);
 		goto leave;
 	}
 
@@ -981,9 +971,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 	struct buffer_head **tmpbh;
 	struct inode *tmpinode;
 
-	mlog(0, "(inode1 = %llu, inode2 = %llu)\n",
-	     (unsigned long long)oi1->ip_blkno,
-	     (unsigned long long)oi2->ip_blkno);
+	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+				(unsigned long long)oi2->ip_blkno);
 
 	if (*bh1)
 		*bh1 = NULL;
@@ -994,7 +983,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 	if (oi1->ip_blkno != oi2->ip_blkno) {
 		if (oi1->ip_blkno < oi2->ip_blkno) {
 			/* switch id1 and id2 around */
-			mlog(0, "switching them around...\n");
 			tmpbh = bh2;
 			bh2 = bh1;
 			bh1 = tmpbh;
@@ -1030,6 +1018,10 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
 			mlog_errno(status);
 	}
 
+	trace_ocfs2_double_lock_end(
+			(unsigned long long)OCFS2_I(inode1)->ip_blkno,
+			(unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
 bail:
 	if (status)
 		mlog_errno(status);
@@ -1074,10 +1066,9 @@ static int ocfs2_rename(struct inode *old_dir,
 	/* At some point it might be nice to break this function up a
 	 * bit. */
 
-	mlog(0, "(0x%p, 0x%p, 0x%p, 0x%p, from='%.*s' to='%.*s')\n",
-	     old_dir, old_dentry, new_dir, new_dentry,
-	     old_dentry->d_name.len, old_dentry->d_name.name,
-	     new_dentry->d_name.len, new_dentry->d_name.name);
+	trace_ocfs2_rename(old_dir, old_dentry, new_dir, new_dentry,
+			   old_dentry->d_name.len, old_dentry->d_name.name,
+			   new_dentry->d_name.len, new_dentry->d_name.name);
 
 	dquot_initialize(old_dir);
 	dquot_initialize(new_dir);
@@ -1234,16 +1225,15 @@ static int ocfs2_rename(struct inode *old_dir,
 		if (!new_inode) {
 			status = -EACCES;
 
-			mlog(0, "We found an inode for name %.*s but VFS "
-			     "didn't give us one.\n", new_dentry->d_name.len,
-			     new_dentry->d_name.name);
+			trace_ocfs2_rename_target_exists(new_dentry->d_name.len,
+						new_dentry->d_name.name);
 			goto bail;
 		}
 
 		if (OCFS2_I(new_inode)->ip_blkno != newfe_blkno) {
 			status = -EACCES;
 
-			mlog(0, "Inode %llu and dir %llu disagree. flags = %x\n",
+			trace_ocfs2_rename_disagree(
 			     (unsigned long long)OCFS2_I(new_inode)->ip_blkno,
 			     (unsigned long long)newfe_blkno,
 			     OCFS2_I(new_inode)->ip_flags);
@@ -1266,8 +1256,7 @@ static int ocfs2_rename(struct inode *old_dir,
 
 		newfe = (struct ocfs2_dinode *) newfe_bh->b_data;
 
-		mlog(0, "aha rename over existing... new_blkno=%llu "
-		     "newfebh=%p bhblocknr=%llu\n",
+		trace_ocfs2_rename_over_existing(
 		     (unsigned long long)newfe_blkno, newfe_bh, newfe_bh ?
 		     (unsigned long long)newfe_bh->b_blocknr : 0ULL);
 
@@ -1509,9 +1498,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 	 * write i_size + 1 bytes. */
 	blocks = (bytes_left + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
 
-	mlog(0, "i_blocks = %llu, i_size = %llu, blocks = %d\n",
-	     (unsigned long long)inode->i_blocks,
-	     i_size_read(inode), blocks);
+	trace_ocfs2_create_symlink_data((unsigned long long)inode->i_blocks,
+					i_size_read(inode), blocks);
 
 	/* Sanity check -- make sure we're going to fit. */
 	if (bytes_left >
@@ -1619,8 +1607,8 @@ static int ocfs2_symlink(struct inode *dir,
 	sigset_t oldset;
 	int did_block_signals = 0;
 
-	mlog(0, "(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
-	     dentry, symname, dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_symlink_begin(dir, dentry, symname,
+				  dentry->d_name.len, dentry->d_name.name);
 
 	dquot_initialize(dir);
 
@@ -1722,9 +1710,10 @@ static int ocfs2_symlink(struct inode *dir,
 		goto bail;
 	did_quota_inode = 1;
 
-	mlog(0, "(0x%p, 0x%p, %d, '%.*s')\n", dir, dentry,
-	     inode->i_mode, dentry->d_name.len,
-	     dentry->d_name.name);
+	trace_ocfs2_symlink_create(dir, dentry, dentry->d_name.len,
+				   dentry->d_name.name,
+				   (unsigned long long)OCFS2_I(dir)->ip_blkno,
+				   inode->i_mode);
 
 	status = ocfs2_mknod_locked(osb, dir, inode,
 				    0, &new_fe_bh, parent_fe_bh, handle,
@@ -1870,8 +1859,7 @@ static int ocfs2_blkno_stringify(u64 blkno, char *name)
 		goto bail;
 	}
 
-	mlog(0, "built filename '%s' for orphan dir (len=%d)\n", name,
-	     namelen);
+	trace_ocfs2_blkno_stringify(blkno, name, namelen);
 
 	status = 0;
 bail:
@@ -2007,7 +1995,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	struct ocfs2_dinode *orphan_fe;
 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
-	mlog(0, "(inode->i_ino = %lu)\n", inode->i_ino);
+	trace_ocfs2_orphan_add_begin(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
 	if (status < 0) {
@@ -2066,8 +2055,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 
 	ocfs2_journal_dirty(handle, fe_bh);
 
-	mlog(0, "Inode %llu orphaned in slot %d\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
+	trace_ocfs2_orphan_add_end((unsigned long long)OCFS2_I(inode)->ip_blkno,
+				   osb->slot_num);
 
 leave:
 	brelse(orphan_dir_bh);
@@ -2095,9 +2084,9 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	mlog(0, "removing '%s' from orphan dir %llu (namelen=%d)\n",
-	     name, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
-	     OCFS2_ORPHAN_NAMELEN);
+	trace_ocfs2_orphan_del(
+	     (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
+	     name, OCFS2_ORPHAN_NAMELEN);
 
 	/* find it's spot in the orphan directory */
 	status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
@@ -2331,9 +2320,6 @@ leave:
 		iput(orphan_dir);
 	}
 
-	if (status == -ENOSPC)
-		mlog(0, "Disk is full\n");
-
 	if ((status < 0) && inode) {
 		clear_nlink(inode);
 		iput(inode);
@@ -2368,8 +2354,10 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 	struct buffer_head *di_bh = NULL;
 	struct ocfs2_dir_lookup_result lookup = { NULL, };
 
-	mlog(0, "(0x%p, 0x%p, %.*s')\n", dir, dentry,
-	     dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_mv_orphaned_inode_to_new(dir, dentry,
+				dentry->d_name.len, dentry->d_name.name,
+				(unsigned long long)OCFS2_I(dir)->ip_blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 	status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
 	if (status < 0) {
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 25e057e9d14d..a98e52ae06ee 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2129,6 +2129,241 @@ DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_dx_dir_rebalance_split);
 DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_prepare_dir_for_insert);
 
 /* End of trace events for fs/ocfs2/dir.c. */
+
+/* Trace events for fs/ocfs2/namei.c. */
+
+DECLARE_EVENT_CLASS(ocfs2__dentry_ops,
+	TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+		 unsigned long long dir_blkno, unsigned long long extra),
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(unsigned long long, dir_blkno)
+		__field(unsigned long long, extra)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->dir_blkno = dir_blkno;
+		__entry->extra = extra;
+	),
+	TP_printk("%p %p %.*s %llu %llu", __entry->dir, __entry->dentry,
+		  __entry->name_len, __get_str(name),
+		  __entry->dir_blkno, __entry->extra)
+);
+
+#define DEFINE_OCFS2_DENTRY_OPS(name)					\
+DEFINE_EVENT(ocfs2__dentry_ops, name,					\
+TP_PROTO(void *dir, void *dentry, int name_len, const char *name,	\
+	 unsigned long long dir_blkno, unsigned long long extra),	\
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, extra))
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_lookup);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mkdir);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_unlink);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_symlink_create);
+
+DEFINE_OCFS2_DENTRY_OPS(ocfs2_mv_orphaned_inode_to_new);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_lookup_ret);
+
+TRACE_EVENT(ocfs2_mknod,
+	TP_PROTO(void *dir, void *dentry, int name_len, const char *name,
+		 unsigned long long dir_blkno, unsigned long dev, int mode),
+	TP_ARGS(dir, dentry, name_len, name, dir_blkno, dev, mode),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(unsigned long long, dir_blkno)
+		__field(unsigned long, dev)
+		__field(int, mode)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->dir_blkno = dir_blkno;
+		__entry->dev = dev;
+		__entry->mode = mode;
+	),
+	TP_printk("%p %p %.*s %llu %lu %d", __entry->dir, __entry->dentry,
+		  __entry->name_len, __get_str(name),
+		  __entry->dir_blkno, __entry->dev, __entry->mode)
+);
+
+TRACE_EVENT(ocfs2_link,
+	TP_PROTO(unsigned long long ino, int old_len, const char *old_name,
+		 int name_len, const char *name),
+	TP_ARGS(ino, old_len, old_name, name_len, name),
+	TP_STRUCT__entry(
+		__field(unsigned long long, ino)
+		__field(int, old_len)
+		__string(old_name, old_name)
+		__field(int, name_len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->ino = ino;
+		__entry->old_len = old_len;
+		__assign_str(old_name, old_name);
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+	),
+	TP_printk("%llu %.*s %.*s", __entry->ino,
+		  __entry->old_len, __get_str(old_name),
+		  __entry->name_len, __get_str(name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_unlink_noent);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_double_lock_end);
+
+TRACE_EVENT(ocfs2_rename,
+	TP_PROTO(void *old_dir, void *old_dentry,
+		 void *new_dir, void *new_dentry,
+		 int old_len, const char *old_name,
+		 int new_len, const char *new_name),
+	TP_ARGS(old_dir, old_dentry, new_dir, new_dentry,
+		old_len, old_name, new_len, new_name),
+	TP_STRUCT__entry(
+		__field(void *, old_dir)
+		__field(void *, old_dentry)
+		__field(void *, new_dir)
+		__field(void *, new_dentry)
+		__field(int, old_len)
+		__string(old_name, old_name)
+		__field(int, new_len)
+		__string(new_name, new_name)
+	),
+	TP_fast_assign(
+		__entry->old_dir = old_dir;
+		__entry->old_dentry = old_dentry;
+		__entry->new_dir = new_dir;
+		__entry->new_dentry = new_dentry;
+		__entry->old_len = old_len;
+		__assign_str(old_name, old_name);
+		__entry->new_len = new_len;
+		__assign_str(new_name, new_name);
+	),
+	TP_printk("%p %p %p %p %.*s %.*s",
+		  __entry->old_dir, __entry->old_dentry,
+		  __entry->new_dir, __entry->new_dentry,
+		  __entry->old_len, __get_str(old_name),
+		  __entry->new_len, __get_str(new_name))
+);
+
+TRACE_EVENT(ocfs2_rename_target_exists,
+	TP_PROTO(int new_len, const char *new_name),
+	TP_ARGS(new_len, new_name),
+	TP_STRUCT__entry(
+		__field(int, new_len)
+		__string(new_name, new_name)
+	),
+	TP_fast_assign(
+		__entry->new_len = new_len;
+		__assign_str(new_name, new_name);
+	),
+	TP_printk("%.*s", __entry->new_len, __get_str(new_name))
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_rename_disagree);
+
+TRACE_EVENT(ocfs2_rename_over_existing,
+	TP_PROTO(unsigned long long new_blkno, void *new_bh,
+		 unsigned long long newdi_blkno),
+	TP_ARGS(new_blkno, new_bh, newdi_blkno),
+	TP_STRUCT__entry(
+		__field(unsigned long long, new_blkno)
+		__field(void *, new_bh)
+		__field(unsigned long long, newdi_blkno)
+	),
+	TP_fast_assign(
+		__entry->new_blkno = new_blkno;
+		__entry->new_bh = new_bh;
+		__entry->newdi_blkno = newdi_blkno;
+	),
+	TP_printk("%llu %p %llu", __entry->new_blkno, __entry->new_bh,
+		  __entry->newdi_blkno)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_create_symlink_data);
+
+TRACE_EVENT(ocfs2_symlink_begin,
+	TP_PROTO(void *dir, void *dentry, const char *symname,
+		 int len, const char *name),
+	TP_ARGS(dir, dentry, symname, len, name),
+	TP_STRUCT__entry(
+		__field(void *, dir)
+		__field(void *, dentry)
+		__field(const char *, symname)
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__entry->dentry = dentry;
+		__entry->symname = symname;
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%p %p %s %.*s", __entry->dir, __entry->dentry,
+		  __entry->symname, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_blkno_stringify,
+	TP_PROTO(unsigned long long blkno, const char *name, int namelen),
+	TP_ARGS(blkno, name, namelen),
+	TP_STRUCT__entry(
+		__field(unsigned long long, blkno)
+		__string(name, name)
+		__field(int, namelen)
+	),
+	TP_fast_assign(
+		__entry->blkno = blkno;
+		__assign_str(name, name);
+		__entry->namelen = namelen;
+	),
+	TP_printk("%llu %s %d", __entry->blkno, __get_str(name),
+		  __entry->namelen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_add_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_orphan_add_end);
+
+TRACE_EVENT(ocfs2_orphan_del,
+	TP_PROTO(unsigned long long dir, const char *name, int namelen),
+	TP_ARGS(dir, name, namelen),
+	TP_STRUCT__entry(
+		__field(unsigned long long, dir)
+		__string(name, name)
+		__field(int, namelen)
+	),
+	TP_fast_assign(
+		__entry->dir = dir;
+		__assign_str(name, name);
+		__entry->namelen = namelen;
+	),
+	TP_printk("%llu %s %d", __entry->dir, __get_str(name),
+		  __entry->namelen)
+);
+
+/* End of trace events for fs/ocfs2/namei.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From a5b8443b34d57c6c6e3f8180d9f88890f3625a4b Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Wed, 23 Feb 2011 22:51:49 +0800
Subject: ocfs2: Remove masklog ML_DCACHE.

Remove mlog(0) from fs/ocfs2/dcache.c and the masklog DCACHE.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |  1 -
 fs/ocfs2/cluster/masklog.h |  1 -
 fs/ocfs2/dcache.c          | 44 ++++++++++-----------
 fs/ocfs2/ocfs2_trace.h     | 99 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 121 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 0668eb6e05d6..91df3e2571a0 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -97,7 +97,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(BH_IO),
 	define_mask(UPTODATE),
 	define_mask(VOTE),
-	define_mask(DCACHE),
 	define_mask(CONN),
 	define_mask(QUORUM),
 	define_mask(EXPORT),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index a815b55d87a7..c64b0aa59e9d 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -99,7 +99,6 @@
 #define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
 #define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
 #define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
-#define ML_DCACHE	0x0000000002000000ULL /* ocfs2 dcache operations */
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
 #define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index c26551c25e8e..e5ba34818332 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/namei.h>
 
-#define MLOG_MASK_PREFIX ML_DCACHE
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -39,6 +38,7 @@
 #include "file.h"
 #include "inode.h"
 #include "super.h"
+#include "ocfs2_trace.h"
 
 void ocfs2_dentry_attach_gen(struct dentry *dentry)
 {
@@ -62,8 +62,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 	inode = dentry->d_inode;
 	osb = OCFS2_SB(dentry->d_sb);
 
-	mlog(0, "(0x%p, '%.*s')\n", dentry,
-	     dentry->d_name.len, dentry->d_name.name);
+	trace_ocfs2_dentry_revalidate(dentry, dentry->d_name.len,
+				      dentry->d_name.name);
 
 	/* For a negative dentry -
 	 * check the generation number of the parent and compare with the
@@ -73,9 +73,10 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 		unsigned long gen = (unsigned long) dentry->d_fsdata;
 		unsigned long pgen =
 			OCFS2_I(dentry->d_parent->d_inode)->ip_dir_lock_gen;
-		mlog(0, "negative dentry: %.*s parent gen: %lu "
-			"dentry gen: %lu\n",
-			dentry->d_name.len, dentry->d_name.name, pgen, gen);
+
+		trace_ocfs2_dentry_revalidate_negative(dentry->d_name.len,
+						       dentry->d_name.name,
+						       pgen, gen);
 		if (gen != pgen)
 			goto bail;
 		goto valid;
@@ -90,8 +91,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 	/* did we or someone else delete this inode? */
 	if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) {
 		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		mlog(0, "inode (%llu) deleted, returning false\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		trace_ocfs2_dentry_revalidate_delete(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 		goto bail;
 	}
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
@@ -101,10 +102,9 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 	 * inode nlink hits zero, it never goes back.
 	 */
 	if (inode->i_nlink == 0) {
-		mlog(0, "Inode %llu orphaned, returning false "
-		     "dir = %d\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     S_ISDIR(inode->i_mode));
+		trace_ocfs2_dentry_revalidate_orphaned(
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			S_ISDIR(inode->i_mode));
 		goto bail;
 	}
 
@@ -113,9 +113,8 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
 	 * redo it.
 	 */
 	if (!dentry->d_fsdata) {
-		mlog(0, "Inode %llu doesn't have dentry lock, "
-		     "returning false\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		trace_ocfs2_dentry_revalidate_nofsdata(
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 		goto bail;
 	}
 
@@ -123,7 +122,7 @@ valid:
 	ret = 1;
 
 bail:
-	mlog(0, "ret = %d\n", ret);
+	trace_ocfs2_dentry_revalidate_ret(ret);
 	return ret;
 }
 
@@ -180,8 +179,8 @@ struct dentry *ocfs2_find_local_alias(struct inode *inode,
 
 		spin_lock(&dentry->d_lock);
 		if (ocfs2_match_dentry(dentry, parent_blkno, skip_unhashed)) {
-			mlog(0, "dentry found: %.*s\n",
-			     dentry->d_name.len, dentry->d_name.name);
+			trace_ocfs2_find_local_alias(dentry->d_name.len,
+						     dentry->d_name.name);
 
 			dget_dlock(dentry);
 			spin_unlock(&dentry->d_lock);
@@ -239,9 +238,8 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
 	struct dentry *alias;
 	struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
 
-	mlog(0, "Attach \"%.*s\", parent %llu, fsdata: %p\n",
-	     dentry->d_name.len, dentry->d_name.name,
-	     (unsigned long long)parent_blkno, dl);
+	trace_ocfs2_dentry_attach_lock(dentry->d_name.len, dentry->d_name.name,
+				       (unsigned long long)parent_blkno, dl);
 
 	/*
 	 * Negative dentry. We ignore these for now.
@@ -291,7 +289,9 @@ int ocfs2_dentry_attach_lock(struct dentry *dentry,
 				(unsigned long long)parent_blkno,
 				(unsigned long long)dl->dl_parent_blkno);
 
-		mlog(0, "Found: %s\n", dl->dl_lockres.l_name);
+		trace_ocfs2_dentry_attach_lock_found(dl->dl_lockres.l_name,
+				(unsigned long long)parent_blkno,
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
 
 		goto out_attach;
 	}
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index a98e52ae06ee..6ef3a928023b 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2364,6 +2364,105 @@ TRACE_EVENT(ocfs2_orphan_del,
 );
 
 /* End of trace events for fs/ocfs2/namei.c. */
+
+/* Trace events for fs/ocfs2/dcache.c. */
+
+TRACE_EVENT(ocfs2_dentry_revalidate,
+	TP_PROTO(void *dentry, int len, const char *name),
+	TP_ARGS(dentry, len, name),
+	TP_STRUCT__entry(
+		__field(void *, dentry)
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->dentry = dentry;
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%p %.*s", __entry->dentry, __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_revalidate_negative,
+	TP_PROTO(int len, const char *name, unsigned long pgen,
+		 unsigned long gen),
+	TP_ARGS(len, name, pgen, gen),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long, pgen)
+		__field(unsigned long, gen)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->pgen = pgen;
+		__entry->gen = gen;
+	),
+	TP_printk("%.*s %lu %lu", __entry->len, __get_str(name),
+		  __entry->pgen, __entry->gen)
+);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_delete);
+
+DEFINE_OCFS2_ULL_INT_EVENT(ocfs2_dentry_revalidate_orphaned);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_dentry_revalidate_nofsdata);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_dentry_revalidate_ret);
+
+TRACE_EVENT(ocfs2_find_local_alias,
+	TP_PROTO(int len, const char *name),
+	TP_ARGS(len, name),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+	),
+	TP_printk("%.*s", __entry->len, __get_str(name))
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock,
+	TP_PROTO(int len, const char *name,
+		 unsigned long long parent, void *fsdata),
+	TP_ARGS(len, name, parent, fsdata),
+	TP_STRUCT__entry(
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long long, parent)
+		__field(void *, fsdata)
+	),
+	TP_fast_assign(
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->parent = parent;
+		__entry->fsdata = fsdata;
+	),
+	TP_printk("%.*s %llu %p", __entry->len, __get_str(name),
+		  __entry->parent, __entry->fsdata)
+);
+
+TRACE_EVENT(ocfs2_dentry_attach_lock_found,
+	TP_PROTO(const char *name, unsigned long long parent,
+		 unsigned long long ino),
+	TP_ARGS(name, parent, ino),
+	TP_STRUCT__entry(
+		__string(name, name)
+		__field(unsigned long long, parent)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__assign_str(name, name);
+		__entry->parent = parent;
+		__entry->ino = ino;
+	),
+	TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino)
+);
+
+/* End of trace events for fs/ocfs2/dcache.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 781f200cb7a657339c8760984870c252701aa485 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 24 Feb 2011 13:50:19 +0800
Subject: ocfs2: Remove masklog ML_EXPORT.

Remove mlog(0) from fs/ocfs2/export.c and the masklog EXPORT.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |  1 -
 fs/ocfs2/cluster/masklog.h |  1 -
 fs/ocfs2/export.c          | 47 +++++++++++---------------
 fs/ocfs2/ocfs2_trace.h     | 82 +++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 101 insertions(+), 30 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 91df3e2571a0..491923b1e0a6 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -99,7 +99,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(VOTE),
 	define_mask(CONN),
 	define_mask(QUORUM),
-	define_mask(EXPORT),
 	define_mask(BASTS),
 	define_mask(CLUSTER),
 	define_mask(ERROR),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index c64b0aa59e9d..c5bde37cd8b2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -101,7 +101,6 @@
 #define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
-#define ML_EXPORT	0x0000000010000000ULL /* ocfs2 export operations */
 #define ML_BASTS	0x0000000100000000ULL /* dlmglue asts and basts */
 #define ML_CLUSTER	0x0000000400000000ULL /* cluster stack */
 
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index 96d7ed7f8305..745db42528d5 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 
-#define MLOG_MASK_PREFIX ML_EXPORT
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -40,6 +39,7 @@
 
 #include "buffer_head_io.h"
 #include "suballoc.h"
+#include "ocfs2_trace.h"
 
 struct ocfs2_inode_handle
 {
@@ -56,10 +56,9 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
 	int status, set;
 	struct dentry *result;
 
-	mlog(0, "(0x%p, 0x%p)\n", sb, handle);
+	trace_ocfs2_get_dentry_begin(sb, handle, (unsigned long long)blkno);
 
 	if (blkno == 0) {
-		mlog(0, "nfs wants inode with blkno: 0\n");
 		result = ERR_PTR(-ESTALE);
 		goto bail;
 	}
@@ -83,6 +82,7 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
 	}
 
 	status = ocfs2_test_inode_bit(osb, blkno, &set);
+	trace_ocfs2_get_dentry_test_bit(status, set);
 	if (status < 0) {
 		if (status == -EINVAL) {
 			/*
@@ -90,18 +90,14 @@ static struct dentry *ocfs2_get_dentry(struct super_block *sb,
 			 * as an inode, we return -ESTALE to be
 			 * nice
 			 */
-			mlog(0, "test inode bit failed %d\n", status);
 			status = -ESTALE;
-		} else {
+		} else
 			mlog(ML_ERROR, "test inode bit failed %d\n", status);
-		}
 		goto unlock_nfs_sync;
 	}
 
 	/* If the inode allocator bit is clear, this inode must be stale */
 	if (!set) {
-		mlog(0, "inode %llu suballoc bit is clear\n",
-		     (unsigned long long)blkno);
 		status = -ESTALE;
 		goto unlock_nfs_sync;
 	}
@@ -114,8 +110,8 @@ unlock_nfs_sync:
 check_err:
 	if (status < 0) {
 		if (status == -ESTALE) {
-			mlog(0, "stale inode ino: %llu generation: %u\n",
-			     (unsigned long long)blkno, handle->ih_generation);
+			trace_ocfs2_get_dentry_stale((unsigned long long)blkno,
+						     handle->ih_generation);
 		}
 		result = ERR_PTR(status);
 		goto bail;
@@ -130,8 +126,9 @@ check_err:
 check_gen:
 	if (handle->ih_generation != inode->i_generation) {
 		iput(inode);
-		mlog(0, "stale inode ino: %llu generation: %u\n",
-		     (unsigned long long)blkno, handle->ih_generation);
+		trace_ocfs2_get_dentry_generation((unsigned long long)blkno,
+						  handle->ih_generation,
+						  inode->i_generation);
 		result = ERR_PTR(-ESTALE);
 		goto bail;
 	}
@@ -141,7 +138,7 @@ check_gen:
 		mlog_errno(PTR_ERR(result));
 
 bail:
-	mlog(0, "result = %p\n", result);
+	trace_ocfs2_get_dentry_end(result);
 	return result;
 }
 
@@ -152,11 +149,8 @@ static struct dentry *ocfs2_get_parent(struct dentry *child)
 	struct dentry *parent;
 	struct inode *dir = child->d_inode;
 
-	mlog(0, "(0x%p, '%.*s')\n", child,
-	     child->d_name.len, child->d_name.name);
-
-	mlog(0, "find parent of directory %llu\n",
-	     (unsigned long long)OCFS2_I(dir)->ip_blkno);
+	trace_ocfs2_get_parent(child, child->d_name.len, child->d_name.name,
+			       (unsigned long long)OCFS2_I(dir)->ip_blkno);
 
 	status = ocfs2_inode_lock(dir, NULL, 0);
 	if (status < 0) {
@@ -178,7 +172,7 @@ bail_unlock:
 	ocfs2_inode_unlock(dir, 0);
 
 bail:
-	mlog(0, "parent %p\n", parent);
+	trace_ocfs2_get_parent_end(parent);
 
 	return parent;
 }
@@ -193,9 +187,9 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 	u32 generation;
 	__le32 *fh = (__force __le32 *) fh_in;
 
-	mlog(0, "(0x%p, '%.*s', 0x%p, %d, %d)\n", dentry,
-	     dentry->d_name.len, dentry->d_name.name,
-	     fh, len, connectable);
+	trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
+				    dentry->d_name.name,
+				    fh, len, connectable);
 
 	if (connectable && (len < 6)) {
 		*max_len = 6;
@@ -210,8 +204,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 	blkno = OCFS2_I(inode)->ip_blkno;
 	generation = inode->i_generation;
 
-	mlog(0, "Encoding fh: blkno: %llu, generation: %u\n",
-	     (unsigned long long)blkno, generation);
+	trace_ocfs2_encode_fh_self((unsigned long long)blkno, generation);
 
 	len = 3;
 	fh[0] = cpu_to_le32((u32)(blkno >> 32));
@@ -236,14 +229,14 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
 		len = 6;
 		type = 2;
 
-		mlog(0, "Encoding parent: blkno: %llu, generation: %u\n",
-		     (unsigned long long)blkno, generation);
+		trace_ocfs2_encode_fh_parent((unsigned long long)blkno,
+					     generation);
 	}
 
 	*max_len = len;
 
 bail:
-	mlog(0, "type = %d\n", type);
+	trace_ocfs2_encode_fh_type(type);
 	return type;
 }
 
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 6ef3a928023b..48839054d5af 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2461,8 +2461,88 @@ TRACE_EVENT(ocfs2_dentry_attach_lock_found,
 	),
 	TP_printk("%s %llu %llu", __get_str(name), __entry->parent, __entry->ino)
 );
-
 /* End of trace events for fs/ocfs2/dcache.c. */
+
+/* Trace events for fs/ocfs2/export.c. */
+
+TRACE_EVENT(ocfs2_get_dentry_begin,
+	TP_PROTO(void *sb, void *handle, unsigned long long blkno),
+	TP_ARGS(sb, handle, blkno),
+	TP_STRUCT__entry(
+		__field(void *, sb)
+		__field(void *, handle)
+		__field(unsigned long long, blkno)
+	),
+	TP_fast_assign(
+		__entry->sb = sb;
+		__entry->handle = handle;
+		__entry->blkno = blkno;
+	),
+	TP_printk("%p %p %llu", __entry->sb, __entry->handle, __entry->blkno)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_get_dentry_test_bit);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_get_dentry_stale);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_get_dentry_generation);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_dentry_end);
+
+TRACE_EVENT(ocfs2_get_parent,
+	TP_PROTO(void *child, int len, const char *name,
+		 unsigned long long ino),
+	TP_ARGS(child, len, name, ino),
+	TP_STRUCT__entry(
+		__field(void *,	child)
+		__field(int, len)
+		__string(name, name)
+		__field(unsigned long long, ino)
+	),
+	TP_fast_assign(
+		__entry->child = child;
+		__entry->len = len;
+		__assign_str(name, name);
+		__entry->ino = ino;
+	),
+	TP_printk("%p %.*s %llu", __entry->child, __entry->len,
+		  __get_str(name), __entry->ino)
+);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_get_parent_end);
+
+TRACE_EVENT(ocfs2_encode_fh_begin,
+	TP_PROTO(void *dentry, int name_len, const char *name,
+		 void *fh, int len, int connectable),
+	TP_ARGS(dentry, name_len, name, fh, len, connectable),
+	TP_STRUCT__entry(
+		__field(void *, dentry)
+		__field(int, name_len)
+		__string(name, name)
+		__field(void *, fh)
+		__field(int, len)
+		__field(int, connectable)
+	),
+	TP_fast_assign(
+		__entry->dentry = dentry;
+		__entry->name_len = name_len;
+		__assign_str(name, name);
+		__entry->fh = fh;
+		__entry->len = len;
+		__entry->connectable = connectable;
+	),
+	TP_printk("%p %.*s %p %d %d", __entry->dentry, __entry->name_len,
+		  __get_str(name), __entry->fh, __entry->len,
+		  __entry->connectable)
+);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_self);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type);
+
+/* End of trace events for fs/ocfs2/export.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From b41079504c786eb226e7e1e3dc5cc1f37fbd1c2b Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 24 Feb 2011 14:15:35 +0800
Subject: ocfs2: Remove masklog ML_JOURNAL.

Remove mlog(0) from fs/ocfs2/journal.c and the masklog JOURNAL.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c |   1 -
 fs/ocfs2/cluster/masklog.h |   1 -
 fs/ocfs2/journal.c         | 130 +++++++++++++++++++--------------------------
 fs/ocfs2/ocfs2_trace.h     |  99 ++++++++++++++++++++++++++++++++++
 4 files changed, 153 insertions(+), 78 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index 491923b1e0a6..b64fb67c2999 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -92,7 +92,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(DLM_MASTER),
 	define_mask(DLM_RECOVERY),
 	define_mask(AIO),
-	define_mask(JOURNAL),
 	define_mask(DLM_GLUE),
 	define_mask(BH_IO),
 	define_mask(UPTODATE),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index c5bde37cd8b2..1340ef0ae3e1 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -94,7 +94,6 @@
 #define ML_DLM_MASTER	0x0000000000000800ULL /* dlm master functions */
 #define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
 #define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
-#define ML_JOURNAL	0x0000000000004000ULL /* ocfs2 journalling functions */
 #define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
 #define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
 #define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 956509167bf5..dcc2d9327150 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -31,7 +31,6 @@
 #include <linux/time.h>
 #include <linux/random.h>
 
-#define MLOG_MASK_PREFIX ML_JOURNAL
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
@@ -52,6 +51,7 @@
 #include "quota.h"
 
 #include "buffer_head_io.h"
+#include "ocfs2_trace.h"
 
 DEFINE_SPINLOCK(trans_inc_lock);
 
@@ -308,9 +308,10 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 	/* Flush all pending commits and checkpoint the journal. */
 	down_write(&journal->j_trans_barrier);
 
-	if (atomic_read(&journal->j_num_trans) == 0) {
+	flushed = atomic_read(&journal->j_num_trans);
+	trace_ocfs2_commit_cache_begin(flushed);
+	if (flushed == 0) {
 		up_write(&journal->j_trans_barrier);
-		mlog(0, "No transactions for me to flush!\n");
 		goto finally;
 	}
 
@@ -329,8 +330,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
 	atomic_set(&journal->j_num_trans, 0);
 	up_write(&journal->j_trans_barrier);
 
-	mlog(0, "commit_thread: flushed transaction %lu (%u handles)\n",
-	     journal->j_trans_id, flushed);
+	trace_ocfs2_commit_cache_end(journal->j_trans_id, flushed);
 
 	ocfs2_wake_downconvert_thread(osb);
 	wake_up(&journal->j_checkpointed);
@@ -423,7 +423,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 
 	old_nblocks = handle->h_buffer_credits;
 
-	mlog(0, "Trying to extend transaction by %d blocks\n", nblocks);
+	trace_ocfs2_extend_trans(old_nblocks, nblocks);
 
 #ifdef CONFIG_OCFS2_DEBUG_FS
 	status = 1;
@@ -436,9 +436,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks)
 #endif
 
 	if (status > 0) {
-		mlog(0,
-		     "jbd2_journal_extend failed, trying "
-		     "jbd2_journal_restart\n");
+		trace_ocfs2_extend_trans_restart(old_nblocks + nblocks);
 		status = jbd2_journal_restart(handle,
 					      old_nblocks + nblocks);
 		if (status < 0) {
@@ -616,12 +614,9 @@ static int __ocfs2_journal_access(handle_t *handle,
 	BUG_ON(!handle);
 	BUG_ON(!bh);
 
-	mlog(0, "bh->b_blocknr=%llu, type=%d (\"%s\"), bh->b_size = %zu\n",
-	     (unsigned long long)bh->b_blocknr, type,
-	     (type == OCFS2_JOURNAL_ACCESS_CREATE) ?
-	     "OCFS2_JOURNAL_ACCESS_CREATE" :
-	     "OCFS2_JOURNAL_ACCESS_WRITE",
-	     bh->b_size);
+	trace_ocfs2_journal_access(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)bh->b_blocknr, type, bh->b_size);
 
 	/* we can safely remove this assertion after testing. */
 	if (!buffer_uptodate(bh)) {
@@ -730,8 +725,7 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh)
 {
 	int status;
 
-	mlog(0, "(bh->b_blocknr=%llu)\n",
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr);
 
 	status = jbd2_journal_dirty_metadata(handle, bh);
 	BUG_ON(status);
@@ -809,10 +803,9 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 		goto done;
 	}
 
-	mlog(0, "inode->i_size = %lld\n", inode->i_size);
-	mlog(0, "inode->i_blocks = %llu\n",
-			(unsigned long long)inode->i_blocks);
-	mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters);
+	trace_ocfs2_journal_init(inode->i_size,
+				 (unsigned long long)inode->i_blocks,
+				 OCFS2_I(inode)->ip_clusters);
 
 	/* call the kernels journal init function now */
 	j_journal = jbd2_journal_init_inode(inode);
@@ -822,8 +815,7 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
 		goto done;
 	}
 
-	mlog(0, "Returned from jbd2_journal_init_inode\n");
-	mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen);
+	trace_ocfs2_journal_init_maxlen(j_journal->j_maxlen);
 
 	*dirty = (le32_to_cpu(di->id1.journal1.ij_flags) &
 		  OCFS2_JOURNAL_DIRTY_FL);
@@ -922,10 +914,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 		BUG();
 
 	num_running_trans = atomic_read(&(osb->journal->j_num_trans));
-	if (num_running_trans > 0)
-		mlog(0, "Shutting down journal: must wait on %d "
-		     "running transactions!\n",
-		     num_running_trans);
+	trace_ocfs2_journal_shutdown(num_running_trans);
 
 	/* Do a commit_cache here. It will flush our journal, *and*
 	 * release any locks that are still held.
@@ -938,7 +927,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 	 * completely destroy the journal. */
 	if (osb->commit_task) {
 		/* Wait for the commit thread */
-		mlog(0, "Waiting for ocfs2commit to exit....\n");
+		trace_ocfs2_journal_shutdown_wait(osb->commit_task);
 		kthread_stop(osb->commit_task);
 		osb->commit_task = NULL;
 	}
@@ -1158,7 +1147,7 @@ struct ocfs2_la_recovery_item {
  */
 void ocfs2_complete_recovery(struct work_struct *work)
 {
-	int ret;
+	int ret = 0;
 	struct ocfs2_journal *journal =
 		container_of(work, struct ocfs2_journal, j_recovery_work);
 	struct ocfs2_super *osb = journal->j_osb;
@@ -1167,7 +1156,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
 	struct ocfs2_quota_recovery *qrec;
 	LIST_HEAD(tmp_la_list);
 
-	mlog(0, "completing recovery from keventd\n");
+	trace_ocfs2_complete_recovery(
+		(unsigned long long)OCFS2_I(journal->j_inode)->ip_blkno);
 
 	spin_lock(&journal->j_lock);
 	list_splice_init(&journal->j_la_cleanups, &tmp_la_list);
@@ -1176,15 +1166,18 @@ void ocfs2_complete_recovery(struct work_struct *work)
 	list_for_each_entry_safe(item, n, &tmp_la_list, lri_list) {
 		list_del_init(&item->lri_list);
 
-		mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
-
 		ocfs2_wait_on_quotas(osb);
 
 		la_dinode = item->lri_la_dinode;
-		if (la_dinode) {
-			mlog(0, "Clean up local alloc %llu\n",
-			     (unsigned long long)le64_to_cpu(la_dinode->i_blkno));
+		tl_dinode = item->lri_tl_dinode;
+		qrec = item->lri_qrec;
 
+		trace_ocfs2_complete_recovery_slot(item->lri_slot,
+			la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
+			tl_dinode ? le64_to_cpu(tl_dinode->i_blkno) : 0,
+			qrec);
+
+		if (la_dinode) {
 			ret = ocfs2_complete_local_alloc_recovery(osb,
 								  la_dinode);
 			if (ret < 0)
@@ -1193,11 +1186,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
 			kfree(la_dinode);
 		}
 
-		tl_dinode = item->lri_tl_dinode;
 		if (tl_dinode) {
-			mlog(0, "Clean up truncate log %llu\n",
-			     (unsigned long long)le64_to_cpu(tl_dinode->i_blkno));
-
 			ret = ocfs2_complete_truncate_log_recovery(osb,
 								   tl_dinode);
 			if (ret < 0)
@@ -1210,9 +1199,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
 		if (ret < 0)
 			mlog_errno(ret);
 
-		qrec = item->lri_qrec;
 		if (qrec) {
-			mlog(0, "Recovering quota files");
 			ret = ocfs2_finish_quota_recovery(osb, qrec,
 							  item->lri_slot);
 			if (ret < 0)
@@ -1223,7 +1210,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
 		kfree(item);
 	}
 
-	mlog(0, "Recovery completion\n");
+	trace_ocfs2_complete_recovery_end(ret);
 }
 
 /* NOTE: This function always eats your references to la_dinode and
@@ -1340,15 +1327,12 @@ restart:
 		 * clear it until ocfs2_recover_node() has succeeded. */
 		node_num = rm->rm_entries[0];
 		spin_unlock(&osb->osb_lock);
-		mlog(0, "checking node %d\n", node_num);
 		slot_num = ocfs2_node_num_to_slot(osb, node_num);
+		trace_ocfs2_recovery_thread_node(node_num, slot_num);
 		if (slot_num == -ENOENT) {
 			status = 0;
-			mlog(0, "no slot for this node, so no recovery"
-			     "required.\n");
 			goto skip_recovery;
 		}
-		mlog(0, "node %d was using slot %d\n", node_num, slot_num);
 
 		/* It is a bit subtle with quota recovery. We cannot do it
 		 * immediately because we have to obtain cluster locks from
@@ -1375,7 +1359,7 @@ skip_recovery:
 		spin_lock(&osb->osb_lock);
 	}
 	spin_unlock(&osb->osb_lock);
-	mlog(0, "All nodes recovered\n");
+	trace_ocfs2_recovery_thread_end(status);
 
 	/* Refresh all journal recovery generations from disk */
 	status = ocfs2_check_journals_nolocks(osb);
@@ -1428,19 +1412,15 @@ bail:
 
 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
 {
-	mlog(0, "(node_num=%d, osb->node_num = %d)\n",
-	     node_num, osb->node_num);
-
 	mutex_lock(&osb->recovery_lock);
-	if (osb->disable_recovery)
-		goto out;
 
-	/* People waiting on recovery will wait on
-	 * the recovery map to empty. */
-	if (ocfs2_recovery_map_set(osb, node_num))
-		mlog(0, "node %d already in recovery map.\n", node_num);
+	trace_ocfs2_recovery_thread(node_num, osb->node_num,
+		osb->disable_recovery, osb->recovery_thread_task,
+		osb->disable_recovery ?
+		-1 : ocfs2_recovery_map_set(osb, node_num));
 
-	mlog(0, "starting recovery thread...\n");
+	if (osb->disable_recovery)
+		goto out;
 
 	if (osb->recovery_thread_task)
 		goto out;
@@ -1528,7 +1508,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	 * If not, it needs recovery.
 	 */
 	if (osb->slot_recovery_generations[slot_num] != slot_reco_gen) {
-		mlog(0, "Slot %u already recovered (old/new=%u/%u)\n", slot_num,
+		trace_ocfs2_replay_journal_recovered(slot_num,
 		     osb->slot_recovery_generations[slot_num], slot_reco_gen);
 		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
 		status = -EBUSY;
@@ -1539,7 +1519,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 
 	status = ocfs2_inode_lock_full(inode, &bh, 1, OCFS2_META_LOCK_RECOVERY);
 	if (status < 0) {
-		mlog(0, "status returned from ocfs2_inode_lock=%d\n", status);
+		trace_ocfs2_replay_journal_lock_err(status);
 		if (status != -ERESTARTSYS)
 			mlog(ML_ERROR, "Could not lock journal!\n");
 		goto done;
@@ -1552,7 +1532,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	slot_reco_gen = ocfs2_get_recovery_generation(fe);
 
 	if (!(flags & OCFS2_JOURNAL_DIRTY_FL)) {
-		mlog(0, "No recovery required for node %d\n", node_num);
+		trace_ocfs2_replay_journal_skip(node_num);
 		/* Refresh recovery generation for the slot */
 		osb->slot_recovery_generations[slot_num] = slot_reco_gen;
 		goto done;
@@ -1573,7 +1553,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 		goto done;
 	}
 
-	mlog(0, "calling journal_init_inode\n");
 	journal = jbd2_journal_init_inode(inode);
 	if (journal == NULL) {
 		mlog(ML_ERROR, "Linux journal layer error\n");
@@ -1593,7 +1572,6 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
 	ocfs2_clear_journal_error(osb->sb, journal, slot_num);
 
 	/* wipe the journal */
-	mlog(0, "flushing the journal.\n");
 	jbd2_journal_lock_updates(journal);
 	status = jbd2_journal_flush(journal);
 	jbd2_journal_unlock_updates(journal);
@@ -1652,8 +1630,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 	struct ocfs2_dinode *la_copy = NULL;
 	struct ocfs2_dinode *tl_copy = NULL;
 
-	mlog(0, "(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
-	     node_num, slot_num, osb->node_num);
+	trace_ocfs2_recover_node(node_num, slot_num, osb->node_num);
 
 	/* Should not ever be called to recover ourselves -- in that
 	 * case we should've called ocfs2_journal_load instead. */
@@ -1662,9 +1639,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
 	status = ocfs2_replay_journal(osb, node_num, slot_num);
 	if (status < 0) {
 		if (status == -EBUSY) {
-			mlog(0, "Skipping recovery for slot %u (node %u) "
-			     "as another node has recovered it\n", slot_num,
-			     node_num);
+			trace_ocfs2_recover_node_skip(slot_num, node_num);
 			status = 0;
 			goto done;
 		}
@@ -1771,8 +1746,8 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
 		spin_lock(&osb->osb_lock);
 		osb->slot_recovery_generations[i] = gen;
 
-		mlog(0, "Slot %u recovery generation is %u\n", i,
-		     osb->slot_recovery_generations[i]);
+		trace_ocfs2_mark_dead_nodes(i,
+					    osb->slot_recovery_generations[i]);
 
 		if (i == osb->slot_num) {
 			spin_unlock(&osb->osb_lock);
@@ -1846,11 +1821,12 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 
 	os = &osb->osb_orphan_scan;
 
-	mlog(0, "Begin orphan scan\n");
-
 	if (atomic_read(&os->os_state) == ORPHAN_SCAN_INACTIVE)
 		goto out;
 
+	trace_ocfs2_queue_orphan_scan_begin(os->os_count, os->os_seqno,
+					    atomic_read(&os->os_state));
+
 	status = ocfs2_orphan_scan_lock(osb, &seqno);
 	if (status < 0) {
 		if (status != -EAGAIN)
@@ -1880,7 +1856,8 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 unlock:
 	ocfs2_orphan_scan_unlock(osb, seqno);
 out:
-	mlog(0, "Orphan scan completed\n");
+	trace_ocfs2_queue_orphan_scan_end(os->os_count, os->os_seqno,
+					  atomic_read(&os->os_state));
 	return;
 }
 
@@ -1964,8 +1941,7 @@ static int ocfs2_orphan_filldir(void *priv, const char *name, int name_len,
 	if (IS_ERR(iter))
 		return 0;
 
-	mlog(0, "queue orphan %llu\n",
-	     (unsigned long long)OCFS2_I(iter)->ip_blkno);
+	trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
 	/* No locking is required for the next_orphan queue as there
 	 * is only ever a single process doing orphan recovery. */
 	OCFS2_I(iter)->ip_next_orphan = p->head;
@@ -2081,7 +2057,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 	struct inode *iter;
 	struct ocfs2_inode_info *oi;
 
-	mlog(0, "Recover inodes from orphan dir in slot %d\n", slot);
+	trace_ocfs2_recover_orphans(slot);
 
 	ocfs2_mark_recovering_orphan_dir(osb, slot);
 	ret = ocfs2_queue_orphans(osb, slot, &inode);
@@ -2094,7 +2070,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 
 	while (inode) {
 		oi = OCFS2_I(inode);
-		mlog(0, "iput orphan %llu\n", (unsigned long long)oi->ip_blkno);
+		trace_ocfs2_recover_orphans_iput(
+					(unsigned long long)oi->ip_blkno);
 
 		iter = oi->ip_next_orphan;
 
@@ -2132,6 +2109,7 @@ static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
 	 * MOUNTED flag, but this is set right before
 	 * dismount_volume() so we can trust it. */
 	if (atomic_read(&osb->vol_state) == VOLUME_DISABLED) {
+		trace_ocfs2_wait_on_mount(VOLUME_DISABLED);
 		mlog(0, "mount error, exiting!\n");
 		return -EBUSY;
 	}
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 48839054d5af..9d1ca49aa98b 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2543,6 +2543,105 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_encode_fh_parent);
 DEFINE_OCFS2_INT_EVENT(ocfs2_encode_fh_type);
 
 /* End of trace events for fs/ocfs2/export.c. */
+
+/* Trace events for fs/ocfs2/journal.c. */
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_commit_cache_begin);
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_commit_cache_end);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_extend_trans);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_extend_trans_restart);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_journal_access);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_journal_dirty);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_journal_init);
+
+DEFINE_OCFS2_UINT_EVENT(ocfs2_journal_init_maxlen);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_journal_shutdown);
+
+DEFINE_OCFS2_POINTER_EVENT(ocfs2_journal_shutdown_wait);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_complete_recovery);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_complete_recovery_end);
+
+TRACE_EVENT(ocfs2_complete_recovery_slot,
+	TP_PROTO(int slot, unsigned long long la_ino,
+		 unsigned long long tl_ino, void *qrec),
+	TP_ARGS(slot, la_ino, tl_ino, qrec),
+	TP_STRUCT__entry(
+		__field(int, slot)
+		__field(unsigned long long, la_ino)
+		__field(unsigned long long, tl_ino)
+		__field(void *, qrec)
+	),
+	TP_fast_assign(
+		__entry->slot = slot;
+		__entry->la_ino = la_ino;
+		__entry->tl_ino = tl_ino;
+		__entry->qrec = qrec;
+	),
+	TP_printk("%d %llu %llu %p", __entry->slot, __entry->la_ino,
+		  __entry->tl_ino, __entry->qrec)
+);
+
+DEFINE_OCFS2_INT_INT_EVENT(ocfs2_recovery_thread_node);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recovery_thread_end);
+
+TRACE_EVENT(ocfs2_recovery_thread,
+	TP_PROTO(int node_num, int osb_node_num, int disable,
+		 void *recovery_thread, int map_set),
+	TP_ARGS(node_num, osb_node_num, disable, recovery_thread, map_set),
+	TP_STRUCT__entry(
+		__field(int, node_num)
+		__field(int, osb_node_num)
+		__field(int,disable)
+		__field(void *, recovery_thread)
+		__field(int,map_set)
+	),
+	TP_fast_assign(
+		__entry->node_num = node_num;
+		__entry->osb_node_num = osb_node_num;
+		__entry->disable = disable;
+		__entry->recovery_thread = recovery_thread;
+		__entry->map_set = map_set;
+	),
+	TP_printk("%d %d %d %p %d", __entry->node_num,
+		   __entry->osb_node_num, __entry->disable,
+		   __entry->recovery_thread, __entry->map_set)
+);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_replay_journal_recovered);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_lock_err);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_replay_journal_skip);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_recover_node);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_recover_node_skip);
+
+DEFINE_OCFS2_UINT_UINT_EVENT(ocfs2_mark_dead_nodes);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_begin);
+
+DEFINE_OCFS2_UINT_UINT_UINT_EVENT(ocfs2_queue_orphan_scan_end);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_orphan_filldir);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_recover_orphans);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput);
+
+DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount);
+
+/* End of trace events for fs/ocfs2/journal.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From 15057e981abb3436c16835af1001447056a809b5 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 24 Feb 2011 16:09:38 +0800
Subject: ocfs2: Remove masklog ML_BH_IO.

Remove mlog(0,...) and mlog(ML_BH_IO,...) from
fs/ocfs2/buffer_head_io.c.
The masklog BH_IO is removed finally.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/buffer_head_io.c  | 37 +++++++++++++------------------------
 fs/ocfs2/cluster/masklog.c |  1 -
 fs/ocfs2/cluster/masklog.h |  1 -
 fs/ocfs2/ocfs2_trace.h     | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index c23846dbfc8f..d1cf9abcd827 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -35,8 +35,8 @@
 #include "inode.h"
 #include "journal.h"
 #include "uptodate.h"
-
 #include "buffer_head_io.h"
+#include "ocfs2_trace.h"
 
 /*
  * Bits on bh->b_state used by ocfs2.
@@ -55,8 +55,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
 {
 	int ret = 0;
 
-	mlog(0, "(bh->b_blocknr = %llu, ci=%p)\n",
-	     (unsigned long long)bh->b_blocknr, ci);
+	trace_ocfs2_write_block((unsigned long long)bh->b_blocknr, ci);
 
 	BUG_ON(bh->b_blocknr < OCFS2_SUPER_BLOCK_BLKNO);
 	BUG_ON(buffer_jbd(bh));
@@ -107,10 +106,10 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 	unsigned int i;
 	struct buffer_head *bh;
 
-	if (!nr) {
-		mlog(ML_BH_IO, "No buffers will be read!\n");
+	trace_ocfs2_read_blocks_sync((unsigned long long)block, nr);
+
+	if (!nr)
 		goto bail;
-	}
 
 	for (i = 0 ; i < nr ; i++) {
 		if (bhs[i] == NULL) {
@@ -124,10 +123,8 @@ int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
 		bh = bhs[i];
 
 		if (buffer_jbd(bh)) {
-			mlog(ML_BH_IO,
-			     "trying to sync read a jbd "
-			     "managed bh (blocknr = %llu), skipping\n",
-			     (unsigned long long)bh->b_blocknr);
+			trace_ocfs2_read_blocks_sync_jbd(
+					(unsigned long long)bh->b_blocknr);
 			continue;
 		}
 
@@ -187,8 +184,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 	struct buffer_head *bh;
 	struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
 
-	mlog(0, "(ci=%p, block=(%llu), nr=(%d), flags=%d)\n",
-	     ci, (unsigned long long)block, nr, flags);
+	trace_ocfs2_read_blocks_begin(ci, (unsigned long long)block, nr, flags);
 
 	BUG_ON(!ci);
 	BUG_ON((flags & OCFS2_BH_READAHEAD) &&
@@ -208,7 +204,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 	}
 
 	if (nr == 0) {
-		mlog(ML_BH_IO, "No buffers will be read!\n");
 		status = 0;
 		goto bail;
 	}
@@ -261,11 +256,10 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 			ignore_cache = 1;
 		}
 
+		trace_ocfs2_read_blocks_bh((unsigned long long)bh->b_blocknr,
+			ignore_cache, buffer_jbd(bh), buffer_dirty(bh));
+
 		if (buffer_jbd(bh)) {
-			if (ignore_cache)
-				mlog(ML_BH_IO, "trying to sync read a jbd "
-					       "managed bh (blocknr = %llu)\n",
-				     (unsigned long long)bh->b_blocknr);
 			continue;
 		}
 
@@ -273,9 +267,6 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 			if (buffer_dirty(bh)) {
 				/* This should probably be a BUG, or
 				 * at least return an error. */
-				mlog(ML_BH_IO, "asking me to sync read a dirty "
-					       "buffer! (blocknr = %llu)\n",
-				     (unsigned long long)bh->b_blocknr);
 				continue;
 			}
 
@@ -368,10 +359,8 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 	}
 	ocfs2_metadata_cache_io_unlock(ci);
 
-	mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n",
-	     (unsigned long long)block, nr,
-	     ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes",
-	     flags);
+	trace_ocfs2_read_blocks_end((unsigned long long)block, nr,
+				    flags, ignore_cache);
 
 bail:
 
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index b64fb67c2999..d845ccef2408 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -93,7 +93,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(DLM_RECOVERY),
 	define_mask(AIO),
 	define_mask(DLM_GLUE),
-	define_mask(BH_IO),
 	define_mask(UPTODATE),
 	define_mask(VOTE),
 	define_mask(CONN),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 1340ef0ae3e1..ad267f614528 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -95,7 +95,6 @@
 #define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
 #define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
 #define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
-#define ML_BH_IO	0x0000000000100000ULL /* ocfs2 buffer I/O */
 #define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
 #define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 9d1ca49aa98b..90dc79a22a59 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2642,6 +2642,52 @@ DEFINE_OCFS2_ULL_EVENT(ocfs2_recover_orphans_iput);
 DEFINE_OCFS2_INT_EVENT(ocfs2_wait_on_mount);
 
 /* End of trace events for fs/ocfs2/journal.c. */
+
+/* Trace events for fs/ocfs2/buffer_head_io.c. */
+
+DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync);
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh);
+
+DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end);
+
+TRACE_EVENT(ocfs2_write_block,
+	TP_PROTO(unsigned long long block, void *ci),
+	TP_ARGS(block, ci),
+	TP_STRUCT__entry(
+		__field(unsigned long long, block)
+		__field(void *, ci)
+	),
+	TP_fast_assign(
+		__entry->block = block;
+		__entry->ci = ci;
+	),
+	TP_printk("%llu %p", __entry->block, __entry->ci)
+);
+
+TRACE_EVENT(ocfs2_read_blocks_begin,
+	TP_PROTO(void *ci, unsigned long long block,
+		 unsigned int nr, int flags),
+	TP_ARGS(ci, block, nr, flags),
+	TP_STRUCT__entry(
+		__field(void *, ci)
+		__field(unsigned long long, block)
+		__field(unsigned int, nr)
+		__field(int, flags)
+	),
+	TP_fast_assign(
+		__entry->ci = ci;
+		__entry->block = block;
+		__entry->nr = nr;
+		__entry->flags = flags;
+	),
+	TP_printk("%p %llu %u %d", __entry->ci, __entry->block,
+		  __entry->nr, __entry->flags)
+);
+
+/* End of trace events for fs/ocfs2/buffer_head_io.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From d701485a6246486fda0ee58457b82a81562a7bc4 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Thu, 24 Feb 2011 16:22:20 +0800
Subject: ocfs2: Remove masklog ML_UPTODATE.

Remove mlog(0,...) and mlog(ML_UPTODATE,...) from
fs/ocfs2/uptodate.c and fs/ocfs2/buffer_head_io.c.

The masklog UPTODATE is removed finally.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/buffer_head_io.c  |  3 +-
 fs/ocfs2/cluster/masklog.c |  1 -
 fs/ocfs2/cluster/masklog.h |  1 -
 fs/ocfs2/ocfs2_trace.h     | 42 ++++++++++++++++++++++++++
 fs/ocfs2/uptodate.c        | 73 +++++++++++++++++++++++-----------------------
 5 files changed, 79 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d1cf9abcd827..5d18ad10c27f 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -247,8 +247,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
 		 */
 
 		if (!ignore_cache && !ocfs2_buffer_uptodate(ci, bh)) {
-			mlog(ML_UPTODATE,
-			     "bh (%llu), owner %llu not uptodate\n",
+			trace_ocfs2_read_blocks_from_disk(
 			     (unsigned long long)bh->b_blocknr,
 			     (unsigned long long)ocfs2_metadata_cache_owner(ci));
 			/* We're using ignore_cache here to say
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d845ccef2408..bbea7b6cb4d3 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -93,7 +93,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(DLM_RECOVERY),
 	define_mask(AIO),
 	define_mask(DLM_GLUE),
-	define_mask(UPTODATE),
 	define_mask(VOTE),
 	define_mask(CONN),
 	define_mask(QUORUM),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index ad267f614528..7a86add27f69 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -95,7 +95,6 @@
 #define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
 #define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
 #define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
-#define ML_UPTODATE	0x0000000000200000ULL /* ocfs2 caching sequence #'s */
 #define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
 #define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 90dc79a22a59..a1dae5bb54ac 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2649,6 +2649,8 @@ DEFINE_OCFS2_ULL_UINT_EVENT(ocfs2_read_blocks_sync);
 
 DEFINE_OCFS2_ULL_EVENT(ocfs2_read_blocks_sync_jbd);
 
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_read_blocks_from_disk);
+
 DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_bh);
 
 DEFINE_OCFS2_ULL_INT_INT_INT_EVENT(ocfs2_read_blocks_end);
@@ -2688,6 +2690,46 @@ TRACE_EVENT(ocfs2_read_blocks_begin,
 );
 
 /* End of trace events for fs/ocfs2/buffer_head_io.c. */
+
+/* Trace events for fs/ocfs2/uptodate.c. */
+
+DEFINE_OCFS2_ULL_EVENT(ocfs2_purge_copied_metadata_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_metadata_cache_purge);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_buffer_cached_begin);
+
+TRACE_EVENT(ocfs2_buffer_cached_end,
+	TP_PROTO(int index, void *item),
+	TP_ARGS(index, item),
+	TP_STRUCT__entry(
+		__field(int, index)
+		__field(void *, item)
+	),
+	TP_fast_assign(
+		__entry->index = index;
+		__entry->item = item;
+	),
+	TP_printk("%d %p", __entry->index, __entry->item)
+);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_append_cache_array);
+
+DEFINE_OCFS2_ULL_ULL_UINT_EVENT(ocfs2_insert_cache_tree);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_expand_cache);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_set_buffer_uptodate);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_set_buffer_uptodate_begin);
+
+DEFINE_OCFS2_ULL_UINT_UINT_EVENT(ocfs2_remove_metadata_array);
+
+DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_remove_metadata_tree);
+
+DEFINE_OCFS2_ULL_ULL_UINT_UINT_EVENT(ocfs2_remove_block_from_cache);
+
+/* End of trace events for fs/ocfs2/uptodate.c. */
 #endif /* _TRACE_OCFS2_H */
 
 /* This part must be outside protection */
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index a0a120e82b97..52eaf33d346f 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -54,14 +54,13 @@
 #include <linux/buffer_head.h>
 #include <linux/rbtree.h>
 
-#define MLOG_MASK_PREFIX ML_UPTODATE
-
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
 
 #include "inode.h"
 #include "uptodate.h"
+#include "ocfs2_trace.h"
 
 struct ocfs2_meta_cache_item {
 	struct rb_node	c_node;
@@ -152,8 +151,8 @@ static unsigned int ocfs2_purge_copied_metadata_tree(struct rb_root *root)
 	while ((node = rb_last(root)) != NULL) {
 		item = rb_entry(node, struct ocfs2_meta_cache_item, c_node);
 
-		mlog(0, "Purge item %llu\n",
-		     (unsigned long long) item->c_block);
+		trace_ocfs2_purge_copied_metadata_tree(
+					(unsigned long long) item->c_block);
 
 		rb_erase(&item->c_node, root);
 		kmem_cache_free(ocfs2_uptodate_cachep, item);
@@ -180,9 +179,9 @@ void ocfs2_metadata_cache_purge(struct ocfs2_caching_info *ci)
 	tree = !(ci->ci_flags & OCFS2_CACHE_FL_INLINE);
 	to_purge = ci->ci_num_cached;
 
-	mlog(0, "Purge %u %s items from Owner %llu\n", to_purge,
-	     tree ? "array" : "tree",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci));
+	trace_ocfs2_metadata_cache_purge(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		to_purge, tree);
 
 	/* If we're a tree, save off the root so that we can safely
 	 * initialize the cache. We do the work to free tree members
@@ -249,10 +248,10 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
 
 	ocfs2_metadata_cache_lock(ci);
 
-	mlog(0, "Owner %llu, query block %llu (inline = %u)\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long) bh->b_blocknr,
-	     !!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
+	trace_ocfs2_buffer_cached_begin(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long) bh->b_blocknr,
+		!!(ci->ci_flags & OCFS2_CACHE_FL_INLINE));
 
 	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE)
 		index = ocfs2_search_cache_array(ci, bh->b_blocknr);
@@ -261,7 +260,7 @@ static int ocfs2_buffer_cached(struct ocfs2_caching_info *ci,
 
 	ocfs2_metadata_cache_unlock(ci);
 
-	mlog(0, "index = %d, item = %p\n", index, item);
+	trace_ocfs2_buffer_cached_end(index, item);
 
 	return (index != -1) || (item != NULL);
 }
@@ -306,8 +305,9 @@ static void ocfs2_append_cache_array(struct ocfs2_caching_info *ci,
 {
 	BUG_ON(ci->ci_num_cached >= OCFS2_CACHE_INFO_MAX_ARRAY);
 
-	mlog(0, "block %llu takes position %u\n", (unsigned long long) block,
-	     ci->ci_num_cached);
+	trace_ocfs2_append_cache_array(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, ci->ci_num_cached);
 
 	ci->ci_cache.ci_array[ci->ci_num_cached] = block;
 	ci->ci_num_cached++;
@@ -324,8 +324,9 @@ static void __ocfs2_insert_cache_tree(struct ocfs2_caching_info *ci,
 	struct rb_node **p = &ci->ci_cache.ci_tree.rb_node;
 	struct ocfs2_meta_cache_item *tmp;
 
-	mlog(0, "Insert block %llu num = %u\n", (unsigned long long) block,
-	     ci->ci_num_cached);
+	trace_ocfs2_insert_cache_tree(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, ci->ci_num_cached);
 
 	while(*p) {
 		parent = *p;
@@ -389,9 +390,9 @@ static void ocfs2_expand_cache(struct ocfs2_caching_info *ci,
 		tree[i] = NULL;
 	}
 
-	mlog(0, "Expanded %llu to a tree cache: flags 0x%x, num = %u\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     ci->ci_flags, ci->ci_num_cached);
+	trace_ocfs2_expand_cache(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		ci->ci_flags, ci->ci_num_cached);
 }
 
 /* Slow path function - memory allocation is necessary. See the
@@ -405,9 +406,9 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
 	struct ocfs2_meta_cache_item *tree[OCFS2_CACHE_INFO_MAX_ARRAY] =
 		{ NULL, };
 
-	mlog(0, "Owner %llu, block %llu, expand = %d\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long)block, expand_tree);
+	trace_ocfs2_set_buffer_uptodate(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)block, expand_tree);
 
 	new = kmem_cache_alloc(ocfs2_uptodate_cachep, GFP_NOFS);
 	if (!new) {
@@ -433,7 +434,6 @@ static void __ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
 
 	ocfs2_metadata_cache_lock(ci);
 	if (ocfs2_insert_can_use_array(ci)) {
-		mlog(0, "Someone cleared the tree underneath us\n");
 		/* Ok, items were removed from the cache in between
 		 * locks. Detect this and revert back to the fast path */
 		ocfs2_append_cache_array(ci, block);
@@ -490,9 +490,9 @@ void ocfs2_set_buffer_uptodate(struct ocfs2_caching_info *ci,
 	if (ocfs2_buffer_cached(ci, bh))
 		return;
 
-	mlog(0, "Owner %llu, inserting block %llu\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long)bh->b_blocknr);
+	trace_ocfs2_set_buffer_uptodate_begin(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)bh->b_blocknr);
 
 	/* No need to recheck under spinlock - insertion is guarded by
 	 * co_io_lock() */
@@ -542,8 +542,9 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
 	BUG_ON(index >= ci->ci_num_cached);
 	BUG_ON(!ci->ci_num_cached);
 
-	mlog(0, "remove index %d (num_cached = %u\n", index,
-	     ci->ci_num_cached);
+	trace_ocfs2_remove_metadata_array(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		index, ci->ci_num_cached);
 
 	ci->ci_num_cached--;
 
@@ -559,8 +560,9 @@ static void ocfs2_remove_metadata_array(struct ocfs2_caching_info *ci,
 static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci,
 				       struct ocfs2_meta_cache_item *item)
 {
-	mlog(0, "remove block %llu from tree\n",
-	     (unsigned long long) item->c_block);
+	trace_ocfs2_remove_metadata_tree(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long)item->c_block);
 
 	rb_erase(&item->c_node, &ci->ci_cache.ci_tree);
 	ci->ci_num_cached--;
@@ -573,10 +575,10 @@ static void ocfs2_remove_block_from_cache(struct ocfs2_caching_info *ci,
 	struct ocfs2_meta_cache_item *item = NULL;
 
 	ocfs2_metadata_cache_lock(ci);
-	mlog(0, "Owner %llu, remove %llu, items = %u, array = %u\n",
-	     (unsigned long long)ocfs2_metadata_cache_owner(ci),
-	     (unsigned long long) block, ci->ci_num_cached,
-	     ci->ci_flags & OCFS2_CACHE_FL_INLINE);
+	trace_ocfs2_remove_block_from_cache(
+		(unsigned long long)ocfs2_metadata_cache_owner(ci),
+		(unsigned long long) block, ci->ci_num_cached,
+		ci->ci_flags);
 
 	if (ci->ci_flags & OCFS2_CACHE_FL_INLINE) {
 		index = ocfs2_search_cache_array(ci, block);
@@ -626,9 +628,6 @@ int __init init_ocfs2_uptodate_cache(void)
 	if (!ocfs2_uptodate_cachep)
 		return -ENOMEM;
 
-	mlog(0, "%u inlined cache items per inode.\n",
-	     OCFS2_CACHE_INFO_MAX_ARRAY);
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 75ba49ae0637c15880fe8f1c8a89333afa052bd8 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 21 Feb 2011 11:19:27 +0800
Subject: ocfs2: Remove masklog ML_AIO.

There is no user for masklog AIO, so remove it.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.c | 1 -
 fs/ocfs2/cluster/masklog.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index bbea7b6cb4d3..07ac24fd9252 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -91,7 +91,6 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
 	define_mask(DLM_THREAD),
 	define_mask(DLM_MASTER),
 	define_mask(DLM_RECOVERY),
-	define_mask(AIO),
 	define_mask(DLM_GLUE),
 	define_mask(VOTE),
 	define_mask(CONN),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 7a86add27f69..0fe96c2830c2 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -93,7 +93,6 @@
 #define ML_DLM_THREAD	0x0000000000000400ULL /* dlm domain thread */
 #define ML_DLM_MASTER	0x0000000000000800ULL /* dlm master functions */
 #define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
-#define ML_AIO		0x0000000000002000ULL /* ocfs2 aio read and write */
 #define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
 #define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
 #define ML_CONN		0x0000000004000000ULL /* net connection management */
-- 
cgit v1.2.3


From b4e1b7e88b2c87c358c2a88bec0c76d25accc604 Mon Sep 17 00:00:00 2001
From: Tao Ma <boyu.mt@taobao.com>
Date: Mon, 21 Feb 2011 11:19:43 +0800
Subject: ocfs2: Make the left masklogs compat.

Since we have removed almost all of the masklogs in fs/ocfs2/*,
make the left masklogs compat.

Signed-off-by: Tao Ma <boyu.mt@taobao.com>
---
 fs/ocfs2/cluster/masklog.h | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 0fe96c2830c2..baa2b9ef7eef 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -82,23 +82,23 @@
 
 /* bits that are frequently given and infrequently matched in the low word */
 /* NOTE: If you add a flag, you need to also update masklog.c! */
-#define ML_TCP		0x0000000000000004ULL /* net cluster/tcp.c */
-#define ML_MSG		0x0000000000000008ULL /* net network messages */
-#define ML_SOCKET	0x0000000000000010ULL /* net socket lifetime */
-#define ML_HEARTBEAT	0x0000000000000020ULL /* hb all heartbeat tracking */
-#define ML_HB_BIO	0x0000000000000040ULL /* hb io tracing */
-#define ML_DLMFS	0x0000000000000080ULL /* dlm user dlmfs */
-#define ML_DLM		0x0000000000000100ULL /* dlm general debugging */
-#define ML_DLM_DOMAIN	0x0000000000000200ULL /* dlm domain debugging */
-#define ML_DLM_THREAD	0x0000000000000400ULL /* dlm domain thread */
-#define ML_DLM_MASTER	0x0000000000000800ULL /* dlm master functions */
-#define ML_DLM_RECOVERY	0x0000000000001000ULL /* dlm master functions */
-#define ML_DLM_GLUE	0x0000000000080000ULL /* ocfs2 dlm glue layer */
-#define ML_VOTE		0x0000000001000000ULL /* ocfs2 node messaging  */
-#define ML_CONN		0x0000000004000000ULL /* net connection management */
-#define ML_QUORUM	0x0000000008000000ULL /* net connection quorum */
-#define ML_BASTS	0x0000000100000000ULL /* dlmglue asts and basts */
-#define ML_CLUSTER	0x0000000400000000ULL /* cluster stack */
+#define ML_TCP		0x0000000000000001ULL /* net cluster/tcp.c */
+#define ML_MSG		0x0000000000000002ULL /* net network messages */
+#define ML_SOCKET	0x0000000000000004ULL /* net socket lifetime */
+#define ML_HEARTBEAT	0x0000000000000008ULL /* hb all heartbeat tracking */
+#define ML_HB_BIO	0x0000000000000010ULL /* hb io tracing */
+#define ML_DLMFS	0x0000000000000020ULL /* dlm user dlmfs */
+#define ML_DLM		0x0000000000000040ULL /* dlm general debugging */
+#define ML_DLM_DOMAIN	0x0000000000000080ULL /* dlm domain debugging */
+#define ML_DLM_THREAD	0x0000000000000100ULL /* dlm domain thread */
+#define ML_DLM_MASTER	0x0000000000000200ULL /* dlm master functions */
+#define ML_DLM_RECOVERY	0x0000000000000400ULL /* dlm master functions */
+#define ML_DLM_GLUE	0x0000000000000800ULL /* ocfs2 dlm glue layer */
+#define ML_VOTE		0x0000000000001000ULL /* ocfs2 node messaging  */
+#define ML_CONN		0x0000000000002000ULL /* net connection management */
+#define ML_QUORUM	0x0000000000004000ULL /* net connection quorum */
+#define ML_BASTS	0x0000000000008000ULL /* dlmglue asts and basts */
+#define ML_CLUSTER	0x0000000000010000ULL /* cluster stack */
 
 /* bits that are infrequently given and frequently matched in the high word */
 #define ML_ERROR	0x1000000000000000ULL /* sent to KERN_ERR */
-- 
cgit v1.2.3