From f5a73672d1811f2fb1dcb62ca90ceb12b2050ae7 Mon Sep 17 00:00:00 2001
From: Neil Brown <neilb@suse.de>
Date: Tue, 10 Aug 2010 10:20:05 -0400
Subject: NFS: allow close-to-open cache semantics to apply to root of NFS
 filesystem

To obey NFS cache semantics, the client must verify the cached
attributes when a file is opened.  In most cases this is done by a call to
d_validate as one of the last steps in path_walk.

However for the root of a filesystem, d_validate is only ever called
on the mounted-on filesystem (except when the path ends '.' or '..').
So NFS has no chance to validate the attributes.

So, in nfs_opendir, we revalidate the attributes if the opened
directory is the mountpoint.  This may cause double-validation for "."
and ".." lookups, but that is better than missing regular /path/name
lookups completely.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 29539ceeb745..bd91b2778315 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -140,6 +140,13 @@ nfs_opendir(struct inode *inode, struct file *filp)
 
 	/* Call generic open code in order to cache credentials */
 	res = nfs_open(inode, filp);
+	if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) {
+		/* This is a mountpoint, so d_revalidate will never
+		 * have been called, so we need to refresh the
+		 * inode (for close-open consistency) ourselves.
+		 */
+		__nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	}
 	return res;
 }
 
-- 
cgit v1.2.3


From 9b00c64318cc337846a7a08a5678f5f19aeff188 Mon Sep 17 00:00:00 2001
From: "Patrick J. LoPresti" <lopresti@gmail.com>
Date: Tue, 10 Aug 2010 17:28:01 -0400
Subject: nfs: Add "lookupcache" to displayed mount options

Running "cat /proc/mounts" fails to display the "lookupcache" option.
This oversight cost me a bunch of wasted time recently.

The following simple patch fixes it.

CC: stable <stable@kernel.org>
Signed-off-by: Patrick LoPresti <lopresti@gmail.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/super.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1ae39f6cb02..3d0d63c00304 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -655,6 +655,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
 
 	if (nfss->options & NFS_OPTION_FSCACHE)
 		seq_printf(m, ",fsc");
+
+	if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) {
+		if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
+			seq_printf(m, ",lookupcache=none");
+		else
+			seq_printf(m, ",lookupcache=pos");
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 5d7ca35a182a626f8ed5596023ad42eb219a332e Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso <dave@gnu.org>
Date: Wed, 11 Aug 2010 12:42:15 -0400
Subject: nfs: Remove redundant NULL check upon kfree()

Signed-off-by: Davidlohr Bueso <dave@gnu.org>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7ffbb98ddec3..6b44bbfb7d8a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2273,8 +2273,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct
 out:
 	if (page)
 		__free_page(page);
-	if (locations)
-		kfree(locations);
+	kfree(locations);
 	return status;
 }
 
-- 
cgit v1.2.3


From 0702099bd86c33c2dcdbd3963433a61f3f503901 Mon Sep 17 00:00:00 2001
From: "J. R. Okajima" <hooanon05@yahoo.co.jp>
Date: Wed, 11 Aug 2010 13:10:16 -0400
Subject: NFS: fix the return value of nfs_file_fsync()

By the commit af7fa16 2010-08-03 NFS: Fix up the fsync code
close(2) became returning the non-zero value even if it went well.
nfs_file_fsync() should return 0 when "status" is positive.

Signed-off-by: J. R. Okajima <hooanon05@yahoo.co.jp>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2d141a74ae82..eb51bd6201da 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -323,7 +323,7 @@ nfs_file_fsync(struct file *file, int datasync)
 	have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags);
 	if (have_error)
 		ret = xchg(&ctx->error, 0);
-	if (!ret)
+	if (!ret && status < 0)
 		ret = status;
 	return ret;
 }
-- 
cgit v1.2.3


From af4e36318edb848fcc0a8d5f75000ca00cdc7595 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Fri, 13 Aug 2010 12:42:24 +0900
Subject: nilfs2: fix list corruption after ifile creation failure

If nilfs_attach_checkpoint() gets a memory allocation failure during
creation of ifile, it will return without removing nilfs_sb_info
struct from ns_supers list.  When a concurrently mounted snapshot is
unmounted or another new snapshot is mounted after that, this causes
kernel oops as below:

> BUG: unable to handle kernel NULL pointer dereference at (null)
> IP: [<f83662ff>] nilfs_find_sbinfo+0x74/0xa4 [nilfs2]
> *pde = 00000000
> Oops: 0000 [#1] SMP
<snip>
> Call Trace:
>  [<f835dc29>] ? nilfs_get_sb+0x165/0x532 [nilfs2]
>  [<c1173c87>] ? ida_get_new_above+0x16d/0x187
>  [<c109a7f8>] ? alloc_vfsmnt+0x7e/0x10a
>  [<c1070790>] ? kstrdup+0x2c/0x40
>  [<c1089041>] ? vfs_kern_mount+0x96/0x14e
>  [<c108913d>] ? do_kern_mount+0x32/0xbd
>  [<c109b331>] ? do_mount+0x642/0x6a1
>  [<c101a415>] ? do_page_fault+0x0/0x2d1
>  [<c1099c00>] ? copy_mount_options+0x80/0xe2
>  [<c10705d8>] ? strndup_user+0x48/0x67
>  [<c109b3f1>] ? sys_mount+0x61/0x90
>  [<c10027cc>] ? sysenter_do_call+0x12/0x22

This fixes the problem.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: stable@kernel.org
---
 fs/nilfs2/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1fa86b9df73b..bee60c04109a 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -400,9 +400,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	list_add(&sbi->s_list, &nilfs->ns_supers);
 	up_write(&nilfs->ns_super_sem);
 
+	err = -ENOMEM;
 	sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size);
 	if (!sbi->s_ifile)
-		return -ENOMEM;
+		goto delist;
 
 	down_read(&nilfs->ns_segctor_sem);
 	err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
@@ -433,6 +434,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	nilfs_mdt_destroy(sbi->s_ifile);
 	sbi->s_ifile = NULL;
 
+ delist:
 	down_write(&nilfs->ns_super_sem);
 	list_del_init(&sbi->s_list);
 	up_write(&nilfs->ns_super_sem);
-- 
cgit v1.2.3


From ea1a16f7168ac19d974ac51b47593b92280e7992 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 15 Aug 2010 20:16:11 +0900
Subject: nilfs2: fix false warning saying one of two super blocks is broken

After applying commit b2ac86e1, the following message got appeared
after unclean shutdown:

> NILFS warning: broken superblock. using spare superblock.

This turns out to be a false message due to the change which updates
two super blocks alternately.  The secondary super block now can be
selected if it's newer than the primary one.

This kills the false warning by suppressing it if another super block
is not actually broken.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/the_nilfs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 37de1f062d81..6af1c0073e9e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -608,11 +608,11 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
 		return -EINVAL;
 	}
 
-	if (swp) {
+	if (!valid[!swp])
 		printk(KERN_WARNING "NILFS warning: broken superblock. "
 		       "using spare superblock.\n");
+	if (swp)
 		nilfs_swap_super_block(nilfs);
-	}
 
 	nilfs->ns_sbwcount = 0;
 	nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime);
-- 
cgit v1.2.3


From 5d9ac7fd32f600f9451ea58abdb07f7ed42e921d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 5 Aug 2010 13:58:22 -0400
Subject: cifs: clean up error handling in cifs_mknod

Get rid of some nesting and add a label we can goto on error.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c | 149 +++++++++++++++++++++++++++++-----------------------------
 1 file changed, 74 insertions(+), 75 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 578d88c5b46e..f17d50047f07 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -496,6 +496,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	struct cifsTconInfo *pTcon;
 	char *full_path = NULL;
 	struct inode *newinode = NULL;
+	int oplock = 0;
+	u16 fileHandle;
+	FILE_ALL_INFO *buf = NULL;
+	unsigned int bytes_written;
+	struct win_dev *pdev;
 
 	if (!old_valid_dev(device_number))
 		return -EINVAL;
@@ -506,9 +511,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 	pTcon = cifs_sb->tcon;
 
 	full_path = build_path_from_dentry(direntry);
-	if (full_path == NULL)
+	if (full_path == NULL) {
 		rc = -ENOMEM;
-	else if (pTcon->unix_ext) {
+		goto mknod_out;
+	}
+
+	if (pTcon->unix_ext) {
 		struct cifs_unix_set_info_args args = {
 			.mode	= mode & ~current_umask(),
 			.ctime	= NO_CHANGE_64,
@@ -527,87 +535,78 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode,
 					    cifs_sb->local_nls,
 					    cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (rc)
+			goto mknod_out;
 
-		if (!rc) {
-			rc = cifs_get_inode_info_unix(&newinode, full_path,
+		rc = cifs_get_inode_info_unix(&newinode, full_path,
 						inode->i_sb, xid);
-			if (pTcon->nocase)
-				direntry->d_op = &cifs_ci_dentry_ops;
-			else
-				direntry->d_op = &cifs_dentry_ops;
-			if (rc == 0)
-				d_instantiate(direntry, newinode);
-		}
-	} else {
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-			int oplock = 0;
-			u16 fileHandle;
-			FILE_ALL_INFO *buf;
+		if (pTcon->nocase)
+			direntry->d_op = &cifs_ci_dentry_ops;
+		else
+			direntry->d_op = &cifs_dentry_ops;
 
-			cFYI(1, "sfu compat create special file");
+		if (rc == 0)
+			d_instantiate(direntry, newinode);
+		goto mknod_out;
+	}
 
-			buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
-			if (buf == NULL) {
-				kfree(full_path);
-				rc = -ENOMEM;
-				FreeXid(xid);
-				return rc;
-			}
+	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
+		goto mknod_out;
 
-			rc = CIFSSMBOpen(xid, pTcon, full_path,
-					 FILE_CREATE, /* fail if exists */
-					 GENERIC_WRITE /* BB would
-					  WRITE_OWNER | WRITE_DAC be better? */,
-					 /* Create a file and set the
-					    file attribute to SYSTEM */
-					 CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
-					 &fileHandle, &oplock, buf,
-					 cifs_sb->local_nls,
-					 cifs_sb->mnt_cifs_flags &
-					    CIFS_MOUNT_MAP_SPECIAL_CHR);
-
-			/* BB FIXME - add handling for backlevel servers
-			   which need legacy open and check for all
-			   calls to SMBOpen for fallback to SMBLeagcyOpen */
-			if (!rc) {
-				/* BB Do not bother to decode buf since no
-				   local inode yet to put timestamps in,
-				   but we can reuse it safely */
-				unsigned int bytes_written;
-				struct win_dev *pdev;
-				pdev = (struct win_dev *)buf;
-				if (S_ISCHR(mode)) {
-					memcpy(pdev->type, "IntxCHR", 8);
-					pdev->major =
-					      cpu_to_le64(MAJOR(device_number));
-					pdev->minor =
-					      cpu_to_le64(MINOR(device_number));
-					rc = CIFSSMBWrite(xid, pTcon,
-						fileHandle,
-						sizeof(struct win_dev),
-						0, &bytes_written, (char *)pdev,
-						NULL, 0);
-				} else if (S_ISBLK(mode)) {
-					memcpy(pdev->type, "IntxBLK", 8);
-					pdev->major =
-					      cpu_to_le64(MAJOR(device_number));
-					pdev->minor =
-					      cpu_to_le64(MINOR(device_number));
-					rc = CIFSSMBWrite(xid, pTcon,
-						fileHandle,
-						sizeof(struct win_dev),
-						0, &bytes_written, (char *)pdev,
-						NULL, 0);
-				} /* else if(S_ISFIFO */
-				CIFSSMBClose(xid, pTcon, fileHandle);
-				d_drop(direntry);
-			}
-			kfree(buf);
-			/* add code here to set EAs */
-		}
+
+	cFYI(1, "sfu compat create special file");
+
+	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (buf == NULL) {
+		kfree(full_path);
+		rc = -ENOMEM;
+		FreeXid(xid);
+		return rc;
 	}
 
+	/* FIXME: would WRITE_OWNER | WRITE_DAC be better? */
+	rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE,
+			 GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL,
+			 &fileHandle, &oplock, buf, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc)
+		goto mknod_out;
+
+	/* BB Do not bother to decode buf since no local inode yet to put
+	 * timestamps in, but we can reuse it safely */
+
+	pdev = (struct win_dev *)buf;
+	if (S_ISCHR(mode)) {
+		memcpy(pdev->type, "IntxCHR", 8);
+		pdev->major =
+		      cpu_to_le64(MAJOR(device_number));
+		pdev->minor =
+		      cpu_to_le64(MINOR(device_number));
+		rc = CIFSSMBWrite(xid, pTcon,
+			fileHandle,
+			sizeof(struct win_dev),
+			0, &bytes_written, (char *)pdev,
+			NULL, 0);
+	} else if (S_ISBLK(mode)) {
+		memcpy(pdev->type, "IntxBLK", 8);
+		pdev->major =
+		      cpu_to_le64(MAJOR(device_number));
+		pdev->minor =
+		      cpu_to_le64(MINOR(device_number));
+		rc = CIFSSMBWrite(xid, pTcon,
+			fileHandle,
+			sizeof(struct win_dev),
+			0, &bytes_written, (char *)pdev,
+			NULL, 0);
+	} /* else if (S_ISFIFO) */
+	CIFSSMBClose(xid, pTcon, fileHandle);
+	d_drop(direntry);
+
+	/* FIXME: add code here to set EAs */
+
+mknod_out:
 	kfree(full_path);
+	kfree(buf);
 	FreeXid(xid);
 	return rc;
 }
-- 
cgit v1.2.3


From 232341ba7fa15115d40f6aa0f8dd14e96e3ad375 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Thu, 5 Aug 2010 13:58:38 -0400
Subject: cifs: consolidate error handling in several functions

cifs has a lot of complicated functions that have to clean up things on
error, but some of them don't have all of the cleanup code
well-consolidated. Clean up and consolidate error handling in several
functions.

This is in preparation of later patches that will need to put references
to the tcon link container.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/dir.c  | 8 +++-----
 fs/cifs/file.c | 3 +--
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index f17d50047f07..f9ed0751cc12 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -305,8 +305,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	full_path = build_path_from_dentry(direntry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto cifs_create_out;
 	}
 
 	if (oplockEnabled)
@@ -365,9 +364,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 
 	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 	if (buf == NULL) {
-		kfree(full_path);
-		FreeXid(xid);
-		return -ENOMEM;
+		rc = -ENOMEM;
+		goto cifs_create_out;
 	}
 
 	/*
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index db11fdef0e92..de748c652d11 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -242,8 +242,7 @@ int cifs_open(struct inode *inode, struct file *file)
 	full_path = build_path_from_dentry(file->f_path.dentry);
 	if (full_path == NULL) {
 		rc = -ENOMEM;
-		FreeXid(xid);
-		return rc;
+		goto out;
 	}
 
 	cFYI(1, "inode = 0x%p file flags are 0x%x for %s",
-- 
cgit v1.2.3


From df486a25900f4dba9cdc3886c4ac871951c6aef3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 17 Aug 2010 17:42:45 -0400
Subject: NFS: Fix the selection of security flavours in Kconfig

Randy Dunlap reports:

ERROR: "svc_gss_principal" [fs/nfs/nfs.ko] undefined!


because in fs/nfs/Kconfig, NFS_V4 selects RPCSEC_GSS_KRB5
and/or in fs/nfsd/Kconfig, NFSD_V4 selects RPCSEC_GSS_KRB5.

RPCSEC_GSS_KRB5 does 5 selects, but none of these is enforced/followed
by the fs/nfs[d]/Kconfig configs:

	select SUNRPC_GSS
	select CRYPTO
	select CRYPTO_MD5
	select CRYPTO_DES
	select CRYPTO_CBC

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: J. Bruce Fields <bfields@fieldses.org>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/Kconfig     | 1 -
 fs/nfsd/Kconfig    | 1 -
 net/sunrpc/Kconfig | 9 +++++----
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index cc1bb33b59b8..2ddc384ec042 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -63,7 +63,6 @@ config NFS_V3_ACL
 config NFS_V4
 	bool "NFS client support for NFS version 4"
 	depends on NFS_FS
-	select RPCSEC_GSS_KRB5
 	help
 	  This option enables support for version 4 of the NFS protocol
 	  (RFC 3530) in the kernel's NFS client.
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 503b9da159a3..95932f523aef 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -69,7 +69,6 @@ config NFSD_V4
 	depends on NFSD && PROC_FS && EXPERIMENTAL
 	select NFSD_V3
 	select FS_POSIX_ACL
-	select RPCSEC_GSS_KRB5
 	help
 	  This option enables support in your system's NFS server for
 	  version 4 of the NFS protocol (RFC 3530).
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 443c161eb8bd..3376d7657185 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -18,10 +18,11 @@ config SUNRPC_XPRT_RDMA
 	  If unsure, say N.
 
 config RPCSEC_GSS_KRB5
-	tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
-	depends on SUNRPC && EXPERIMENTAL
+	tristate
+	depends on SUNRPC && CRYPTO
+	prompt "Secure RPC: Kerberos V mechanism" if !(NFS_V4 || NFSD_V4)
+	default y
 	select SUNRPC_GSS
-	select CRYPTO
 	select CRYPTO_MD5
 	select CRYPTO_DES
 	select CRYPTO_CBC
@@ -34,7 +35,7 @@ config RPCSEC_GSS_KRB5
 	  available from http://linux-nfs.org/.  In addition, user-space
 	  Kerberos support should be installed.
 
-	  If unsure, say N.
+	  If unsure, say Y.
 
 config RPCSEC_GSS_SPKM3
 	tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)"
-- 
cgit v1.2.3


From d7627467b7a8dd6944885290a03a07ceb28c10eb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 17 Aug 2010 23:52:56 +0100
Subject: Make do_execve() take a const filename pointer

Make do_execve() take a const filename pointer so that kernel_execve() compiles
correctly on ARM:

arch/arm/kernel/sys_arm.c:88: warning: passing argument 1 of 'do_execve' discards qualifiers from pointer target type

This also requires the argv and envp arguments to be consted twice, once for
the pointer array and once for the strings the array points to.  This is
because do_execve() passes a pointer to the filename (now const) to
copy_strings_kernel().  A simpler alternative would be to cast the filename
pointer in do_execve() when it's passed to copy_strings_kernel().

do_execve() may not change any of the strings it is passed as part of the argv
or envp lists as they are some of them in .rodata, so marking these strings as
const should be fine.

Further kernel_execve() and sys_execve() need to be changed to match.

This has been test built on x86_64, frv, arm and mips.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/process.c             |  5 +++--
 arch/arm/kernel/sys_arm.c               | 14 +++++++++-----
 arch/avr32/kernel/process.c             |  5 +++--
 arch/avr32/kernel/sys_avr32.c           |  4 +++-
 arch/blackfin/kernel/process.c          |  4 +++-
 arch/cris/arch-v10/kernel/process.c     |  4 +++-
 arch/cris/arch-v32/kernel/process.c     |  6 ++++--
 arch/frv/kernel/process.c               |  5 +++--
 arch/h8300/kernel/process.c             |  5 ++++-
 arch/h8300/kernel/sys_h8300.c           |  4 +++-
 arch/ia64/kernel/process.c              |  4 +++-
 arch/m32r/kernel/process.c              |  4 ++--
 arch/m32r/kernel/sys_m32r.c             |  4 +++-
 arch/m68k/kernel/process.c              |  4 +++-
 arch/m68k/kernel/sys_m68k.c             |  4 +++-
 arch/m68knommu/kernel/process.c         |  4 +++-
 arch/m68knommu/kernel/sys_m68k.c        |  4 +++-
 arch/microblaze/kernel/sys_microblaze.c | 10 +++++++---
 arch/mips/kernel/syscall.c              | 10 +++++++---
 arch/mn10300/kernel/process.c           |  4 ++--
 arch/parisc/hpux/fs.c                   |  6 ++++--
 arch/parisc/kernel/process.c            | 15 ++++++++++-----
 arch/powerpc/kernel/process.c           |  5 +++--
 arch/s390/kernel/process.c              |  5 +++--
 arch/score/kernel/sys_score.c           | 10 +++++++---
 arch/sh/kernel/process_32.c             |  7 ++++---
 arch/sh/kernel/process_64.c             |  4 ++--
 arch/sh/kernel/sys_sh32.c               |  4 +++-
 arch/sh/kernel/sys_sh64.c               |  4 +++-
 arch/sparc/kernel/process_32.c          |  6 ++++--
 arch/sparc/kernel/process_64.c          |  4 ++--
 arch/sparc/kernel/sys_sparc_32.c        |  4 +++-
 arch/sparc/kernel/sys_sparc_64.c        |  4 +++-
 arch/tile/kernel/process.c              |  5 +++--
 arch/um/kernel/exec.c                   |  5 +++--
 arch/um/kernel/syscall.c                |  4 +++-
 arch/x86/include/asm/syscalls.h         |  5 +++--
 arch/x86/kernel/process.c               |  5 +++--
 arch/x86/kernel/sys_i386_32.c           |  4 +++-
 arch/xtensa/kernel/process.c            |  5 +++--
 fs/binfmt_misc.c                        |  2 +-
 fs/binfmt_script.c                      |  3 ++-
 fs/exec.c                               | 21 +++++++++++----------
 include/linux/binfmts.h                 |  7 ++++---
 include/linux/sched.h                   |  4 +++-
 include/linux/syscalls.h                |  2 +-
 init/do_mounts_initrd.c                 |  7 ++++---
 init/main.c                             |  6 +++---
 kernel/kmod.c                           |  4 +++-
 security/commoncap.c                    |  2 +-
 50 files changed, 179 insertions(+), 98 deletions(-)

(limited to 'fs')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 88e608aebc8c..842dba308eab 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -387,8 +387,9 @@ EXPORT_SYMBOL(dump_elf_task_fp);
  * sys_execve() executes a new program.
  */
 asmlinkage int
-do_sys_execve(const char __user *ufilename, char __user * __user *argv,
-	      char __user * __user *envp, struct pt_regs *regs)
+do_sys_execve(const char __user *ufilename,
+	      const char __user *const __user *argv,
+	      const char __user *const __user *envp, struct pt_regs *regs)
 {
 	int error;
 	char *filename;
diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c
index 5b7c541a4c63..62e7c61d0342 100644
--- a/arch/arm/kernel/sys_arm.c
+++ b/arch/arm/kernel/sys_arm.c
@@ -62,8 +62,9 @@ asmlinkage int sys_vfork(struct pt_regs *regs)
 /* sys_execve() executes a new program.
  * This is called indirectly via a small wrapper
  */
-asmlinkage int sys_execve(const char __user *filenamei, char __user * __user *argv,
-			  char __user * __user *envp, struct pt_regs *regs)
+asmlinkage int sys_execve(const char __user *filenamei,
+			  const char __user *const __user *argv,
+			  const char __user *const __user *envp, struct pt_regs *regs)
 {
 	int error;
 	char * filename;
@@ -78,14 +79,17 @@ out:
 	return error;
 }
 
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	struct pt_regs regs;
 	int ret;
 
 	memset(&regs, 0, sizeof(struct pt_regs));
-	ret = do_execve(filename, (char __user * __user *)argv,
-			(char __user * __user *)envp, &regs);
+	ret = do_execve(filename,
+			(const char __user *const __user *)argv,
+			(const char __user *const __user *)envp, &regs);
 	if (ret < 0)
 		goto out;
 
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index e5daddff397d..9c46aaad11ce 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -384,8 +384,9 @@ asmlinkage int sys_vfork(struct pt_regs *regs)
 }
 
 asmlinkage int sys_execve(const char __user *ufilename,
-			  char __user *__user *uargv,
-			  char __user *__user *uenvp, struct pt_regs *regs)
+			  const char __user *const __user *uargv,
+			  const char __user *const __user *uenvp,
+			  struct pt_regs *regs)
 {
 	int error;
 	char *filename;
diff --git a/arch/avr32/kernel/sys_avr32.c b/arch/avr32/kernel/sys_avr32.c
index 459349b5ed5a..62635a09ae3e 100644
--- a/arch/avr32/kernel/sys_avr32.c
+++ b/arch/avr32/kernel/sys_avr32.c
@@ -7,7 +7,9 @@
  */
 #include <linux/unistd.h>
 
-int kernel_execve(const char *file, char **argv, char **envp)
+int kernel_execve(const char *file,
+		  const char *const *argv,
+		  const char *const *envp)
 {
 	register long scno asm("r8") = __NR_execve;
 	register long sc1 asm("r12") = (long)file;
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index a566f61c002a..01f98cb964d2 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -209,7 +209,9 @@ copy_thread(unsigned long clone_flags,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(const char __user *name, char __user * __user *argv, char __user * __user *envp)
+asmlinkage int sys_execve(const char __user *name,
+			  const char __user *const __user *argv,
+			  const char __user *const __user *envp)
 {
 	int error;
 	char *filename;
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 93f0f64b1326..9a57db6907f5 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -204,7 +204,9 @@ asmlinkage int sys_vfork(long r10, long r11, long r12, long r13, long mof, long
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(const char *fname, char **argv, char **envp,
+asmlinkage int sys_execve(const char *fname,
+			  const char *const *argv,
+			  const char *const *envp,
 			  long r13, long mof, long srp, 
 			  struct pt_regs *regs)
 {
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index 2661a9529d70..562f84718906 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -218,8 +218,10 @@ sys_vfork(long r10, long r11, long r12, long r13, long mof, long srp,
 
 /* sys_execve() executes a new program. */
 asmlinkage int
-sys_execve(const char *fname, char **argv, char **envp, long r13, long mof, long srp,
-	struct pt_regs *regs)
+sys_execve(const char *fname,
+	   const char *const *argv,
+	   const char *const *envp, long r13, long mof, long srp,
+	   struct pt_regs *regs)
 {
 	int error;
 	char *filename;
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 428931cf2f0c..2b63b0191f52 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -250,8 +250,9 @@ int copy_thread(unsigned long clone_flags,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(const char __user *name, char __user * __user *argv,
-			  char __user * __user *envp)
+asmlinkage int sys_execve(const char __user *name,
+			  const char __user *const __user *argv,
+			  const char __user *const __user *envp)
 {
 	int error;
 	char * filename;
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index 8b7b78d77d5c..97478138e361 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -212,7 +212,10 @@ int copy_thread(unsigned long clone_flags,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(const char *name, char **argv, char **envp,int dummy,...)
+asmlinkage int sys_execve(const char *name,
+			  const char *const *argv,
+			  const char *const *envp,
+			  int dummy, ...)
 {
 	int error;
 	char * filename;
diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c
index f9b3f44da69f..dc1ac0243b78 100644
--- a/arch/h8300/kernel/sys_h8300.c
+++ b/arch/h8300/kernel/sys_h8300.c
@@ -51,7 +51,9 @@ asmlinkage void syscall_print(void *dummy,...)
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	register long res __asm__("er0");
 	register char *const *_c __asm__("er3") = envp;
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index a879c03b7f1c..16f1c7b04c69 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -633,7 +633,9 @@ dump_fpu (struct pt_regs *pt, elf_fpregset_t dst)
 }
 
 long
-sys_execve (const char __user *filename, char __user * __user *argv, char __user * __user *envp,
+sys_execve (const char __user *filename,
+	    const char __user *const __user *argv,
+	    const char __user *const __user *envp,
 	    struct pt_regs *regs)
 {
 	char *fname;
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 8665a4d868ec..422bea9f1dbc 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -289,8 +289,8 @@ asmlinkage int sys_vfork(unsigned long r0, unsigned long r1, unsigned long r2,
  * sys_execve() executes a new program.
  */
 asmlinkage int sys_execve(const char __user *ufilename,
-			  char __user * __user *uargv,
-			  char __user * __user *uenvp,
+			  const char __user *const __user *uargv,
+			  const char __user *const __user *uenvp,
 			  unsigned long r3, unsigned long r4, unsigned long r5,
 			  unsigned long r6, struct pt_regs regs)
 {
diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c
index 0a00f467edfa..d841fb6cc703 100644
--- a/arch/m32r/kernel/sys_m32r.c
+++ b/arch/m32r/kernel/sys_m32r.c
@@ -93,7 +93,9 @@ asmlinkage int sys_cachectl(char *addr, int nbytes, int op)
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	register long __scno __asm__ ("r7") = __NR_execve;
 	register long __arg3 __asm__ ("r2") = (long)(envp);
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 221d0b71ce39..18732ab23292 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -315,7 +315,9 @@ EXPORT_SYMBOL(dump_fpu);
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(const char __user *name, char __user * __user *argv, char __user * __user *envp)
+asmlinkage int sys_execve(const char __user *name,
+			  const char __user *const __user *argv,
+			  const char __user *const __user *envp)
 {
 	int error;
 	char * filename;
diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c
index 77896692eb0a..2f431ece7b5f 100644
--- a/arch/m68k/kernel/sys_m68k.c
+++ b/arch/m68k/kernel/sys_m68k.c
@@ -459,7 +459,9 @@ asmlinkage int sys_getpagesize(void)
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	register long __res asm ("%d0") = __NR_execve;
 	register long __a asm ("%d1") = (long)(filename);
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index 6350f68cd026..4d090d3c0897 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -350,7 +350,9 @@ void dump(struct pt_regs *fp)
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(const char *name, char **argv, char **envp)
+asmlinkage int sys_execve(const char *name,
+			  const char *const *argv,
+			  const char *const *envp)
 {
 	int error;
 	char * filename;
diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c
index d65e9c4c930c..68488ae47f0a 100644
--- a/arch/m68knommu/kernel/sys_m68k.c
+++ b/arch/m68knommu/kernel/sys_m68k.c
@@ -44,7 +44,9 @@ asmlinkage int sys_getpagesize(void)
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	register long __res asm ("%d0") = __NR_execve;
 	register long __a asm ("%d1") = (long)(filename);
diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c
index 6abab6ebedbe..2250fe9d269b 100644
--- a/arch/microblaze/kernel/sys_microblaze.c
+++ b/arch/microblaze/kernel/sys_microblaze.c
@@ -47,8 +47,10 @@ asmlinkage long microblaze_clone(int flags, unsigned long stack, struct pt_regs
 	return do_fork(flags, stack, regs, 0, NULL, NULL);
 }
 
-asmlinkage long microblaze_execve(const char __user *filenamei, char __user *__user *argv,
-			char __user *__user *envp, struct pt_regs *regs)
+asmlinkage long microblaze_execve(const char __user *filenamei,
+				  const char __user *const __user *argv,
+				  const char __user *const __user *envp,
+				  struct pt_regs *regs)
 {
 	int error;
 	char *filename;
@@ -77,7 +79,9 @@ asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	register const char *__a __asm__("r5") = filename;
 	register const void *__b __asm__("r6") = argv;
diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c
index bddce0bca195..1dc6edff45e0 100644
--- a/arch/mips/kernel/syscall.c
+++ b/arch/mips/kernel/syscall.c
@@ -258,8 +258,10 @@ asmlinkage int sys_execve(nabi_no_regargs struct pt_regs regs)
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
-	error = do_execve(filename, (char __user *__user *) (long)regs.regs[5],
-	                  (char __user *__user *) (long)regs.regs[6], &regs);
+	error = do_execve(filename,
+			  (const char __user *const __user *) (long)regs.regs[5],
+	                  (const char __user *const __user *) (long)regs.regs[6],
+			  &regs);
 	putname(filename);
 
 out:
@@ -436,7 +438,9 @@ asmlinkage void bad_stack(void)
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	register unsigned long __a0 asm("$4") = (unsigned long) filename;
 	register unsigned long __a1 asm("$5") = (unsigned long) argv;
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 762eb325b949..f48373e2bc1c 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -269,8 +269,8 @@ asmlinkage long sys_vfork(void)
 }
 
 asmlinkage long sys_execve(const char __user *name,
-			   char __user * __user *argv,
-			   char __user * __user *envp)
+			   const char __user *const __user *argv,
+			   const char __user *const __user *envp)
 {
 	char *filename;
 	int error;
diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c
index 1444875a7611..0dc8543acb4f 100644
--- a/arch/parisc/hpux/fs.c
+++ b/arch/parisc/hpux/fs.c
@@ -41,8 +41,10 @@ int hpux_execve(struct pt_regs *regs)
 	if (IS_ERR(filename))
 		goto out;
 
-	error = do_execve(filename, (char __user * __user *) regs->gr[25],
-		(char __user * __user *) regs->gr[24], regs);
+	error = do_execve(filename,
+			  (const char __user *const __user *) regs->gr[25],
+			  (const char __user *const __user *) regs->gr[24],
+			  regs);
 
 	putname(filename);
 
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index 76332dadc6e9..4b4b9181a1a0 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -348,17 +348,22 @@ asmlinkage int sys_execve(struct pt_regs *regs)
 	error = PTR_ERR(filename);
 	if (IS_ERR(filename))
 		goto out;
-	error = do_execve(filename, (char __user * __user *) regs->gr[25],
-		(char __user * __user *) regs->gr[24], regs);
+	error = do_execve(filename,
+			  (const char __user *const __user *) regs->gr[25],
+			  (const char __user *const __user *) regs->gr[24],
+			  regs);
 	putname(filename);
 out:
 
 	return error;
 }
 
-extern int __execve(const char *filename, char *const argv[],
-		char *const envp[], struct task_struct *task);
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+extern int __execve(const char *filename,
+		    const char *const argv[],
+		    const char *const envp[], struct task_struct *task);
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	return __execve(filename, argv, envp, current);
 }
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index feacfb789686..91356ffda2ca 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1034,8 +1034,9 @@ int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2,
 	flush_fp_to_thread(current);
 	flush_altivec_to_thread(current);
 	flush_spe_to_thread(current);
-	error = do_execve(filename, (char __user * __user *) a1,
-			  (char __user * __user *) a2, regs);
+	error = do_execve(filename,
+			  (const char __user *const __user *) a1,
+			  (const char __user *const __user *) a2, regs);
 	putname(filename);
 out:
 	return error;
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index 7eafaf2662b9..d3a2d1c6438e 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -267,8 +267,9 @@ asmlinkage void execve_tail(void)
 /*
  * sys_execve() executes a new program.
  */
-SYSCALL_DEFINE3(execve, const char __user *, name, char __user * __user *, argv,
-		char __user * __user *, envp)
+SYSCALL_DEFINE3(execve, const char __user *, name,
+		const char __user *const __user *, argv,
+		const char __user *const __user *, envp)
 {
 	struct pt_regs *regs = task_pt_regs(current);
 	char *filename;
diff --git a/arch/score/kernel/sys_score.c b/arch/score/kernel/sys_score.c
index 651096ff8db4..e478bf9a7e91 100644
--- a/arch/score/kernel/sys_score.c
+++ b/arch/score/kernel/sys_score.c
@@ -99,8 +99,10 @@ score_execve(struct pt_regs *regs)
 	if (IS_ERR(filename))
 		return error;
 
-	error = do_execve(filename, (char __user *__user*)regs->regs[5],
-			  (char __user *__user *) regs->regs[6], regs);
+	error = do_execve(filename,
+			  (const char __user *const __user *)regs->regs[5],
+			  (const char __user *const __user *)regs->regs[6],
+			  regs);
 
 	putname(filename);
 	return error;
@@ -110,7 +112,9 @@ score_execve(struct pt_regs *regs)
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	register unsigned long __r4 asm("r4") = (unsigned long) filename;
 	register unsigned long __r5 asm("r5") = (unsigned long) argv;
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 052981972ae6..762a13984bbd 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -296,9 +296,10 @@ asmlinkage int sys_vfork(unsigned long r4, unsigned long r5,
 /*
  * sys_execve() executes a new program.
  */
-asmlinkage int sys_execve(char __user *ufilename, char __user * __user *uargv,
-			  char __user * __user *uenvp, unsigned long r7,
-			  struct pt_regs __regs)
+asmlinkage int sys_execve(const char __user *ufilename,
+			  const char __user *const __user *uargv,
+			  const char __user *const __user *uenvp,
+			  unsigned long r7, struct pt_regs __regs)
 {
 	struct pt_regs *regs = RELOC_HIDE(&__regs, 0);
 	int error;
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index 68d128d651b3..210c1cabcb7f 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -497,8 +497,8 @@ asmlinkage int sys_execve(const char *ufilename, char **uargv,
 		goto out;
 
 	error = do_execve(filename,
-			  (char __user * __user *)uargv,
-			  (char __user * __user *)uenvp,
+			  (const char __user *const __user *)uargv,
+			  (const char __user *const __user *)uenvp,
 			  pregs);
 	putname(filename);
 out:
diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c
index eb68bfdd86e6..f56b6fe5c5d0 100644
--- a/arch/sh/kernel/sys_sh32.c
+++ b/arch/sh/kernel/sys_sh32.c
@@ -71,7 +71,9 @@ asmlinkage int sys_fadvise64_64_wrapper(int fd, u32 offset0, u32 offset1,
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	register long __sc0 __asm__ ("r3") = __NR_execve;
 	register long __sc4 __asm__ ("r4") = (long) filename;
diff --git a/arch/sh/kernel/sys_sh64.c b/arch/sh/kernel/sys_sh64.c
index 287235768bc5..c5a38c4bf410 100644
--- a/arch/sh/kernel/sys_sh64.c
+++ b/arch/sh/kernel/sys_sh64.c
@@ -33,7 +33,9 @@
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	register unsigned long __sc0 __asm__ ("r9") = ((0x13 << 16) | __NR_execve);
 	register unsigned long __sc2 __asm__ ("r2") = (unsigned long) filename;
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 40e29fc8a4d6..17529298c50a 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -633,8 +633,10 @@ asmlinkage int sparc_execve(struct pt_regs *regs)
 	if(IS_ERR(filename))
 		goto out;
 	error = do_execve(filename,
-			  (char __user * __user *)regs->u_regs[base + UREG_I1],
-			  (char __user * __user *)regs->u_regs[base + UREG_I2],
+			  (const char __user *const  __user *)
+			  regs->u_regs[base + UREG_I1],
+			  (const char __user *const  __user *)
+			  regs->u_regs[base + UREG_I2],
 			  regs);
 	putname(filename);
 out:
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index dbe81a368b45..485f54748384 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -739,9 +739,9 @@ asmlinkage int sparc_execve(struct pt_regs *regs)
 	if (IS_ERR(filename))
 		goto out;
 	error = do_execve(filename,
-			  (char __user * __user *)
+			  (const char __user *const __user *)
 			  regs->u_regs[base + UREG_I1],
-			  (char __user * __user *)
+			  (const char __user *const __user *)
 			  regs->u_regs[base + UREG_I2], regs);
 	putname(filename);
 	if (!error) {
diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c
index ee995b7dae7e..50794137d710 100644
--- a/arch/sparc/kernel/sys_sparc_32.c
+++ b/arch/sparc/kernel/sys_sparc_32.c
@@ -282,7 +282,9 @@ out:
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	long __res;
 	register long __g1 __asm__ ("g1") = __NR_execve;
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 3d435c42e6db..f836f4e93afe 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -758,7 +758,9 @@ SYSCALL_DEFINE5(rt_sigaction, int, sig, const struct sigaction __user *, act,
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	long __res;
 	register long __g1 __asm__ ("g1") = __NR_execve;
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index ed590ad0acdc..985cc28c74c5 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -543,8 +543,9 @@ long _sys_vfork(struct pt_regs *regs)
 /*
  * sys_execve() executes a new program.
  */
-long _sys_execve(char __user *path, char __user *__user *argv,
-		 char __user *__user *envp, struct pt_regs *regs)
+long _sys_execve(const char __user *path,
+		 const char __user *const __user *argv,
+		 const char __user *const __user *envp, struct pt_regs *regs)
 {
 	long error;
 	char *filename;
diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c
index 59b20d93b6d4..cd145eda3579 100644
--- a/arch/um/kernel/exec.c
+++ b/arch/um/kernel/exec.c
@@ -44,8 +44,9 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp)
 	PT_REGS_SP(regs) = esp;
 }
 
-static long execve1(const char *file, char __user * __user *argv,
-		    char __user *__user *env)
+static long execve1(const char *file,
+		    const char __user *const __user *argv,
+		    const char __user *const __user *env)
 {
 	long error;
 
diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c
index 7427c0b1930c..5ddb246626db 100644
--- a/arch/um/kernel/syscall.c
+++ b/arch/um/kernel/syscall.c
@@ -51,7 +51,9 @@ long old_mmap(unsigned long addr, unsigned long len,
 	return err;
 }
 
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	mm_segment_t fs;
 	int ret;
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index feb2ff9bfc2d..f1d8b441fc77 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -23,8 +23,9 @@ long sys_iopl(unsigned int, struct pt_regs *);
 /* kernel/process.c */
 int sys_fork(struct pt_regs *);
 int sys_vfork(struct pt_regs *);
-long sys_execve(const char __user *, char __user * __user *,
-		char __user * __user *, struct pt_regs *);
+long sys_execve(const char __user *,
+		const char __user *const __user *,
+		const char __user *const __user *, struct pt_regs *);
 long sys_clone(unsigned long, unsigned long, void __user *,
 	       void __user *, struct pt_regs *);
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 64ecaf0af9af..57d1868a86aa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -301,8 +301,9 @@ EXPORT_SYMBOL(kernel_thread);
 /*
  * sys_execve() executes a new program.
  */
-long sys_execve(const char __user *name, char __user * __user *argv,
-		char __user * __user *envp, struct pt_regs *regs)
+long sys_execve(const char __user *name,
+		const char __user *const __user *argv,
+		const char __user *const __user *envp, struct pt_regs *regs)
 {
 	long error;
 	char *filename;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 196552bb412c..d5e06624e34a 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -28,7 +28,9 @@
  * Do a system call from kernel instead of calling sys_execve so we
  * end up with proper pt_regs.
  */
-int kernel_execve(const char *filename, char *const argv[], char *const envp[])
+int kernel_execve(const char *filename,
+		  const char *const argv[],
+		  const char *const envp[])
 {
 	long __res;
 	asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx"
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 7c2f38f68ebb..e3558b9a58ba 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -318,8 +318,9 @@ long xtensa_clone(unsigned long clone_flags, unsigned long newsp,
  */
 
 asmlinkage
-long xtensa_execve(const char __user *name, char __user * __user *argv,
-                   char __user * __user *envp,
+long xtensa_execve(const char __user *name,
+		   const char __user *const __user *argv,
+                   const char __user *const __user *envp,
                    long a3, long a4, long a5,
                    struct pt_regs *regs)
 {
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 9e60fd201716..a7528b913936 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -108,7 +108,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
 	Node *fmt;
 	struct file * interp_file = NULL;
 	char iname[BINPRM_BUF_SIZE];
-	char *iname_addr = iname;
+	const char *iname_addr = iname;
 	int retval;
 	int fd_binary = -1;
 
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index aca9d55afb22..396a9884591f 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -16,7 +16,8 @@
 
 static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
 {
-	char *cp, *i_name, *i_arg;
+	const char *i_arg, *i_name;
+	char *cp;
 	struct file *file;
 	char interp[BINPRM_BUF_SIZE];
 	int retval;
diff --git a/fs/exec.c b/fs/exec.c
index 7761837e4500..05c7d6b84df7 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -361,13 +361,13 @@ err:
 /*
  * count() counts the number of strings in array ARGV.
  */
-static int count(char __user * __user * argv, int max)
+static int count(const char __user * const __user * argv, int max)
 {
 	int i = 0;
 
 	if (argv != NULL) {
 		for (;;) {
-			char __user * p;
+			const char __user * p;
 
 			if (get_user(p, argv))
 				return -EFAULT;
@@ -387,7 +387,7 @@ static int count(char __user * __user * argv, int max)
  * processes's memory to the new process's stack.  The call to get_user_pages()
  * ensures the destination page is created and not swapped out.
  */
-static int copy_strings(int argc, char __user * __user * argv,
+static int copy_strings(int argc, const char __user *const __user *argv,
 			struct linux_binprm *bprm)
 {
 	struct page *kmapped_page = NULL;
@@ -396,7 +396,7 @@ static int copy_strings(int argc, char __user * __user * argv,
 	int ret;
 
 	while (argc-- > 0) {
-		char __user *str;
+		const char __user *str;
 		int len;
 		unsigned long pos;
 
@@ -470,12 +470,13 @@ out:
 /*
  * Like copy_strings, but get argv and its values from kernel memory.
  */
-int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm)
+int copy_strings_kernel(int argc, const char *const *argv,
+			struct linux_binprm *bprm)
 {
 	int r;
 	mm_segment_t oldfs = get_fs();
 	set_fs(KERNEL_DS);
-	r = copy_strings(argc, (char __user * __user *)argv, bprm);
+	r = copy_strings(argc, (const char __user *const  __user *)argv, bprm);
 	set_fs(oldfs);
 	return r;
 }
@@ -997,7 +998,7 @@ EXPORT_SYMBOL(flush_old_exec);
 void setup_new_exec(struct linux_binprm * bprm)
 {
 	int i, ch;
-	char * name;
+	const char *name;
 	char tcomm[sizeof(current->comm)];
 
 	arch_pick_mmap_layout(current->mm);
@@ -1316,9 +1317,9 @@ EXPORT_SYMBOL(search_binary_handler);
 /*
  * sys_execve() executes a new program.
  */
-int do_execve(char * filename,
-	char __user *__user *argv,
-	char __user *__user *envp,
+int do_execve(const char * filename,
+	const char __user *const __user *argv,
+	const char __user *const __user *envp,
 	struct pt_regs * regs)
 {
 	struct linux_binprm *bprm;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index c809e286d213..a065612fc928 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -50,8 +50,8 @@ struct linux_binprm{
 	int unsafe;		/* how unsafe this exec is (mask of LSM_UNSAFE_*) */
 	unsigned int per_clear;	/* bits to clear in current->personality */
 	int argc, envc;
-	char * filename;	/* Name of binary as seen by procps */
-	char * interp;		/* Name of the binary really executed. Most
+	const char * filename;	/* Name of binary as seen by procps */
+	const char * interp;	/* Name of the binary really executed. Most
 				   of the time same as filename, but could be
 				   different for binfmt_{misc,script} */
 	unsigned interp_flags;
@@ -126,7 +126,8 @@ extern int setup_arg_pages(struct linux_binprm * bprm,
 			   unsigned long stack_top,
 			   int executable_stack);
 extern int bprm_mm_init(struct linux_binprm *bprm);
-extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm);
+extern int copy_strings_kernel(int argc, const char *const *argv,
+			       struct linux_binprm *bprm);
 extern int prepare_bprm_creds(struct linux_binprm *bprm);
 extern void install_exec_creds(struct linux_binprm *bprm);
 extern void do_coredump(long signr, int exit_code, struct pt_regs *regs);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ce160d68f5e7..1e2a6db2d7dd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2109,7 +2109,9 @@ extern void daemonize(const char *, ...);
 extern int allow_signal(int);
 extern int disallow_signal(int);
 
-extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *);
+extern int do_execve(const char *,
+		     const char __user * const __user *,
+		     const char __user * const __user *, struct pt_regs *);
 extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
 struct task_struct *fork_idle(int);
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 6e5d19788634..e6319d18a55d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -820,7 +820,7 @@ asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
 				  u64 mask, int fd,
 				  const char  __user *pathname);
 
-int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
+int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]);
 
 
 asmlinkage long sys_perf_event_open(
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index 2b108538d0d9..3098a38f3ae1 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -24,10 +24,11 @@ static int __init no_initrd(char *str)
 
 __setup("noinitrd", no_initrd);
 
-static int __init do_linuxrc(void * shell)
+static int __init do_linuxrc(void *_shell)
 {
-	static char *argv[] = { "linuxrc", NULL, };
-	extern char * envp_init[];
+	static const char *argv[] = { "linuxrc", NULL, };
+	extern const char *envp_init[];
+	const char *shell = _shell;
 
 	sys_close(old_fd);sys_close(root_fd);
 	sys_setsid();
diff --git a/init/main.c b/init/main.c
index 22d61cb06f98..94ab488039aa 100644
--- a/init/main.c
+++ b/init/main.c
@@ -197,8 +197,8 @@ static int __init set_reset_devices(char *str)
 
 __setup("reset_devices", set_reset_devices);
 
-static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
-char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
+static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
+const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
 static const char *panic_later, *panic_param;
 
 extern const struct obs_kernel_param __setup_start[], __setup_end[];
@@ -809,7 +809,7 @@ static void __init do_pre_smp_initcalls(void)
 		do_one_initcall(*fn);
 }
 
-static void run_init_process(char *init_filename)
+static void run_init_process(const char *init_filename)
 {
 	argv_init[0] = init_filename;
 	kernel_execve(init_filename, argv_init, envp_init);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 6e9b19667a8d..9cd0591c96a2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -153,7 +153,9 @@ static int ____call_usermodehelper(void *data)
 			goto fail;
 	}
 
-	retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
+	retval = kernel_execve(sub_info->path,
+			       (const char *const *)sub_info->argv,
+			       (const char *const *)sub_info->envp);
 
 	/* Exec failed? */
 fail:
diff --git a/security/commoncap.c b/security/commoncap.c
index 4e015996dd4d..9d172e6e330c 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -40,7 +40,7 @@
  *
  * Warn if that happens, once per boot.
  */
-static void warn_setuid_and_fcaps_mixed(char *fname)
+static void warn_setuid_and_fcaps_mixed(const char *fname)
 {
 	static int warned;
 	if (!warned) {
-- 
cgit v1.2.3


From f4ae2faa40199b97b12f508234640bc565d166f8 Mon Sep 17 00:00:00 2001
From: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Date: Wed, 11 Aug 2010 14:07:01 +0300
Subject: fix reiserfs_evict_inode end_writeback second call

reiserfs_evict_inode calls end_writeback two times hitting
kernel BUG at fs/inode.c:298 becase inode->i_state is I_CLEAR already.

Signed-off-by: Sergey Senozhatsky <sergey.senozhatsky@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ae35413dcbe1..caa758377d66 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -83,6 +83,7 @@ void reiserfs_evict_inode(struct inode *inode)
 	dquot_drop(inode);
 	inode->i_blocks = 0;
 	reiserfs_write_unlock_once(inode->i_sb, depth);
+	return;
 
 no_delete:
 	end_writeback(inode);
-- 
cgit v1.2.3


From b845ff8f3ea2988ad5041315e2d35298e85cbc2f Mon Sep 17 00:00:00 2001
From: Alexander Shishkin <virtuoso@slind.org>
Date: Tue, 17 Aug 2010 17:08:35 +0300
Subject: cramfs: only unlock new inodes

Commit 77b8a75f5bb introduced a warning at fs/inode.c:692 unlock_new_inode(),
caused by unlock_new_inode() being called on existing inodes as well.

This patch changes setup_inode() to only call unlock_new_inode() for I_NEW
inodes.

Signed-off-by: Alexander Shishkin <virtuoso@slind.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/cramfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index a53b130b366c..1e7a33028d33 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -80,7 +80,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 		}
 	} else {
 		inode = iget_locked(sb, CRAMINO(cramfs_inode));
-		if (inode) {
+		if (inode && (inode->i_state & I_NEW)) {
 			setup_inode(inode, cramfs_inode);
 			unlock_new_inode(inode);
 		}
-- 
cgit v1.2.3


From dad5eb6daa7eeb63d4fc9d982892c59faa07e797 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 17 Aug 2010 12:42:13 +0200
Subject: vfs: update ctime when changing the file's permission by setfacl

generic_acl_set didn't update the ctime of the file when its permission was
changed.

Steps to reproduce:
 # touch aaa
 # stat -c %Z aaa
 1275289822
 # setfacl -m  'u::x,g::x,o::x' aaa
 # stat -c %Z aaa
 1275289822                         <- unchanged

But, according to the spec of the ctime, vfs must update it.

Port of ext3 patch by Miao Xie <miaox@cn.fujitsu.com>.

CC: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/generic_acl.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/generic_acl.c b/fs/generic_acl.c
index 99800e564157..6bc9e3a5a693 100644
--- a/fs/generic_acl.c
+++ b/fs/generic_acl.c
@@ -94,6 +94,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value,
 			if (error < 0)
 				goto failed;
 			inode->i_mode = mode;
+			inode->i_ctime = CURRENT_TIME;
 			if (error == 0) {
 				posix_acl_release(acl);
 				acl = NULL;
-- 
cgit v1.2.3


From 87e99511ea54510ffb60b98001d108794d5037f8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Aug 2010 17:05:45 +0200
Subject: kill BH_Ordered flag

Instead of abusing a buffer_head flag just add a variant of
sync_dirty_buffer which allows passing the exact type of write
flag required.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c                 | 17 ++++++++--------
 fs/jbd/commit.c             | 49 +++++++++++++++++++++++----------------------
 fs/jbd2/commit.c            | 39 ++++++++++++++----------------------
 fs/nilfs2/super.c           | 28 +++++++++++++-------------
 include/linux/buffer_head.h |  3 +--
 5 files changed, 63 insertions(+), 73 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 50efa339e051..6c8ad977f3d4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2911,13 +2911,6 @@ int submit_bh(int rw, struct buffer_head * bh)
 	BUG_ON(buffer_delay(bh));
 	BUG_ON(buffer_unwritten(bh));
 
-	/*
-	 * Mask in barrier bit for a write (could be either a WRITE or a
-	 * WRITE_SYNC
-	 */
-	if (buffer_ordered(bh) && (rw & WRITE))
-		rw |= WRITE_BARRIER;
-
 	/*
 	 * Only clear out a write error when rewriting
 	 */
@@ -3021,7 +3014,7 @@ EXPORT_SYMBOL(ll_rw_block);
  * and then start new I/O and then wait upon it.  The caller must have a ref on
  * the buffer_head.
  */
-int sync_dirty_buffer(struct buffer_head *bh)
+int __sync_dirty_buffer(struct buffer_head *bh, int rw)
 {
 	int ret = 0;
 
@@ -3030,7 +3023,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(rw, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
@@ -3043,6 +3036,12 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	}
 	return ret;
 }
+EXPORT_SYMBOL(__sync_dirty_buffer);
+
+int sync_dirty_buffer(struct buffer_head *bh)
+{
+	return __sync_dirty_buffer(bh, WRITE_SYNC);
+}
 EXPORT_SYMBOL(sync_dirty_buffer);
 
 /*
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 28a9ddaa0c49..95d8c11c929e 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -119,7 +119,6 @@ static int journal_write_commit_record(journal_t *journal,
 	struct buffer_head *bh;
 	journal_header_t *header;
 	int ret;
-	int barrier_done = 0;
 
 	if (is_journal_aborted(journal))
 		return 0;
@@ -137,34 +136,36 @@ static int journal_write_commit_record(journal_t *journal,
 
 	JBUFFER_TRACE(descriptor, "write commit block");
 	set_buffer_dirty(bh);
+
 	if (journal->j_flags & JFS_BARRIER) {
-		set_buffer_ordered(bh);
-		barrier_done = 1;
-	}
-	ret = sync_dirty_buffer(bh);
-	if (barrier_done)
-		clear_buffer_ordered(bh);
-	/* is it possible for another commit to fail at roughly
-	 * the same time as this one?  If so, we don't want to
-	 * trust the barrier flag in the super, but instead want
-	 * to remember if we sent a barrier request
-	 */
-	if (ret == -EOPNOTSUPP && barrier_done) {
-		char b[BDEVNAME_SIZE];
+		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER);
 
-		printk(KERN_WARNING
-			"JBD: barrier-based sync failed on %s - "
-			"disabling barriers\n",
-			bdevname(journal->j_dev, b));
-		spin_lock(&journal->j_state_lock);
-		journal->j_flags &= ~JFS_BARRIER;
-		spin_unlock(&journal->j_state_lock);
+		/*
+		 * Is it possible for another commit to fail at roughly
+		 * the same time as this one?  If so, we don't want to
+		 * trust the barrier flag in the super, but instead want
+		 * to remember if we sent a barrier request
+		 */
+		if (ret == -EOPNOTSUPP) {
+			char b[BDEVNAME_SIZE];
 
-		/* And try again, without the barrier */
-		set_buffer_uptodate(bh);
-		set_buffer_dirty(bh);
+			printk(KERN_WARNING
+				"JBD: barrier-based sync failed on %s - "
+				"disabling barriers\n",
+				bdevname(journal->j_dev, b));
+			spin_lock(&journal->j_state_lock);
+			journal->j_flags &= ~JFS_BARRIER;
+			spin_unlock(&journal->j_state_lock);
+
+			/* And try again, without the barrier */
+			set_buffer_uptodate(bh);
+			set_buffer_dirty(bh);
+			ret = sync_dirty_buffer(bh);
+		}
+	} else {
 		ret = sync_dirty_buffer(bh);
 	}
+
 	put_bh(bh);		/* One for getblk() */
 	journal_put_journal_head(descriptor);
 
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index f52e5e8049f1..7c068c189d80 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -101,7 +101,6 @@ static int journal_submit_commit_record(journal_t *journal,
 	struct commit_header *tmp;
 	struct buffer_head *bh;
 	int ret;
-	int barrier_done = 0;
 	struct timespec now = current_kernel_time();
 
 	if (is_journal_aborted(journal))
@@ -136,30 +135,22 @@ static int journal_submit_commit_record(journal_t *journal,
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
-		set_buffer_ordered(bh);
-		barrier_done = 1;
-	}
-	ret = submit_bh(WRITE_SYNC_PLUG, bh);
-	if (barrier_done)
-		clear_buffer_ordered(bh);
-
-	/* is it possible for another commit to fail at roughly
-	 * the same time as this one?  If so, we don't want to
-	 * trust the barrier flag in the super, but instead want
-	 * to remember if we sent a barrier request
-	 */
-	if (ret == -EOPNOTSUPP && barrier_done) {
-		printk(KERN_WARNING
-		       "JBD2: Disabling barriers on %s, "
-		       "not supported by device\n", journal->j_devname);
-		write_lock(&journal->j_state_lock);
-		journal->j_flags &= ~JBD2_BARRIER;
-		write_unlock(&journal->j_state_lock);
+		ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh);
+		if (ret == -EOPNOTSUPP) {
+			printk(KERN_WARNING
+			       "JBD2: Disabling barriers on %s, "
+			       "not supported by device\n", journal->j_devname);
+			write_lock(&journal->j_state_lock);
+			journal->j_flags &= ~JBD2_BARRIER;
+			write_unlock(&journal->j_state_lock);
 
-		/* And try again, without the barrier */
-		lock_buffer(bh);
-		set_buffer_uptodate(bh);
-		clear_buffer_dirty(bh);
+			/* And try again, without the barrier */
+			lock_buffer(bh);
+			set_buffer_uptodate(bh);
+			clear_buffer_dirty(bh);
+			ret = submit_bh(WRITE_SYNC_PLUG, bh);
+		}
+	} else {
 		ret = submit_bh(WRITE_SYNC_PLUG, bh);
 	}
 	*cbh = bh;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1fa86b9df73b..68345430fb48 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -175,24 +175,24 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag)
 {
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 	int err;
-	int barrier_done = 0;
 
-	if (nilfs_test_opt(sbi, BARRIER)) {
-		set_buffer_ordered(nilfs->ns_sbh[0]);
-		barrier_done = 1;
-	}
  retry:
 	set_buffer_dirty(nilfs->ns_sbh[0]);
-	err = sync_dirty_buffer(nilfs->ns_sbh[0]);
-	if (err == -EOPNOTSUPP && barrier_done) {
-		nilfs_warning(sbi->s_super, __func__,
-			      "barrier-based sync failed. "
-			      "disabling barriers\n");
-		nilfs_clear_opt(sbi, BARRIER);
-		barrier_done = 0;
-		clear_buffer_ordered(nilfs->ns_sbh[0]);
-		goto retry;
+
+	if (nilfs_test_opt(sbi, BARRIER)) {
+		err = __sync_dirty_buffer(nilfs->ns_sbh[0],
+					  WRITE_SYNC | WRITE_BARRIER);
+		if (err == -EOPNOTSUPP) {
+			nilfs_warning(sbi->s_super, __func__,
+				      "barrier-based sync failed. "
+				      "disabling barriers\n");
+			nilfs_clear_opt(sbi, BARRIER);
+			goto retry;
+		}
+	} else {
+		err = sync_dirty_buffer(nilfs->ns_sbh[0]);
 	}
+
 	if (unlikely(err)) {
 		printk(KERN_ERR
 		       "NILFS: unable to write superblock (err=%d)\n", err);
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 43e649a72529..72c1cf83eb85 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -32,7 +32,6 @@ enum bh_state_bits {
 	BH_Delay,	/* Buffer is not yet allocated on disk */
 	BH_Boundary,	/* Block is followed by a discontiguity */
 	BH_Write_EIO,	/* I/O error on write */
-	BH_Ordered,	/* ordered write */
 	BH_Eopnotsupp,	/* operation not supported (barrier) */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
 	BH_Quiet,	/* Buffer Error Prinks to be quiet */
@@ -125,7 +124,6 @@ BUFFER_FNS(Async_Write, async_write)
 BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
 BUFFER_FNS(Write_EIO, write_io_error)
-BUFFER_FNS(Ordered, ordered)
 BUFFER_FNS(Eopnotsupp, eopnotsupp)
 BUFFER_FNS(Unwritten, unwritten)
 
@@ -183,6 +181,7 @@ void unlock_buffer(struct buffer_head *bh);
 void __lock_buffer(struct buffer_head *bh);
 void ll_rw_block(int, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
+int __sync_dirty_buffer(struct buffer_head *bh, int rw);
 int submit_bh(int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
-- 
cgit v1.2.3


From 9cb569d601e0b93e01c20a22872270ec663b75f6 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 11 Aug 2010 17:06:24 +0200
Subject: remove SWRITE* I/O types

These flags aren't real I/O types, but tell ll_rw_block to always
lock the buffer instead of giving up on a failed trylock.

Instead add a new write_dirty_buffer helper that implements this semantic
and use it from the existing SWRITE* callers.  Note that the ll_rw_block
code had a bug where it didn't promote WRITE_SYNC_PLUG properly, which
this patch fixes.

In the ufs code clean up the helper that used to call ll_rw_block
to mirror sync_dirty_buffer, which is the function it implements for
compound buffers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/buffer.c                 | 52 +++++++++++++++++++++++++--------------------
 fs/fat/misc.c               |  4 +++-
 fs/jbd/checkpoint.c         |  4 +++-
 fs/jbd/journal.c            |  2 +-
 fs/jbd/revoke.c             |  2 +-
 fs/jbd2/checkpoint.c        |  4 +++-
 fs/jbd2/journal.c           |  2 +-
 fs/jbd2/revoke.c            |  2 +-
 fs/reiserfs/journal.c       |  2 +-
 fs/ufs/balloc.c             | 24 +++++++--------------
 fs/ufs/ialloc.c             | 18 ++++++----------
 fs/ufs/truncate.c           | 18 ++++++----------
 fs/ufs/util.c               | 20 +++++++----------
 fs/ufs/util.h               |  3 +--
 include/linux/buffer_head.h |  1 +
 include/linux/fs.h          |  9 --------
 16 files changed, 73 insertions(+), 94 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 6c8ad977f3d4..3e7dca279d1c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -770,11 +770,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				spin_unlock(lock);
 				/*
 				 * Ensure any pending I/O completes so that
-				 * ll_rw_block() actually writes the current
-				 * contents - it is a noop if I/O is still in
-				 * flight on potentially older contents.
+				 * write_dirty_buffer() actually writes the
+				 * current contents - it is a noop if I/O is
+				 * still in flight on potentially older
+				 * contents.
 				 */
-				ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
+				write_dirty_buffer(bh, WRITE_SYNC_PLUG);
 
 				/*
 				 * Kick off IO for the previous mapping. Note
@@ -2949,22 +2950,21 @@ EXPORT_SYMBOL(submit_bh);
 
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
- * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
+ * @rw: whether to %READ or %WRITE or maybe %READA (readahead)
  * @nr: number of &struct buffer_heads in the array
  * @bhs: array of pointers to &struct buffer_head
  *
  * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
  * requests an I/O operation on them, either a %READ or a %WRITE.  The third
- * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
- * are sent to disk. The fourth %READA option is described in the documentation
- * for generic_make_request() which ll_rw_block() calls.
+ * %READA option is described in the documentation for generic_make_request()
+ * which ll_rw_block() calls.
  *
  * This function drops any buffer that it cannot get a lock on (with the
- * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
- * clean when doing a write request, and any buffer that appears to be
- * up-to-date when doing read request.  Further it marks as clean buffers that
- * are processed for writing (the buffer cache won't assume that they are
- * actually clean until the buffer gets unlocked).
+ * BH_Lock state bit), any buffer that appears to be clean when doing a write
+ * request, and any buffer that appears to be up-to-date when doing read
+ * request.  Further it marks as clean buffers that are processed for
+ * writing (the buffer cache won't assume that they are actually clean
+ * until the buffer gets unlocked).
  *
  * ll_rw_block sets b_end_io to simple completion handler that marks
  * the buffer up-to-date (if approriate), unlocks the buffer and wakes
@@ -2980,20 +2980,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 	for (i = 0; i < nr; i++) {
 		struct buffer_head *bh = bhs[i];
 
-		if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
-			lock_buffer(bh);
-		else if (!trylock_buffer(bh))
+		if (!trylock_buffer(bh))
 			continue;
-
-		if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
-		    rw == SWRITE_SYNC_PLUG) {
+		if (rw == WRITE) {
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
 				get_bh(bh);
-				if (rw == SWRITE_SYNC)
-					submit_bh(WRITE_SYNC, bh);
-				else
-					submit_bh(WRITE, bh);
+				submit_bh(WRITE, bh);
 				continue;
 			}
 		} else {
@@ -3009,6 +3002,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
 }
 EXPORT_SYMBOL(ll_rw_block);
 
+void write_dirty_buffer(struct buffer_head *bh, int rw)
+{
+	lock_buffer(bh);
+	if (!test_clear_buffer_dirty(bh)) {
+		unlock_buffer(bh);
+		return;
+	}
+	bh->b_end_io = end_buffer_write_sync;
+	get_bh(bh);
+	submit_bh(rw, bh);
+}
+EXPORT_SYMBOL(write_dirty_buffer);
+
 /*
  * For a data-integrity writeout, we need to wait upon any in-progress I/O
  * and then start new I/O and then wait upon it.  The caller must have a ref on
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index 1fa23f6ffba5..1736f2356388 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -250,7 +250,9 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs)
 {
 	int i, err = 0;
 
-	ll_rw_block(SWRITE, nr_bhs, bhs);
+	for (i = 0; i < nr_bhs; i++)
+		write_dirty_buffer(bhs[i], WRITE);
+
 	for (i = 0; i < nr_bhs; i++) {
 		wait_on_buffer(bhs[i]);
 		if (buffer_eopnotsupp(bhs[i])) {
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index b0435dd0654d..05a38b9c4c0e 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -254,7 +254,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
 {
 	int i;
 
-	ll_rw_block(SWRITE, *batch_count, bhs);
+	for (i = 0; i < *batch_count; i++)
+		write_dirty_buffer(bhs[i], WRITE);
+
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = bhs[i];
 		clear_buffer_jwrite(bh);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index f19ce94693d8..2c4b1f109da9 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -1024,7 +1024,7 @@ void journal_update_superblock(journal_t *journal, int wait)
 	if (wait)
 		sync_dirty_buffer(bh);
 	else
-		ll_rw_block(SWRITE, 1, &bh);
+		write_dirty_buffer(bh, WRITE);
 
 out:
 	/* If we have just flushed the log (by marking s_start==0), then
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index ad717328343a..d29018307e2e 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -617,7 +617,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+	write_dirty_buffer(bh, write_op);
 }
 #endif
 
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 1c23a0f4e8a3..5247e7ffdcb4 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -255,7 +255,9 @@ __flush_batch(journal_t *journal, int *batch_count)
 {
 	int i;
 
-	ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs);
+	for (i = 0; i < *batch_count; i++)
+		write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE);
+
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = journal->j_chkpt_bhs[i];
 		clear_buffer_jwrite(bh);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ad5866aaf0f9..0e8014ea6b94 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1124,7 +1124,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait)
 			set_buffer_uptodate(bh);
 		}
 	} else
-		ll_rw_block(SWRITE, 1, &bh);
+		write_dirty_buffer(bh, WRITE);
 
 out:
 	/* If we have just flushed the log (by marking s_start==0), then
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index a360b06af2e3..9ad321fd63fd 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -625,7 +625,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(bh);
 	BUFFER_TRACE(bh, "write");
 	set_buffer_dirty(bh);
-	ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh);
+	write_dirty_buffer(bh, write_op);
 }
 #endif
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1ec952b1f036..812e2c05aa29 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2311,7 +2311,7 @@ static int journal_read_transaction(struct super_block *sb,
 	/* flush out the real blocks */
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		set_buffer_dirty(real_blocks[i]);
-		ll_rw_block(SWRITE, 1, real_blocks + i);
+		write_dirty_buffer(real_blocks[i], WRITE);
 	}
 	for (i = 0; i < get_desc_trans_len(desc); i++) {
 		wait_on_buffer(real_blocks[i]);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 048484fb10d2..46f7a807bbc1 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -114,10 +114,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 	sb->s_dirt = 1;
 	
 	unlock_super (sb);
@@ -207,10 +205,8 @@ do_more:
 
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 
 	if (overflow) {
 		fragment += count;
@@ -558,10 +554,8 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
 	
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 	sb->s_dirt = 1;
 
 	UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment);
@@ -680,10 +674,8 @@ cg_found:
 succed:
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 	sb->s_dirt = 1;
 
 	result += cgno * uspi->s_fpg;
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 428017e018fe..2eabf04af3de 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -113,10 +113,8 @@ void ufs_free_inode (struct inode * inode)
 
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 	
 	sb->s_dirt = 1;
 	unlock_super (sb);
@@ -156,10 +154,8 @@ static void ufs2_init_inodes_chunk(struct super_block *sb,
 
 	fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb);
 	ubh_mark_buffer_dirty(UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer(UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 
 	UFSD("EXIT\n");
 }
@@ -290,10 +286,8 @@ cg_found:
 	}
 	ubh_mark_buffer_dirty (USPI_UBH(uspi));
 	ubh_mark_buffer_dirty (UCPI_UBH(ucpi));
-	if (sb->s_flags & MS_SYNCHRONOUS) {
-		ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi));
-		ubh_wait_on_buffer (UCPI_UBH(ucpi));
-	}
+	if (sb->s_flags & MS_SYNCHRONOUS)
+		ubh_sync_block(UCPI_UBH(ucpi));
 	sb->s_dirt = 1;
 
 	inode->i_ino = cg * uspi->s_ipg + bit;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index 34d5cb135320..a58f9155fc9a 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -243,10 +243,8 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p)
 		ubh_bforget(ind_ubh);
 		ind_ubh = NULL;
 	}
-	if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) {
-		ubh_ll_rw_block(SWRITE, ind_ubh);
-		ubh_wait_on_buffer (ind_ubh);
-	}
+	if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh))
+		ubh_sync_block(ind_ubh);
 	ubh_brelse (ind_ubh);
 	
 	UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -307,10 +305,8 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p)
 		ubh_bforget(dind_bh);
 		dind_bh = NULL;
 	}
-	if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) {
-		ubh_ll_rw_block(SWRITE, dind_bh);
-		ubh_wait_on_buffer (dind_bh);
-	}
+	if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh))
+		ubh_sync_block(dind_bh);
 	ubh_brelse (dind_bh);
 	
 	UFSD("EXIT: ino %lu\n", inode->i_ino);
@@ -367,10 +363,8 @@ static int ufs_trunc_tindirect(struct inode *inode)
 		ubh_bforget(tind_bh);
 		tind_bh = NULL;
 	}
-	if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) {
-		ubh_ll_rw_block(SWRITE, tind_bh);
-		ubh_wait_on_buffer (tind_bh);
-	}
+	if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh))
+		ubh_sync_block(tind_bh);
 	ubh_brelse (tind_bh);
 	
 	UFSD("EXIT: ino %lu\n", inode->i_ino);
diff --git a/fs/ufs/util.c b/fs/ufs/util.c
index 85a7fc9e4a4e..d2c36d53fe66 100644
--- a/fs/ufs/util.c
+++ b/fs/ufs/util.c
@@ -113,21 +113,17 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag)
 	}
 }
 
-void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh)
+void ubh_sync_block(struct ufs_buffer_head *ubh)
 {
-	if (!ubh)
-		return;
+	if (ubh) {
+		unsigned i;
 
-	ll_rw_block(rw, ubh->count, ubh->bh);
-}
+		for (i = 0; i < ubh->count; i++)
+			write_dirty_buffer(ubh->bh[i], WRITE);
 
-void ubh_wait_on_buffer (struct ufs_buffer_head * ubh)
-{
-	unsigned i;
-	if (!ubh)
-		return;
-	for ( i = 0; i < ubh->count; i++ )
-		wait_on_buffer (ubh->bh[i]);
+		for (i = 0; i < ubh->count; i++)
+			wait_on_buffer(ubh->bh[i]);
+	}
 }
 
 void ubh_bforget (struct ufs_buffer_head * ubh)
diff --git a/fs/ufs/util.h b/fs/ufs/util.h
index 0466036912f1..9f8775ce381c 100644
--- a/fs/ufs/util.h
+++ b/fs/ufs/util.h
@@ -269,8 +269,7 @@ extern void ubh_brelse (struct ufs_buffer_head *);
 extern void ubh_brelse_uspi (struct ufs_sb_private_info *);
 extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *);
 extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int);
-extern void ubh_ll_rw_block(int, struct ufs_buffer_head *);
-extern void ubh_wait_on_buffer (struct ufs_buffer_head *);
+extern void ubh_sync_block(struct ufs_buffer_head *);
 extern void ubh_bforget (struct ufs_buffer_head *);
 extern int  ubh_buffer_dirty (struct ufs_buffer_head *);
 #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size)
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 72c1cf83eb85..ec94c12f21da 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -182,6 +182,7 @@ void __lock_buffer(struct buffer_head *bh);
 void ll_rw_block(int, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int __sync_dirty_buffer(struct buffer_head *bh, int rw);
+void write_dirty_buffer(struct buffer_head *bh, int rw);
 int submit_bh(int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9a96b4d83fc1..29f7c975304c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -125,9 +125,6 @@ struct inodes_stat_t {
  *			block layer could (in theory) choose to ignore this
  *			request if it runs into resource problems.
  * WRITE		A normal async write. Device will be plugged.
- * SWRITE		Like WRITE, but a special case for ll_rw_block() that
- *			tells it to lock the buffer first. Normally a buffer
- *			must be locked before doing IO.
  * WRITE_SYNC_PLUG	Synchronous write. Identical to WRITE, but passes down
  *			the hint that someone will be waiting on this IO
  *			shortly. The device must still be unplugged explicitly,
@@ -138,9 +135,6 @@ struct inodes_stat_t {
  *			immediately after submission. The write equivalent
  *			of READ_SYNC.
  * WRITE_ODIRECT_PLUG	Special case write for O_DIRECT only.
- * SWRITE_SYNC
- * SWRITE_SYNC_PLUG	Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer.
- *			See SWRITE.
  * WRITE_BARRIER	Like WRITE_SYNC, but tells the block layer that all
  *			previously submitted writes must be safely on storage
  *			before this one is started. Also guarantees that when
@@ -155,7 +149,6 @@ struct inodes_stat_t {
 #define READ			0
 #define WRITE			RW_MASK
 #define READA			RWA_MASK
-#define SWRITE			(WRITE | READA)
 
 #define READ_SYNC		(READ | REQ_SYNC | REQ_UNPLUG)
 #define READ_META		(READ | REQ_META)
@@ -165,8 +158,6 @@ struct inodes_stat_t {
 #define WRITE_META		(WRITE | REQ_META)
 #define WRITE_BARRIER		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \
 				 REQ_HARDBARRIER)
-#define SWRITE_SYNC_PLUG	(SWRITE | REQ_SYNC | REQ_NOIDLE)
-#define SWRITE_SYNC		(SWRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG)
 
 /*
  * These aren't really reads or writes, they pass down information about
-- 
cgit v1.2.3


From 850a496f969719b494cc972ab1d0e088737358d7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 18 Aug 2010 06:18:57 -0400
Subject: hostfs: dumb (and usually harmless) tpyo - strncpy instead of strlcpy

... not harmless in this case - we have a string in the end of buffer
already.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hostfs/hostfs_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index dd1e55535a4e..77c4f6ee6c40 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -104,7 +104,7 @@ static char *__dentry_name(struct dentry *dentry, char *name)
 		__putname(name);
 		return NULL;
 	}
-	strncpy(name, root, PATH_MAX);
+	strlcpy(name, root, PATH_MAX);
 	if (len > p - name) {
 		__putname(name);
 		return NULL;
-- 
cgit v1.2.3


From 3b6036d148bad5bb7928b021a49bb9e395361084 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 18 Aug 2010 06:21:10 -0400
Subject: hostfs ->follow_link() braino

we want the assignment to err done inside the if () to be
visible after it, so (re)declaring err inside if () body
is wrong.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hostfs/hostfs_kern.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 77c4f6ee6c40..f7dc9b5f9ef8 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -876,7 +876,7 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 		char *path = dentry_name(dentry);
 		int err = -ENOMEM;
 		if (path) {
-			int err = hostfs_do_readlink(path, link, PATH_MAX);
+			err = hostfs_do_readlink(path, link, PATH_MAX);
 			if (err == PATH_MAX)
 				err = -E2BIG;
 			__putname(path);
-- 
cgit v1.2.3


From 3a48ee8a4ad26c3a538b6fc11a86a8f80c3dce18 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Mon, 16 Aug 2010 19:05:23 +0200
Subject: mbcache: Limit the maximum number of cache entries

Limit the maximum number of mb_cache entries depending on the number of
hash buckets: if the only limit to the number of cache entries is the
available memory the hash chains can grow very long, taking a long time
to search.

At least partially solves https://bugzilla.lustre.org/show_bug.cgi?id=22771.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/mbcache.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/mbcache.c b/fs/mbcache.c
index cf4e6cdfd15b..93444747237b 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -80,6 +80,7 @@ struct mb_cache {
 	struct list_head		c_cache_list;
 	const char			*c_name;
 	atomic_t			c_entry_count;
+	int				c_max_entries;
 	int				c_bucket_bits;
 	struct kmem_cache		*c_entry_cache;
 	struct list_head		*c_block_hash;
@@ -243,6 +244,12 @@ mb_cache_create(const char *name, int bucket_bits)
 	if (!cache->c_entry_cache)
 		goto fail2;
 
+	/*
+	 * Set an upper limit on the number of cache entries so that the hash
+	 * chains won't grow too long.
+	 */
+	cache->c_max_entries = bucket_count << 4;
+
 	spin_lock(&mb_cache_spinlock);
 	list_add(&cache->c_cache_list, &mb_cache_list);
 	spin_unlock(&mb_cache_spinlock);
@@ -333,7 +340,6 @@ mb_cache_destroy(struct mb_cache *cache)
 	kfree(cache);
 }
 
-
 /*
  * mb_cache_entry_alloc()
  *
@@ -345,17 +351,29 @@ mb_cache_destroy(struct mb_cache *cache)
 struct mb_cache_entry *
 mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags)
 {
-	struct mb_cache_entry *ce;
-
-	ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
-	if (ce) {
+	struct mb_cache_entry *ce = NULL;
+
+	if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) {
+		spin_lock(&mb_cache_spinlock);
+		if (!list_empty(&mb_cache_lru_list)) {
+			ce = list_entry(mb_cache_lru_list.next,
+					struct mb_cache_entry, e_lru_list);
+			list_del_init(&ce->e_lru_list);
+			__mb_cache_entry_unhash(ce);
+		}
+		spin_unlock(&mb_cache_spinlock);
+	}
+	if (!ce) {
+		ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags);
+		if (!ce)
+			return NULL;
 		atomic_inc(&cache->c_entry_count);
 		INIT_LIST_HEAD(&ce->e_lru_list);
 		INIT_LIST_HEAD(&ce->e_block_list);
 		ce->e_cache = cache;
-		ce->e_used = 1 + MB_CACHE_WRITER;
 		ce->e_queued = 0;
 	}
+	ce->e_used = 1 + MB_CACHE_WRITER;
 	return ce;
 }
 
-- 
cgit v1.2.3


From 2e2e88ea8c3bd9e1bd6e42faf047a4ac3fbb3b2f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Wed, 18 Aug 2010 04:37:30 +1000
Subject: fs: fix do_lookup false negative

fs: fix do_lookup false negative

In do_lookup, if we initially find no dentry, we take the directory i_mutex and
re-check the lookup. If we find a dentry there, then we revalidate it if
needed. However if that revalidate asks for the dentry to be invalidated, we
return -ENOENT from do_lookup. What should happen instead is an attempt to
allocate and lookup a new dentry.

This is probably not noticed because it is rare. It is only reached if a
concurrent create races in first (in which case, the dentry probably won't be
invalidated anyway), or if the racy __d_lookup has failed due to a
false-negative (which is very rare).

Fix this by removing code and have it use the normal reval path.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 17ea76bf2fbe..c2742b7dec59 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -709,6 +709,7 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 	dentry = __d_lookup(nd->path.dentry, name);
 	if (!dentry)
 		goto need_lookup;
+found:
 	if (dentry->d_op && dentry->d_op->d_revalidate)
 		goto need_revalidate;
 done:
@@ -766,14 +767,7 @@ out_unlock:
 	 * we waited on the semaphore. Need to revalidate.
 	 */
 	mutex_unlock(&dir->i_mutex);
-	if (dentry->d_op && dentry->d_op->d_revalidate) {
-		dentry = do_revalidate(dentry, nd);
-		if (!dentry)
-			dentry = ERR_PTR(-ENOENT);
-	}
-	if (IS_ERR(dentry))
-		goto fail;
-	goto done;
+	goto found;
 
 need_revalidate:
 	dentry = do_revalidate(dentry, nd);
-- 
cgit v1.2.3


From baa0389073eb7beb9d36f6d13df97e16c1bfa626 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Wed, 18 Aug 2010 04:37:31 +1000
Subject: fs: dentry allocation consolidation

fs: dentry allocation consolidation

There are 2 duplicate copies of code in dentry allocation in path lookup.
Consolidate them into a single function.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 70 +++++++++++++++++++++++++++++---------------------------------
 1 file changed, 33 insertions(+), 37 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index c2742b7dec59..b815a4d2e1d6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -685,6 +685,35 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 	follow_mount(&nd->path);
 }
 
+/*
+ * Allocate a dentry with name and parent, and perform a parent
+ * directory ->lookup on it. Returns the new dentry, or ERR_PTR
+ * on error. parent->d_inode->i_mutex must be held. d_lookup must
+ * have verified that no child exists while under i_mutex.
+ */
+static struct dentry *d_alloc_and_lookup(struct dentry *parent,
+				struct qstr *name, struct nameidata *nd)
+{
+	struct inode *inode = parent->d_inode;
+	struct dentry *dentry;
+	struct dentry *old;
+
+	/* Don't create child dentry for a dead directory. */
+	if (unlikely(IS_DEADDIR(inode)))
+		return ERR_PTR(-ENOENT);
+
+	dentry = d_alloc(parent, name);
+	if (unlikely(!dentry))
+		return ERR_PTR(-ENOMEM);
+
+	old = inode->i_op->lookup(inode, dentry, nd);
+	if (unlikely(old)) {
+		dput(dentry);
+		dentry = old;
+	}
+	return dentry;
+}
+
 /*
  *  It's more convoluted than I'd like it to be, but... it's still fairly
  *  small and for now I'd prefer to have fast path as straight as possible.
@@ -738,30 +767,13 @@ need_lookup:
 	 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
 	 */
 	dentry = d_lookup(parent, name);
-	if (!dentry) {
-		struct dentry *new;
-
-		/* Don't create child dentry for a dead directory. */
-		dentry = ERR_PTR(-ENOENT);
-		if (IS_DEADDIR(dir))
-			goto out_unlock;
-
-		new = d_alloc(parent, name);
-		dentry = ERR_PTR(-ENOMEM);
-		if (new) {
-			dentry = dir->i_op->lookup(dir, new, nd);
-			if (dentry)
-				dput(new);
-			else
-				dentry = new;
-		}
-out_unlock:
+	if (likely(!dentry)) {
+		dentry = d_alloc_and_lookup(parent, name, nd);
 		mutex_unlock(&dir->i_mutex);
 		if (IS_ERR(dentry))
 			goto fail;
 		goto done;
 	}
-
 	/*
 	 * Uhhuh! Nasty case: the cache was re-populated while
 	 * we waited on the semaphore. Need to revalidate.
@@ -1135,24 +1147,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
 	if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
 		dentry = do_revalidate(dentry, nd);
 
-	if (!dentry) {
-		struct dentry *new;
-
-		/* Don't create child dentry for a dead directory. */
-		dentry = ERR_PTR(-ENOENT);
-		if (IS_DEADDIR(inode))
-			goto out;
-
-		new = d_alloc(base, name);
-		dentry = ERR_PTR(-ENOMEM);
-		if (!new)
-			goto out;
-		dentry = inode->i_op->lookup(inode, new, nd);
-		if (!dentry)
-			dentry = new;
-		else
-			dput(new);
-	}
+	if (!dentry)
+		dentry = d_alloc_and_lookup(base, name, nd);
 out:
 	return dentry;
 }
-- 
cgit v1.2.3


From 2a4419b5b2a77f3f4537c14f7ad7df95770655dd Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Wed, 18 Aug 2010 04:37:33 +1000
Subject: fs: fs_struct rwlock to spinlock

fs: fs_struct rwlock to spinlock

struct fs_struct.lock is an rwlock with the read-side used to protect root and
pwd members while taking references to them. Taking a reference to a path
typically requires just 2 atomic ops, so the critical section is very small.
Parallel read-side operations would have cacheline contention on the lock, the
dentry, and the vfsmount cachelines, so the rwlock is unlikely to ever give a
real parallelism increase.

Replace it with a spinlock to avoid one or two atomic operations in typical
path lookup fastpath.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/staging/pohmelfs/path_entry.c |  8 ++++----
 fs/exec.c                             |  4 ++--
 fs/fs_struct.c                        | 32 ++++++++++++++++----------------
 include/linux/fs_struct.h             | 14 +++++++-------
 kernel/fork.c                         | 10 +++++-----
 5 files changed, 34 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/drivers/staging/pohmelfs/path_entry.c b/drivers/staging/pohmelfs/path_entry.c
index cdc4dd50d638..8ec83d2dffb7 100644
--- a/drivers/staging/pohmelfs/path_entry.c
+++ b/drivers/staging/pohmelfs/path_entry.c
@@ -44,9 +44,9 @@ int pohmelfs_construct_path_string(struct pohmelfs_inode *pi, void *data, int le
 		return -ENOENT;
 	}
 
-	read_lock(&current->fs->lock);
+	spin_lock(&current->fs->lock);
 	path.mnt = mntget(current->fs->root.mnt);
-	read_unlock(&current->fs->lock);
+	spin_unlock(&current->fs->lock);
 
 	path.dentry = d;
 
@@ -91,9 +91,9 @@ int pohmelfs_path_length(struct pohmelfs_inode *pi)
 		return -ENOENT;
 	}
 
-	read_lock(&current->fs->lock);
+	spin_lock(&current->fs->lock);
 	root = dget(current->fs->root.dentry);
-	read_unlock(&current->fs->lock);
+	spin_unlock(&current->fs->lock);
 
 	spin_lock(&dcache_lock);
 
diff --git a/fs/exec.c b/fs/exec.c
index 7761837e4500..5adab2c93eca 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1117,7 +1117,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
 	n_fs = 1;
-	write_lock(&p->fs->lock);
+	spin_lock(&p->fs->lock);
 	rcu_read_lock();
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
 		if (t->fs == p->fs)
@@ -1134,7 +1134,7 @@ int check_unsafe_exec(struct linux_binprm *bprm)
 			res = 1;
 		}
 	}
-	write_unlock(&p->fs->lock);
+	spin_unlock(&p->fs->lock);
 
 	return res;
 }
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 1ee40eb9a2c0..ed45a9cf5f3d 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -13,11 +13,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path)
 {
 	struct path old_root;
 
-	write_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	old_root = fs->root;
 	fs->root = *path;
 	path_get(path);
-	write_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 	if (old_root.dentry)
 		path_put(&old_root);
 }
@@ -30,11 +30,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path)
 {
 	struct path old_pwd;
 
-	write_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	old_pwd = fs->pwd;
 	fs->pwd = *path;
 	path_get(path);
-	write_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 
 	if (old_pwd.dentry)
 		path_put(&old_pwd);
@@ -51,7 +51,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 		task_lock(p);
 		fs = p->fs;
 		if (fs) {
-			write_lock(&fs->lock);
+			spin_lock(&fs->lock);
 			if (fs->root.dentry == old_root->dentry
 			    && fs->root.mnt == old_root->mnt) {
 				path_get(new_root);
@@ -64,7 +64,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 				fs->pwd = *new_root;
 				count++;
 			}
-			write_unlock(&fs->lock);
+			spin_unlock(&fs->lock);
 		}
 		task_unlock(p);
 	} while_each_thread(g, p);
@@ -87,10 +87,10 @@ void exit_fs(struct task_struct *tsk)
 	if (fs) {
 		int kill;
 		task_lock(tsk);
-		write_lock(&fs->lock);
+		spin_lock(&fs->lock);
 		tsk->fs = NULL;
 		kill = !--fs->users;
-		write_unlock(&fs->lock);
+		spin_unlock(&fs->lock);
 		task_unlock(tsk);
 		if (kill)
 			free_fs_struct(fs);
@@ -104,7 +104,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	if (fs) {
 		fs->users = 1;
 		fs->in_exec = 0;
-		rwlock_init(&fs->lock);
+		spin_lock_init(&fs->lock);
 		fs->umask = old->umask;
 		get_fs_root_and_pwd(old, &fs->root, &fs->pwd);
 	}
@@ -121,10 +121,10 @@ int unshare_fs_struct(void)
 		return -ENOMEM;
 
 	task_lock(current);
-	write_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	kill = !--fs->users;
 	current->fs = new_fs;
-	write_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 	task_unlock(current);
 
 	if (kill)
@@ -143,7 +143,7 @@ EXPORT_SYMBOL(current_umask);
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
 	.users		= 1,
-	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
+	.lock		= __SPIN_LOCK_UNLOCKED(init_fs.lock),
 	.umask		= 0022,
 };
 
@@ -156,14 +156,14 @@ void daemonize_fs_struct(void)
 
 		task_lock(current);
 
-		write_lock(&init_fs.lock);
+		spin_lock(&init_fs.lock);
 		init_fs.users++;
-		write_unlock(&init_fs.lock);
+		spin_unlock(&init_fs.lock);
 
-		write_lock(&fs->lock);
+		spin_lock(&fs->lock);
 		current->fs = &init_fs;
 		kill = !--fs->users;
-		write_unlock(&fs->lock);
+		spin_unlock(&fs->lock);
 
 		task_unlock(current);
 		if (kill)
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index eca3d5202138..a42b5bf02f8b 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -5,7 +5,7 @@
 
 struct fs_struct {
 	int users;
-	rwlock_t lock;
+	spinlock_t lock;
 	int umask;
 	int in_exec;
 	struct path root, pwd;
@@ -23,29 +23,29 @@ extern int unshare_fs_struct(void);
 
 static inline void get_fs_root(struct fs_struct *fs, struct path *root)
 {
-	read_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	*root = fs->root;
 	path_get(root);
-	read_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 }
 
 static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
 {
-	read_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	*pwd = fs->pwd;
 	path_get(pwd);
-	read_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 }
 
 static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root,
 				       struct path *pwd)
 {
-	read_lock(&fs->lock);
+	spin_lock(&fs->lock);
 	*root = fs->root;
 	path_get(root);
 	*pwd = fs->pwd;
 	path_get(pwd);
-	read_unlock(&fs->lock);
+	spin_unlock(&fs->lock);
 }
 
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index 98b450876f93..856eac3ec52e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -752,13 +752,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
 		/* tsk->fs is already what we want */
-		write_lock(&fs->lock);
+		spin_lock(&fs->lock);
 		if (fs->in_exec) {
-			write_unlock(&fs->lock);
+			spin_unlock(&fs->lock);
 			return -EAGAIN;
 		}
 		fs->users++;
-		write_unlock(&fs->lock);
+		spin_unlock(&fs->lock);
 		return 0;
 	}
 	tsk->fs = copy_fs_struct(fs);
@@ -1676,13 +1676,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 
 		if (new_fs) {
 			fs = current->fs;
-			write_lock(&fs->lock);
+			spin_lock(&fs->lock);
 			current->fs = new_fs;
 			if (--fs->users)
 				new_fs = NULL;
 			else
 				new_fs = fs;
-			write_unlock(&fs->lock);
+			spin_unlock(&fs->lock);
 		}
 
 		if (new_mm) {
-- 
cgit v1.2.3


From b04f784e5d19ed58892833dae845738972cea260 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Wed, 18 Aug 2010 04:37:34 +1000
Subject: fs: remove extra lookup in __lookup_hash

fs: remove extra lookup in __lookup_hash

Optimize lookup for create operations, where no dentry should often be
common-case. In cases where it is not, such as unlink, the added overhead
is much smaller than the removed.

Also, move comments about __d_lookup racyness to the __d_lookup call site.
d_lookup is intuitive; __d_lookup is what needs commenting. So in that same
vein, add kerneldoc comments to __d_lookup and clean up some of the comments:

- We are interested in how the RCU lookup works here, particularly with
  renames. Make that explicit, and point to the document where it is explained
  in more detail.
- RCU is pretty standard now, and macros make implementations pretty mindless.
  If we want to know about RCU barrier details, we look in RCU code.
- Delete some boring legacy comments because we don't care much about how the
  code used to work, more about the interesting parts of how it works now. So
  comments about lazy LRU may be interesting, but would better be done in the
  LRU or refcount management code.

Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 60 +++++++++++++++++++++++++++++++++++-------------------------
 fs/namei.c  | 32 ++++++++++++++++----------------
 2 files changed, 51 insertions(+), 41 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 4d13bf50b7b1..d56a40b5a577 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1332,31 +1332,13 @@ EXPORT_SYMBOL(d_add_ci);
  * d_lookup - search for a dentry
  * @parent: parent dentry
  * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
  *
- * Searches the children of the parent dentry for the name in question. If
- * the dentry is found its reference count is incremented and the dentry
- * is returned. The caller must use dput to free the entry when it has
- * finished using it. %NULL is returned on failure.
- *
- * __d_lookup is dcache_lock free. The hash list is protected using RCU.
- * Memory barriers are used while updating and doing lockless traversal. 
- * To avoid races with d_move while rename is happening, d_lock is used.
- *
- * Overflows in memcmp(), while d_move, are avoided by keeping the length
- * and name pointer in one structure pointed by d_qstr.
- *
- * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while
- * lookup is going on.
- *
- * The dentry unused LRU is not updated even if lookup finds the required dentry
- * in there. It is updated in places such as prune_dcache, shrink_dcache_sb,
- * select_parent and __dget_locked. This laziness saves lookup from dcache_lock
- * acquisition.
- *
- * d_lookup() is protected against the concurrent renames in some unrelated
- * directory using the seqlockt_t rename_lock.
+ * d_lookup searches the children of the parent dentry for the name in
+ * question. If the dentry is found its reference count is incremented and the
+ * dentry is returned. The caller must use dput to free the entry when it has
+ * finished using it. %NULL is returned if the dentry does not exist.
  */
-
 struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 {
 	struct dentry * dentry = NULL;
@@ -1372,6 +1354,21 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name)
 }
 EXPORT_SYMBOL(d_lookup);
 
+/*
+ * __d_lookup - search for a dentry (racy)
+ * @parent: parent dentry
+ * @name: qstr of name we wish to find
+ * Returns: dentry, or NULL
+ *
+ * __d_lookup is like d_lookup, however it may (rarely) return a
+ * false-negative result due to unrelated rename activity.
+ *
+ * __d_lookup is slightly faster by avoiding rename_lock read seqlock,
+ * however it must be used carefully, eg. with a following d_lookup in
+ * the case of failure.
+ *
+ * __d_lookup callers must be commented.
+ */
 struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 {
 	unsigned int len = name->len;
@@ -1382,6 +1379,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 	struct hlist_node *node;
 	struct dentry *dentry;
 
+	/*
+	 * The hash list is protected using RCU.
+	 *
+	 * Take d_lock when comparing a candidate dentry, to avoid races
+	 * with d_move().
+	 *
+	 * It is possible that concurrent renames can mess up our list
+	 * walk here and result in missing our dentry, resulting in the
+	 * false-negative result. d_lookup() protects against concurrent
+	 * renames using rename_lock seqlock.
+	 *
+	 * See Documentation/vfs/dcache-locking.txt for more details.
+	 */
 	rcu_read_lock();
 	
 	hlist_for_each_entry_rcu(dentry, node, head, d_hash) {
@@ -1396,8 +1406,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name)
 
 		/*
 		 * Recheck the dentry after taking the lock - d_move may have
-		 * changed things.  Don't bother checking the hash because we're
-		 * about to compare the whole name anyway.
+		 * changed things. Don't bother checking the hash because
+		 * we're about to compare the whole name anyway.
 		 */
 		if (dentry->d_parent != parent)
 			goto next;
diff --git a/fs/namei.c b/fs/namei.c
index b815a4d2e1d6..11de7c39ff76 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -735,6 +735,11 @@ static int do_lookup(struct nameidata *nd, struct qstr *name,
 			return err;
 	}
 
+	/*
+	 * Rename seqlock is not required here because in the off chance
+	 * of a false negative due to a concurrent rename, we're going to
+	 * do the non-racy lookup, below.
+	 */
 	dentry = __d_lookup(nd->path.dentry, name);
 	if (!dentry)
 		goto need_lookup;
@@ -754,17 +759,13 @@ need_lookup:
 	mutex_lock(&dir->i_mutex);
 	/*
 	 * First re-do the cached lookup just in case it was created
-	 * while we waited for the directory semaphore..
-	 *
-	 * FIXME! This could use version numbering or similar to
-	 * avoid unnecessary cache lookups.
-	 *
-	 * The "dcache_lock" is purely to protect the RCU list walker
-	 * from concurrent renames at this point (we mustn't get false
-	 * negatives from the RCU list walk here, unlike the optimistic
-	 * fast walk).
+	 * while we waited for the directory semaphore, or the first
+	 * lookup failed due to an unrelated rename.
 	 *
-	 * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup
+	 * This could use version numbering or similar to avoid unnecessary
+	 * cache lookups, but then we'd have to do the first lookup in the
+	 * non-racy way. However in the common case here, everything should
+	 * be hot in cache, so would it be a big win?
 	 */
 	dentry = d_lookup(parent, name);
 	if (likely(!dentry)) {
@@ -1136,13 +1137,12 @@ static struct dentry *__lookup_hash(struct qstr *name,
 			goto out;
 	}
 
-	dentry = __d_lookup(base, name);
-
-	/* lockess __d_lookup may fail due to concurrent d_move()
-	 * in some unrelated directory, so try with d_lookup
+	/*
+	 * Don't bother with __d_lookup: callers are for creat as
+	 * well as unlink, so a lot of the time it would cost
+	 * a double lookup.
 	 */
-	if (!dentry)
-		dentry = d_lookup(base, name);
+	dentry = d_lookup(base, name);
 
 	if (dentry && dentry->d_op && dentry->d_op->d_revalidate)
 		dentry = do_revalidate(dentry, nd);
-- 
cgit v1.2.3


From ee2ffa0dfdd2db19705f2ba1c6a4c0bfe8122dd8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Wed, 18 Aug 2010 04:37:35 +1000
Subject: fs: cleanup files_lock locking

fs: cleanup files_lock locking

Lock tty_files with a new spinlock, tty_files_lock; provide helpers to
manipulate the per-sb files list; unexport the files_lock spinlock.

Cc: linux-kernel@vger.kernel.org
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Acked-by: Andi Kleen <ak@linux.intel.com>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/char/pty.c       |  6 +++++-
 drivers/char/tty_io.c    | 26 ++++++++++++++++++--------
 fs/file_table.c          | 42 ++++++++++++++++++------------------------
 fs/open.c                |  4 ++--
 include/linux/fs.h       |  7 ++-----
 include/linux/tty.h      |  1 +
 security/selinux/hooks.c |  4 ++--
 7 files changed, 48 insertions(+), 42 deletions(-)

(limited to 'fs')

diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index ad46eae1f9bb..2c64faa8efa4 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -676,7 +676,11 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 
 	set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
 	filp->private_data = tty;
-	file_move(filp, &tty->tty_files);
+
+	file_sb_list_del(filp); /* __dentry_open has put it on the sb list */
+	spin_lock(&tty_files_lock);
+	list_add(&filp->f_u.fu_list, &tty->tty_files);
+	spin_unlock(&tty_files_lock);
 
 	retval = devpts_pty_new(inode, tty->link);
 	if (retval)
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 0350c42375a2..cd5b829634ea 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -136,6 +136,9 @@ LIST_HEAD(tty_drivers);			/* linked list of tty drivers */
 DEFINE_MUTEX(tty_mutex);
 EXPORT_SYMBOL(tty_mutex);
 
+/* Spinlock to protect the tty->tty_files list */
+DEFINE_SPINLOCK(tty_files_lock);
+
 static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *);
 static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *);
 ssize_t redirected_tty_write(struct file *, const char __user *,
@@ -235,11 +238,11 @@ static int check_tty_count(struct tty_struct *tty, const char *routine)
 	struct list_head *p;
 	int count = 0;
 
-	file_list_lock();
+	spin_lock(&tty_files_lock);
 	list_for_each(p, &tty->tty_files) {
 		count++;
 	}
-	file_list_unlock();
+	spin_unlock(&tty_files_lock);
 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
 	    tty->driver->subtype == PTY_TYPE_SLAVE &&
 	    tty->link && tty->link->count)
@@ -519,7 +522,7 @@ void __tty_hangup(struct tty_struct *tty)
 	   workqueue with the lock held */
 	check_tty_count(tty, "tty_hangup");
 
-	file_list_lock();
+	spin_lock(&tty_files_lock);
 	/* This breaks for file handles being sent over AF_UNIX sockets ? */
 	list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) {
 		if (filp->f_op->write == redirected_tty_write)
@@ -530,7 +533,7 @@ void __tty_hangup(struct tty_struct *tty)
 		__tty_fasync(-1, filp, 0);	/* can't block */
 		filp->f_op = &hung_up_tty_fops;
 	}
-	file_list_unlock();
+	spin_unlock(&tty_files_lock);
 
 	tty_ldisc_hangup(tty);
 
@@ -1424,9 +1427,9 @@ static void release_one_tty(struct work_struct *work)
 	tty_driver_kref_put(driver);
 	module_put(driver->owner);
 
-	file_list_lock();
+	spin_lock(&tty_files_lock);
 	list_del_init(&tty->tty_files);
-	file_list_unlock();
+	spin_unlock(&tty_files_lock);
 
 	put_pid(tty->pgrp);
 	put_pid(tty->session);
@@ -1671,7 +1674,10 @@ int tty_release(struct inode *inode, struct file *filp)
 	 *  - do_tty_hangup no longer sees this file descriptor as
 	 *    something that needs to be handled for hangups.
 	 */
-	file_kill(filp);
+	spin_lock(&tty_files_lock);
+	BUG_ON(list_empty(&filp->f_u.fu_list));
+	list_del_init(&filp->f_u.fu_list);
+	spin_unlock(&tty_files_lock);
 	filp->private_data = NULL;
 
 	/*
@@ -1840,7 +1846,11 @@ got_driver:
 	}
 
 	filp->private_data = tty;
-	file_move(filp, &tty->tty_files);
+	BUG_ON(list_empty(&filp->f_u.fu_list));
+	file_sb_list_del(filp); /* __dentry_open has put it on the sb list */
+	spin_lock(&tty_files_lock);
+	list_add(&filp->f_u.fu_list, &tty->tty_files);
+	spin_unlock(&tty_files_lock);
 	check_tty_count(tty, "tty_open");
 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
 	    tty->driver->subtype == PTY_TYPE_MASTER)
diff --git a/fs/file_table.c b/fs/file_table.c
index edecd36fed9b..6f0e62ecfddd 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -32,8 +32,7 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-/* public. Not pretty! */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+static __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
 
 /* SLAB cache for file structures */
 static struct kmem_cache *filp_cachep __read_mostly;
@@ -249,7 +248,7 @@ static void __fput(struct file *file)
 		cdev_put(inode->i_cdev);
 	fops_put(file->f_op);
 	put_pid(file->f_owner.pid);
-	file_kill(file);
+	file_sb_list_del(file);
 	if (file->f_mode & FMODE_WRITE)
 		drop_file_write_access(file);
 	file->f_path.dentry = NULL;
@@ -328,31 +327,29 @@ struct file *fget_light(unsigned int fd, int *fput_needed)
 	return file;
 }
 
-
 void put_filp(struct file *file)
 {
 	if (atomic_long_dec_and_test(&file->f_count)) {
 		security_file_free(file);
-		file_kill(file);
+		file_sb_list_del(file);
 		file_free(file);
 	}
 }
 
-void file_move(struct file *file, struct list_head *list)
+void file_sb_list_add(struct file *file, struct super_block *sb)
 {
-	if (!list)
-		return;
-	file_list_lock();
-	list_move(&file->f_u.fu_list, list);
-	file_list_unlock();
+	spin_lock(&files_lock);
+	BUG_ON(!list_empty(&file->f_u.fu_list));
+	list_add(&file->f_u.fu_list, &sb->s_files);
+	spin_unlock(&files_lock);
 }
 
-void file_kill(struct file *file)
+void file_sb_list_del(struct file *file)
 {
 	if (!list_empty(&file->f_u.fu_list)) {
-		file_list_lock();
+		spin_lock(&files_lock);
 		list_del_init(&file->f_u.fu_list);
-		file_list_unlock();
+		spin_unlock(&files_lock);
 	}
 }
 
@@ -361,7 +358,7 @@ int fs_may_remount_ro(struct super_block *sb)
 	struct file *file;
 
 	/* Check that no files are currently opened for writing. */
-	file_list_lock();
+	spin_lock(&files_lock);
 	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
 		struct inode *inode = file->f_path.dentry->d_inode;
 
@@ -373,10 +370,10 @@ int fs_may_remount_ro(struct super_block *sb)
 		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
 			goto too_bad;
 	}
-	file_list_unlock();
+	spin_unlock(&files_lock);
 	return 1; /* Tis' cool bro. */
 too_bad:
-	file_list_unlock();
+	spin_unlock(&files_lock);
 	return 0;
 }
 
@@ -392,7 +389,7 @@ void mark_files_ro(struct super_block *sb)
 	struct file *f;
 
 retry:
-	file_list_lock();
+	spin_lock(&files_lock);
 	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
 		struct vfsmount *mnt;
 		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
@@ -408,16 +405,13 @@ retry:
 			continue;
 		file_release_write(f);
 		mnt = mntget(f->f_path.mnt);
-		file_list_unlock();
-		/*
-		 * This can sleep, so we can't hold
-		 * the file_list_lock() spinlock.
-		 */
+		/* This can sleep, so we can't hold the spinlock. */
+		spin_unlock(&files_lock);
 		mnt_drop_write(mnt);
 		mntput(mnt);
 		goto retry;
 	}
-	file_list_unlock();
+	spin_unlock(&files_lock);
 }
 
 void __init files_init(unsigned long mempages)
diff --git a/fs/open.c b/fs/open.c
index 630715f9f73d..d74e1983e8dc 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -675,7 +675,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 	f->f_path.mnt = mnt;
 	f->f_pos = 0;
 	f->f_op = fops_get(inode->i_fop);
-	file_move(f, &inode->i_sb->s_files);
+	file_sb_list_add(f, inode->i_sb);
 
 	error = security_dentry_open(f, cred);
 	if (error)
@@ -721,7 +721,7 @@ cleanup_all:
 			mnt_drop_write(mnt);
 		}
 	}
-	file_kill(f);
+	file_sb_list_del(f);
 	f->f_path.dentry = NULL;
 	f->f_path.mnt = NULL;
 cleanup_file:
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 29f7c975304c..5a9a9e5a3705 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -944,9 +944,6 @@ struct file {
 	unsigned long f_mnt_write_state;
 #endif
 };
-extern spinlock_t files_lock;
-#define file_list_lock() spin_lock(&files_lock);
-#define file_list_unlock() spin_unlock(&files_lock);
 
 #define get_file(x)	atomic_long_inc(&(x)->f_count)
 #define fput_atomic(x)	atomic_long_add_unless(&(x)->f_count, -1, 1)
@@ -2188,8 +2185,8 @@ static inline void insert_inode_hash(struct inode *inode) {
 	__insert_inode_hash(inode, inode->i_ino);
 }
 
-extern void file_move(struct file *f, struct list_head *list);
-extern void file_kill(struct file *f);
+extern void file_sb_list_add(struct file *f, struct super_block *sb);
+extern void file_sb_list_del(struct file *f);
 #ifdef CONFIG_BLOCK
 extern void submit_bio(int, struct bio *);
 extern int bdev_read_only(struct block_device *);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 1437da3ddc62..f6b371a2514e 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -470,6 +470,7 @@ extern struct tty_struct *tty_pair_get_tty(struct tty_struct *tty);
 extern struct tty_struct *tty_pair_get_pty(struct tty_struct *tty);
 
 extern struct mutex tty_mutex;
+extern spinlock_t tty_files_lock;
 
 extern void tty_write_unlock(struct tty_struct *tty);
 extern int tty_write_lock(struct tty_struct *tty, int ndelay);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 42043f96e54f..bd7da0f0ccf3 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2170,7 +2170,7 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 
 	tty = get_current_tty();
 	if (tty) {
-		file_list_lock();
+		spin_lock(&tty_files_lock);
 		if (!list_empty(&tty->tty_files)) {
 			struct inode *inode;
 
@@ -2186,7 +2186,7 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 				drop_tty = 1;
 			}
 		}
-		file_list_unlock();
+		spin_unlock(&tty_files_lock);
 		tty_kref_put(tty);
 	}
 	/* Reset controlling tty. */
-- 
cgit v1.2.3


From d996b62a8df1d935b01319bf8defb95b5709f7b8 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Wed, 18 Aug 2010 04:37:36 +1000
Subject: tty: fix fu_list abuse

tty: fix fu_list abuse

tty code abuses fu_list, which causes a bug in remount,ro handling.

If a tty device node is opened on a filesystem, then the last link to the inode
removed, the filesystem will be allowed to be remounted readonly. This is
because fs_may_remount_ro does not find the 0 link tty inode on the file sb
list (because the tty code incorrectly removed it to use for its own purpose).
This can result in a filesystem with errors after it is marked "clean".

Taking idea from Christoph's initial patch, allocate a tty private struct
at file->private_data and put our required list fields in there, linking
file and tty. This makes tty nodes behave the same way as other device nodes
and avoid meddling with the vfs, and avoids this bug.

The error handling is not trivial in the tty code, so for this bugfix, I take
the simple approach of using __GFP_NOFAIL and don't worry about memory errors.
This is not a problem because our allocator doesn't fail small allocs as a rule
anyway. So proper error handling is left as an exercise for tty hackers.

[ Arguably filesystem's device inode would ideally be divorced from the
driver's pseudo inode when it is opened, but in practice it's not clear whether
that will ever be worth implementing. ]

Cc: linux-kernel@vger.kernel.org
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/char/pty.c       |  6 +---
 drivers/char/tty_io.c    | 84 +++++++++++++++++++++++++++++++-----------------
 fs/internal.h            |  2 ++
 include/linux/fs.h       |  2 --
 include/linux/tty.h      |  8 +++++
 security/selinux/hooks.c |  5 ++-
 6 files changed, 69 insertions(+), 38 deletions(-)

(limited to 'fs')

diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 2c64faa8efa4..c350d01716bd 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -675,12 +675,8 @@ static int ptmx_open(struct inode *inode, struct file *filp)
 	}
 
 	set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */
-	filp->private_data = tty;
 
-	file_sb_list_del(filp); /* __dentry_open has put it on the sb list */
-	spin_lock(&tty_files_lock);
-	list_add(&filp->f_u.fu_list, &tty->tty_files);
-	spin_unlock(&tty_files_lock);
+	tty_add_file(tty, filp);
 
 	retval = devpts_pty_new(inode, tty->link);
 	if (retval)
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index cd5b829634ea..949067a0bd47 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -188,6 +188,41 @@ void free_tty_struct(struct tty_struct *tty)
 	kfree(tty);
 }
 
+static inline struct tty_struct *file_tty(struct file *file)
+{
+	return ((struct tty_file_private *)file->private_data)->tty;
+}
+
+/* Associate a new file with the tty structure */
+void tty_add_file(struct tty_struct *tty, struct file *file)
+{
+	struct tty_file_private *priv;
+
+	/* XXX: must implement proper error handling in callers */
+	priv = kmalloc(sizeof(*priv), GFP_KERNEL|__GFP_NOFAIL);
+
+	priv->tty = tty;
+	priv->file = file;
+	file->private_data = priv;
+
+	spin_lock(&tty_files_lock);
+	list_add(&priv->list, &tty->tty_files);
+	spin_unlock(&tty_files_lock);
+}
+
+/* Delete file from its tty */
+void tty_del_file(struct file *file)
+{
+	struct tty_file_private *priv = file->private_data;
+
+	spin_lock(&tty_files_lock);
+	list_del(&priv->list);
+	spin_unlock(&tty_files_lock);
+	file->private_data = NULL;
+	kfree(priv);
+}
+
+
 #define TTY_NUMBER(tty) ((tty)->index + (tty)->driver->name_base)
 
 /**
@@ -500,6 +535,7 @@ void __tty_hangup(struct tty_struct *tty)
 	struct file *cons_filp = NULL;
 	struct file *filp, *f = NULL;
 	struct task_struct *p;
+	struct tty_file_private *priv;
 	int    closecount = 0, n;
 	unsigned long flags;
 	int refs = 0;
@@ -509,7 +545,7 @@ void __tty_hangup(struct tty_struct *tty)
 
 
 	spin_lock(&redirect_lock);
-	if (redirect && redirect->private_data == tty) {
+	if (redirect && file_tty(redirect) == tty) {
 		f = redirect;
 		redirect = NULL;
 	}
@@ -524,7 +560,8 @@ void __tty_hangup(struct tty_struct *tty)
 
 	spin_lock(&tty_files_lock);
 	/* This breaks for file handles being sent over AF_UNIX sockets ? */
-	list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) {
+	list_for_each_entry(priv, &tty->tty_files, list) {
+		filp = priv->file;
 		if (filp->f_op->write == redirected_tty_write)
 			cons_filp = filp;
 		if (filp->f_op->write != tty_write)
@@ -892,12 +929,10 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count,
 			loff_t *ppos)
 {
 	int i;
-	struct tty_struct *tty;
-	struct inode *inode;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct tty_struct *tty = file_tty(file);
 	struct tty_ldisc *ld;
 
-	tty = file->private_data;
-	inode = file->f_path.dentry->d_inode;
 	if (tty_paranoia_check(tty, inode, "tty_read"))
 		return -EIO;
 	if (!tty || (test_bit(TTY_IO_ERROR, &tty->flags)))
@@ -1068,12 +1103,11 @@ void tty_write_message(struct tty_struct *tty, char *msg)
 static ssize_t tty_write(struct file *file, const char __user *buf,
 						size_t count, loff_t *ppos)
 {
-	struct tty_struct *tty;
 	struct inode *inode = file->f_path.dentry->d_inode;
+	struct tty_struct *tty = file_tty(file);
+ 	struct tty_ldisc *ld;
 	ssize_t ret;
-	struct tty_ldisc *ld;
 
-	tty = file->private_data;
 	if (tty_paranoia_check(tty, inode, "tty_write"))
 		return -EIO;
 	if (!tty || !tty->ops->write ||
@@ -1510,13 +1544,13 @@ static void release_tty(struct tty_struct *tty, int idx)
 
 int tty_release(struct inode *inode, struct file *filp)
 {
-	struct tty_struct *tty, *o_tty;
+	struct tty_struct *tty = file_tty(filp);
+	struct tty_struct *o_tty;
 	int	pty_master, tty_closing, o_tty_closing, do_sleep;
 	int	devpts;
 	int	idx;
 	char	buf[64];
 
-	tty = filp->private_data;
 	if (tty_paranoia_check(tty, inode, "tty_release_dev"))
 		return 0;
 
@@ -1674,11 +1708,7 @@ int tty_release(struct inode *inode, struct file *filp)
 	 *  - do_tty_hangup no longer sees this file descriptor as
 	 *    something that needs to be handled for hangups.
 	 */
-	spin_lock(&tty_files_lock);
-	BUG_ON(list_empty(&filp->f_u.fu_list));
-	list_del_init(&filp->f_u.fu_list);
-	spin_unlock(&tty_files_lock);
-	filp->private_data = NULL;
+	tty_del_file(filp);
 
 	/*
 	 * Perform some housekeeping before deciding whether to return.
@@ -1845,12 +1875,8 @@ got_driver:
 		return PTR_ERR(tty);
 	}
 
-	filp->private_data = tty;
-	BUG_ON(list_empty(&filp->f_u.fu_list));
-	file_sb_list_del(filp); /* __dentry_open has put it on the sb list */
-	spin_lock(&tty_files_lock);
-	list_add(&filp->f_u.fu_list, &tty->tty_files);
-	spin_unlock(&tty_files_lock);
+	tty_add_file(tty, filp);
+
 	check_tty_count(tty, "tty_open");
 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
 	    tty->driver->subtype == PTY_TYPE_MASTER)
@@ -1926,11 +1952,10 @@ got_driver:
 
 static unsigned int tty_poll(struct file *filp, poll_table *wait)
 {
-	struct tty_struct *tty;
+	struct tty_struct *tty = file_tty(filp);
 	struct tty_ldisc *ld;
 	int ret = 0;
 
-	tty = filp->private_data;
 	if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_poll"))
 		return 0;
 
@@ -1943,11 +1968,10 @@ static unsigned int tty_poll(struct file *filp, poll_table *wait)
 
 static int __tty_fasync(int fd, struct file *filp, int on)
 {
-	struct tty_struct *tty;
+	struct tty_struct *tty = file_tty(filp);
 	unsigned long flags;
 	int retval = 0;
 
-	tty = filp->private_data;
 	if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_fasync"))
 		goto out;
 
@@ -2501,13 +2525,13 @@ EXPORT_SYMBOL(tty_pair_get_pty);
  */
 long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	struct tty_struct *tty, *real_tty;
+	struct tty_struct *tty = file_tty(file);
+	struct tty_struct *real_tty;
 	void __user *p = (void __user *)arg;
 	int retval;
 	struct tty_ldisc *ld;
 	struct inode *inode = file->f_dentry->d_inode;
 
-	tty = file->private_data;
 	if (tty_paranoia_check(tty, inode, "tty_ioctl"))
 		return -EINVAL;
 
@@ -2629,7 +2653,7 @@ static long tty_compat_ioctl(struct file *file, unsigned int cmd,
 				unsigned long arg)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct tty_struct *tty = file->private_data;
+	struct tty_struct *tty = file_tty(file);
 	struct tty_ldisc *ld;
 	int retval = -ENOIOCTLCMD;
 
@@ -2721,7 +2745,7 @@ void __do_SAK(struct tty_struct *tty)
 				if (!filp)
 					continue;
 				if (filp->f_op->read == tty_read &&
-				    filp->private_data == tty) {
+				    file_tty(filp) == tty) {
 					printk(KERN_NOTICE "SAK: killed process %d"
 					    " (%s): fd#%d opened to the tty\n",
 					    task_pid_nr(p), p->comm, i);
diff --git a/fs/internal.h b/fs/internal.h
index 6b706bc60a66..6a5c13a80660 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -80,6 +80,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
 /*
  * file_table.c
  */
+extern void file_sb_list_add(struct file *f, struct super_block *sb);
+extern void file_sb_list_del(struct file *f);
 extern void mark_files_ro(struct super_block *);
 extern struct file *get_empty_filp(void);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5a9a9e5a3705..5e65add0f163 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2185,8 +2185,6 @@ static inline void insert_inode_hash(struct inode *inode) {
 	__insert_inode_hash(inode, inode->i_ino);
 }
 
-extern void file_sb_list_add(struct file *f, struct super_block *sb);
-extern void file_sb_list_del(struct file *f);
 #ifdef CONFIG_BLOCK
 extern void submit_bio(int, struct bio *);
 extern int bdev_read_only(struct block_device *);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index f6b371a2514e..67d64e6efe7a 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -329,6 +329,13 @@ struct tty_struct {
 	struct tty_port *port;
 };
 
+/* Each of a tty's open files has private_data pointing to tty_file_private */
+struct tty_file_private {
+	struct tty_struct *tty;
+	struct file *file;
+	struct list_head list;
+};
+
 /* tty magic number */
 #define TTY_MAGIC		0x5401
 
@@ -458,6 +465,7 @@ extern void proc_clear_tty(struct task_struct *p);
 extern struct tty_struct *get_current_tty(void);
 extern void tty_default_fops(struct file_operations *fops);
 extern struct tty_struct *alloc_tty_struct(void);
+extern void tty_add_file(struct tty_struct *tty, struct file *file);
 extern void free_tty_struct(struct tty_struct *tty);
 extern void initialize_tty_struct(struct tty_struct *tty,
 		struct tty_driver *driver, int idx);
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index bd7da0f0ccf3..4796ddd4e721 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2172,6 +2172,7 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 	if (tty) {
 		spin_lock(&tty_files_lock);
 		if (!list_empty(&tty->tty_files)) {
+			struct tty_file_private *file_priv;
 			struct inode *inode;
 
 			/* Revalidate access to controlling tty.
@@ -2179,7 +2180,9 @@ static inline void flush_unauthorized_files(const struct cred *cred,
 			   than using file_has_perm, as this particular open
 			   file may belong to another process and we are only
 			   interested in the inode-based check here. */
-			file = list_first_entry(&tty->tty_files, struct file, f_u.fu_list);
+			file_priv = list_first_entry(&tty->tty_files,
+						struct tty_file_private, list);
+			file = file_priv->file;
 			inode = file->f_path.dentry->d_inode;
 			if (inode_has_perm(cred, inode,
 					   FILE__READ | FILE__WRITE, NULL)) {
-- 
cgit v1.2.3


From 6416ccb7899960868f5016751fb81bf25213d24f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Wed, 18 Aug 2010 04:37:38 +1000
Subject: fs: scale files_lock

fs: scale files_lock

Improve scalability of files_lock by adding per-cpu, per-sb files lists,
protected with an lglock. The lglock provides fast access to the per-cpu lists
to add and remove files. It also provides a snapshot of all the per-cpu lists
(although this is very slow).

One difficulty with this approach is that a file can be removed from the list
by another CPU. We must track which per-cpu list the file is on with a new
variale in the file struct (packed into a hole on 64-bit archs). Scalability
could suffer if files are frequently removed from different cpu's list.

However loads with frequent removal of files imply short interval between
adding and removing the files, and the scheduler attempts to avoid moving
processes too far away. Also, even in the case of cross-CPU removal, the
hardware has much more opportunity to parallelise cacheline transfers with N
cachelines than with 1.

A worst-case test of 1 CPU allocating files subsequently being freed by N CPUs
degenerates to contending on a single lock, which is no worse than before. When
more than one CPU are allocating files, even if they are always freed by
different CPUs, there will be more parallelism than the single-lock case.

Testing results:

On a 2 socket, 8 core opteron, I measure the number of times the lock is taken
to remove the file, the number of times it is removed by the same CPU that
added it, and the number of times it is removed by the same node that added it.

Booting:    locks=  25049 cpu-hits=  23174 (92.5%) node-hits=  23945 (95.6%)
kbuild -j16 locks=2281913 cpu-hits=2208126 (96.8%) node-hits=2252674 (98.7%)
dbench 64   locks=4306582 cpu-hits=4287247 (99.6%) node-hits=4299527 (99.8%)

So a file is removed from the same CPU it was added by over 90% of the time.
It remains within the same node 95% of the time.

Tim Chen ran some numbers for a 64 thread Nehalem system performing a compile.

                throughput
2.6.34-rc2      24.5
+patch          24.9

                us      sys     idle    IO wait (in %)
2.6.34-rc2      51.25   28.25   17.25   3.25
+patch          53.75   18.5    19      8.75

So significantly less CPU time spent in kernel code, higher idle time and
slightly higher throughput.

Single threaded performance difference was within the noise of microbenchmarks.
That is not to say penalty does not exist, the code is larger and more memory
accesses required so it will be slightly slower.

Cc: linux-kernel@vger.kernel.org
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c    | 108 ++++++++++++++++++++++++++++++++++++++++++++---------
 fs/super.c         |  18 +++++++++
 include/linux/fs.h |   7 ++++
 3 files changed, 115 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/file_table.c b/fs/file_table.c
index 6f0e62ecfddd..a04bdd81c11c 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -20,7 +20,9 @@
 #include <linux/cdev.h>
 #include <linux/fsnotify.h>
 #include <linux/sysctl.h>
+#include <linux/lglock.h>
 #include <linux/percpu_counter.h>
+#include <linux/percpu.h>
 #include <linux/ima.h>
 
 #include <asm/atomic.h>
@@ -32,7 +34,8 @@ struct files_stat_struct files_stat = {
 	.max_files = NR_FILE
 };
 
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+DECLARE_LGLOCK(files_lglock);
+DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
 static struct kmem_cache *filp_cachep __read_mostly;
@@ -336,30 +339,98 @@ void put_filp(struct file *file)
 	}
 }
 
+static inline int file_list_cpu(struct file *file)
+{
+#ifdef CONFIG_SMP
+	return file->f_sb_list_cpu;
+#else
+	return smp_processor_id();
+#endif
+}
+
+/* helper for file_sb_list_add to reduce ifdefs */
+static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
+{
+	struct list_head *list;
+#ifdef CONFIG_SMP
+	int cpu;
+	cpu = smp_processor_id();
+	file->f_sb_list_cpu = cpu;
+	list = per_cpu_ptr(sb->s_files, cpu);
+#else
+	list = &sb->s_files;
+#endif
+	list_add(&file->f_u.fu_list, list);
+}
+
+/**
+ * file_sb_list_add - add a file to the sb's file list
+ * @file: file to add
+ * @sb: sb to add it to
+ *
+ * Use this function to associate a file with the superblock of the inode it
+ * refers to.
+ */
 void file_sb_list_add(struct file *file, struct super_block *sb)
 {
-	spin_lock(&files_lock);
-	BUG_ON(!list_empty(&file->f_u.fu_list));
-	list_add(&file->f_u.fu_list, &sb->s_files);
-	spin_unlock(&files_lock);
+	lg_local_lock(files_lglock);
+	__file_sb_list_add(file, sb);
+	lg_local_unlock(files_lglock);
 }
 
+/**
+ * file_sb_list_del - remove a file from the sb's file list
+ * @file: file to remove
+ * @sb: sb to remove it from
+ *
+ * Use this function to remove a file from its superblock.
+ */
 void file_sb_list_del(struct file *file)
 {
 	if (!list_empty(&file->f_u.fu_list)) {
-		spin_lock(&files_lock);
+		lg_local_lock_cpu(files_lglock, file_list_cpu(file));
 		list_del_init(&file->f_u.fu_list);
-		spin_unlock(&files_lock);
+		lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
 	}
 }
 
+#ifdef CONFIG_SMP
+
+/*
+ * These macros iterate all files on all CPUs for a given superblock.
+ * files_lglock must be held globally.
+ */
+#define do_file_list_for_each_entry(__sb, __file)		\
+{								\
+	int i;							\
+	for_each_possible_cpu(i) {				\
+		struct list_head *list;				\
+		list = per_cpu_ptr((__sb)->s_files, i);		\
+		list_for_each_entry((__file), list, f_u.fu_list)
+
+#define while_file_list_for_each_entry				\
+	}							\
+}
+
+#else
+
+#define do_file_list_for_each_entry(__sb, __file)		\
+{								\
+	struct list_head *list;					\
+	list = &(sb)->s_files;					\
+	list_for_each_entry((__file), list, f_u.fu_list)
+
+#define while_file_list_for_each_entry				\
+}
+
+#endif
+
 int fs_may_remount_ro(struct super_block *sb)
 {
 	struct file *file;
-
 	/* Check that no files are currently opened for writing. */
-	spin_lock(&files_lock);
-	list_for_each_entry(file, &sb->s_files, f_u.fu_list) {
+	lg_global_lock(files_lglock);
+	do_file_list_for_each_entry(sb, file) {
 		struct inode *inode = file->f_path.dentry->d_inode;
 
 		/* File with pending delete? */
@@ -369,11 +440,11 @@ int fs_may_remount_ro(struct super_block *sb)
 		/* Writeable file? */
 		if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE))
 			goto too_bad;
-	}
-	spin_unlock(&files_lock);
+	} while_file_list_for_each_entry;
+	lg_global_unlock(files_lglock);
 	return 1; /* Tis' cool bro. */
 too_bad:
-	spin_unlock(&files_lock);
+	lg_global_unlock(files_lglock);
 	return 0;
 }
 
@@ -389,8 +460,8 @@ void mark_files_ro(struct super_block *sb)
 	struct file *f;
 
 retry:
-	spin_lock(&files_lock);
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+	lg_global_lock(files_lglock);
+	do_file_list_for_each_entry(sb, f) {
 		struct vfsmount *mnt;
 		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
 		       continue;
@@ -406,12 +477,12 @@ retry:
 		file_release_write(f);
 		mnt = mntget(f->f_path.mnt);
 		/* This can sleep, so we can't hold the spinlock. */
-		spin_unlock(&files_lock);
+		lg_global_unlock(files_lglock);
 		mnt_drop_write(mnt);
 		mntput(mnt);
 		goto retry;
-	}
-	spin_unlock(&files_lock);
+	} while_file_list_for_each_entry;
+	lg_global_unlock(files_lglock);
 }
 
 void __init files_init(unsigned long mempages)
@@ -431,5 +502,6 @@ void __init files_init(unsigned long mempages)
 	if (files_stat.max_files < NR_FILE)
 		files_stat.max_files = NR_FILE;
 	files_defer_init();
+	lg_lock_init(files_lglock);
 	percpu_counter_init(&nr_files, 0);
 } 
diff --git a/fs/super.c b/fs/super.c
index 9674ab2c8718..8819e3a7ff20 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -54,7 +54,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
 			s = NULL;
 			goto out;
 		}
+#ifdef CONFIG_SMP
+		s->s_files = alloc_percpu(struct list_head);
+		if (!s->s_files) {
+			security_sb_free(s);
+			kfree(s);
+			s = NULL;
+			goto out;
+		} else {
+			int i;
+
+			for_each_possible_cpu(i)
+				INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i));
+		}
+#else
 		INIT_LIST_HEAD(&s->s_files);
+#endif
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
@@ -108,6 +123,9 @@ out:
  */
 static inline void destroy_super(struct super_block *s)
 {
+#ifdef CONFIG_SMP
+	free_percpu(s->s_files);
+#endif
 	security_sb_free(s);
 	kfree(s->s_subtype);
 	kfree(s->s_options);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5e65add0f163..76041b614758 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -920,6 +920,9 @@ struct file {
 #define f_vfsmnt	f_path.mnt
 	const struct file_operations	*f_op;
 	spinlock_t		f_lock;  /* f_ep_links, f_flags, no IRQ */
+#ifdef CONFIG_SMP
+	int			f_sb_list_cpu;
+#endif
 	atomic_long_t		f_count;
 	unsigned int 		f_flags;
 	fmode_t			f_mode;
@@ -1334,7 +1337,11 @@ struct super_block {
 
 	struct list_head	s_inodes;	/* all inodes */
 	struct hlist_head	s_anon;		/* anonymous dentries for (nfs) exporting */
+#ifdef CONFIG_SMP
+	struct list_head __percpu *s_files;
+#else
 	struct list_head	s_files;
+#endif
 	/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */
 	struct list_head	s_dentry_lru;	/* unused dentry lru */
 	int			s_nr_dentry_unused;	/* # of dentry on lru */
-- 
cgit v1.2.3


From 99b7db7b8ffd6bb755eb0a175596421a0b581cb2 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@kernel.dk>
Date: Wed, 18 Aug 2010 04:37:39 +1000
Subject: fs: brlock vfsmount_lock

fs: brlock vfsmount_lock

Use a brlock for the vfsmount lock. It must be taken for write whenever
modifying the mount hash or associated fields, and may be taken for read when
performing mount hash lookups.

A new lock is added for the mnt-id allocator, so it doesn't need to take
the heavy vfsmount write-lock.

The number of atomics should remain the same for fastpath rlock cases, though
code would be slightly slower due to per-cpu access. Scalability is not not be
much improved in common cases yet, due to other locks (ie. dcache_lock) getting
in the way. However path lookups crossing mountpoints should be one case where
scalability is improved (currently requiring the global lock).

The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node
Altix system (high latency to remote nodes), a simple umount microbenchmark
(mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it
took 6.8s, afterwards took 7.1s, about 5% slower.

Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Nick Piggin <npiggin@kernel.dk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c    |  11 ++--
 fs/internal.h  |   5 +-
 fs/namei.c     |   7 ++-
 fs/namespace.c | 177 ++++++++++++++++++++++++++++++++++++---------------------
 fs/pnode.c     |  11 +++-
 5 files changed, 134 insertions(+), 77 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index d56a40b5a577..83293be48149 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1935,7 +1935,7 @@ static int prepend_path(const struct path *path, struct path *root,
 	bool slash = false;
 	int error = 0;
 
-	spin_lock(&vfsmount_lock);
+	br_read_lock(vfsmount_lock);
 	while (dentry != root->dentry || vfsmnt != root->mnt) {
 		struct dentry * parent;
 
@@ -1964,7 +1964,7 @@ out:
 	if (!error && !slash)
 		error = prepend(buffer, buflen, "/", 1);
 
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 	return error;
 
 global_root:
@@ -2302,11 +2302,12 @@ int path_is_under(struct path *path1, struct path *path2)
 	struct vfsmount *mnt = path1->mnt;
 	struct dentry *dentry = path1->dentry;
 	int res;
-	spin_lock(&vfsmount_lock);
+
+	br_read_lock(vfsmount_lock);
 	if (mnt != path2->mnt) {
 		for (;;) {
 			if (mnt->mnt_parent == mnt) {
-				spin_unlock(&vfsmount_lock);
+				br_read_unlock(vfsmount_lock);
 				return 0;
 			}
 			if (mnt->mnt_parent == path2->mnt)
@@ -2316,7 +2317,7 @@ int path_is_under(struct path *path1, struct path *path2)
 		dentry = mnt->mnt_mountpoint;
 	}
 	res = is_subdir(dentry, path2->dentry);
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 	return res;
 }
 EXPORT_SYMBOL(path_is_under);
diff --git a/fs/internal.h b/fs/internal.h
index 6a5c13a80660..a6910e91cee8 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -9,6 +9,8 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/lglock.h>
+
 struct super_block;
 struct linux_binprm;
 struct path;
@@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 
 extern void __init mnt_init(void);
 
-extern spinlock_t vfsmount_lock;
+DECLARE_BRLOCK(vfsmount_lock);
+
 
 /*
  * fs_struct.c
diff --git a/fs/namei.c b/fs/namei.c
index 11de7c39ff76..24896e833565 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -595,15 +595,16 @@ int follow_up(struct path *path)
 {
 	struct vfsmount *parent;
 	struct dentry *mountpoint;
-	spin_lock(&vfsmount_lock);
+
+	br_read_lock(vfsmount_lock);
 	parent = path->mnt->mnt_parent;
 	if (parent == path->mnt) {
-		spin_unlock(&vfsmount_lock);
+		br_read_unlock(vfsmount_lock);
 		return 0;
 	}
 	mntget(parent);
 	mountpoint = dget(path->mnt->mnt_mountpoint);
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 	dput(path->dentry);
 	path->dentry = mountpoint;
 	mntput(path->mnt);
diff --git a/fs/namespace.c b/fs/namespace.c
index 2e10cb19c5b0..de402eb6eafb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -11,6 +11,8 @@
 #include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
 #include <linux/smp_lock.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -38,12 +40,10 @@
 #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head))
 #define HASH_SIZE (1UL << HASH_SHIFT)
 
-/* spinlock for vfsmount related operations, inplace of dcache_lock */
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
-
 static int event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
+static DEFINE_SPINLOCK(mnt_id_lock);
 static int mnt_id_start = 0;
 static int mnt_group_start = 1;
 
@@ -55,6 +55,16 @@ static struct rw_semaphore namespace_sem;
 struct kobject *fs_kobj;
 EXPORT_SYMBOL_GPL(fs_kobj);
 
+/*
+ * vfsmount lock may be taken for read to prevent changes to the
+ * vfsmount hash, ie. during mountpoint lookups or walking back
+ * up the tree.
+ *
+ * It should be taken for write in all cases where the vfsmount
+ * tree or hash is modified or when a vfsmount structure is modified.
+ */
+DEFINE_BRLOCK(vfsmount_lock);
+
 static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 {
 	unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES);
@@ -65,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
 
 #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
 
-/* allocation is serialized by namespace_sem */
+/*
+ * allocation is serialized by namespace_sem, but we need the spinlock to
+ * serialize with freeing.
+ */
 static int mnt_alloc_id(struct vfsmount *mnt)
 {
 	int res;
 
 retry:
 	ida_pre_get(&mnt_id_ida, GFP_KERNEL);
-	spin_lock(&vfsmount_lock);
+	spin_lock(&mnt_id_lock);
 	res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id);
 	if (!res)
 		mnt_id_start = mnt->mnt_id + 1;
-	spin_unlock(&vfsmount_lock);
+	spin_unlock(&mnt_id_lock);
 	if (res == -EAGAIN)
 		goto retry;
 
@@ -86,11 +99,11 @@ retry:
 static void mnt_free_id(struct vfsmount *mnt)
 {
 	int id = mnt->mnt_id;
-	spin_lock(&vfsmount_lock);
+	spin_lock(&mnt_id_lock);
 	ida_remove(&mnt_id_ida, id);
 	if (mnt_id_start > id)
 		mnt_id_start = id;
-	spin_unlock(&vfsmount_lock);
+	spin_unlock(&mnt_id_lock);
 }
 
 /*
@@ -348,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
 	int ret = 0;
 
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	mnt->mnt_flags |= MNT_WRITE_HOLD;
 	/*
 	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -382,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 	 */
 	smp_wmb();
 	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	return ret;
 }
 
 static void __mnt_unmake_readonly(struct vfsmount *mnt)
 {
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	mnt->mnt_flags &= ~MNT_READONLY;
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 }
 
 void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
@@ -414,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt)
 /*
  * find the first or last mount at @dentry on vfsmount @mnt depending on
  * @dir. If @dir is set return the first mount else return the last mount.
+ * vfsmount_lock must be held for read or write.
  */
 struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 			      int dir)
@@ -443,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
 struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct vfsmount *child_mnt;
-	spin_lock(&vfsmount_lock);
+
+	br_read_lock(vfsmount_lock);
 	if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
 		mntget(child_mnt);
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 	return child_mnt;
 }
 
@@ -455,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt)
 	return mnt->mnt_ns == current->nsproxy->mnt_ns;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void touch_mnt_namespace(struct mnt_namespace *ns)
 {
 	if (ns) {
@@ -463,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns)
 	}
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void __touch_mnt_namespace(struct mnt_namespace *ns)
 {
 	if (ns && ns->event != event) {
@@ -471,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns)
 	}
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 {
 	old_path->dentry = mnt->mnt_mountpoint;
@@ -482,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path)
 	old_path->dentry->d_mounted--;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
 			struct vfsmount *child_mnt)
 {
@@ -490,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry,
 	dentry->d_mounted++;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 static void attach_mnt(struct vfsmount *mnt, struct path *path)
 {
 	mnt_set_mountpoint(path->mnt, path->dentry, mnt);
@@ -499,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path)
 }
 
 /*
- * the caller must hold vfsmount_lock
+ * vfsmount lock must be held for write
  */
 static void commit_tree(struct vfsmount *mnt)
 {
@@ -623,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt)
 void mntput_no_expire(struct vfsmount *mnt)
 {
 repeat:
-	if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) {
-		if (likely(!mnt->mnt_pinned)) {
-			spin_unlock(&vfsmount_lock);
-			__mntput(mnt);
-			return;
-		}
-		atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
-		mnt->mnt_pinned = 0;
-		spin_unlock(&vfsmount_lock);
-		acct_auto_close_mnt(mnt);
-		goto repeat;
+	if (atomic_add_unless(&mnt->mnt_count, -1, 1))
+		return;
+	br_write_lock(vfsmount_lock);
+	if (!atomic_dec_and_test(&mnt->mnt_count)) {
+		br_write_unlock(vfsmount_lock);
+		return;
+	}
+	if (likely(!mnt->mnt_pinned)) {
+		br_write_unlock(vfsmount_lock);
+		__mntput(mnt);
+		return;
 	}
+	atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count);
+	mnt->mnt_pinned = 0;
+	br_write_unlock(vfsmount_lock);
+	acct_auto_close_mnt(mnt);
+	goto repeat;
 }
-
 EXPORT_SYMBOL(mntput_no_expire);
 
 void mnt_pin(struct vfsmount *mnt)
 {
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	mnt->mnt_pinned++;
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 }
 
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *mnt)
 {
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	if (mnt->mnt_pinned) {
 		atomic_inc(&mnt->mnt_count);
 		mnt->mnt_pinned--;
 	}
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 }
 
 EXPORT_SYMBOL(mnt_unpin);
@@ -746,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p)
 	struct mnt_namespace *ns = p->ns;
 	int res = 0;
 
-	spin_lock(&vfsmount_lock);
+	br_read_lock(vfsmount_lock);
 	if (p->event != ns->event) {
 		p->event = ns->event;
 		res = 1;
 	}
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 
 	return res;
 }
@@ -952,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt)
 	int minimum_refs = 0;
 	struct vfsmount *p;
 
-	spin_lock(&vfsmount_lock);
+	br_read_lock(vfsmount_lock);
 	for (p = mnt; p; p = next_mnt(p, mnt)) {
 		actual_refs += atomic_read(&p->mnt_count);
 		minimum_refs += 2;
 	}
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 
 	if (actual_refs > minimum_refs)
 		return 0;
@@ -984,10 +1018,10 @@ int may_umount(struct vfsmount *mnt)
 {
 	int ret = 1;
 	down_read(&namespace_sem);
-	spin_lock(&vfsmount_lock);
+	br_read_lock(vfsmount_lock);
 	if (propagate_mount_busy(mnt, 2))
 		ret = 0;
-	spin_unlock(&vfsmount_lock);
+	br_read_unlock(vfsmount_lock);
 	up_read(&namespace_sem);
 	return ret;
 }
@@ -1003,13 +1037,14 @@ void release_mounts(struct list_head *head)
 		if (mnt->mnt_parent != mnt) {
 			struct dentry *dentry;
 			struct vfsmount *m;
-			spin_lock(&vfsmount_lock);
+
+			br_write_lock(vfsmount_lock);
 			dentry = mnt->mnt_mountpoint;
 			m = mnt->mnt_parent;
 			mnt->mnt_mountpoint = mnt->mnt_root;
 			mnt->mnt_parent = mnt;
 			m->mnt_ghosts--;
-			spin_unlock(&vfsmount_lock);
+			br_write_unlock(vfsmount_lock);
 			dput(dentry);
 			mntput(m);
 		}
@@ -1017,6 +1052,10 @@ void release_mounts(struct list_head *head)
 	}
 }
 
+/*
+ * vfsmount lock must be held for write
+ * namespace_sem must be held for write
+ */
 void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
 {
 	struct vfsmount *p;
@@ -1107,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 	}
 
 	down_write(&namespace_sem);
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	event++;
 
 	if (!(flags & MNT_DETACH))
@@ -1119,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags)
 			umount_tree(mnt, 1, &umount_list);
 		retval = 0;
 	}
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	return retval;
@@ -1231,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 			q = clone_mnt(p, p->mnt_root, flag);
 			if (!q)
 				goto Enomem;
-			spin_lock(&vfsmount_lock);
+			br_write_lock(vfsmount_lock);
 			list_add_tail(&q->mnt_list, &res->mnt_list);
 			attach_mnt(q, &path);
-			spin_unlock(&vfsmount_lock);
+			br_write_unlock(vfsmount_lock);
 		}
 	}
 	return res;
 Enomem:
 	if (res) {
 		LIST_HEAD(umount_list);
-		spin_lock(&vfsmount_lock);
+		br_write_lock(vfsmount_lock);
 		umount_tree(res, 0, &umount_list);
-		spin_unlock(&vfsmount_lock);
+		br_write_unlock(vfsmount_lock);
 		release_mounts(&umount_list);
 	}
 	return NULL;
@@ -1262,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
 {
 	LIST_HEAD(umount_list);
 	down_write(&namespace_sem);
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	umount_tree(mnt, 0, &umount_list);
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 }
@@ -1392,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 	if (err)
 		goto out_cleanup_ids;
 
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 
 	if (IS_MNT_SHARED(dest_mnt)) {
 		for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1411,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
 		list_del_init(&child->mnt_hash);
 		commit_tree(child);
 	}
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
+
 	return 0;
 
  out_cleanup_ids:
@@ -1466,10 +1506,10 @@ static int do_change_type(struct path *path, int flag)
 			goto out_unlock;
 	}
 
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
 		change_mnt_propagation(m, type);
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 
  out_unlock:
 	up_write(&namespace_sem);
@@ -1513,9 +1553,10 @@ static int do_loopback(struct path *path, char *old_name,
 	err = graft_tree(mnt, path);
 	if (err) {
 		LIST_HEAD(umount_list);
-		spin_lock(&vfsmount_lock);
+
+		br_write_lock(vfsmount_lock);
 		umount_tree(mnt, 0, &umount_list);
-		spin_unlock(&vfsmount_lock);
+		br_write_unlock(vfsmount_lock);
 		release_mounts(&umount_list);
 	}
 
@@ -1568,16 +1609,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	else
 		err = do_remount_sb(sb, flags, data, 0);
 	if (!err) {
-		spin_lock(&vfsmount_lock);
+		br_write_lock(vfsmount_lock);
 		mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK;
 		path->mnt->mnt_flags = mnt_flags;
-		spin_unlock(&vfsmount_lock);
+		br_write_unlock(vfsmount_lock);
 	}
 	up_write(&sb->s_umount);
 	if (!err) {
-		spin_lock(&vfsmount_lock);
+		br_write_lock(vfsmount_lock);
 		touch_mnt_namespace(path->mnt->mnt_ns);
-		spin_unlock(&vfsmount_lock);
+		br_write_unlock(vfsmount_lock);
 	}
 	return err;
 }
@@ -1754,7 +1795,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		return;
 
 	down_write(&namespace_sem);
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
@@ -1773,7 +1814,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 		touch_mnt_namespace(mnt->mnt_ns);
 		umount_tree(mnt, 1, &umounts);
 	}
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	up_write(&namespace_sem);
 
 	release_mounts(&umounts);
@@ -1830,6 +1871,8 @@ resume:
 /*
  * process a list of expirable mountpoints with the intent of discarding any
  * submounts of a specific parent mountpoint
+ *
+ * vfsmount_lock must be held for write
  */
 static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts)
 {
@@ -2048,9 +2091,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
 		kfree(new_ns);
 		return ERR_PTR(-ENOMEM);
 	}
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 
 	/*
 	 * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2244,7 +2287,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 		goto out2; /* not attached */
 	/* make sure we can reach put_old from new_root */
 	tmp = old.mnt;
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	if (tmp != new.mnt) {
 		for (;;) {
 			if (tmp->mnt_parent == tmp)
@@ -2264,7 +2307,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	/* mount new_root on / */
 	attach_mnt(new.mnt, &root_parent);
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	chroot_fs_refs(&root, &new);
 	error = 0;
 	path_put(&root_parent);
@@ -2279,7 +2322,7 @@ out1:
 out0:
 	return error;
 out3:
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	goto out2;
 }
 
@@ -2326,6 +2369,8 @@ void __init mnt_init(void)
 	for (u = 0; u < HASH_SIZE; u++)
 		INIT_LIST_HEAD(&mount_hashtable[u]);
 
+	br_lock_init(vfsmount_lock);
+
 	err = sysfs_init();
 	if (err)
 		printk(KERN_WARNING "%s: sysfs_init error: %d\n",
@@ -2344,9 +2389,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
 	if (!atomic_dec_and_test(&ns->count))
 		return;
 	down_write(&namespace_sem);
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	umount_tree(ns->root, 0, &umount_list);
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	up_write(&namespace_sem);
 	release_mounts(&umount_list);
 	kfree(ns);
diff --git a/fs/pnode.c b/fs/pnode.c
index 5cc564a83149..8066b8dd748f 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt)
 	return 0;
 }
 
+/*
+ * vfsmount lock must be held for write
+ */
 void change_mnt_propagation(struct vfsmount *mnt, int type)
 {
 	if (type == MS_SHARED) {
@@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
 		prev_src_mnt  = child;
 	}
 out:
-	spin_lock(&vfsmount_lock);
+	br_write_lock(vfsmount_lock);
 	while (!list_empty(&tmp_list)) {
 		child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
 		umount_tree(child, 0, &umount_list);
 	}
-	spin_unlock(&vfsmount_lock);
+	br_write_unlock(vfsmount_lock);
 	release_mounts(&umount_list);
 	return ret;
 }
@@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count)
  * other mounts its parent propagates to.
  * Check if any of these mounts that **do not have submounts**
  * have more references than 'refcnt'. If so return busy.
+ *
+ * vfsmount lock must be held for read or write
  */
 int propagate_mount_busy(struct vfsmount *mnt, int refcnt)
 {
@@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt)
  * collect all mounts that receive propagation from the mount in @list,
  * and return these additional mounts in the same list.
  * @list: the list of mounts to be unmounted.
+ *
+ * vfsmount lock must be held for write
  */
 int propagate_umount(struct list_head *list)
 {
-- 
cgit v1.2.3


From 0a377cff9428af2da2b293d11e07bc4dbf064ee5 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Wed, 18 Aug 2010 09:25:42 -0400
Subject: NFS: Fix an Oops in the NFSv4 atomic open code

Adam Lackorzynski reports:

with 2.6.35.2 I'm getting this reproducible Oops:

[  110.825396] BUG: unable to handle kernel NULL pointer dereference at
(null)
[  110.828638] IP: [<ffffffff811247b7>] encode_attrs+0x1a/0x2a4
[  110.828638] PGD be89f067 PUD bf18f067 PMD 0
[  110.828638] Oops: 0000 [#1] SMP
[  110.828638] last sysfs file: /sys/class/net/lo/operstate
[  110.828638] CPU 2
[  110.828638] Modules linked in: rtc_cmos rtc_core rtc_lib amd64_edac_mod
i2c_amd756 edac_core i2c_core dm_mirror dm_region_hash dm_log dm_snapshot
sg sr_mod usb_storage ohci_hcd mptspi tg3 mptscsih mptbase usbcore nls_base
[last unloaded: scsi_wait_scan]
[  110.828638]
[  110.828638] Pid: 11264, comm: setchecksum Not tainted 2.6.35.2 #1
[  110.828638] RIP: 0010:[<ffffffff811247b7>]  [<ffffffff811247b7>]
encode_attrs+0x1a/0x2a4
[  110.828638] RSP: 0000:ffff88003bf5b878  EFLAGS: 00010296
[  110.828638] RAX: ffff8800bddb48a8 RBX: ffff88003bf5bb18 RCX:
0000000000000000
[  110.828638] RDX: ffff8800be258800 RSI: 0000000000000000 RDI:
ffff88003bf5b9f8
[  110.828638] RBP: 0000000000000000 R08: ffff8800bddb48a8 R09:
0000000000000004
[  110.828638] R10: 0000000000000003 R11: ffff8800be779000 R12:
ffff8800be258800
[  110.828638] R13: ffff88003bf5b9f8 R14: ffff88003bf5bb20 R15:
ffff8800be258800
[  110.828638] FS:  0000000000000000(0000) GS:ffff880041e00000(0063)
knlGS:00000000556bd6b0
[  110.828638] CS:  0010 DS: 002b ES: 002b CR0: 000000008005003b
[  110.828638] CR2: 0000000000000000 CR3: 00000000be8ef000 CR4:
00000000000006e0
[  110.828638] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
0000000000000000
[  110.828638] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7:
0000000000000400
[  110.828638] Process setchecksum (pid: 11264, threadinfo
ffff88003bf5a000, task ffff88003f232210)
[  110.828638] Stack:
[  110.828638]  0000000000000000 ffff8800bfbcf920 0000000000000000
0000000000000ffe
[  110.828638] <0> 0000000000000000 0000000000000000 0000000000000000
0000000000000000
[  110.828638] <0> 0000000000000000 0000000000000000 0000000000000000
0000000000000000
[  110.828638] Call Trace:
[  110.828638]  [<ffffffff81124c1f>] ? nfs4_xdr_enc_setattr+0x90/0xb4
[  110.828638]  [<ffffffff81371161>] ? call_transmit+0x1c3/0x24a
[  110.828638]  [<ffffffff813774d9>] ? __rpc_execute+0x78/0x22a
[  110.828638]  [<ffffffff81371a91>] ? rpc_run_task+0x21/0x2b
[  110.828638]  [<ffffffff81371b7e>] ? rpc_call_sync+0x3d/0x5d
[  110.828638]  [<ffffffff8111e284>] ? _nfs4_do_setattr+0x11b/0x147
[  110.828638]  [<ffffffff81109466>] ? nfs_init_locked+0x0/0x32
[  110.828638]  [<ffffffff810ac521>] ? ifind+0x4e/0x90
[  110.828638]  [<ffffffff8111e2fb>] ? nfs4_do_setattr+0x4b/0x6e
[  110.828638]  [<ffffffff8111e634>] ? nfs4_do_open+0x291/0x3a6
[  110.828638]  [<ffffffff8111ed81>] ? nfs4_open_revalidate+0x63/0x14a
[  110.828638]  [<ffffffff811056c4>] ? nfs_open_revalidate+0xd7/0x161
[  110.828638]  [<ffffffff810a2de4>] ? do_lookup+0x1a4/0x201
[  110.828638]  [<ffffffff810a4733>] ? link_path_walk+0x6a/0x9d5
[  110.828638]  [<ffffffff810a42b6>] ? do_last+0x17b/0x58e
[  110.828638]  [<ffffffff810a5fbe>] ? do_filp_open+0x1bd/0x56e
[  110.828638]  [<ffffffff811cd5e0>] ? _atomic_dec_and_lock+0x30/0x48
[  110.828638]  [<ffffffff810a9b1b>] ? dput+0x37/0x152
[  110.828638]  [<ffffffff810ae063>] ? alloc_fd+0x69/0x10a
[  110.828638]  [<ffffffff81099f39>] ? do_sys_open+0x56/0x100
[  110.828638]  [<ffffffff81027a22>] ? ia32_sysret+0x0/0x5
[  110.828638] Code: 83 f1 01 e8 f5 ca ff ff 48 83 c4 50 5b 5d 41 5c c3 41
57 41 56 41 55 49 89 fd 41 54 49 89 d4 55 48 89 f5 53 48 81 ec 18 01 00 00
<8b> 06 89 c2 83 e2 08 83 fa 01 19 db 83 e3 f8 83 c3 18 a8 01 8d
[  110.828638] RIP  [<ffffffff811247b7>] encode_attrs+0x1a/0x2a4
[  110.828638]  RSP <ffff88003bf5b878>
[  110.828638] CR2: 0000000000000000
[  112.840396] ---[ end trace 95282e83fd77358f ]---

We need to ensure that the O_EXCL flag is turned off if the user doesn't
set O_CREAT.

Cc: stable@kernel.org
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c      | 2 +-
 fs/nfs/nfs4proc.c | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index bd91b2778315..e257172d438c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1110,7 +1110,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
 	if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
 		goto no_open_dput;
 	/* We can't create new files, or truncate existing ones here */
-	openflags &= ~(O_CREAT|O_TRUNC);
+	openflags &= ~(O_CREAT|O_EXCL|O_TRUNC);
 
 	/*
 	 * Note: we're not holding inode->i_mutex and so may be racing with
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6b44bbfb7d8a..089da5b5d20a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2036,7 +2036,8 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	struct rpc_cred *cred;
 	struct nfs4_state *state;
 	struct dentry *res;
-	fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
+	int open_flags = nd->intent.open.flags;
+	fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
 
 	if (nd->flags & LOOKUP_CREATE) {
 		attr.ia_mode = nd->intent.open.create_mode;
@@ -2044,8 +2045,9 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 		if (!IS_POSIXACL(dir))
 			attr.ia_mode &= ~current_umask();
 	} else {
+		open_flags &= ~O_EXCL;
 		attr.ia_valid = 0;
-		BUG_ON(nd->intent.open.flags & O_CREAT);
+		BUG_ON(open_flags & O_CREAT);
 	}
 
 	cred = rpc_lookup_cred();
@@ -2054,7 +2056,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 	parent = dentry->d_parent;
 	/* Protect against concurrent sillydeletes */
 	nfs_block_sillyrename(parent);
-	state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
+	state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred);
 	put_rpccred(cred);
 	if (IS_ERR(state)) {
 		if (PTR_ERR(state) == -ENOENT) {
-- 
cgit v1.2.3


From 1cb0c924fa2d616e5e3b5bc62d97191aac9ff442 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Wed, 18 Aug 2010 21:11:11 +0900
Subject: nilfs2: wait for discard to finish

nilfs_discard_segment() doesn't wait for completion of discard
requests.  This specifies BLKDEV_IFL_WAIT flag when calling
blkdev_issue_discard() in order to fix the sync failure.

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/nilfs2/the_nilfs.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 6af1c0073e9e..4317f177ea7c 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -775,6 +775,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 						   start * sects_per_block,
 						   nblocks * sects_per_block,
 						   GFP_NOFS,
+						   BLKDEV_IFL_WAIT |
 						   BLKDEV_IFL_BARRIER);
 			if (ret < 0)
 				return ret;
@@ -785,7 +786,8 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump,
 		ret = blkdev_issue_discard(nilfs->ns_bdev,
 					   start * sects_per_block,
 					   nblocks * sects_per_block,
-					   GFP_NOFS, BLKDEV_IFL_BARRIER);
+					   GFP_NOFS,
+					  BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 	return ret;
 }
 
-- 
cgit v1.2.3


From fc87a40677bbe0937e2ff0642c7e83c9a4813f3d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 18 Aug 2010 13:13:39 -0400
Subject: cifs: fix NULL pointer dereference in cifs_find_smb_ses

cifs_find_smb_ses assumes that the vol->password field is a valid
pointer, but that's only the case if a password was passed in via
the options string. It's possible that one won't be if there is
no mount helper on the box.

Reported-by: diabel <gacek-2004@wp.pl>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 95c2ea67edfb..446e2486d5f0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1673,7 +1673,8 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 				    MAX_USERNAME_SIZE))
 				continue;
 			if (strlen(vol->username) != 0 &&
-			    strncmp(ses->password, vol->password,
+			    strncmp(ses->password,
+				    vol->password ? vol->password : "",
 				    MAX_PASSWORD_SIZE))
 				continue;
 		}
-- 
cgit v1.2.3


From bf4f12113812ac5be76c5590c6f50c8346f784a4 Mon Sep 17 00:00:00 2001
From: Igor Druzhinin <jaxbrigs@gmail.com>
Date: Fri, 20 Aug 2010 00:27:12 +0400
Subject: cifs: correction of unicode header files

This patch corrects a problem of compilation errors at removal of
UNIUPR_NOLOWER definition and adds include guards to cifs_unicode.h.

Signed-off-by: Igor Druzhinin <jaxbrigs@gmail.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_unicode.h | 18 +++++++++++-------
 fs/cifs/cifs_uniupr.h  | 16 ++++++++--------
 2 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index 650638275a6f..7fe6b52df507 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -30,6 +30,8 @@
  *     This is a compressed table of upper and lower case conversion.
  *
  */
+#ifndef _CIFS_UNICODE_H
+#define _CIFS_UNICODE_H
 
 #include <asm/byteorder.h>
 #include <linux/types.h>
@@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[];
 #endif				/* UNIUPR_NOUPPER */
 
 #ifndef UNIUPR_NOLOWER
-extern signed char UniLowerTable[512];
-extern struct UniCaseRange UniLowerRange[];
+extern signed char CifsUniLowerTable[512];
+extern const struct UniCaseRange CifsUniLowerRange[];
 #endif				/* UNIUPR_NOLOWER */
 
 #ifdef __KERNEL__
@@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin)
  * UniTolower:  Convert a unicode character to lower case
  */
 static inline wchar_t
-UniTolower(wchar_t uc)
+UniTolower(register wchar_t uc)
 {
-	register struct UniCaseRange *rp;
+	register const struct UniCaseRange *rp;
 
-	if (uc < sizeof(UniLowerTable)) {
+	if (uc < sizeof(CifsUniLowerTable)) {
 		/* Latin characters */
-		return uc + UniLowerTable[uc];	/* Use base tables */
+		return uc + CifsUniLowerTable[uc];	/* Use base tables */
 	} else {
-		rp = UniLowerRange;	/* Use range tables */
+		rp = CifsUniLowerRange;	/* Use range tables */
 		while (rp->start) {
 			if (uc < rp->start)	/* Before start of range */
 				return uc;	/* Uppercase = input */
@@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin)
 }
 
 #endif
+
+#endif /* _CIFS_UNICODE_H */
diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h
index 18a9d978e519..0ac7c5a8633a 100644
--- a/fs/cifs/cifs_uniupr.h
+++ b/fs/cifs/cifs_uniupr.h
@@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = {
 /*
  * Latin lower case
  */
-static signed char CifsUniLowerTable[512] = {
+signed char CifsUniLowerTable[512] = {
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 000-00f */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 010-01f */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,	/* 020-02f */
@@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = {
 /*
  * Lower Case Range
  */
-static const struct UniCaseRange CifsUniLowerRange[] = {
-	0x0380, 0x03ab, UniCaseRangeL0380,
-	0x0400, 0x042f, UniCaseRangeL0400,
-	0x0490, 0x04cb, UniCaseRangeL0490,
-	0x1e00, 0x1ff7, UniCaseRangeL1e00,
-	0xff20, 0xff3a, UniCaseRangeLff20,
-	0, 0, 0
+const struct UniCaseRange CifsUniLowerRange[] = {
+	{0x0380, 0x03ab, UniCaseRangeL0380},
+	{0x0400, 0x042f, UniCaseRangeL0400},
+	{0x0490, 0x04cb, UniCaseRangeL0490},
+	{0x1e00, 0x1ff7, UniCaseRangeL1e00},
+	{0xff20, 0xff3a, UniCaseRangeLff20},
+	{0}
 };
 #endif
-- 
cgit v1.2.3


From 9fbc590860e75785bdaf8b83e48fabfe4d4f7d58 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 20 Aug 2010 20:42:26 +0000
Subject: [CIFS] Fix ntlmv2 auth with ntlmssp

Make ntlmv2 as an authentication mechanism within ntlmssp
instead of ntlmv1.
Parse type 2 response in ntlmssp negotiation to pluck
AV pairs and use them to calculate ntlmv2 response token.
Also, assign domain name from the sever response in type 2
packet of ntlmssp and use that (netbios) domain name in
calculation of response.

Enable cifs/smb signing using rc4 and md5.

Changed name of the structure mac_key to session_key to reflect
the type of key it holds.

Use kernel crypto_shash_* APIs instead of the equivalent cifs functions.

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/Kconfig       |   2 +
 fs/cifs/asn1.c        |   6 +-
 fs/cifs/cifsencrypt.c | 416 +++++++++++++++++++++++++++++++++++---------------
 fs/cifs/cifsglob.h    |  18 ++-
 fs/cifs/cifspdu.h     |   7 +-
 fs/cifs/cifsproto.h   |  12 +-
 fs/cifs/cifssmb.c     |  13 +-
 fs/cifs/connect.c     |  13 +-
 fs/cifs/ntlmssp.h     |  13 ++
 fs/cifs/sess.c        | 118 ++++++++++----
 fs/cifs/transport.c   |   6 +-
 11 files changed, 452 insertions(+), 172 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 917b7d449bb2..0da1debd499d 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -2,6 +2,8 @@ config CIFS
 	tristate "CIFS support (advanced network filesystem, SMBFS successor)"
 	depends on INET
 	select NLS
+	select CRYPTO_MD5
+	select CRYPTO_ARC4
 	help
 	  This is the client VFS module for the Common Internet File System
 	  (CIFS) protocol which is the successor to the Server Message Block
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c
index cfd1ce34e0bc..21f0fbd86989 100644
--- a/fs/cifs/asn1.c
+++ b/fs/cifs/asn1.c
@@ -597,13 +597,13 @@ decode_negTokenInit(unsigned char *security_blob, int length,
 				if (compare_oid(oid, oidlen, MSKRB5_OID,
 						MSKRB5_OID_LEN))
 					server->sec_mskerberos = true;
-				else if (compare_oid(oid, oidlen, KRB5U2U_OID,
+				if (compare_oid(oid, oidlen, KRB5U2U_OID,
 						     KRB5U2U_OID_LEN))
 					server->sec_kerberosu2u = true;
-				else if (compare_oid(oid, oidlen, KRB5_OID,
+				if (compare_oid(oid, oidlen, KRB5_OID,
 						     KRB5_OID_LEN))
 					server->sec_kerberos = true;
-				else if (compare_oid(oid, oidlen, NTLMSSP_OID,
+				if (compare_oid(oid, oidlen, NTLMSSP_OID,
 						     NTLMSSP_OID_LEN))
 					server->sec_ntlmssp = true;
 
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 847628dfdc44..051d00011ca3 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -27,6 +27,7 @@
 #include "md5.h"
 #include "cifs_unicode.h"
 #include "cifsproto.h"
+#include "ntlmssp.h"
 #include <linux/ctype.h>
 #include <linux/random.h>
 
@@ -42,21 +43,44 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 		       unsigned char *p24);
 
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
-				    const struct mac_key *key, char *signature)
+			struct TCP_Server_Info *server, char *signature)
 {
-	struct	MD5Context context;
+	int rc = 0;
+	struct {
+		struct shash_desc shash;
+		char ctx[crypto_shash_descsize(server->ntlmssp.md5)];
+	} sdesc;
 
-	if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL))
+	if (cifs_pdu == NULL || server == NULL || signature == NULL)
 		return -EINVAL;
 
-	cifs_MD5_init(&context);
-	cifs_MD5_update(&context, (char *)&key->data, key->len);
-	cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+	sdesc.shash.tfm = server->ntlmssp.md5;
+	sdesc.shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdesc.shash);
+	if (rc) {
+		cERROR(1, "could not initialize master crypto API hmacmd5\n");
+		return rc;
+	}
+
+	if (server->secType == RawNTLMSSP)
+		crypto_shash_update(&sdesc.shash,
+			server->session_key.data.ntlmv2.key,
+			CIFS_NTLMV2_SESSKEY_SIZE);
+	else
+		crypto_shash_update(&sdesc.shash,
+			(char *)&server->session_key.data,
+			server->session_key.len);
+
+	crypto_shash_update(&sdesc.shash,
+			cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
+
+	rc = crypto_shash_final(&sdesc.shash, signature);
 
-	cifs_MD5_final(signature, &context);
 	return 0;
 }
 
+
 int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 		  __u32 *pexpected_response_sequence_number)
 {
@@ -78,8 +102,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 	server->sequence_number++;
 	spin_unlock(&GlobalMid_Lock);
 
-	rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key,
-				      smb_signature);
+	rc = cifs_calculate_signature(cifs_pdu, server, smb_signature);
 	if (rc)
 		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
 	else
@@ -89,16 +112,36 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server,
 }
 
 static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
-				const struct mac_key *key, char *signature)
+			struct TCP_Server_Info *server, char *signature)
 {
-	struct  MD5Context context;
 	int i;
+	int rc = 0;
+	struct {
+		struct shash_desc shash;
+		char ctx[crypto_shash_descsize(server->ntlmssp.md5)];
+	} sdesc;
 
-	if ((iov == NULL) || (signature == NULL) || (key == NULL))
+	if (iov == NULL || server == NULL || signature == NULL)
 		return -EINVAL;
 
-	cifs_MD5_init(&context);
-	cifs_MD5_update(&context, (char *)&key->data, key->len);
+	sdesc.shash.tfm = server->ntlmssp.md5;
+	sdesc.shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdesc.shash);
+	if (rc) {
+		cERROR(1, "could not initialize master crypto API hmacmd5\n");
+		return rc;
+	}
+
+	if (server->secType == RawNTLMSSP)
+		crypto_shash_update(&sdesc.shash,
+			server->session_key.data.ntlmv2.key,
+			CIFS_NTLMV2_SESSKEY_SIZE);
+	else
+		crypto_shash_update(&sdesc.shash,
+			(char *)&server->session_key.data,
+			server->session_key.len);
+
 	for (i = 0; i < n_vec; i++) {
 		if (iov[i].iov_len == 0)
 			continue;
@@ -111,18 +154,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		if (i == 0) {
 			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
 				break; /* nothing to sign or corrupt header */
-			cifs_MD5_update(&context, iov[0].iov_base+4,
-				  iov[0].iov_len-4);
+			crypto_shash_update(&sdesc.shash,
+				iov[i].iov_base + 4, iov[i].iov_len - 4);
 		} else
-			cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len);
+			crypto_shash_update(&sdesc.shash,
+				iov[i].iov_base, iov[i].iov_len);
 	}
 
-	cifs_MD5_final(signature, &context);
+	rc = crypto_shash_final(&sdesc.shash, signature);
 
 	return 0;
 }
 
-
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 		   __u32 *pexpected_response_sequence_number)
 {
@@ -145,8 +188,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 	server->sequence_number++;
 	spin_unlock(&GlobalMid_Lock);
 
-	rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key,
-				      smb_signature);
+	rc = cifs_calc_signature2(iov, n_vec, server, smb_signature);
 	if (rc)
 		memset(cifs_pdu->Signature.SecuritySignature, 0, 8);
 	else
@@ -156,14 +198,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
 }
 
 int cifs_verify_signature(struct smb_hdr *cifs_pdu,
-			  const struct mac_key *mac_key,
+			  struct TCP_Server_Info *server,
 			  __u32 expected_sequence_number)
 {
-	unsigned int rc;
+	int rc;
 	char server_response_sig[8];
 	char what_we_think_sig_should_be[20];
 
-	if ((cifs_pdu == NULL) || (mac_key == NULL))
+	if (cifs_pdu == NULL || server == NULL)
 		return -EINVAL;
 
 	if (cifs_pdu->Command == SMB_COM_NEGOTIATE)
@@ -192,7 +234,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 					cpu_to_le32(expected_sequence_number);
 	cifs_pdu->Signature.Sequence.Reserved = 0;
 
-	rc = cifs_calculate_signature(cifs_pdu, mac_key,
+	rc = cifs_calculate_signature(cifs_pdu, server,
 		what_we_think_sig_should_be);
 
 	if (rc)
@@ -209,7 +251,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu,
 }
 
 /* We fill in key by putting in 40 byte array which was allocated by caller */
-int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+int cifs_calculate_session_key(struct session_key *key, const char *rn,
 			   const char *password)
 {
 	char temp_key[16];
@@ -223,63 +265,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
 	return 0;
 }
 
-int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses,
-			       const struct nls_table *nls_info)
-{
-	char temp_hash[16];
-	struct HMACMD5Context ctx;
-	char *ucase_buf;
-	__le16 *unicode_buf;
-	unsigned int i, user_name_len, dom_name_len;
-
-	if (ses == NULL)
-		return -EINVAL;
-
-	E_md4hash(ses->password, temp_hash);
-
-	hmac_md5_init_limK_to_64(temp_hash, 16, &ctx);
-	user_name_len = strlen(ses->userName);
-	if (user_name_len > MAX_USERNAME_SIZE)
-		return -EINVAL;
-	if (ses->domainName == NULL)
-		return -EINVAL; /* BB should we use CIFS_LINUX_DOM */
-	dom_name_len = strlen(ses->domainName);
-	if (dom_name_len > MAX_USERNAME_SIZE)
-		return -EINVAL;
-
-	ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL);
-	if (ucase_buf == NULL)
-		return -ENOMEM;
-	unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL);
-	if (unicode_buf == NULL) {
-		kfree(ucase_buf);
-		return -ENOMEM;
-	}
-
-	for (i = 0; i < user_name_len; i++)
-		ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]];
-	ucase_buf[i] = 0;
-	user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf,
-				      MAX_USERNAME_SIZE*2, nls_info);
-	unicode_buf[user_name_len] = 0;
-	user_name_len++;
-
-	for (i = 0; i < dom_name_len; i++)
-		ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]];
-	ucase_buf[i] = 0;
-	dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf,
-				     MAX_USERNAME_SIZE*2, nls_info);
-
-	unicode_buf[user_name_len + dom_name_len] = 0;
-	hmac_md5_update((const unsigned char *) unicode_buf,
-		(user_name_len+dom_name_len)*2, &ctx);
-
-	hmac_md5_final(ses->server->ntlmv2_hash, &ctx);
-	kfree(ucase_buf);
-	kfree(unicode_buf);
-	return 0;
-}
-
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt,
 			char *lnm_session_key)
@@ -324,21 +309,29 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
 {
 	int rc = 0;
 	int len;
-	char nt_hash[16];
-	struct HMACMD5Context *pctxt;
+	char nt_hash[CIFS_NTHASH_SIZE];
 	wchar_t *user;
 	wchar_t *domain;
-
-	pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL);
-
-	if (pctxt == NULL)
-		return -ENOMEM;
+	wchar_t *server;
+	struct {
+		struct shash_desc shash;
+		char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)];
+	} sdesc;
 
 	/* calculate md4 hash of password */
 	E_md4hash(ses->password, nt_hash);
 
-	/* convert Domainname to unicode and uppercase */
-	hmac_md5_init_limK_to_64(nt_hash, 16, pctxt);
+	sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5;
+	sdesc.shash.flags = 0x0;
+
+	crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, nt_hash,
+				CIFS_NTHASH_SIZE);
+
+	rc = crypto_shash_init(&sdesc.shash);
+	if (rc) {
+		cERROR(1, "could not initialize master crypto API hmacmd5\n");
+		return rc;
+	}
 
 	/* convert ses->userName to unicode and uppercase */
 	len = strlen(ses->userName);
@@ -347,7 +340,8 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
 		goto calc_exit_2;
 	len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
 	UniStrupr(user);
-	hmac_md5_update((char *)user, 2*len, pctxt);
+
+	crypto_shash_update(&sdesc.shash, (char *)user, 2 * len);
 
 	/* convert ses->domainName to unicode and uppercase */
 	if (ses->domainName) {
@@ -363,65 +357,243 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
 		   Maybe converting the domain name earlier makes sense */
 		/* UniStrupr(domain); */
 
-		hmac_md5_update((char *)domain, 2*len, pctxt);
+		crypto_shash_update(&sdesc.shash, (char *)domain, 2 * len);
 
 		kfree(domain);
+	} else if (ses->serverName) {
+		len = strlen(ses->serverName);
+
+		server = kmalloc(2 + (len * 2), GFP_KERNEL);
+		if (server == NULL)
+			goto calc_exit_1;
+		len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
+					nls_cp);
+		/* the following line was removed since it didn't work well
+		   with lower cased domain name that passed as an option.
+		   Maybe converting the domain name earlier makes sense */
+		/* UniStrupr(domain); */
+
+		crypto_shash_update(&sdesc.shash, (char *)server, 2 * len);
+
+		kfree(server);
 	}
 calc_exit_1:
 	kfree(user);
 calc_exit_2:
 	/* BB FIXME what about bytes 24 through 40 of the signing key?
 	   compare with the NTLM example */
-	hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
+	rc = crypto_shash_final(&sdesc.shash, ses->server->ntlmv2_hash);
 
-	kfree(pctxt);
 	return rc;
 }
 
-void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
-		      const struct nls_table *nls_cp)
+static int
+find_domain_name(struct cifsSesInfo *ses)
+{
+	int rc = 0;
+	unsigned int attrsize;
+	unsigned int type;
+	unsigned char *blobptr;
+	struct ntlmssp2_name *attrptr;
+
+	if (ses->server->tiblob) {
+		blobptr = ses->server->tiblob;
+		attrptr = (struct ntlmssp2_name *) blobptr;
+
+		while ((type = attrptr->type) != 0) {
+			blobptr += 2; /* advance attr type */
+			attrsize = attrptr->length;
+			blobptr += 2; /* advance attr size */
+			if (type == NTLMSSP_AV_NB_DOMAIN_NAME) {
+				if (!ses->domainName) {
+					ses->domainName =
+						kmalloc(attrptr->length + 1,
+								GFP_KERNEL);
+					if (!ses->domainName)
+							return -ENOMEM;
+					cifs_from_ucs2(ses->domainName,
+						(__le16 *)blobptr,
+						attrptr->length,
+						attrptr->length,
+						load_nls_default(), false);
+				}
+			}
+			blobptr += attrsize; /* advance attr  value */
+			attrptr = (struct ntlmssp2_name *) blobptr;
+		}
+	} else {
+		ses->server->tilen = 2 * sizeof(struct ntlmssp2_name);
+		ses->server->tiblob = kmalloc(ses->server->tilen, GFP_KERNEL);
+		if (!ses->server->tiblob) {
+			ses->server->tilen = 0;
+			cERROR(1, "Challenge target info allocation failure");
+			return -ENOMEM;
+		}
+		memset(ses->server->tiblob, 0x0, ses->server->tilen);
+		attrptr = (struct ntlmssp2_name *) ses->server->tiblob;
+		attrptr->type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
+	}
+
+	return rc;
+}
+
+static int
+CalcNTLMv2_response(const struct TCP_Server_Info *server,
+			 char *v2_session_response)
 {
 	int rc;
+	struct {
+		struct shash_desc shash;
+		char ctx[crypto_shash_descsize(server->ntlmssp.hmacmd5)];
+	} sdesc;
+
+	sdesc.shash.tfm = server->ntlmssp.hmacmd5;
+	sdesc.shash.flags = 0x0;
+
+	crypto_shash_setkey(server->ntlmssp.hmacmd5, server->ntlmv2_hash,
+		CIFS_HMAC_MD5_HASH_SIZE);
+
+	rc = crypto_shash_init(&sdesc.shash);
+	if (rc) {
+		cERROR(1, "could not initialize master crypto API hmacmd5\n");
+		return rc;
+	}
+
+	memcpy(v2_session_response + CIFS_SERVER_CHALLENGE_SIZE,
+		server->cryptKey, CIFS_SERVER_CHALLENGE_SIZE);
+	crypto_shash_update(&sdesc.shash,
+		v2_session_response + CIFS_SERVER_CHALLENGE_SIZE,
+		sizeof(struct ntlmv2_resp) - CIFS_SERVER_CHALLENGE_SIZE);
+
+	if (server->tilen)
+		crypto_shash_update(&sdesc.shash,
+					server->tiblob, server->tilen);
+
+	rc = crypto_shash_final(&sdesc.shash, v2_session_response);
+
+	return rc;
+}
+
+int
+setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
+		      const struct nls_table *nls_cp)
+{
+	int rc = 0;
 	struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf;
-	struct HMACMD5Context context;
+	struct {
+		struct shash_desc shash;
+		char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)];
+	} sdesc;
 
 	buf->blob_signature = cpu_to_le32(0x00000101);
 	buf->reserved = 0;
 	buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
 	get_random_bytes(&buf->client_chal, sizeof(buf->client_chal));
 	buf->reserved2 = 0;
-	buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE);
-	buf->names[0].length = 0;
-	buf->names[1].type = 0;
-	buf->names[1].length = 0;
+
+	if (!ses->domainName) {
+		rc = find_domain_name(ses);
+		if (rc) {
+			cERROR(1, "could not get domain/server name rc %d", rc);
+			return rc;
+		}
+	}
 
 	/* calculate buf->ntlmv2_hash */
 	rc = calc_ntlmv2_hash(ses, nls_cp);
-	if (rc)
+	if (rc) {
+		cERROR(1, "could not get v2 hash rc %d", rc);
+		return rc;
+	}
+	rc = CalcNTLMv2_response(ses->server, resp_buf);
+	if (rc) {
 		cERROR(1, "could not get v2 hash rc %d", rc);
-	CalcNTLMv2_response(ses, resp_buf);
+		return rc;
+	}
+
+	crypto_shash_setkey(ses->server->ntlmssp.hmacmd5,
+			ses->server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
+
+	sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5;
+	sdesc.shash.flags = 0x0;
+
+	rc = crypto_shash_init(&sdesc.shash);
+	if (rc) {
+		cERROR(1, "could not initialize master crypto API hmacmd5\n");
+		return rc;
+	}
+
+	crypto_shash_update(&sdesc.shash, resp_buf, CIFS_HMAC_MD5_HASH_SIZE);
 
-	/* now calculate the MAC key for NTLMv2 */
-	hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
-	hmac_md5_update(resp_buf, 16, &context);
-	hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context);
+	rc = crypto_shash_final(&sdesc.shash,
+		ses->server->session_key.data.ntlmv2.key);
 
-	memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf,
-	       sizeof(struct ntlmv2_resp));
-	ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp);
+	memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf,
+			sizeof(struct ntlmv2_resp));
+	ses->server->session_key.len = 16 + sizeof(struct ntlmv2_resp);
+
+	return rc;
 }
 
-void CalcNTLMv2_response(const struct cifsSesInfo *ses,
-			 char *v2_session_response)
+int
+calc_seckey(struct TCP_Server_Info *server)
 {
-	struct HMACMD5Context context;
-	/* rest of v2 struct already generated */
-	memcpy(v2_session_response + 8, ses->server->cryptKey, 8);
-	hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context);
+	int rc;
+	unsigned char sec_key[CIFS_NTLMV2_SESSKEY_SIZE];
+	struct crypto_blkcipher *tfm_arc4;
+	struct scatterlist sgin, sgout;
+	struct blkcipher_desc desc;
+
+	get_random_bytes(sec_key, CIFS_NTLMV2_SESSKEY_SIZE);
+
+	tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)",
+						0, CRYPTO_ALG_ASYNC);
+	if (!tfm_arc4 || IS_ERR(tfm_arc4)) {
+		cERROR(1, "could not allocate " "master crypto API arc4\n");
+		return 1;
+	}
+
+	crypto_blkcipher_setkey(tfm_arc4,
+		server->session_key.data.ntlmv2.key, CIFS_CPHTXT_SIZE);
+	sg_init_one(&sgin, sec_key, CIFS_CPHTXT_SIZE);
+	sg_init_one(&sgout, server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
+	rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE);
+
+	if (!rc)
+		memcpy(server->session_key.data.ntlmv2.key,
+				sec_key, CIFS_NTLMV2_SESSKEY_SIZE);
 
-	hmac_md5_update(v2_session_response+8,
-			sizeof(struct ntlmv2_resp) - 8, &context);
+	crypto_free_blkcipher(tfm_arc4);
 
-	hmac_md5_final(v2_session_response, &context);
-/*	cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */
+	return 0;
+}
+
+void
+cifs_crypto_shash_release(struct TCP_Server_Info *server)
+{
+	if (server->ntlmssp.md5)
+		crypto_free_shash(server->ntlmssp.md5);
+
+	if (server->ntlmssp.hmacmd5)
+		crypto_free_shash(server->ntlmssp.hmacmd5);
+}
+
+int
+cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
+{
+	server->ntlmssp.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
+	if (!server->ntlmssp.hmacmd5 ||
+			IS_ERR(server->ntlmssp.hmacmd5)) {
+		cERROR(1, "could not allocate master crypto API hmacmd5\n");
+		return 1;
+	}
+
+	server->ntlmssp.md5 = crypto_alloc_shash("md5", 0, 0);
+	if (!server->ntlmssp.md5 || IS_ERR(server->ntlmssp.md5)) {
+		crypto_free_shash(server->ntlmssp.hmacmd5);
+		cERROR(1, "could not allocate master crypto API md5\n");
+		return 1;
+	}
+
+	return 0;
 }
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0cdfb8c32ac6..49563e0c1725 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -25,6 +25,9 @@
 #include <linux/workqueue.h>
 #include "cifs_fs_sb.h"
 #include "cifsacl.h"
+#include <crypto/internal/hash.h>
+#include <linux/scatterlist.h>
+
 /*
  * The sizes of various internal tables and strings
  */
@@ -97,7 +100,7 @@ enum protocolEnum {
 	/* Netbios frames protocol not supported at this time */
 };
 
-struct mac_key {
+struct session_key {
 	unsigned int len;
 	union {
 		char ntlm[CIFS_SESS_KEY_SIZE + 16];
@@ -120,6 +123,14 @@ struct cifs_cred {
 	struct cifs_ace *aces;
 };
 
+struct ntlmssp_auth {
+	__u32 client_flags;
+	__u32 server_flags;
+	unsigned char ciphertext[CIFS_CPHTXT_SIZE];
+	struct crypto_shash *hmacmd5;
+	struct crypto_shash *md5;
+};
+
 /*
  *****************************************************************
  * Except the CIFS PDUs themselves all the
@@ -182,11 +193,14 @@ struct TCP_Server_Info {
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
 	__u32 sequence_number; /* needed for CIFS PDU signature */
-	struct mac_key mac_signing_key;
+	struct session_key session_key;
 	char ntlmv2_hash[16];
 	unsigned long lstrp; /* when we got last response from this server */
 	u16 dialect; /* dialect index that server chose */
 	/* extended security flavors that server supports */
+	unsigned int tilen; /* length of the target info blob */
+	unsigned char *tiblob; /* target info blob in challenge response */
+	struct ntlmssp_auth ntlmssp; /* various keys, ciphers, flags */
 	bool	sec_kerberos;		/* supports plain Kerberos */
 	bool	sec_mskerberos;		/* supports legacy MS Kerberos */
 	bool	sec_kerberosu2u;	/* supports U2U Kerberos */
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 14d036d8db11..320e0fd0ba7b 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -134,6 +134,12 @@
  * Size of the session key (crypto key encrypted with the password
  */
 #define CIFS_SESS_KEY_SIZE (24)
+#define CIFS_CLIENT_CHALLENGE_SIZE (8)
+#define CIFS_SERVER_CHALLENGE_SIZE (8)
+#define CIFS_HMAC_MD5_HASH_SIZE (16)
+#define CIFS_CPHTXT_SIZE (16)
+#define CIFS_NTLMV2_SESSKEY_SIZE (16)
+#define CIFS_NTHASH_SIZE (16)
 
 /*
  * Maximum user name length
@@ -663,7 +669,6 @@ struct ntlmv2_resp {
 	__le64  time;
 	__u64  client_chal; /* random */
 	__u32  reserved2;
-	struct ntlmssp2_name names[2];
 	/* array of name entries could follow ending in minimum 4 byte struct */
 } __attribute__((packed));
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 1f5450814087..1378d9133844 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -361,15 +361,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *);
 extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *,
 			  __u32 *);
 extern int cifs_verify_signature(struct smb_hdr *,
-				 const struct mac_key *mac_key,
+				 struct TCP_Server_Info *server,
 				__u32 expected_sequence_number);
-extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn,
+extern int cifs_calculate_session_key(struct session_key *key, const char *rn,
 				 const char *pass);
-extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *,
-			const struct nls_table *);
-extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *);
-extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
+extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *,
 			     const struct nls_table *);
+extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *);
+extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
+extern int calc_seckey(struct TCP_Server_Info *);
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern void calc_lanman_hash(const char *password, const char *cryptkey,
 				bool encrypt, char *lnm_session_key);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index c65c3419dd37..4bda920d1f75 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -604,11 +604,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			else
 				rc = -EINVAL;
 
-			if (server->sec_kerberos || server->sec_mskerberos)
-				server->secType = Kerberos;
-			else if (server->sec_ntlmssp)
-				server->secType = RawNTLMSSP;
-			else
+			if (server->secType == Kerberos) {
+				if (!server->sec_kerberos &&
+						!server->sec_mskerberos)
+					rc = -EOPNOTSUPP;
+			} else if (server->secType == RawNTLMSSP) {
+				if (!server->sec_ntlmssp)
+					rc = -EOPNOTSUPP;
+			} else
 				rc = -EOPNOTSUPP;
 		}
 	} else
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 446e2486d5f0..18af707f00f1 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1707,6 +1707,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses)
 		CIFSSMBLogoff(xid, ses);
 		_FreeXid(xid);
 	}
+	cifs_crypto_shash_release(server);
 	sesInfoFree(ses);
 	cifs_put_tcp_session(server);
 }
@@ -1786,13 +1787,23 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 	ses->linux_uid = volume_info->linux_uid;
 	ses->overrideSecFlg = volume_info->secFlg;
 
+	rc = cifs_crypto_shash_allocate(server);
+	if (rc) {
+		cERROR(1, "could not setup hash structures rc %d", rc);
+		goto get_ses_fail;
+	}
+	server->tilen = 0;
+	server->tiblob = NULL;
+
 	mutex_lock(&ses->session_mutex);
 	rc = cifs_negotiate_protocol(xid, ses);
 	if (!rc)
 		rc = cifs_setup_session(xid, ses, volume_info->local_nls);
 	mutex_unlock(&ses->session_mutex);
-	if (rc)
+	if (rc) {
+		cifs_crypto_shash_release(ses->server);
 		goto get_ses_fail;
+	}
 
 	/* success, put it on the list */
 	write_lock(&cifs_tcp_ses_lock);
diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h
index 49c9a4e75319..1db0f0746a5b 100644
--- a/fs/cifs/ntlmssp.h
+++ b/fs/cifs/ntlmssp.h
@@ -61,6 +61,19 @@
 #define NTLMSSP_NEGOTIATE_KEY_XCH   0x40000000
 #define NTLMSSP_NEGOTIATE_56        0x80000000
 
+/* Define AV Pair Field IDs */
+#define NTLMSSP_AV_EOL			0
+#define NTLMSSP_AV_NB_COMPUTER_NAME	1
+#define NTLMSSP_AV_NB_DOMAIN_NAME	2
+#define NTLMSSP_AV_DNS_COMPUTER_NAME	3
+#define NTLMSSP_AV_DNS_DOMAIN_NAME	4
+#define NTLMSSP_AV_DNS_TREE_NAME	5
+#define NTLMSSP_AV_FLAGS		6
+#define NTLMSSP_AV_TIMESTAMP		7
+#define NTLMSSP_AV_RESTRICTION		8
+#define NTLMSSP_AV_TARGET_NAME		9
+#define NTLMSSP_AV_CHANNEL_BINDINGS	10
+
 /* Although typedefs are not commonly used for structure definitions */
 /* in the Linux kernel, in this particular case they are useful      */
 /* to more closely match the standards document for NTLMSSP from     */
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 0a57cb7db5dd..41fc5328120d 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -383,6 +383,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft,
 static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 				    struct cifsSesInfo *ses)
 {
+	unsigned int tioffset; /* challeng message target info area */
+	unsigned int tilen; /* challeng message target info area length  */
+
 	CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr;
 
 	if (blob_len < sizeof(CHALLENGE_MESSAGE)) {
@@ -405,6 +408,18 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
 		we must set the MIC field of the AUTHENTICATE_MESSAGE */
 
+	tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
+	tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
+	ses->server->tilen = tilen;
+	if (tilen) {
+		ses->server->tiblob = kmalloc(tilen, GFP_KERNEL);
+		if (!ses->server->tiblob) {
+			cERROR(1, "Challenge target info allocation failure");
+			return -ENOMEM;
+		}
+		memcpy(ses->server->tiblob,  bcc_ptr + tioffset, tilen);
+	}
+
 	return 0;
 }
 
@@ -451,10 +466,12 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 				   struct cifsSesInfo *ses,
 				   const struct nls_table *nls_cp, bool first)
 {
+	int rc;
+	unsigned int size;
 	AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer;
 	__u32 flags;
 	unsigned char *tmp;
-	char ntlm_session_key[CIFS_SESS_KEY_SIZE];
+	struct ntlmv2_resp ntlmv2_response = {};
 
 	memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8);
 	sec_blob->MessageType = NtLmAuthenticate;
@@ -477,19 +494,25 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->LmChallengeResponse.Length = 0;
 	sec_blob->LmChallengeResponse.MaximumLength = 0;
 
-	/* calculate session key,  BB what about adding similar ntlmv2 path? */
-	SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key);
-	if (first)
-		cifs_calculate_mac_key(&ses->server->mac_signing_key,
-				       ntlm_session_key, ses->password);
-
-	memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE);
 	sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer);
-	sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE);
-	sec_blob->NtChallengeResponse.MaximumLength =
-				cpu_to_le16(CIFS_SESS_KEY_SIZE);
+	rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp);
+	if (rc) {
+		cERROR(1, "error rc: %d during ntlmssp ntlmv2 setup", rc);
+		goto setup_ntlmv2_ret;
+	}
+	size =  sizeof(struct ntlmv2_resp);
+	memcpy(tmp, (char *)&ntlmv2_response, size);
+	tmp += size;
+	if (ses->server->tilen > 0) {
+		memcpy(tmp, ses->server->tiblob, ses->server->tilen);
+		tmp += ses->server->tilen;
+	} else
+		ses->server->tilen = 0;
 
-	tmp += CIFS_SESS_KEY_SIZE;
+	sec_blob->NtChallengeResponse.Length = cpu_to_le16(size +
+				ses->server->tilen);
+	sec_blob->NtChallengeResponse.MaximumLength =
+		cpu_to_le16(size + ses->server->tilen);
 
 	if (ses->domainName == NULL) {
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
@@ -501,7 +524,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		len = cifs_strtoUCS((__le16 *)tmp, ses->domainName,
 				    MAX_USERNAME_SIZE, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
-		len += 2; /* trailing null */
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->DomainName.Length = cpu_to_le16(len);
 		sec_blob->DomainName.MaximumLength = cpu_to_le16(len);
@@ -518,7 +540,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		len = cifs_strtoUCS((__le16 *)tmp, ses->userName,
 				    MAX_USERNAME_SIZE, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
-		len += 2; /* trailing null */
 		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->UserName.Length = cpu_to_le16(len);
 		sec_blob->UserName.MaximumLength = cpu_to_le16(len);
@@ -530,9 +551,26 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	sec_blob->WorkstationName.MaximumLength = 0;
 	tmp += 2;
 
-	sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
-	sec_blob->SessionKey.Length = 0;
-	sec_blob->SessionKey.MaximumLength = 0;
+	if ((ses->server->ntlmssp.server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) &&
+			!calc_seckey(ses->server)) {
+		memcpy(tmp, ses->server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE);
+		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE);
+		sec_blob->SessionKey.MaximumLength =
+			cpu_to_le16(CIFS_CPHTXT_SIZE);
+		tmp += CIFS_CPHTXT_SIZE;
+	} else {
+		sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer);
+		sec_blob->SessionKey.Length = 0;
+		sec_blob->SessionKey.MaximumLength = 0;
+	}
+
+	ses->server->sequence_number = 0;
+
+setup_ntlmv2_ret:
+	if (ses->server->tilen > 0)
+		kfree(ses->server->tiblob);
+
 	return tmp - pbuffer;
 }
 
@@ -546,15 +584,14 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB,
 	return;
 }
 
-static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB,
+static int setup_ntlmssp_auth_req(char *ntlmsspblob,
 				  struct cifsSesInfo *ses,
 				  const struct nls_table *nls, bool first_time)
 {
 	int bloblen;
 
-	bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls,
+	bloblen = build_ntlmssp_auth_blob(ntlmsspblob, ses, nls,
 					  first_time);
-	pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen);
 
 	return bloblen;
 }
@@ -580,6 +617,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	struct key *spnego_key = NULL;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
 	bool first_time;
+	char *ntlmsspblob;
 
 	if (ses == NULL)
 		return -EINVAL;
@@ -690,7 +728,7 @@ ssetup_ntlmssp_authenticate:
 
 		if (first_time) /* should this be moved into common code
 				  with similar ntlmv2 path? */
-			cifs_calculate_mac_key(&ses->server->mac_signing_key,
+			cifs_calculate_session_key(&ses->server->session_key,
 				ntlm_session_key, ses->password);
 		/* copy session key */
 
@@ -729,12 +767,21 @@ ssetup_ntlmssp_authenticate:
 			cpu_to_le16(sizeof(struct ntlmv2_resp));
 
 		/* calculate session key */
-		setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
+		rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp);
+		if (rc) {
+			kfree(v2_sess_key);
+			goto ssetup_exit;
+		}
 		/* FIXME: calculate MAC key */
 		memcpy(bcc_ptr, (char *)v2_sess_key,
 		       sizeof(struct ntlmv2_resp));
 		bcc_ptr += sizeof(struct ntlmv2_resp);
 		kfree(v2_sess_key);
+		if (ses->server->tilen > 0) {
+			memcpy(bcc_ptr, ses->server->tiblob,
+				ses->server->tilen);
+			bcc_ptr += ses->server->tilen;
+		}
 		if (ses->capabilities & CAP_UNICODE) {
 			if (iov[0].iov_len % 2) {
 				*bcc_ptr = 0;
@@ -765,15 +812,15 @@ ssetup_ntlmssp_authenticate:
 		}
 		/* bail out if key is too long */
 		if (msg->sesskey_len >
-		    sizeof(ses->server->mac_signing_key.data.krb5)) {
+		    sizeof(ses->server->session_key.data.krb5)) {
 			cERROR(1, "Kerberos signing key too long (%u bytes)",
 				msg->sesskey_len);
 			rc = -EOVERFLOW;
 			goto ssetup_exit;
 		}
 		if (first_time) {
-			ses->server->mac_signing_key.len = msg->sesskey_len;
-			memcpy(ses->server->mac_signing_key.data.krb5,
+			ses->server->session_key.len = msg->sesskey_len;
+			memcpy(ses->server->session_key.data.krb5,
 				msg->data, msg->sesskey_len);
 		}
 		pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC;
@@ -815,12 +862,26 @@ ssetup_ntlmssp_authenticate:
 			if (phase == NtLmNegotiate) {
 				setup_ntlmssp_neg_req(pSMB, ses);
 				iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE);
+				iov[1].iov_base = &pSMB->req.SecurityBlob[0];
 			} else if (phase == NtLmAuthenticate) {
 				int blob_len;
-				blob_len = setup_ntlmssp_auth_req(pSMB, ses,
-								  nls_cp,
-								  first_time);
+				ntlmsspblob = kmalloc(5 *
+					sizeof(struct _AUTHENTICATE_MESSAGE),
+					GFP_KERNEL);
+				if (!ntlmsspblob) {
+					cERROR(1, "Can't allocate NTLMSSP");
+					rc = -ENOMEM;
+					goto ssetup_exit;
+				}
+
+				blob_len = setup_ntlmssp_auth_req(ntlmsspblob,
+								ses,
+								nls_cp,
+								first_time);
 				iov[1].iov_len = blob_len;
+				iov[1].iov_base = ntlmsspblob;
+				pSMB->req.SecurityBlobLength =
+					cpu_to_le16(blob_len);
 				/* Make sure that we tell the server that we
 				   are using the uid that it just gave us back
 				   on the response (challenge) */
@@ -830,7 +891,6 @@ ssetup_ntlmssp_authenticate:
 				rc = -ENOSYS;
 				goto ssetup_exit;
 			}
-			iov[1].iov_base = &pSMB->req.SecurityBlob[0];
 			/* unicode strings must be word aligned */
 			if ((iov[0].iov_len + iov[1].iov_len) % 2) {
 				*bcc_ptr = 0;
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 82f78c4d6978..e0588cdf4cc5 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses,
 		    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 					     SECMODE_SIGN_ENABLED))) {
 			rc = cifs_verify_signature(midQ->resp_buf,
-						&ses->server->mac_signing_key,
+						ses->server,
 						midQ->sequence_number+1);
 			if (rc) {
 				cERROR(1, "Unexpected SMB signature");
@@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses,
 		    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 					     SECMODE_SIGN_ENABLED))) {
 			rc = cifs_verify_signature(out_buf,
-						&ses->server->mac_signing_key,
+						ses->server,
 						midQ->sequence_number+1);
 			if (rc) {
 				cERROR(1, "Unexpected SMB signature");
@@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon,
 	    (ses->server->secMode & (SECMODE_SIGN_REQUIRED |
 				     SECMODE_SIGN_ENABLED))) {
 		rc = cifs_verify_signature(out_buf,
-					   &ses->server->mac_signing_key,
+					   ses->server,
 					   midQ->sequence_number+1);
 		if (rc) {
 			cERROR(1, "Unexpected SMB signature");
-- 
cgit v1.2.3


From f3c60c5918f26ea16761ddc8b12d8401a3db626b Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Wed, 11 Aug 2010 14:51:23 -0700
Subject: ceph: fix multiple mds session shutdown

The use of a completion when waiting for session shutdown during umount is
inappropriate, given the complexity of the condition.  For multiple MDS's,
this resulted in the umount thread spinning, often preventing the session
close message from being processed in some cases.

Switch to a waitqueue and defined a condition helper.  This cleans things
up nicely.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 68 +++++++++++++++++++++++++++-------------------------
 fs/ceph/mds_client.h |  3 ++-
 2 files changed, 37 insertions(+), 34 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a75ddbf9fe37..397a47b696ce 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2208,7 +2208,7 @@ static void handle_session(struct ceph_mds_session *session,
 			pr_info("mds%d reconnect denied\n", session->s_mds);
 		remove_session_caps(session);
 		wake = 1; /* for good measure */
-		complete_all(&mdsc->session_close_waiters);
+		wake_up_all(&mdsc->session_close_wq);
 		kick_requests(mdsc, mds);
 		break;
 
@@ -2876,7 +2876,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
 		return -ENOMEM;
 
 	init_completion(&mdsc->safe_umount_waiters);
-	init_completion(&mdsc->session_close_waiters);
+	init_waitqueue_head(&mdsc->session_close_wq);
 	INIT_LIST_HEAD(&mdsc->waiting_for_map);
 	mdsc->sessions = NULL;
 	mdsc->max_sessions = 0;
@@ -3021,6 +3021,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 	wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
 }
 
+/*
+ * true if all sessions are closed, or we force unmount
+ */
+bool done_closing_sessions(struct ceph_mds_client *mdsc)
+{
+	int i, n = 0;
+
+	if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+		return true;
+
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++)
+		if (mdsc->sessions[i])
+			n++;
+	mutex_unlock(&mdsc->mutex);
+	return n == 0;
+}
 
 /*
  * called after sb is ro.
@@ -3029,45 +3046,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 {
 	struct ceph_mds_session *session;
 	int i;
-	int n;
 	struct ceph_client *client = mdsc->client;
-	unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
+	unsigned long timeout = client->mount_args->mount_timeout * HZ;
 
 	dout("close_sessions\n");
 
-	mutex_lock(&mdsc->mutex);
-
 	/* close sessions */
-	started = jiffies;
-	while (time_before(jiffies, started + timeout)) {
-		dout("closing sessions\n");
-		n = 0;
-		for (i = 0; i < mdsc->max_sessions; i++) {
-			session = __ceph_lookup_mds_session(mdsc, i);
-			if (!session)
-				continue;
-			mutex_unlock(&mdsc->mutex);
-			mutex_lock(&session->s_mutex);
-			__close_session(mdsc, session);
-			mutex_unlock(&session->s_mutex);
-			ceph_put_mds_session(session);
-			mutex_lock(&mdsc->mutex);
-			n++;
-		}
-		if (n == 0)
-			break;
-
-		if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
-			break;
-
-		dout("waiting for sessions to close\n");
+	mutex_lock(&mdsc->mutex);
+	for (i = 0; i < mdsc->max_sessions; i++) {
+		session = __ceph_lookup_mds_session(mdsc, i);
+		if (!session)
+			continue;
 		mutex_unlock(&mdsc->mutex);
-		wait_for_completion_timeout(&mdsc->session_close_waiters,
-					    timeout);
+		mutex_lock(&session->s_mutex);
+		__close_session(mdsc, session);
+		mutex_unlock(&session->s_mutex);
+		ceph_put_mds_session(session);
 		mutex_lock(&mdsc->mutex);
 	}
+	mutex_unlock(&mdsc->mutex);
+
+	dout("waiting for sessions to close\n");
+	wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc),
+			   timeout);
 
 	/* tear down remaining sessions */
+	mutex_lock(&mdsc->mutex);
 	for (i = 0; i < mdsc->max_sessions; i++) {
 		if (mdsc->sessions[i]) {
 			session = get_session(mdsc->sessions[i]);
@@ -3080,9 +3084,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 			mutex_lock(&mdsc->mutex);
 		}
 	}
-
 	WARN_ON(!list_empty(&mdsc->cap_delay_list));
-
 	mutex_unlock(&mdsc->mutex);
 
 	ceph_cleanup_empty_realms(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ab7e89f5e344..c98267ce6d2a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -234,7 +234,8 @@ struct ceph_mds_client {
 	struct mutex            mutex;         /* all nested structures */
 
 	struct ceph_mdsmap      *mdsmap;
-	struct completion       safe_umount_waiters, session_close_waiters;
+	struct completion       safe_umount_waiters;
+	wait_queue_head_t       session_close_wq;
 	struct list_head        waiting_for_map;
 
 	struct ceph_mds_session **sessions;    /* NULL for mds if no session */
-- 
cgit v1.2.3


From 082afec92d1052305af1195f591602f4d0f44277 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sun, 22 Aug 2010 15:16:41 -0700
Subject: ceph: fix xattr cap writeback

We should include the xattr metadata blob in the cap update message any
time we are flushing dirty state, NOT just when we are also dropping the
cap.  This fixes async xattr writeback.

Also, clean up the code slightly to avoid duplicating the bit test.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7bf182b03973..0ac2703f3bdf 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1082,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	gid_t gid;
 	struct ceph_mds_session *session;
 	u64 xattr_version = 0;
+	struct ceph_buffer *xattr_blob = NULL;
 	int delayed = 0;
 	u64 flush_tid = 0;
 	int i;
@@ -1160,9 +1161,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	gid = inode->i_gid;
 	mode = inode->i_mode;
 
-	if (dropping & CEPH_CAP_XATTR_EXCL) {
+	if (flushing & CEPH_CAP_XATTR_EXCL) {
 		__ceph_build_xattrs_blob(ci);
-		xattr_version = ci->i_xattrs.version + 1;
+		xattr_blob = ci->i_xattrs.blob;
+		xattr_version = ci->i_xattrs.version;
 	}
 
 	spin_unlock(&inode->i_lock);
@@ -1170,9 +1172,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
 		op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
 		size, max_size, &mtime, &atime, time_warp_seq,
-		uid, gid, mode,
-		xattr_version,
-		(flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+		uid, gid, mode, xattr_version, xattr_blob,
 		follows);
 	if (ret < 0) {
 		dout("error sending cap msg, must requeue %p\n", inode);
-- 
cgit v1.2.3


From 4a625be47243e0e07dedd0a1a6b94c66c2ab93ba Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sun, 22 Aug 2010 15:03:56 -0700
Subject: ceph: include dirty xattrs state in snapped caps

When we snapshot dirty metadata that needs to be written back to the MDS,
include dirty xattr metadata.  Make the capsnap reference the encoded
xattr blob so that it will be written back in the FLUSHSNAP op.

Also fix the capsnap creation guard to include dirty auth or file bits,
not just tests specific to dirty file data or file writes in progress
(this fixes auth metadata writeback).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/caps.c  |  2 +-
 fs/ceph/snap.c  | 23 ++++++++++++++++-------
 fs/ceph/super.h |  8 +++++---
 fs/ceph/xattr.c |  1 +
 4 files changed, 23 insertions(+), 11 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0ac2703f3bdf..ba5bbf318fe1 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1282,7 +1282,7 @@ retry:
 			     &capsnap->mtime, &capsnap->atime,
 			     capsnap->time_warp_seq,
 			     capsnap->uid, capsnap->gid, capsnap->mode,
-			     0, NULL,
+			     capsnap->xattr_version, capsnap->xattr_blob,
 			     capsnap->follows);
 
 		next_follows = capsnap->follows + 1;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index c0b26b6badba..2cb190c2bd99 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -435,7 +435,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 {
 	struct inode *inode = &ci->vfs_inode;
 	struct ceph_cap_snap *capsnap;
-	int used;
+	int used, dirty;
 
 	capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
 	if (!capsnap) {
@@ -445,6 +445,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 
 	spin_lock(&inode->i_lock);
 	used = __ceph_caps_used(ci);
+	dirty = __ceph_caps_dirty(ci);
 	if (__ceph_have_pending_cap_snap(ci)) {
 		/* there is no point in queuing multiple "pending" cap_snaps,
 		   as no new writes are allowed to start when pending, so any
@@ -452,11 +453,13 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		   cap_snap.  lucky us. */
 		dout("queue_cap_snap %p already pending\n", inode);
 		kfree(capsnap);
-	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+	} else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) ||
+		   (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL|
+			     CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
 		struct ceph_snap_context *snapc = ci->i_head_snapc;
 
 		igrab(inode);
-
+		
 		atomic_set(&capsnap->nref, 1);
 		capsnap->ci = ci;
 		INIT_LIST_HEAD(&capsnap->ci_item);
@@ -464,15 +467,21 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 
 		capsnap->follows = snapc->seq - 1;
 		capsnap->issued = __ceph_caps_issued(ci, NULL);
-		capsnap->dirty = __ceph_caps_dirty(ci);
+		capsnap->dirty = dirty;
 
 		capsnap->mode = inode->i_mode;
 		capsnap->uid = inode->i_uid;
 		capsnap->gid = inode->i_gid;
 
-		/* fixme? */
-		capsnap->xattr_blob = NULL;
-		capsnap->xattr_len = 0;
+		if (dirty & CEPH_CAP_XATTR_EXCL) {
+			__ceph_build_xattrs_blob(ci);
+			capsnap->xattr_blob =
+				ceph_buffer_get(ci->i_xattrs.blob);
+			capsnap->xattr_version = ci->i_xattrs.version;
+		} else {
+			capsnap->xattr_blob = NULL;
+			capsnap->xattr_version = 0;
+		}
 
 		/* dirty page count moved from _head to this cap_snap;
 		   all subsequent writes page dirties occur _after_ this
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2482d696f0de..b33929d8f287 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -216,8 +216,7 @@ struct ceph_cap_snap {
 	uid_t uid;
 	gid_t gid;
 
-	void *xattr_blob;
-	int xattr_len;
+	struct ceph_buffer *xattr_blob;
 	u64 xattr_version;
 
 	u64 size;
@@ -229,8 +228,11 @@ struct ceph_cap_snap {
 
 static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
 {
-	if (atomic_dec_and_test(&capsnap->nref))
+	if (atomic_dec_and_test(&capsnap->nref)) {
+		if (capsnap->xattr_blob)
+			ceph_buffer_put(capsnap->xattr_blob);
 		kfree(capsnap);
+	}
 }
 
 /*
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 097a2654c00f..9578af610b73 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -485,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
 		ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
 		ci->i_xattrs.prealloc_blob = NULL;
 		ci->i_xattrs.dirty = false;
+		ci->i_xattrs.version++;
 	}
 }
 
-- 
cgit v1.2.3


From ed326044489ed89c740c50a3df5dffc9c3b20b96 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 16 Aug 2010 13:37:31 -0700
Subject: ceph: queue cap snap writeback for realm children on snap update

When a realm is updated, we need to queue writeback on inodes in that
realm _and_ its children.  Otherwise, if the inode gets cowed on the
server, we can get a hang later due to out-of-sync cap/snap state.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/snap.c | 60 ++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 37 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 2cb190c2bd99..6bdbf3ae7082 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -548,6 +548,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 	return 1;  /* caller may want to ceph_flush_snaps */
 }
 
+/*
+ * Queue cap_snaps for snap writeback for this realm and its children.
+ * Called under snap_rwsem, so realm topology won't change.
+ */
+static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
+{
+	struct ceph_inode_info *ci;
+	struct inode *lastinode = NULL;
+	struct ceph_snap_realm *child;
+
+	dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino);
+
+	spin_lock(&realm->inodes_with_caps_lock);
+	list_for_each_entry(ci, &realm->inodes_with_caps,
+			    i_snap_realm_item) {
+		struct inode *inode = igrab(&ci->vfs_inode);
+		if (!inode)
+			continue;
+		spin_unlock(&realm->inodes_with_caps_lock);
+		if (lastinode)
+			iput(lastinode);
+		lastinode = inode;
+		ceph_queue_cap_snap(ci);
+		spin_lock(&realm->inodes_with_caps_lock);
+	}
+	spin_unlock(&realm->inodes_with_caps_lock);
+	if (lastinode)
+		iput(lastinode);
+
+	dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino);
+	list_for_each_entry(child, &realm->children, child_item)
+		queue_realm_cap_snaps(child);
+
+	dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino);
+}
 
 /*
  * Parse and apply a snapblob "snap trace" from the MDS.  This specifies
@@ -598,29 +633,8 @@ more:
 		 *
 		 * ...unless it's a snap deletion!
 		 */
-		if (!deletion) {
-			struct ceph_inode_info *ci;
-			struct inode *lastinode = NULL;
-
-			spin_lock(&realm->inodes_with_caps_lock);
-			list_for_each_entry(ci, &realm->inodes_with_caps,
-					    i_snap_realm_item) {
-				struct inode *inode = igrab(&ci->vfs_inode);
-				if (!inode)
-					continue;
-				spin_unlock(&realm->inodes_with_caps_lock);
-				if (lastinode)
-					iput(lastinode);
-				lastinode = inode;
-				ceph_queue_cap_snap(ci);
-				spin_lock(&realm->inodes_with_caps_lock);
-			}
-			spin_unlock(&realm->inodes_with_caps_lock);
-			if (lastinode)
-				iput(lastinode);
-			dout("update_snap_trace cap_snaps queued\n");
-		}
-
+		if (!deletion)
+			queue_realm_cap_snaps(realm);
 	} else {
 		dout("update_snap_trace %llx %p seq %lld unchanged\n",
 		     realm->ino, realm, realm->seq);
-- 
cgit v1.2.3


From eb6bb1c5bdc6e455a9d16cb845cc65afc9b0a617 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Mon, 16 Aug 2010 09:21:27 -0700
Subject: ceph: direct requests in snapped namespace based on nonsnap parent

When making a request in the virtual snapdir or a snapped portion of the
namespace, we should choose the MDS based on the first nonsnap parent (and
its caps).  If that is not the best place, we will get forward hints to
find the right MDS in the cluster.  This fixes ESTALE errors when using
the .snap directory and namespace with multiple MDSs.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 397a47b696ce..8d1f11c7a5a2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -560,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc,
  *
  * Called under mdsc->mutex.
  */
+struct dentry *get_nonsnap_parent(struct dentry *dentry)
+{
+	while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP)
+		dentry = dentry->d_parent;
+	return dentry;
+}
+
 static int __choose_mds(struct ceph_mds_client *mdsc,
 			struct ceph_mds_request *req)
 {
@@ -590,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
 	if (req->r_inode) {
 		inode = req->r_inode;
 	} else if (req->r_dentry) {
-		if (req->r_dentry->d_inode) {
+		struct inode *dir = req->r_dentry->d_parent->d_inode;
+
+		if (dir->i_sb != mdsc->client->sb) {
+			/* not this fs! */
+			inode = req->r_dentry->d_inode;
+		} else if (ceph_snap(dir) != CEPH_NOSNAP) {
+			/* direct snapped/virtual snapdir requests
+			 * based on parent dir inode */
+			struct dentry *dn =
+				get_nonsnap_parent(req->r_dentry->d_parent);
+			inode = dn->d_inode;
+			dout("__choose_mds using nonsnap parent %p\n", inode);
+		} else if (req->r_dentry->d_inode) {
+			/* dentry target */
 			inode = req->r_dentry->d_inode;
 		} else {
-			inode = req->r_dentry->d_parent->d_inode;
+			/* dir + name */
+			inode = dir;
 			hash = req->r_dentry->d_name.hash;
 			is_hash = true;
 		}
 	}
+
 	dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
 	     (int)hash, mode);
 	if (!inode)
-- 
cgit v1.2.3


From 679ceace848e9fd570678396ffe1ef034e00e82d Mon Sep 17 00:00:00 2001
From: Michael Rubin <mrubin@google.com>
Date: Fri, 20 Aug 2010 02:31:26 -0700
Subject: mm: exporting account_page_dirty

This allows code outside of the mm core to safely manipulate page state
and not worry about the other accounting. Not using these routines means
that some code will lose track of the accounting and we get bugs. This
has happened once already.

Signed-off-by: Michael Rubin <mrubin@google.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c      | 8 +-------
 mm/page-writeback.c | 1 +
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5598a0d02295..420d46974ec8 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page)
 	spin_lock_irq(&mapping->tree_lock);
 	if (page->mapping) {	/* Race with truncate? */
 		WARN_ON_ONCE(!PageUptodate(page));
-
-		if (mapping_cap_account_dirty(mapping)) {
-			__inc_zone_page_state(page, NR_FILE_DIRTY);
-			__inc_bdi_stat(mapping->backing_dev_info,
-					BDI_RECLAIMABLE);
-			task_io_account_write(PAGE_CACHE_SIZE);
-		}
+		account_page_dirtied(page, page->mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef61548..849d0ccbe914 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1096,6 +1096,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 		task_io_account_write(PAGE_CACHE_SIZE);
 	}
 }
+EXPORT_SYMBOL(account_page_dirtied);
 
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
-- 
cgit v1.2.3


From faa9560ae76ef50a3cbfb1a6afc0343fd8172374 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 18 Aug 2010 12:25:49 -0400
Subject: fanotify: do not dereference inode_mark when it is unset

The fanotify code is supposed to get the group from the mark.  It accidentally
only used the inode_mark.  If the vfsmount_mark was set but not the inode_mark
it would deref the NULL inode_mark.  Get the group from the correct place.

Reported-by: Tvrtko Ursulin <tvrtko.ursulin@sophos.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 3970392b2722..f3e3b355ba7f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -148,13 +148,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
 			 const unsigned char *file_name,
 			 struct fsnotify_event **event)
 {
-	struct fsnotify_group *group = inode_mark->group;
+	struct fsnotify_group *group = NULL;
 	__u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
 	__u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
-	pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p"
-		 " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell,
-		 mnt, inode_mark, mask, data, data_is, cookie, *event);
+	if (unlikely(!inode_mark && !vfsmount_mark)) {
+		BUG();
+		return 0;
+	}
 
 	/* clear ignored on inode modification */
 	if (mask & FS_MODIFY) {
@@ -168,18 +169,24 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
 
 	/* does the inode mark tell us to do something? */
 	if (inode_mark) {
+		group = inode_mark->group;
 		inode_test_mask &= inode_mark->mask;
 		inode_test_mask &= ~inode_mark->ignored_mask;
 	}
 
 	/* does the vfsmount_mark tell us to do something? */
 	if (vfsmount_mark) {
+		group = vfsmount_mark->group;
 		vfsmount_test_mask &= vfsmount_mark->mask;
 		vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
 		if (inode_mark)
 			vfsmount_test_mask &= ~inode_mark->ignored_mask;
 	}
 
+	pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p"
+		 " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell,
+		 mnt, inode_mark, mask, data, data_is, cookie, *event);
+
 	if (!inode_test_mask && !vfsmount_test_mask)
 		return 0;
 
-- 
cgit v1.2.3


From 5f3f259fa8f1d7969360acfad5307d03c2f53d63 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 18 Aug 2010 12:25:49 -0400
Subject: fsnotify: reset used_inode and used_vfsmount on each pass

The fsnotify main loop has 2 booleans which tell if a particular mark was
sent to the listeners or if it should be processed in the next pass.  The
problem is that the booleans were not reset on each traversal of the loop.
So marks could get skipped even when they were not sent to the notifiers.

Reported-by: Tvrtko Ursulin <tvrtko.ursulin@sophos.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index f3e3b355ba7f..59dc7a02bd0c 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -220,7 +220,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	struct fsnotify_event *event = NULL;
 	struct vfsmount *mnt;
 	int idx, ret = 0;
-	bool used_inode = false, used_vfsmount = false;
+	bool used_inode, used_vfsmount;
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
@@ -261,6 +261,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	}
 
 	while (inode_node || vfsmount_node) {
+		used_inode = used_vfsmount = false;
+
 		if (inode_node) {
 			inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
 						 struct fsnotify_mark, i.i_list);
-- 
cgit v1.2.3


From 84e1ab4d875922c034db7f4f814ac445a20a14bd Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 18 Aug 2010 12:25:50 -0400
Subject: fsnotify: fix ignored mask handling between inode and vfsmount marks

The interesting 2 list lockstep walking didn't quite work out if the inode
marks only had ignores and the vfsmount list requested events.  The code to
shortcut list traversal would not run the inode list since it didn't have real
event requests.  This code forces inode list traversal when a vfsmount mark
matches the event type.  Maybe we could add an i_fsnotify_ignored_mask field
to struct inode to get the shortcut back, but it doesn't seem worth it to grow
struct inode again.

I bet with the recent changes to lock the way we do now it would actually not
be a major perf hit to just drop i_fsnotify_mark_mask altogether.  But that is
for another day.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 59dc7a02bd0c..6f2777ce87a1 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -149,8 +149,8 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
 			 struct fsnotify_event **event)
 {
 	struct fsnotify_group *group = NULL;
-	__u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
-	__u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
+	__u32 inode_test_mask = 0;
+	__u32 vfsmount_test_mask = 0;
 
 	if (unlikely(!inode_mark && !vfsmount_mark)) {
 		BUG();
@@ -170,12 +170,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
 	/* does the inode mark tell us to do something? */
 	if (inode_mark) {
 		group = inode_mark->group;
+		inode_test_mask = (mask & ~FS_EVENT_ON_CHILD);
 		inode_test_mask &= inode_mark->mask;
 		inode_test_mask &= ~inode_mark->ignored_mask;
 	}
 
 	/* does the vfsmount_mark tell us to do something? */
 	if (vfsmount_mark) {
+		vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD);
 		group = vfsmount_mark->group;
 		vfsmount_test_mask &= vfsmount_mark->mask;
 		vfsmount_test_mask &= ~vfsmount_mark->ignored_mask;
@@ -183,9 +185,12 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
 			vfsmount_test_mask &= ~inode_mark->ignored_mask;
 	}
 
-	pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p"
-		 " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell,
-		 mnt, inode_mark, mask, data, data_is, cookie, *event);
+	pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
+		 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
+		 " data=%p data_is=%d cookie=%d event=%p\n",
+		 __func__, group, to_tell, mnt, mask, inode_mark,
+		 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
+		 data_is, cookie, *event);
 
 	if (!inode_test_mask && !vfsmount_test_mask)
 		return 0;
@@ -214,7 +219,7 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
 int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	     const unsigned char *file_name, u32 cookie)
 {
-	struct hlist_node *inode_node, *vfsmount_node;
+	struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
 	struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL;
 	struct fsnotify_group *inode_group, *vfsmount_group;
 	struct fsnotify_event *event = NULL;
@@ -245,19 +250,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	    (test_mask & to_tell->i_fsnotify_mask))
 		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
 					      &fsnotify_mark_srcu);
-	else
-		inode_node = NULL;
 
-	if (mnt) {
-		if ((mask & FS_MODIFY) ||
-		    (test_mask & mnt->mnt_fsnotify_mask))
-			vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
-							 &fsnotify_mark_srcu);
-		else
-			vfsmount_node = NULL;
-	} else {
-		mnt = NULL;
-		vfsmount_node = NULL;
+	if (mnt && ((mask & FS_MODIFY) ||
+		    (test_mask & mnt->mnt_fsnotify_mask))) {
+		vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first,
+						 &fsnotify_mark_srcu);
+		inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first,
+					      &fsnotify_mark_srcu);
 	}
 
 	while (inode_node || vfsmount_node) {
-- 
cgit v1.2.3


From 2eebf582c9b3106abb9c33f4fc0a347fb9391037 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 18 Aug 2010 12:25:50 -0400
Subject: fanotify: flush outstanding perm requests on group destroy

When an fanotify listener is closing it may cause a deadlock between the
listener and the original task doing an fs operation.  If the original task
is waiting for a permissions response it will be holding the srcu lock.  The
listener cannot clean up and exit until after that srcu lock is syncronized.
Thus deadlock.  The fix introduced here is to stop accepting new permissions
events when a listener is shutting down and to grant permission for all
outstanding events.  Thus the original task will eventually release the srcu
lock and the listener can complete shutdown.

Reported-by: Andreas Gruenbacher <agruen@suse.de>
Cc: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 27 +++++++++++++++++++++++++++
 include/linux/fanotify.h           |  7 -------
 include/linux/fsnotify_backend.h   |  1 +
 3 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 032b837fcd11..b966b7230f47 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -195,6 +195,14 @@ static int prepare_for_access_response(struct fsnotify_group *group,
 	re->fd = fd;
 
 	mutex_lock(&group->fanotify_data.access_mutex);
+
+	if (group->fanotify_data.bypass_perm) {
+		mutex_unlock(&group->fanotify_data.access_mutex);
+		kmem_cache_free(fanotify_response_event_cache, re);
+		event->response = FAN_ALLOW;
+		return 0;
+	}
+		
 	list_add_tail(&re->list, &group->fanotify_data.access_list);
 	mutex_unlock(&group->fanotify_data.access_mutex);
 
@@ -364,9 +372,28 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t
 static int fanotify_release(struct inode *ignored, struct file *file)
 {
 	struct fsnotify_group *group = file->private_data;
+	struct fanotify_response_event *re, *lre;
 
 	pr_debug("%s: file=%p group=%p\n", __func__, file, group);
 
+#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
+	mutex_lock(&group->fanotify_data.access_mutex);
+
+	group->fanotify_data.bypass_perm = true;
+
+	list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) {
+		pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group,
+			 re, re->event);
+
+		list_del_init(&re->list);
+		re->event->response = FAN_ALLOW;
+
+		kmem_cache_free(fanotify_response_event_cache, re);
+	}
+	mutex_unlock(&group->fanotify_data.access_mutex);
+
+	wake_up(&group->fanotify_data.access_waitq);
+#endif
 	/* matches the fanotify_init->fsnotify_alloc_group */
 	fsnotify_put_group(group);
 
diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h
index f0949a57ca9d..985435622ecd 100644
--- a/include/linux/fanotify.h
+++ b/include/linux/fanotify.h
@@ -95,11 +95,4 @@ struct fanotify_response {
 				(long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \
 				(long)(meta)->event_len <= (long)(len))
 
-#ifdef __KERNEL__
-
-struct fanotify_wait {
-	struct fsnotify_event *event;
-	__s32 fd;
-};
-#endif /* __KERNEL__ */
 #endif /* _LINUX_FANOTIFY_H */
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index ed36fb57c426..e40190d16878 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -156,6 +156,7 @@ struct fsnotify_group {
 			struct mutex access_mutex;
 			struct list_head access_list;
 			wait_queue_head_t access_waitq;
+			bool bypass_perm; /* protected by access_mutex */
 #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */
 			int f_flags;
 		} fanotify_data;
-- 
cgit v1.2.3


From ff8d6e983185ce19fa92bb836eb52b589957be65 Mon Sep 17 00:00:00 2001
From: Tvrtko Ursulin <tvrtko.ursulin@sophos.com>
Date: Fri, 20 Aug 2010 10:24:18 +0100
Subject: fanotify: drop duplicate pr_debug statement

This reminded me... you have two pr_debugs in fanotify_should_send_event
which output redundant information. Maybe you intended it like that so
it is selectable how much log spam you want, or if not you may want to
apply this patch.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@sophos.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fanotify/fanotify.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 756566fe8449..85366c78cc37 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -165,9 +165,6 @@ static bool fanotify_should_send_event(struct fsnotify_group *group,
 		 "mask=%x data=%p data_type=%d\n", __func__, group, to_tell,
 		 inode_mark, vfsmnt_mark, event_mask, data, data_type);
 
-	pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n",
-		 __func__, group, vfsmnt_mark, inode_mark, event_mask);
-
 	/* sorry, fanotify only gives a damn about files and dirs */
 	if (!S_ISREG(to_tell->i_mode) &&
 	    !S_ISDIR(to_tell->i_mode))
-- 
cgit v1.2.3


From 124514918b030d74f1f3e15483b7bf3b85268082 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Sun, 22 Aug 2010 21:33:32 -0700
Subject: ceph: don't improperly set dir complete when holding EXCL cap

If we hold the EXCL cap, we cannot trust the dir stats from the MDS (num
files, subdirs) and must not incorrectly conclude that the directory is
empty.  If we do, we get can bad results from lookup (bad ENOENT) and
bad readdir results.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 5d893d31e399..3e6b52cb5ee8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -677,6 +677,7 @@ static int fill_inode(struct inode *inode,
 		if (ci->i_files == 0 && ci->i_subdirs == 0 &&
 		    ceph_snap(inode) == CEPH_NOSNAP &&
 		    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+		    (issued & CEPH_CAP_FILE_EXCL) == 0 &&
 		    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
 			dout(" marking %p complete (empty)\n", inode);
 			ci->i_ceph_flags |= CEPH_I_COMPLETE;
-- 
cgit v1.2.3


From 07a27e226d1ed210d2d4218bd0642b40f5405c6a Mon Sep 17 00:00:00 2001
From: Henry C Chang <henry_c_chang@tcloudcomputing.com>
Date: Sun, 22 Aug 2010 21:34:27 -0700
Subject: ceph: fix osd request lru adjustment when sending request

Fix argument order.  We want to move the item to the end of the list, not
change the position of the head.

Signed-off-by: Henry C Chang <henry_c_chang@tcloudcomputing.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/osd_client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index bed6391e52c7..dfced1dacbcd 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -661,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc,
 	reqhead->reassert_version = req->r_reassert_version;
 
 	req->r_stamp = jiffies;
-	list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
+	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
 
 	ceph_msg_get(req->r_request); /* send consumes a ref */
 	ceph_con_send(&req->r_osd->o_con, req->r_request);
-- 
cgit v1.2.3


From 3ec6bbcdb4e85403f2c5958876ca9492afdf4031 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Mon, 23 Aug 2010 11:04:07 -0500
Subject: missing changes during ntlmv2/ntlmssp auth and sign

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c |  2 ++
 fs/cifs/sess.c        | 13 ++++++++-----
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 051d00011ca3..eef78c24e0cc 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -553,6 +553,8 @@ calc_seckey(struct TCP_Server_Info *server)
 		return 1;
 	}
 
+	desc.tfm = tfm_arc4;
+
 	crypto_blkcipher_setkey(tfm_arc4,
 		server->session_key.data.ntlmv2.key, CIFS_CPHTXT_SIZE);
 	sg_init_one(&sgin, sec_key, CIFS_CPHTXT_SIZE);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 41fc5328120d..4788e16a02cc 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -408,6 +408,8 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len,
 	/* BB spec says that if AvId field of MsvAvTimestamp is populated then
 		we must set the MIC field of the AUTHENTICATE_MESSAGE */
 
+	ses->server->ntlmssp.server_flags = le32_to_cpu(pblob->NegotiateFlags);
+
 	tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset);
 	tilen = cpu_to_le16(pblob->TargetInfoArray.Length);
 	ses->server->tilen = tilen;
@@ -440,12 +442,13 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 	/* BB is NTLMV2 session security format easier to use here? */
 	flags = NTLMSSP_NEGOTIATE_56 |	NTLMSSP_REQUEST_TARGET |
 		NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE |
-		NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM;
+		NTLMSSP_NEGOTIATE_NTLM;
 	if (ses->server->secMode &
-	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))
-		flags |= NTLMSSP_NEGOTIATE_SIGN;
-	if (ses->server->secMode & SECMODE_SIGN_REQUIRED)
-		flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN;
+	   (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) {
+		flags |= NTLMSSP_NEGOTIATE_SIGN |
+			NTLMSSP_NEGOTIATE_KEY_XCH |
+			NTLMSSP_NEGOTIATE_EXTENDED_SEC;
+	}
 
 	sec_blob->NegotiateFlags |= cpu_to_le32(flags);
 
-- 
cgit v1.2.3


From 24e6cf92fde1f140d8eb0bf7cd24c2c78149b6b2 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 23 Aug 2010 11:38:04 -0400
Subject: cifs: check for NULL session password

It's possible for a cifsSesInfo struct to have a NULL password, so we
need to check for that prior to running strncmp on it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 18af707f00f1..ec0ea4a43bdb 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1673,6 +1673,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol)
 				    MAX_USERNAME_SIZE))
 				continue;
 			if (strlen(vol->username) != 0 &&
+			    ses->password != NULL &&
 			    strncmp(ses->password,
 				    vol->password ? vol->password : "",
 				    MAX_PASSWORD_SIZE))
-- 
cgit v1.2.3


From d17c701ce6a548a92f7f8a3cec20299465f36ee3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 24 Aug 2010 11:42:52 +1000
Subject: xfs: unlock items before allowing the CIL to commit

When we commit a transaction using delayed logging, we need to
unlock the items in the transaciton before we unlock the CIL context
and allow it to be checkpointed. If we unlock them after we release
the CIl context lock, the CIL can checkpoint and complete before
we free the log items. This breaks stale buffer item unlock and
unpin processing as there is an implicit assumption that the unlock
will occur before the unpin.

Also, some log items need to store the LSN of the transaction commit
in the item (inodes and EFIs) and so can race with other transaction
completions if we don't prevent the CIL from checkpointing before
the unlock occurs.

Cc: <stable@kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_log_cil.c    | 14 ++++++++++++++
 fs/xfs/xfs_trans.c      |  5 +----
 fs/xfs/xfs_trans_priv.h |  3 ++-
 3 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 31e4ea2d19ac..ef8e7d9f445d 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -377,9 +377,23 @@ xfs_log_commit_cil(
 	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
 	xfs_trans_unreserve_and_mod_sb(tp);
 
+	/*
+	 * Once all the items of the transaction have been copied to the CIL,
+	 * the items can be unlocked and freed.
+	 *
+	 * This needs to be done before we drop the CIL context lock because we
+	 * have to update state in the log items and unlock them before they go
+	 * to disk. If we don't, then the CIL checkpoint can race with us and
+	 * we can run checkpoint completion before we've updated and unlocked
+	 * the log items. This affects (at least) processing of stale buffers,
+	 * inodes and EFIs.
+	 */
+	xfs_trans_free_items(tp, *commit_lsn, 0);
+
 	/* check for background commit before unlock */
 	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
 		push = 1;
+
 	up_read(&log->l_cilp->xc_ctx_lock);
 
 	/*
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fdca7416c754..1c47edaea0d2 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1167,7 +1167,7 @@ xfs_trans_del_item(
  * Unlock all of the items of a transaction and free all the descriptors
  * of that transaction.
  */
-STATIC void
+void
 xfs_trans_free_items(
 	struct xfs_trans	*tp,
 	xfs_lsn_t		commit_lsn,
@@ -1653,9 +1653,6 @@ xfs_trans_commit_cil(
 		return error;
 
 	current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-
-	/* xfs_trans_free_items() unlocks them first */
-	xfs_trans_free_items(tp, *commit_lsn, 0);
 	xfs_trans_free(tp);
 	return 0;
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index e2d93d8ead7b..62da86c90de5 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -25,7 +25,8 @@ struct xfs_trans;
 
 void	xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *);
 void	xfs_trans_del_item(struct xfs_log_item *);
-
+void	xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
+				int flags);
 void	xfs_trans_item_committed(struct xfs_log_item *lip,
 				xfs_lsn_t commit_lsn, int aborted);
 void	xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
-- 
cgit v1.2.3


From 5b3eed756cd37255cad1181bd86bfd0977e97953 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 24 Aug 2010 11:42:41 +1000
Subject: xfs: ensure we mark all inodes in a freed cluster XFS_ISTALE

Under heavy load parallel metadata loads (e.g. dbench), we can fail
to mark all the inodes in a cluster being freed as XFS_ISTALE as we
skip inodes we cannot get the XFS_ILOCK_EXCL or the flush lock on.
When this happens and the inode cluster buffer has already been
marked stale and freed, inode reclaim can try to write the inode out
as it is dirty and not marked stale. This can result in writing th
metadata to an freed extent, or in the case it has already
been overwritten trigger a magic number check failure and return an
EUCLEAN error such as:

Filesystem "ram0": inode 0x442ba1 background reclaim flush failed with 117

Fix this by ensuring that we hoover up all in memory inodes in the
cluster and mark them XFS_ISTALE when freeing the cluster.

Cc: <stable@kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_inode.c | 49 ++++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 68415cb4f23c..34798f391c49 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1914,6 +1914,11 @@ xfs_iunlink_remove(
 	return 0;
 }
 
+/*
+ * A big issue when freeing the inode cluster is is that we _cannot_ skip any
+ * inodes that are in memory - they all must be marked stale and attached to
+ * the cluster buffer.
+ */
 STATIC void
 xfs_ifree_cluster(
 	xfs_inode_t	*free_ip,
@@ -1945,8 +1950,6 @@ xfs_ifree_cluster(
 	}
 
 	for (j = 0; j < nbufs; j++, inum += ninodes) {
-		int	found = 0;
-
 		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
 					 XFS_INO_TO_AGBNO(mp, inum));
 
@@ -1965,7 +1968,9 @@ xfs_ifree_cluster(
 		/*
 		 * Walk the inodes already attached to the buffer and mark them
 		 * stale. These will all have the flush locks held, so an
-		 * in-memory inode walk can't lock them.
+		 * in-memory inode walk can't lock them. By marking them all
+		 * stale first, we will not attempt to lock them in the loop
+		 * below as the XFS_ISTALE flag will be set.
 		 */
 		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
 		while (lip) {
@@ -1977,11 +1982,11 @@ xfs_ifree_cluster(
 							&iip->ili_flush_lsn,
 							&iip->ili_item.li_lsn);
 				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-				found++;
 			}
 			lip = lip->li_bio_list;
 		}
 
+
 		/*
 		 * For each inode in memory attempt to add it to the inode
 		 * buffer and set it up for being staled on buffer IO
@@ -1993,6 +1998,7 @@ xfs_ifree_cluster(
 		 * even trying to lock them.
 		 */
 		for (i = 0; i < ninodes; i++) {
+retry:
 			read_lock(&pag->pag_ici_lock);
 			ip = radix_tree_lookup(&pag->pag_ici_root,
 					XFS_INO_TO_AGINO(mp, (inum + i)));
@@ -2003,38 +2009,36 @@ xfs_ifree_cluster(
 				continue;
 			}
 
-			/* don't try to lock/unlock the current inode */
+			/*
+			 * Don't try to lock/unlock the current inode, but we
+			 * _cannot_ skip the other inodes that we did not find
+			 * in the list attached to the buffer and are not
+			 * already marked stale. If we can't lock it, back off
+			 * and retry.
+			 */
 			if (ip != free_ip &&
 			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
 				read_unlock(&pag->pag_ici_lock);
-				continue;
+				delay(1);
+				goto retry;
 			}
 			read_unlock(&pag->pag_ici_lock);
 
-			if (!xfs_iflock_nowait(ip)) {
-				if (ip != free_ip)
-					xfs_iunlock(ip, XFS_ILOCK_EXCL);
-				continue;
-			}
-
+			xfs_iflock(ip);
 			xfs_iflags_set(ip, XFS_ISTALE);
-			if (xfs_inode_clean(ip)) {
-				ASSERT(ip != free_ip);
-				xfs_ifunlock(ip);
-				xfs_iunlock(ip, XFS_ILOCK_EXCL);
-				continue;
-			}
 
+			/*
+			 * we don't need to attach clean inodes or those only
+			 * with unlogged changes (which we throw away, anyway).
+			 */
 			iip = ip->i_itemp;
-			if (!iip) {
-				/* inode with unlogged changes only */
+			if (!iip || xfs_inode_clean(ip)) {
 				ASSERT(ip != free_ip);
 				ip->i_update_core = 0;
 				xfs_ifunlock(ip);
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
 				continue;
 			}
-			found++;
 
 			iip->ili_last_fields = iip->ili_format.ilf_fields;
 			iip->ili_format.ilf_fields = 0;
@@ -2049,8 +2053,7 @@ xfs_ifree_cluster(
 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
 		}
 
-		if (found)
-			xfs_trans_stale_inode_buf(tp, bp);
+		xfs_trans_stale_inode_buf(tp, bp);
 		xfs_trans_binval(tp, bp);
 	}
 
-- 
cgit v1.2.3


From 4536f2ad8b330453d7ebec0746c4374eadd649b1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 24 Aug 2010 11:42:30 +1000
Subject: xfs: fix untrusted inode number lookup

Commit 7124fe0a5b619d65b739477b3b55a20bf805b06d ("xfs: validate untrusted inode
numbers during lookup") changes the inode lookup code to do btree lookups for
untrusted inode numbers. This change made an invalid assumption about the
alignment of inodes and hence incorrectly calculated the first inode in the
cluster. As a result, some inode numbers were being incorrectly considered
invalid when they were actually valid.

The issue was not picked up by the xfstests suite because it always runs fsr
and dump (the two utilities that utilise the bulkstat interface) on cache hot
inodes and hence the lookup code in the cold cache path was not sufficiently
exercised to uncover this intermittent problem.

Fix the issue by relaxing the btree lookup criteria and then checking if the
record returned contains the inode number we are lookup for. If it we get an
incorrect record, then the inode number is invalid.

Cc: <stable@kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_ialloc.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index abf80ae1e95b..5371d2dc360e 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -1213,7 +1213,6 @@ xfs_imap_lookup(
 	struct xfs_inobt_rec_incore rec;
 	struct xfs_btree_cur	*cur;
 	struct xfs_buf		*agbp;
-	xfs_agino_t		startino;
 	int			error;
 	int			i;
 
@@ -1227,13 +1226,13 @@ xfs_imap_lookup(
 	}
 
 	/*
-	 * derive and lookup the exact inode record for the given agino. If the
-	 * record cannot be found, then it's an invalid inode number and we
-	 * should abort.
+	 * Lookup the inode record for the given agino. If the record cannot be
+	 * found, then it's an invalid inode number and we should abort. Once
+	 * we have a record, we need to ensure it contains the inode number
+	 * we are looking up.
 	 */
 	cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-	startino = agino & ~(XFS_IALLOC_INODES(mp) - 1);
-	error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i);
+	error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
 	if (!error) {
 		if (i)
 			error = xfs_inobt_get_rec(cur, &rec, &i);
@@ -1246,6 +1245,11 @@ xfs_imap_lookup(
 	if (error)
 		return error;
 
+	/* check that the returned record contains the required inode */
+	if (rec.ir_startino > agino ||
+	    rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino)
+		return EINVAL;
+
 	/* for untrusted inodes check it is allocated first */
 	if ((flags & XFS_IGET_UNTRUSTED) &&
 	    (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)))
-- 
cgit v1.2.3


From efceab1d563153a2b1a6e7d35376241a48126989 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 24 Aug 2010 11:44:56 +1000
Subject: xfs: handle negative wbc->nr_to_write during sync writeback

During data integrity (WB_SYNC_ALL) writeback, wbc->nr_to_write will
go negative on inodes with more than 1024 dirty pages due to
implementation details of write_cache_pages(). Currently XFS will
abort page clustering in writeback once nr_to_write drops below
zero, and so for data integrity writeback we will do very
inefficient page at a time allocation and IO submission for inodes
with large numbers of dirty pages.

Fix this by only aborting the page clustering code when
wbc->nr_to_write is negative and the sync mode is WB_SYNC_NONE.

Cc: <stable@kernel.org>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_aops.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 15412fe15c3a..528be1ba1402 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -852,8 +852,8 @@ xfs_convert_page(
 		SetPageUptodate(page);
 
 	if (count) {
-		wbc->nr_to_write--;
-		if (wbc->nr_to_write <= 0)
+		if (--wbc->nr_to_write <= 0 &&
+		    wbc->sync_mode == WB_SYNC_NONE)
 			done = 1;
 	}
 	xfs_start_page_writeback(page, !page_dirty, count);
-- 
cgit v1.2.3


From 2fe33661fcd79d4c53022509f7223d526b5fa233 Mon Sep 17 00:00:00 2001
From: Stuart Brodsky <sbrodsky@sgi.com>
Date: Tue, 24 Aug 2010 11:46:05 +1000
Subject: xfs: ensure f_ffree returned by statfs() is non-negative

Because of delayed updates to sb_icount field in the super block, it
is possible to allocate over maxicount number of inodes.  This
causes the arithmetic to calculate a negative number of free inodes
in user commands like df or stat -f.

Since maxicount is a somewhat arbitrary number, a slight over
allocation is not critical but user commands should be displayed as
0 or greater and never go negative.  To do this the value in the
stats buffer f_ffree is capped to never go negative.

[ Modified to use max_t as per Christoph's comment. ]

Signed-off-by: Stu Brodsky <sbrodsky@sgi.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_super.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 15c35b62ff14..c6b24e7c308a 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1226,6 +1226,7 @@ xfs_fs_statfs(
 	struct xfs_inode	*ip = XFS_I(dentry->d_inode);
 	__uint64_t		fakeinos, id;
 	xfs_extlen_t		lsize;
+	__int64_t		ffree;
 
 	statp->f_type = XFS_SB_MAGIC;
 	statp->f_namelen = MAXNAMELEN - 1;
@@ -1249,7 +1250,11 @@ xfs_fs_statfs(
 		statp->f_files = min_t(typeof(statp->f_files),
 					statp->f_files,
 					mp->m_maxicount);
-	statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+
+	/* make sure statp->f_ffree does not underflow */
+	ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
+	statp->f_ffree = max_t(__int64_t, ffree, 0);
+
 	spin_unlock(&mp->m_sb_lock);
 
 	if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) ||
-- 
cgit v1.2.3


From 1a387d3be2b30c90f20d49a3497a8fc0693a9d18 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 24 Aug 2010 11:46:31 +1000
Subject: xfs: dummy transactions should not dirty VFS state

When we  need to cover the log, we issue dummy transactions to ensure
the current log tail is on disk. Unfortunately we currently use the
root inode in the dummy transaction, and the act of committing the
transaction dirties the inode at the VFS level.

As a result, the VFS writeback of the dirty inode will prevent the
filesystem from idling long enough for the log covering state
machine to complete. The state machine gets stuck in a loop issuing
new dummy transactions to cover the log and never makes progress.

To avoid this problem, the dummy transactions should not cause
externally visible state changes. To ensure this occurs, make sure
that dummy transactions log an unchanging field in the superblock as
it's state is never propagated outside the filesystem. This allows
the log covering state machine to complete successfully and the
filesystem now correctly enters a fully idle state about 90s after
the last modification was made.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_super.c |  2 +-
 fs/xfs/linux-2.6/xfs_sync.c  | 42 ++++++------------------------------------
 fs/xfs/xfs_fsops.c           | 31 ++++++++++++++++++-------------
 fs/xfs/xfs_fsops.h           |  2 +-
 4 files changed, 26 insertions(+), 51 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index c6b24e7c308a..a4e07974955b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1407,7 +1407,7 @@ xfs_fs_freeze(
 
 	xfs_save_resvblks(mp);
 	xfs_quiesce_attr(mp);
-	return -xfs_fs_log_dummy(mp);
+	return -xfs_fs_log_dummy(mp, SYNC_WAIT);
 }
 
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index dfcbd98d1599..d59c4a65d492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -34,6 +34,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_quota.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -340,38 +341,6 @@ xfs_sync_attr(
 				     XFS_ICI_NO_TAG, 0, NULL);
 }
 
-STATIC int
-xfs_commit_dummy_trans(
-	struct xfs_mount	*mp,
-	uint			flags)
-{
-	struct xfs_inode	*ip = mp->m_rootip;
-	struct xfs_trans	*tp;
-	int			error;
-
-	/*
-	 * Put a dummy transaction in the log to tell recovery
-	 * that all others are OK.
-	 */
-	tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
-	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
-	if (error) {
-		xfs_trans_cancel(tp, 0);
-		return error;
-	}
-
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-	xfs_trans_ijoin(tp, ip);
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	error = xfs_trans_commit(tp, 0);
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-	/* the log force ensures this transaction is pushed to disk */
-	xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-	return error;
-}
-
 STATIC int
 xfs_sync_fsdata(
 	struct xfs_mount	*mp)
@@ -432,7 +401,7 @@ xfs_quiesce_data(
 
 	/* mark the log as covered if needed */
 	if (xfs_log_need_covered(mp))
-		error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT);
+		error2 = xfs_fs_log_dummy(mp, SYNC_WAIT);
 
 	/* flush data-only devices */
 	if (mp->m_rtdev_targp)
@@ -563,7 +532,7 @@ xfs_flush_inodes(
 /*
  * Every sync period we need to unpin all items, reclaim inodes and sync
  * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle.
+ * filesystem is idle and not frozen.
  */
 STATIC void
 xfs_sync_worker(
@@ -577,8 +546,9 @@ xfs_sync_worker(
 		xfs_reclaim_inodes(mp, 0);
 		/* dgc: errors ignored here */
 		error = xfs_qm_sync(mp, SYNC_TRYLOCK);
-		if (xfs_log_need_covered(mp))
-			error = xfs_commit_dummy_trans(mp, 0);
+		if (mp->m_super->s_frozen == SB_UNFROZEN &&
+		    xfs_log_need_covered(mp))
+			error = xfs_fs_log_dummy(mp, 0);
 	}
 	mp->m_sync_seq++;
 	wake_up(&mp->m_wait_single_sync_task);
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index dbca5f5c37ba..43b1d5699335 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -604,31 +604,36 @@ out:
 	return 0;
 }
 
+/*
+ * Dump a transaction into the log that contains no real change. This is needed
+ * to be able to make the log dirty or stamp the current tail LSN into the log
+ * during the covering operation.
+ *
+ * We cannot use an inode here for this - that will push dirty state back up
+ * into the VFS and then periodic inode flushing will prevent log covering from
+ * making progress. Hence we log a field in the superblock instead.
+ */
 int
 xfs_fs_log_dummy(
-	xfs_mount_t	*mp)
+	xfs_mount_t	*mp,
+	int		flags)
 {
 	xfs_trans_t	*tp;
-	xfs_inode_t	*ip;
 	int		error;
 
 	tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-	error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+	error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+					XFS_DEFAULT_LOG_COUNT);
 	if (error) {
 		xfs_trans_cancel(tp, 0);
 		return error;
 	}
 
-	ip = mp->m_rootip;
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
-
-	xfs_trans_ijoin(tp, ip);
-	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-	xfs_trans_set_sync(tp);
-	error = xfs_trans_commit(tp, 0);
-
-	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-	return error;
+	/* log the UUID because it is an unchanging field */
+	xfs_mod_sb(tp, XFS_SB_UUID);
+	if (flags & SYNC_WAIT)
+		xfs_trans_set_sync(tp);
+	return xfs_trans_commit(tp, 0);
 }
 
 int
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 88435e0a77c9..a786c5212c1e 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
 extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
-extern int xfs_fs_log_dummy(xfs_mount_t *mp);
+extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags);
 
 #endif	/* __XFS_FSOPS_H__ */
-- 
cgit v1.2.3


From a44f13edf0ebb4e41942d0f16ca80489dcf6659d Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 24 Aug 2010 11:40:03 +1000
Subject: xfs: Reduce log force overhead for delayed logging

Delayed logging adds some serialisation to the log force process to
ensure that it does not deference a bad commit context structure
when determining if a CIL push is necessary or not. It does this by
grabing the CIL context lock exclusively, then dropping it before
pushing the CIL if necessary. This causes serialisation of all log
forces and pushes regardless of whether a force is necessary or not.
As a result fsync heavy workloads (like dbench) can be significantly
slower with delayed logging than without.

To avoid this penalty, copy the current sequence from the context to
the CIL structure when they are swapped. This allows us to do
unlocked checks on the current sequence without having to worry
about dereferencing context structures that may have already been
freed. Hence we can remove the CIL context locking in the forcing
code and only call into the push code if the current context matches
the sequence we need to force.

By passing the sequence into the push code, we can check the
sequence again once we have the CIL lock held exclusive and abort if
the sequence has already been pushed. This avoids a lock round-trip
and unnecessary CIL pushes when we have racing push calls.

The result is that the regression in dbench performance goes away -
this change improves dbench performance on a ramdisk from ~2100MB/s
to ~2500MB/s. This compares favourably to not using delayed logging
which retuns ~2500MB/s for the same workload.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_log.c      |   7 +-
 fs/xfs/xfs_log_cil.c  | 245 +++++++++++++++++++++++++++-----------------------
 fs/xfs/xfs_log_priv.h |  13 ++-
 3 files changed, 147 insertions(+), 118 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 925d572bf0f4..33f718f92a48 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3015,7 +3015,8 @@ _xfs_log_force(
 
 	XFS_STATS_INC(xs_log_force);
 
-	xlog_cil_push(log, 1);
+	if (log->l_cilp)
+		xlog_cil_force(log);
 
 	spin_lock(&log->l_icloglock);
 
@@ -3167,7 +3168,7 @@ _xfs_log_force_lsn(
 	XFS_STATS_INC(xs_log_force);
 
 	if (log->l_cilp) {
-		lsn = xlog_cil_push_lsn(log, lsn);
+		lsn = xlog_cil_force_lsn(log, lsn);
 		if (lsn == NULLCOMMITLSN)
 			return 0;
 	}
@@ -3724,7 +3725,7 @@ xfs_log_force_umount(
 	 * call below.
 	 */
 	if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
-		xlog_cil_push(log, 1);
+		xlog_cil_force(log);
 
 	/*
 	 * We must hold both the GRANT lock and the LOG lock,
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ef8e7d9f445d..9768f2437bb3 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -68,6 +68,7 @@ xlog_cil_init(
 	ctx->sequence = 1;
 	ctx->cil = cil;
 	cil->xc_ctx = ctx;
+	cil->xc_current_sequence = ctx->sequence;
 
 	cil->xc_log = log;
 	log->l_cilp = cil;
@@ -320,94 +321,6 @@ xlog_cil_free_logvec(
 	}
 }
 
-/*
- * Commit a transaction with the given vector to the Committed Item List.
- *
- * To do this, we need to format the item, pin it in memory if required and
- * account for the space used by the transaction. Once we have done that we
- * need to release the unused reservation for the transaction, attach the
- * transaction to the checkpoint context so we carry the busy extents through
- * to checkpoint completion, and then unlock all the items in the transaction.
- *
- * For more specific information about the order of operations in
- * xfs_log_commit_cil() please refer to the comments in
- * xfs_trans_commit_iclog().
- *
- * Called with the context lock already held in read mode to lock out
- * background commit, returns without it held once background commits are
- * allowed again.
- */
-int
-xfs_log_commit_cil(
-	struct xfs_mount	*mp,
-	struct xfs_trans	*tp,
-	struct xfs_log_vec	*log_vector,
-	xfs_lsn_t		*commit_lsn,
-	int			flags)
-{
-	struct log		*log = mp->m_log;
-	int			log_flags = 0;
-	int			push = 0;
-
-	if (flags & XFS_TRANS_RELEASE_LOG_RES)
-		log_flags = XFS_LOG_REL_PERM_RESERV;
-
-	if (XLOG_FORCED_SHUTDOWN(log)) {
-		xlog_cil_free_logvec(log_vector);
-		return XFS_ERROR(EIO);
-	}
-
-	/* lock out background commit */
-	down_read(&log->l_cilp->xc_ctx_lock);
-	xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
-
-	/* check we didn't blow the reservation */
-	if (tp->t_ticket->t_curr_res < 0)
-		xlog_print_tic_res(log->l_mp, tp->t_ticket);
-
-	/* attach the transaction to the CIL if it has any busy extents */
-	if (!list_empty(&tp->t_busy)) {
-		spin_lock(&log->l_cilp->xc_cil_lock);
-		list_splice_init(&tp->t_busy,
-					&log->l_cilp->xc_ctx->busy_extents);
-		spin_unlock(&log->l_cilp->xc_cil_lock);
-	}
-
-	tp->t_commit_lsn = *commit_lsn;
-	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
-	xfs_trans_unreserve_and_mod_sb(tp);
-
-	/*
-	 * Once all the items of the transaction have been copied to the CIL,
-	 * the items can be unlocked and freed.
-	 *
-	 * This needs to be done before we drop the CIL context lock because we
-	 * have to update state in the log items and unlock them before they go
-	 * to disk. If we don't, then the CIL checkpoint can race with us and
-	 * we can run checkpoint completion before we've updated and unlocked
-	 * the log items. This affects (at least) processing of stale buffers,
-	 * inodes and EFIs.
-	 */
-	xfs_trans_free_items(tp, *commit_lsn, 0);
-
-	/* check for background commit before unlock */
-	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
-		push = 1;
-
-	up_read(&log->l_cilp->xc_ctx_lock);
-
-	/*
-	 * We need to push CIL every so often so we don't cache more than we
-	 * can fit in the log. The limit really is that a checkpoint can't be
-	 * more than half the log (the current checkpoint is not allowed to
-	 * overwrite the previous checkpoint), but commit latency and memory
-	 * usage limit this to a smaller size in most cases.
-	 */
-	if (push)
-		xlog_cil_push(log, 0);
-	return 0;
-}
-
 /*
  * Mark all items committed and clear busy extents. We free the log vector
  * chains in a separate pass so that we unpin the log items as quickly as
@@ -441,13 +354,23 @@ xlog_cil_committed(
 }
 
 /*
- * Push the Committed Item List to the log. If the push_now flag is not set,
- * then it is a background flush and so we can chose to ignore it.
+ * Push the Committed Item List to the log. If @push_seq flag is zero, then it
+ * is a background flush and so we can chose to ignore it. Otherwise, if the
+ * current sequence is the same as @push_seq we need to do a flush. If
+ * @push_seq is less than the current sequence, then it has already been
+ * flushed and we don't need to do anything - the caller will wait for it to
+ * complete if necessary.
+ *
+ * @push_seq is a value rather than a flag because that allows us to do an
+ * unlocked check of the sequence number for a match. Hence we can allows log
+ * forces to run racily and not issue pushes for the same sequence twice. If we
+ * get a race between multiple pushes for the same sequence they will block on
+ * the first one and then abort, hence avoiding needless pushes.
  */
-int
+STATIC int
 xlog_cil_push(
 	struct log		*log,
-	int			push_now)
+	xfs_lsn_t		push_seq)
 {
 	struct xfs_cil		*cil = log->l_cilp;
 	struct xfs_log_vec	*lv;
@@ -467,12 +390,14 @@ xlog_cil_push(
 	if (!cil)
 		return 0;
 
+	ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
+
 	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
 	new_ctx->ticket = xlog_cil_ticket_alloc(log);
 
 	/* lock out transaction commit, but don't block on background push */
 	if (!down_write_trylock(&cil->xc_ctx_lock)) {
-		if (!push_now)
+		if (!push_seq)
 			goto out_free_ticket;
 		down_write(&cil->xc_ctx_lock);
 	}
@@ -483,7 +408,11 @@ xlog_cil_push(
 		goto out_skip;
 
 	/* check for spurious background flush */
-	if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+	if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+		goto out_skip;
+
+	/* check for a previously pushed seqeunce */
+	if (push_seq < cil->xc_ctx->sequence)
 		goto out_skip;
 
 	/*
@@ -528,6 +457,13 @@ xlog_cil_push(
 	new_ctx->cil = cil;
 	cil->xc_ctx = new_ctx;
 
+	/*
+	 * mirror the new sequence into the cil structure so that we can do
+	 * unlocked checks against the current sequence in log forces without
+	 * risking deferencing a freed context pointer.
+	 */
+	cil->xc_current_sequence = new_ctx->sequence;
+
 	/*
 	 * The switch is now done, so we can drop the context lock and move out
 	 * of a shared context. We can't just go straight to the commit record,
@@ -639,6 +575,94 @@ out_abort:
 	return XFS_ERROR(EIO);
 }
 
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_log_vec	*log_vector,
+	xfs_lsn_t		*commit_lsn,
+	int			flags)
+{
+	struct log		*log = mp->m_log;
+	int			log_flags = 0;
+	int			push = 0;
+
+	if (flags & XFS_TRANS_RELEASE_LOG_RES)
+		log_flags = XFS_LOG_REL_PERM_RESERV;
+
+	if (XLOG_FORCED_SHUTDOWN(log)) {
+		xlog_cil_free_logvec(log_vector);
+		return XFS_ERROR(EIO);
+	}
+
+	/* lock out background commit */
+	down_read(&log->l_cilp->xc_ctx_lock);
+	xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
+
+	/* check we didn't blow the reservation */
+	if (tp->t_ticket->t_curr_res < 0)
+		xlog_print_tic_res(log->l_mp, tp->t_ticket);
+
+	/* attach the transaction to the CIL if it has any busy extents */
+	if (!list_empty(&tp->t_busy)) {
+		spin_lock(&log->l_cilp->xc_cil_lock);
+		list_splice_init(&tp->t_busy,
+					&log->l_cilp->xc_ctx->busy_extents);
+		spin_unlock(&log->l_cilp->xc_cil_lock);
+	}
+
+	tp->t_commit_lsn = *commit_lsn;
+	xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+	xfs_trans_unreserve_and_mod_sb(tp);
+
+	/*
+	 * Once all the items of the transaction have been copied to the CIL,
+	 * the items can be unlocked and freed.
+	 *
+	 * This needs to be done before we drop the CIL context lock because we
+	 * have to update state in the log items and unlock them before they go
+	 * to disk. If we don't, then the CIL checkpoint can race with us and
+	 * we can run checkpoint completion before we've updated and unlocked
+	 * the log items. This affects (at least) processing of stale buffers,
+	 * inodes and EFIs.
+	 */
+	xfs_trans_free_items(tp, *commit_lsn, 0);
+
+	/* check for background commit before unlock */
+	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+		push = 1;
+
+	up_read(&log->l_cilp->xc_ctx_lock);
+
+	/*
+	 * We need to push CIL every so often so we don't cache more than we
+	 * can fit in the log. The limit really is that a checkpoint can't be
+	 * more than half the log (the current checkpoint is not allowed to
+	 * overwrite the previous checkpoint), but commit latency and memory
+	 * usage limit this to a smaller size in most cases.
+	 */
+	if (push)
+		xlog_cil_push(log, 0);
+	return 0;
+}
+
 /*
  * Conditionally push the CIL based on the sequence passed in.
  *
@@ -653,39 +677,34 @@ out_abort:
  * commit lsn is there. It'll be empty, so this is broken for now.
  */
 xfs_lsn_t
-xlog_cil_push_lsn(
+xlog_cil_force_lsn(
 	struct log	*log,
-	xfs_lsn_t	push_seq)
+	xfs_lsn_t	sequence)
 {
 	struct xfs_cil		*cil = log->l_cilp;
 	struct xfs_cil_ctx	*ctx;
 	xfs_lsn_t		commit_lsn = NULLCOMMITLSN;
 
-restart:
-	down_write(&cil->xc_ctx_lock);
-	ASSERT(push_seq <= cil->xc_ctx->sequence);
-
-	/* check to see if we need to force out the current context */
-	if (push_seq == cil->xc_ctx->sequence) {
-		up_write(&cil->xc_ctx_lock);
-		xlog_cil_push(log, 1);
-		goto restart;
-	}
+	ASSERT(sequence <= cil->xc_current_sequence);
+
+	/*
+	 * check to see if we need to force out the current context.
+	 * xlog_cil_push() handles racing pushes for the same sequence,
+	 * so no need to deal with it here.
+	 */
+	if (sequence == cil->xc_current_sequence)
+		xlog_cil_push(log, sequence);
 
 	/*
 	 * See if we can find a previous sequence still committing.
-	 * We can drop the flush lock as soon as we have the cil lock
-	 * because we are now only comparing contexts protected by
-	 * the cil lock.
-	 *
 	 * We need to wait for all previous sequence commits to complete
 	 * before allowing the force of push_seq to go ahead. Hence block
 	 * on commits for those as well.
 	 */
+restart:
 	spin_lock(&cil->xc_cil_lock);
-	up_write(&cil->xc_ctx_lock);
 	list_for_each_entry(ctx, &cil->xc_committing, committing) {
-		if (ctx->sequence > push_seq)
+		if (ctx->sequence > sequence)
 			continue;
 		if (!ctx->commit_lsn) {
 			/*
@@ -695,7 +714,7 @@ restart:
 			sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
 			goto restart;
 		}
-		if (ctx->sequence != push_seq)
+		if (ctx->sequence != sequence)
 			continue;
 		/* found it! */
 		commit_lsn = ctx->commit_lsn;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8c072618965c..ced52b98b322 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -422,6 +422,7 @@ struct xfs_cil {
 	struct rw_semaphore	xc_ctx_lock;
 	struct list_head	xc_committing;
 	sv_t			xc_commit_wait;
+	xfs_lsn_t		xc_current_sequence;
 };
 
 /*
@@ -562,8 +563,16 @@ int	xlog_cil_init(struct log *log);
 void	xlog_cil_init_post_recovery(struct log *log);
 void	xlog_cil_destroy(struct log *log);
 
-int	xlog_cil_push(struct log *log, int push_now);
-xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
+/*
+ * CIL force routines
+ */
+xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence);
+
+static inline void
+xlog_cil_force(struct log *log)
+{
+	xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+}
 
 /*
  * Unmount record type is used as a pseudo transaction type for the ticket.
-- 
cgit v1.2.3


From 3b93c7aaefc05ee2a75e2726929b01a321402984 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Tue, 24 Aug 2010 11:45:53 +1000
Subject: xfs: don't do memory allocation under the CIL context lock

Formatting items requires memory allocation when using delayed
logging. Currently that memory allocation is done while holding the
CIL context lock in read mode. This means that if memory allocation
takes some time (e.g. enters reclaim), we cannot push on the CIL
until the allocation(s) required by formatting complete. This can
stall CIL pushes for some time, and once a push is stalled so are
all new transaction commits.

Fix this splitting the item formatting into two steps. The first
step which does the allocation and memcpy() into the allocated
buffer is now done outside the CIL context lock, and only the CIL
insert is done inside the CIL context lock. This avoids the stall
issue.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_log_cil.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 9768f2437bb3..ed575fb4b495 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -270,15 +270,10 @@ xlog_cil_insert(
 static void
 xlog_cil_format_items(
 	struct log		*log,
-	struct xfs_log_vec	*log_vector,
-	struct xlog_ticket	*ticket,
-	xfs_lsn_t		*start_lsn)
+	struct xfs_log_vec	*log_vector)
 {
 	struct xfs_log_vec *lv;
 
-	if (start_lsn)
-		*start_lsn = log->l_cilp->xc_ctx->sequence;
-
 	ASSERT(log_vector);
 	for (lv = log_vector; lv; lv = lv->lv_next) {
 		void	*ptr;
@@ -302,9 +297,24 @@ xlog_cil_format_items(
 			ptr += vec->i_len;
 		}
 		ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+	}
+}
 
+static void
+xlog_cil_insert_items(
+	struct log		*log,
+	struct xfs_log_vec	*log_vector,
+	struct xlog_ticket	*ticket,
+	xfs_lsn_t		*start_lsn)
+{
+	struct xfs_log_vec *lv;
+
+	if (start_lsn)
+		*start_lsn = log->l_cilp->xc_ctx->sequence;
+
+	ASSERT(log_vector);
+	for (lv = log_vector; lv; lv = lv->lv_next)
 		xlog_cil_insert(log, ticket, lv->lv_item, lv);
-	}
 }
 
 static void
@@ -612,9 +622,17 @@ xfs_log_commit_cil(
 		return XFS_ERROR(EIO);
 	}
 
+	/*
+	 * do all the hard work of formatting items (including memory
+	 * allocation) outside the CIL context lock. This prevents stalling CIL
+	 * pushes when we are low on memory and a transaction commit spends a
+	 * lot of time in memory reclaim.
+	 */
+	xlog_cil_format_items(log, log_vector);
+
 	/* lock out background commit */
 	down_read(&log->l_cilp->xc_ctx_lock);
-	xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
+	xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn);
 
 	/* check we didn't blow the reservation */
 	if (tp->t_ticket->t_curr_res < 0)
-- 
cgit v1.2.3


From b5420f235953448eeae615b3361584dc5e414f34 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 24 Aug 2010 11:47:51 +1000
Subject: xfs: do not discard page cache data on EAGAIN

If xfs_map_blocks returns EAGAIN because of lock contention we must redirty the
page and not disard the pagecache content and return an error from writepage.
We used to do this correctly, but the logic got lost during the recent
reshuffle of the writepage code.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reported-by: Mike Gao <ygao.linux@gmail.com>
Tested-by: Mike Gao <ygao.linux@gmail.com>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/linux-2.6/xfs_aops.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 528be1ba1402..b552f816de15 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1068,7 +1068,7 @@ xfs_vm_writepage(
 	 * by themselves.
 	 */
 	if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
-		goto out_fail;
+		goto redirty;
 
 	/*
 	 * We need a transaction if there are delalloc or unwritten buffers
@@ -1080,7 +1080,7 @@ xfs_vm_writepage(
 	 */
 	xfs_count_page_state(page, &delalloc, &unwritten);
 	if ((current->flags & PF_FSTRANS) && (delalloc || unwritten))
-		goto out_fail;
+		goto redirty;
 
 	/* Is this page beyond the end of the file? */
 	offset = i_size_read(inode);
@@ -1245,12 +1245,15 @@ error:
 	if (iohead)
 		xfs_cancel_ioend(iohead);
 
+	if (err == -EAGAIN)
+		goto redirty;
+
 	xfs_aops_discard_page(page);
 	ClearPageUptodate(page);
 	unlock_page(page);
 	return err;
 
-out_fail:
+redirty:
 	redirty_page_for_writepage(wbc, page);
 	unlock_page(page);
 	return 0;
-- 
cgit v1.2.3


From 2d20ca835867d93ead6ce61780d883a4b128106d Mon Sep 17 00:00:00 2001
From: "shirishpargaonkar@gmail.com" <shirishpargaonkar@gmail.com>
Date: Tue, 24 Aug 2010 11:53:48 -0500
Subject: Eliminate sparse warning - bad constant expression

Eliminiate sparse warning during usage of crypto_shash_* APIs
       error: bad constant expression

Allocate memory for shash descriptors once, so that we do not kmalloc/kfree it
for every signature generation (shash descriptor for md5 hash).

From ed7538619817777decc44b5660b52268077b74f3 Mon Sep 17 00:00:00 2001
From: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Date: Tue, 24 Aug 2010 11:47:43 -0500
Subject: [PATCH] eliminate sparse warnings during crypto_shash_* APis usage

Signed-off-by: Shirish Pargaonkar <shirishpargaonkar@gmail.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsencrypt.c | 193 +++++++++++++++++++++++++++++++-------------------
 fs/cifs/cifsglob.h    |   7 ++
 2 files changed, 128 insertions(+), 72 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index eef78c24e0cc..709f2296bdb4 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -45,39 +45,38 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8,
 static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu,
 			struct TCP_Server_Info *server, char *signature)
 {
-	int rc = 0;
-	struct {
-		struct shash_desc shash;
-		char ctx[crypto_shash_descsize(server->ntlmssp.md5)];
-	} sdesc;
+	int rc;
 
 	if (cifs_pdu == NULL || server == NULL || signature == NULL)
 		return -EINVAL;
 
-	sdesc.shash.tfm = server->ntlmssp.md5;
-	sdesc.shash.flags = 0x0;
+	if (!server->ntlmssp.sdescmd5) {
+		cERROR(1,
+			"cifs_calculate_signature: can't generate signature\n");
+		return -1;
+	}
 
-	rc = crypto_shash_init(&sdesc.shash);
+	rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash);
 	if (rc) {
-		cERROR(1, "could not initialize master crypto API hmacmd5\n");
+		cERROR(1, "cifs_calculate_signature: oould not init md5\n");
 		return rc;
 	}
 
 	if (server->secType == RawNTLMSSP)
-		crypto_shash_update(&sdesc.shash,
+		crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
 			server->session_key.data.ntlmv2.key,
 			CIFS_NTLMV2_SESSKEY_SIZE);
 	else
-		crypto_shash_update(&sdesc.shash,
+		crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
 			(char *)&server->session_key.data,
 			server->session_key.len);
 
-	crypto_shash_update(&sdesc.shash,
+	crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
 			cifs_pdu->Protocol, cifs_pdu->smb_buf_length);
 
-	rc = crypto_shash_final(&sdesc.shash, signature);
+	rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature);
 
-	return 0;
+	return rc;
 }
 
 
@@ -115,30 +114,28 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 			struct TCP_Server_Info *server, char *signature)
 {
 	int i;
-	int rc = 0;
-	struct {
-		struct shash_desc shash;
-		char ctx[crypto_shash_descsize(server->ntlmssp.md5)];
-	} sdesc;
+	int rc;
 
 	if (iov == NULL || server == NULL || signature == NULL)
 		return -EINVAL;
 
-	sdesc.shash.tfm = server->ntlmssp.md5;
-	sdesc.shash.flags = 0x0;
+	if (!server->ntlmssp.sdescmd5) {
+		cERROR(1, "cifs_calc_signature2: can't generate signature\n");
+		return -1;
+	}
 
-	rc = crypto_shash_init(&sdesc.shash);
+	rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash);
 	if (rc) {
-		cERROR(1, "could not initialize master crypto API hmacmd5\n");
+		cERROR(1, "cifs_calc_signature2: oould not init md5\n");
 		return rc;
 	}
 
 	if (server->secType == RawNTLMSSP)
-		crypto_shash_update(&sdesc.shash,
+		crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
 			server->session_key.data.ntlmv2.key,
 			CIFS_NTLMV2_SESSKEY_SIZE);
 	else
-		crypto_shash_update(&sdesc.shash,
+		crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
 			(char *)&server->session_key.data,
 			server->session_key.len);
 
@@ -146,7 +143,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		if (iov[i].iov_len == 0)
 			continue;
 		if (iov[i].iov_base == NULL) {
-			cERROR(1, "null iovec entry");
+			cERROR(1, "cifs_calc_signature2: null iovec entry");
 			return -EIO;
 		}
 		/* The first entry includes a length field (which does not get
@@ -154,16 +151,16 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec,
 		if (i == 0) {
 			if (iov[0].iov_len <= 8) /* cmd field at offset 9 */
 				break; /* nothing to sign or corrupt header */
-			crypto_shash_update(&sdesc.shash,
+			crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
 				iov[i].iov_base + 4, iov[i].iov_len - 4);
 		} else
-			crypto_shash_update(&sdesc.shash,
+			crypto_shash_update(&server->ntlmssp.sdescmd5->shash,
 				iov[i].iov_base, iov[i].iov_len);
 	}
 
-	rc = crypto_shash_final(&sdesc.shash, signature);
+	rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature);
 
-	return 0;
+	return rc;
 }
 
 int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server,
@@ -313,43 +310,48 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
 	wchar_t *user;
 	wchar_t *domain;
 	wchar_t *server;
-	struct {
-		struct shash_desc shash;
-		char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)];
-	} sdesc;
+
+	if (!ses->server->ntlmssp.sdeschmacmd5) {
+		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		return -1;
+	}
 
 	/* calculate md4 hash of password */
 	E_md4hash(ses->password, nt_hash);
 
-	sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5;
-	sdesc.shash.flags = 0x0;
-
 	crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, nt_hash,
 				CIFS_NTHASH_SIZE);
 
-	rc = crypto_shash_init(&sdesc.shash);
+	rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash);
 	if (rc) {
-		cERROR(1, "could not initialize master crypto API hmacmd5\n");
+		cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n");
 		return rc;
 	}
 
 	/* convert ses->userName to unicode and uppercase */
 	len = strlen(ses->userName);
 	user = kmalloc(2 + (len * 2), GFP_KERNEL);
-	if (user == NULL)
+	if (user == NULL) {
+		cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n");
+		rc = -ENOMEM;
 		goto calc_exit_2;
+	}
 	len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp);
 	UniStrupr(user);
 
-	crypto_shash_update(&sdesc.shash, (char *)user, 2 * len);
+	crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+				(char *)user, 2 * len);
 
 	/* convert ses->domainName to unicode and uppercase */
 	if (ses->domainName) {
 		len = strlen(ses->domainName);
 
 		domain = kmalloc(2 + (len * 2), GFP_KERNEL);
-		if (domain == NULL)
+		if (domain == NULL) {
+			cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure");
+			rc = -ENOMEM;
 			goto calc_exit_1;
+		}
 		len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len,
 					nls_cp);
 		/* the following line was removed since it didn't work well
@@ -357,15 +359,19 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
 		   Maybe converting the domain name earlier makes sense */
 		/* UniStrupr(domain); */
 
-		crypto_shash_update(&sdesc.shash, (char *)domain, 2 * len);
+		crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+					(char *)domain, 2 * len);
 
 		kfree(domain);
 	} else if (ses->serverName) {
 		len = strlen(ses->serverName);
 
 		server = kmalloc(2 + (len * 2), GFP_KERNEL);
-		if (server == NULL)
+		if (server == NULL) {
+			cERROR(1, "calc_ntlmv2_hash: server mem alloc failure");
+			rc = -ENOMEM;
 			goto calc_exit_1;
+		}
 		len = cifs_strtoUCS((__le16 *)server, ses->serverName, len,
 					nls_cp);
 		/* the following line was removed since it didn't work well
@@ -373,16 +379,20 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses,
 		   Maybe converting the domain name earlier makes sense */
 		/* UniStrupr(domain); */
 
-		crypto_shash_update(&sdesc.shash, (char *)server, 2 * len);
+		crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+					(char *)server, 2 * len);
 
 		kfree(server);
 	}
+
+	rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash,
+					ses->server->ntlmv2_hash);
+
 calc_exit_1:
 	kfree(user);
 calc_exit_2:
 	/* BB FIXME what about bytes 24 through 40 of the signing key?
 	   compare with the NTLM example */
-	rc = crypto_shash_final(&sdesc.shash, ses->server->ntlmv2_hash);
 
 	return rc;
 }
@@ -442,34 +452,33 @@ CalcNTLMv2_response(const struct TCP_Server_Info *server,
 			 char *v2_session_response)
 {
 	int rc;
-	struct {
-		struct shash_desc shash;
-		char ctx[crypto_shash_descsize(server->ntlmssp.hmacmd5)];
-	} sdesc;
 
-	sdesc.shash.tfm = server->ntlmssp.hmacmd5;
-	sdesc.shash.flags = 0x0;
+	if (!server->ntlmssp.sdeschmacmd5) {
+		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		return -1;
+	}
 
 	crypto_shash_setkey(server->ntlmssp.hmacmd5, server->ntlmv2_hash,
 		CIFS_HMAC_MD5_HASH_SIZE);
 
-	rc = crypto_shash_init(&sdesc.shash);
+	rc = crypto_shash_init(&server->ntlmssp.sdeschmacmd5->shash);
 	if (rc) {
-		cERROR(1, "could not initialize master crypto API hmacmd5\n");
+		cERROR(1, "CalcNTLMv2_response: could not init hmacmd5");
 		return rc;
 	}
 
 	memcpy(v2_session_response + CIFS_SERVER_CHALLENGE_SIZE,
 		server->cryptKey, CIFS_SERVER_CHALLENGE_SIZE);
-	crypto_shash_update(&sdesc.shash,
+	crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash,
 		v2_session_response + CIFS_SERVER_CHALLENGE_SIZE,
 		sizeof(struct ntlmv2_resp) - CIFS_SERVER_CHALLENGE_SIZE);
 
 	if (server->tilen)
-		crypto_shash_update(&sdesc.shash,
+		crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash,
 					server->tiblob, server->tilen);
 
-	rc = crypto_shash_final(&sdesc.shash, v2_session_response);
+	rc = crypto_shash_final(&server->ntlmssp.sdeschmacmd5->shash,
+					v2_session_response);
 
 	return rc;
 }
@@ -480,10 +489,6 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 {
 	int rc = 0;
 	struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf;
-	struct {
-		struct shash_desc shash;
-		char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)];
-	} sdesc;
 
 	buf->blob_signature = cpu_to_le32(0x00000101);
 	buf->reserved = 0;
@@ -511,21 +516,24 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf,
 		return rc;
 	}
 
+	if (!ses->server->ntlmssp.sdeschmacmd5) {
+		cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n");
+		return -1;
+	}
+
 	crypto_shash_setkey(ses->server->ntlmssp.hmacmd5,
 			ses->server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE);
 
-	sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5;
-	sdesc.shash.flags = 0x0;
-
-	rc = crypto_shash_init(&sdesc.shash);
+	rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash);
 	if (rc) {
-		cERROR(1, "could not initialize master crypto API hmacmd5\n");
+		cERROR(1, "setup_ntlmv2_rsp: could not init hmacmd5\n");
 		return rc;
 	}
 
-	crypto_shash_update(&sdesc.shash, resp_buf, CIFS_HMAC_MD5_HASH_SIZE);
+	crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash,
+				resp_buf, CIFS_HMAC_MD5_HASH_SIZE);
 
-	rc = crypto_shash_final(&sdesc.shash,
+	rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash,
 		ses->server->session_key.data.ntlmv2.key);
 
 	memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf,
@@ -578,24 +586,65 @@ cifs_crypto_shash_release(struct TCP_Server_Info *server)
 
 	if (server->ntlmssp.hmacmd5)
 		crypto_free_shash(server->ntlmssp.hmacmd5);
+
+	kfree(server->ntlmssp.sdeschmacmd5);
+
+	kfree(server->ntlmssp.sdescmd5);
 }
 
 int
 cifs_crypto_shash_allocate(struct TCP_Server_Info *server)
 {
+	int rc;
+	unsigned int size;
+
 	server->ntlmssp.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0);
 	if (!server->ntlmssp.hmacmd5 ||
 			IS_ERR(server->ntlmssp.hmacmd5)) {
-		cERROR(1, "could not allocate master crypto API hmacmd5\n");
+		cERROR(1, "could not allocate crypto hmacmd5\n");
 		return 1;
 	}
 
 	server->ntlmssp.md5 = crypto_alloc_shash("md5", 0, 0);
 	if (!server->ntlmssp.md5 || IS_ERR(server->ntlmssp.md5)) {
-		crypto_free_shash(server->ntlmssp.hmacmd5);
-		cERROR(1, "could not allocate master crypto API md5\n");
-		return 1;
+		cERROR(1, "could not allocate crypto md5\n");
+		rc = 1;
+		goto cifs_crypto_shash_allocate_ret1;
 	}
 
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->ntlmssp.hmacmd5);
+	server->ntlmssp.sdeschmacmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->ntlmssp.sdeschmacmd5) {
+		cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n");
+		rc = -ENOMEM;
+		goto cifs_crypto_shash_allocate_ret2;
+	}
+	server->ntlmssp.sdeschmacmd5->shash.tfm = server->ntlmssp.hmacmd5;
+	server->ntlmssp.sdeschmacmd5->shash.flags = 0x0;
+
+
+	size = sizeof(struct shash_desc) +
+			crypto_shash_descsize(server->ntlmssp.md5);
+	server->ntlmssp.sdescmd5 = kmalloc(size, GFP_KERNEL);
+	if (!server->ntlmssp.sdescmd5) {
+		cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n");
+		rc = -ENOMEM;
+		goto cifs_crypto_shash_allocate_ret3;
+	}
+	server->ntlmssp.sdescmd5->shash.tfm = server->ntlmssp.md5;
+	server->ntlmssp.sdescmd5->shash.flags = 0x0;
+
 	return 0;
+
+cifs_crypto_shash_allocate_ret3:
+	kfree(server->ntlmssp.sdeschmacmd5);
+
+cifs_crypto_shash_allocate_ret2:
+	crypto_free_shash(server->ntlmssp.md5);
+
+cifs_crypto_shash_allocate_ret1:
+	crypto_free_shash(server->ntlmssp.hmacmd5);
+
+	return rc;
 }
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 49563e0c1725..c9d0cfc086eb 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -123,12 +123,19 @@ struct cifs_cred {
 	struct cifs_ace *aces;
 };
 
+struct sdesc {
+	struct shash_desc shash;
+	char ctx[];
+};
+
 struct ntlmssp_auth {
 	__u32 client_flags;
 	__u32 server_flags;
 	unsigned char ciphertext[CIFS_CPHTXT_SIZE];
 	struct crypto_shash *hmacmd5;
 	struct crypto_shash *md5;
+	struct sdesc *sdeschmacmd5;
+	struct sdesc *sdescmd5;
 };
 
 /*
-- 
cgit v1.2.3


From 7d8cb26d7dcb911f110b7762bd5941e8f009d6c3 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 24 Aug 2010 08:44:16 -0700
Subject: ceph: maintain i_head_snapc when any caps are dirty, not just for
 data

We used to use i_head_snapc to keep track of which snapc the current epoch
of dirty data was dirtied under.  It is used by queue_cap_snap to set up
the cap_snap.  However, since we queue cap snaps for any dirty caps, not
just for dirty file data, we need to keep a valid i_head_snapc anytime
we have dirty|flushing caps.  This fixes a NULL pointer deref in
queue_cap_snap when writing back dirty caps without data (e.g.,
snaptest-authwb.sh).

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/addr.c  |  4 ++--
 fs/ceph/caps.c  | 20 +++++++++++++++++---
 fs/ceph/snap.c  |  6 +++++-
 fs/ceph/super.h |  3 ++-
 4 files changed, 26 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 420d46974ec8..4cfce1ee31fa 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page)
 
 	/* dirty the head */
 	spin_lock(&inode->i_lock);
-	if (ci->i_wrbuffer_ref_head == 0)
+	if (ci->i_head_snapc == NULL)
 		ci->i_head_snapc = ceph_get_snap_context(snapc);
 	++ci->i_wrbuffer_ref_head;
 	if (ci->i_wrbuffer_ref == 0)
@@ -346,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
 			break;
 		}
 	}
-	if (!snapc && ci->i_head_snapc) {
+	if (!snapc && ci->i_wrbuffer_ref_head) {
 		snapc = ceph_get_snap_context(ci->i_head_snapc);
 		dout(" head snapc %p has %d dirty pages\n",
 		     snapc, ci->i_wrbuffer_ref_head);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index ba5bbf318fe1..a2069b6680ae 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1143,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 		for (i = 0; i < CEPH_CAP_BITS; i++)
 			if (flushing & (1 << i))
 				ci->i_cap_flush_tid[i] = flush_tid;
+
+		follows = ci->i_head_snapc->seq;
+	} else {
+		follows = 0;
 	}
 
 	keep = cap->implemented;
@@ -1156,7 +1160,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	mtime = inode->i_mtime;
 	atime = inode->i_atime;
 	time_warp_seq = ci->i_time_warp_seq;
-	follows = ci->i_snap_realm->cached_context->seq;
 	uid = inode->i_uid;
 	gid = inode->i_gid;
 	mode = inode->i_mode;
@@ -1332,7 +1335,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 	     ceph_cap_string(was | mask));
 	ci->i_dirty_caps |= mask;
 	if (was == 0) {
-		dout(" inode %p now dirty\n", &ci->vfs_inode);
+		if (!ci->i_head_snapc)
+			ci->i_head_snapc = ceph_get_snap_context(
+				ci->i_snap_realm->cached_context);
+		dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode,
+			ci->i_head_snapc);
 		BUG_ON(!list_empty(&ci->i_dirty_item));
 		spin_lock(&mdsc->cap_dirty_lock);
 		list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
@@ -2190,7 +2197,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
 
 	if (ci->i_head_snapc == snapc) {
 		ci->i_wrbuffer_ref_head -= nr;
-		if (!ci->i_wrbuffer_ref_head) {
+		if (ci->i_wrbuffer_ref_head == 0 &&
+		    ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) {
+			BUG_ON(!ci->i_head_snapc);
 			ceph_put_snap_context(ci->i_head_snapc);
 			ci->i_head_snapc = NULL;
 		}
@@ -2483,6 +2492,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
 			dout(" inode %p now clean\n", inode);
 			BUG_ON(!list_empty(&ci->i_dirty_item));
 			drop = 1;
+			if (ci->i_wrbuffer_ref_head == 0) {
+				BUG_ON(!ci->i_head_snapc);
+				ceph_put_snap_context(ci->i_head_snapc);
+				ci->i_head_snapc = NULL;
+			}
 		} else {
 			BUG_ON(list_empty(&ci->i_dirty_item));
 		}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 6bdbf3ae7082..4868b9dcac5a 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -458,6 +458,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 			     CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) {
 		struct ceph_snap_context *snapc = ci->i_head_snapc;
 
+		dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode,
+		     capsnap, snapc);
 		igrab(inode);
 		
 		atomic_set(&capsnap->nref, 1);
@@ -489,7 +491,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
 		capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
 		ci->i_wrbuffer_ref_head = 0;
 		capsnap->context = snapc;
-		ci->i_head_snapc = NULL;
+		ci->i_head_snapc =
+			ceph_get_snap_context(ci->i_snap_realm->cached_context);
+		dout(" new snapc is %p\n", ci->i_head_snapc);
 		list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
 
 		if (used & CEPH_CAP_FILE_WR) {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index b33929d8f287..c33897ae5725 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -344,7 +344,8 @@ struct ceph_inode_info {
 	unsigned i_cap_exporting_issued;
 	struct ceph_cap_reservation i_cap_migration_resv;
 	struct list_head i_cap_snaps;   /* snapped state pending flush to mds */
-	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 */
+	struct ceph_snap_context *i_head_snapc;  /* set if wr_buffer_head > 0 or
+						    dirty|flushing caps */
 	unsigned i_snap_caps;           /* cap bits for snapped files */
 
 	int i_nr_by_mode[CEPH_FILE_MODE_NUM];  /* open file counts */
-- 
cgit v1.2.3


From 36e21687e6e51c4225c42e6291938363f7bbfa7c Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Tue, 24 Aug 2010 16:23:48 -0700
Subject: ceph: initialize fields on new dentry_infos

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/dir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 67bbb41d5526..6e4f43ff23ec 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -46,7 +46,7 @@ int ceph_init_dentry(struct dentry *dentry)
 	else
 		dentry->d_op = &ceph_snap_dentry_ops;
 
-	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+	di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO);
 	if (!di)
 		return -ENOMEM;          /* oh well */
 
-- 
cgit v1.2.3


From ac1f12ef569d49b013c3db86e11be7e15d66b1c3 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 25 Aug 2010 09:11:35 +0200
Subject: ceph: ceph_get_inode() returns an ERR_PTR

ceph_get_inode() returns an ERR_PTR and it doesn't return a NULL.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/inode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 3e6b52cb5ee8..e7cca414da03 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1230,11 +1230,11 @@ retry_lookup:
 			in = dn->d_inode;
 		} else {
 			in = ceph_get_inode(parent->d_sb, vino);
-			if (in == NULL) {
+			if (IS_ERR(in)) {
 				dout("new_inode badness\n");
 				d_delete(dn);
 				dput(dn);
-				err = -ENOMEM;
+				err = PTR_ERR(in);
 				goto out;
 			}
 			dn = splice_dentry(dn, in, NULL);
-- 
cgit v1.2.3


From ad8453ab0a5b98884074302ba3cc37664791e261 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 25 Aug 2010 13:26:32 +0100
Subject: ceph: Fix warnings

Just scrubbing some warnings so I can see real problem ones in the build
noise. For 32bit we need to coax gcc politely into believing we really
honestly intend to the casts. Using (u64)(unsigned long) means we cast from
a pointer to a type of the right size and then extend it. This stops the
warning spew.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/locks.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index ae85af06454f..ff4e753aae92 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -82,7 +82,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 		length = fl->fl_end - fl->fl_start + 1;
 
 	err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-				(u64)fl->fl_pid, (u64)fl->fl_nspid,
+				(u64)fl->fl_pid,
+				(u64)(unsigned long)fl->fl_nspid,
 				lock_cmd, fl->fl_start,
 				length, wait);
 	if (!err) {
@@ -92,7 +93,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl)
 			/* undo! This should only happen if the kernel detects
 			 * local deadlock. */
 			ceph_lock_message(CEPH_LOCK_FCNTL, op, file,
-					  (u64)fl->fl_pid, (u64)fl->fl_nspid,
+					  (u64)fl->fl_pid,
+					  (u64)(unsigned long)fl->fl_nspid,
 					  CEPH_LOCK_UNLOCK, fl->fl_start,
 					  length, 0);
 			dout("got %d on posix_lock_file, undid lock", err);
@@ -132,7 +134,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 		length = fl->fl_end - fl->fl_start + 1;
 
 	err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK,
-				file, (u64)fl->fl_pid, (u64)fl->fl_nspid,
+				file, (u64)fl->fl_pid,
+				(u64)(unsigned long)fl->fl_nspid,
 				lock_cmd, fl->fl_start,
 				length, wait);
 	if (!err) {
@@ -141,7 +144,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
 			ceph_lock_message(CEPH_LOCK_FLOCK,
 					  CEPH_MDS_OP_SETFILELOCK,
 					  file, (u64)fl->fl_pid,
-					  (u64)fl->fl_nspid,
+					  (u64)(unsigned long)fl->fl_nspid,
 					  CEPH_LOCK_UNLOCK, fl->fl_start,
 					  length, 0);
 			dout("got %d on flock_lock_file_wait, undid lock", err);
@@ -235,7 +238,8 @@ int lock_to_ceph_filelock(struct file_lock *lock,
 	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
 	cephlock->client = cpu_to_le64(0);
 	cephlock->pid = cpu_to_le64(lock->fl_pid);
-	cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid);
+	cephlock->pid_namespace =
+	        cpu_to_le64((u64)(unsigned long)lock->fl_nspid);
 
 	switch (lock->fl_type) {
 	case F_RDLCK:
-- 
cgit v1.2.3


From c89e5198b26a869ce2842bad8519264f3394dee9 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 26 Aug 2010 02:11:54 +0000
Subject: [CIFS] Eliminate unused variable warning

CC: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/sess.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 4788e16a02cc..795095f4eac6 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -620,7 +620,6 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses,
 	struct key *spnego_key = NULL;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
 	bool first_time;
-	char *ntlmsspblob;
 
 	if (ses == NULL)
 		return -EINVAL;
@@ -868,6 +867,8 @@ ssetup_ntlmssp_authenticate:
 				iov[1].iov_base = &pSMB->req.SecurityBlob[0];
 			} else if (phase == NtLmAuthenticate) {
 				int blob_len;
+				char *ntlmsspblob;
+
 				ntlmsspblob = kmalloc(5 *
 					sizeof(struct _AUTHENTICATE_MESSAGE),
 					GFP_KERNEL);
-- 
cgit v1.2.3


From f44c3890d9fd6e4284518ff3bb16879fee194a3a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 26 Aug 2010 11:07:24 +0200
Subject: ceph: ceph_mdsc_build_path() returns an ERR_PTR

ceph_mdsc_build_path() returns an ERR_PTR but this code is set up to
handle NULL returns.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/debugfs.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 360c4f22718d..6fd8b20a8611 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -171,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p)
 		} else if (req->r_dentry) {
 			path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
 						    &pathbase, 0);
+			if (IS_ERR(path))
+				path = NULL;
 			spin_lock(&req->r_dentry->d_lock);
 			seq_printf(s, " #%llx/%.*s (%s)",
 				   ceph_ino(req->r_dentry->d_parent->d_inode),
@@ -187,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p)
 		if (req->r_old_dentry) {
 			path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
 						    &pathbase, 0);
+			if (IS_ERR(path))
+				path = NULL;
 			spin_lock(&req->r_old_dentry->d_lock);
 			seq_printf(s, " #%llx/%.*s (%s)",
 			   ceph_ino(req->r_old_dentry->d_parent->d_inode),
-- 
cgit v1.2.3


From e072f8aa3587710cd35cce0f6b6efd7b4276c327 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@newdream.net>
Date: Thu, 26 Aug 2010 09:26:37 -0700
Subject: ceph: don't BUG on ENOMEM during mds reconnect

We are in a position to return an error; do that instead.

Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/mds_client.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 8d1f11c7a5a2..f091b1351786 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2324,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
 		if (IS_ERR(path)) {
 			err = PTR_ERR(path);
-			BUG_ON(err);
+			goto out_dput;
 		}
 	} else {
 		path = NULL;
@@ -2332,7 +2332,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 	}
 	err = ceph_pagelist_encode_string(pagelist, path, pathlen);
 	if (err)
-		goto out;
+		goto out_free;
 
 	spin_lock(&inode->i_lock);
 	cap->seq = 0;        /* reset cap seq */
@@ -2376,8 +2376,9 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
 		unlock_kernel();
 	}
 
-out:
+out_free:
 	kfree(path);
+out_dput:
 	dput(dentry);
 	return err;
 }
-- 
cgit v1.2.3


From b545787dbb00a041c541a4759d938ddb0108295a Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Thu, 26 Aug 2010 11:12:38 +0200
Subject: ceph: fix get_ticket_handler() error handling

get_ticket_handler() returns a valid pointer or it returns
ERR_PTR(-ENOMEM) if kzalloc() fails.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Sage Weil <sage@newdream.net>
---
 fs/ceph/auth_x.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 582e0b2caf8a..a2d002cbdec2 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -376,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
 
 		th = get_ticket_handler(ac, service);
 
-		if (!th) {
+		if (IS_ERR(th)) {
 			*pneed |= service;
 			continue;
 		}
@@ -399,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 	struct ceph_x_ticket_handler *th =
 		get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
 
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
 	ceph_x_validate_tickets(ac, &need);
 
 	dout("build_request want %x have %x need %x\n",
@@ -450,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac,
 			return -ERANGE;
 		head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
 
-		BUG_ON(!th);
 		ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
 		if (ret)
 			return ret;
@@ -505,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
 
 	case CEPHX_GET_PRINCIPAL_SESSION_KEY:
 		th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
-		BUG_ON(!th);
+		if (IS_ERR(th))
+			return PTR_ERR(th);
 		ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
 					       buf + sizeof(*head), end);
 		break;
@@ -563,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
 	void *end = p + sizeof(au->reply_buf);
 
 	th = get_ticket_handler(ac, au->service);
-	if (!th)
-		return -EIO;  /* hrm! */
+	if (IS_ERR(th))
+		return PTR_ERR(th);
 	ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
 	if (ret < 0)
 		return ret;
@@ -626,7 +629,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
 	struct ceph_x_ticket_handler *th;
 
 	th = get_ticket_handler(ac, peer_type);
-	if (th && !IS_ERR(th))
+	if (!IS_ERR(th))
 		remove_ticket_handler(ac, th);
 }
 
-- 
cgit v1.2.3


From f0138a79d74e1e942970ea163be268cd2e4bbcfc Mon Sep 17 00:00:00 2001
From: Suresh Jayaraman <sjayaraman@suse.de>
Date: Thu, 26 Aug 2010 14:46:09 +0530
Subject: Cannot allocate memory error on mount
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On 08/26/2010 01:56 AM, joe hefner wrote:
> On a recent Fedora (13), I am seeing a mount failure message that I can not explain. I have a Windows Server 2003ýa with a share set up for access only for a specific username (say userfoo). If I try to mount it from Linux,ýusing userfoo and the correct password all is well. If I try with a bad password or with some other username (userbar), it fails with "Permission denied" as expected. If I try to mount as username = administrator, and give the correct administrator password, I would also expect "Permission denied", but I see "Cannot allocate memory" instead.

> ýfs/cifs/netmisc.c: Mapping smb error code 5 to POSIX err -13
> ýfs/cifs/cifssmb.c: Send error in QPathInfo = -13
> ýCIFS VFS: cifs_read_super: get root inode failed

Looks like the commit 0b8f18e3 assumed that cifs_get_inode_info() and
friends fail only due to memory allocation error when the inode is NULL
which is not the case if CIFSSMBQPathInfo() fails and returns an error.
Fix this by propagating the actual error code back.

Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Suresh Jayaraman <sjayaraman@suse.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 4bc47e5b5f29..86a164f08a74 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -834,7 +834,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 						xid, NULL);
 
 	if (!inode)
-		return ERR_PTR(-ENOMEM);
+		return ERR_PTR(rc);
 
 #ifdef CONFIG_CIFS_FSCACHE
 	/* populate tcon->resource_id */
-- 
cgit v1.2.3


From 30c0e1ef0a8a6cab4e0f9357698c81a2f7f73cc5 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 17 Aug 2010 18:46:33 -0400
Subject: nfsd4: bad BUG() in preprocess_stateid_op

It's OK for this function to return without setting filp--we do it in
the special-stateid case.

And there's a legitimate case where we can hit this, since we do permit
reads on write-only stateid's.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0a024917f052..b990eadb799c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2983,7 +2983,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
 				*filpp = find_readable_file(stp->st_file);
 			else
 				*filpp = find_writeable_file(stp->st_file);
-			BUG_ON(!*filpp); /* assured by check_openmode */
 		}
 	}
 	status = nfs_ok;
-- 
cgit v1.2.3


From 18608ad49cffa430cfd0b4e027dedfe3114f916e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 20 Aug 2010 18:06:26 -0400
Subject: nfsd4: typo fix in find_any_file

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/state.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 7731a75971dd..84579c86b13d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -379,7 +379,7 @@ static inline struct file *find_any_file(struct nfs4_file *f)
 {
 	if (f->fi_fds[O_RDWR])
 		return f->fi_fds[O_RDWR];
-	else if (f->fi_fds[O_RDWR])
+	else if (f->fi_fds[O_WRONLY])
 		return f->fi_fds[O_WRONLY];
 	else
 		return f->fi_fds[O_RDONLY];
-- 
cgit v1.2.3


From 7d94784293096c0a46897acdb83be5abd9278ece Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 20 Aug 2010 18:09:31 -0400
Subject: nfsd4: fix downgrade/lock logic

If we already had a RW open for a file, and get a readonly open, we were
piggybacking on the existing RW open.  That's inconsistent with the
downgrade logic which blows away the RW open assuming you'll still have
a readonly open.

Also, make sure there is a readonly or writeonly open available for
locking, again to prevent bad behavior in downgrade cases when any RW
open may be lost.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4state.c | 25 +++++++++++++++----------
 fs/nfsd/state.h     | 12 ++++++------
 2 files changed, 21 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b990eadb799c..69b266db7f5c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2450,14 +2450,13 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh,
 static __be32
 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open)
 {
-	u32 op_share_access, new_access;
+	u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
+	bool new_access;
 	__be32 status;
 
-	set_access(&new_access, stp->st_access_bmap);
-	new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK;
-
+	new_access = !test_bit(op_share_access, &stp->st_access_bmap);
 	if (new_access) {
-		status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access);
+		status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access);
 		if (status)
 			return status;
 	}
@@ -2470,7 +2469,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
 		return status;
 	}
 	/* remember the open */
-	op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK;
 	__set_bit(op_share_access, &stp->st_access_bmap);
 	__set_bit(open->op_share_deny, &stp->st_deny_bmap);
 
@@ -3560,7 +3558,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_stateowner *open_sop = NULL;
 	struct nfs4_stateowner *lock_sop = NULL;
 	struct nfs4_stateid *lock_stp;
-	struct file *filp;
+	struct nfs4_file *fp;
+	struct file *filp = NULL;
 	struct file_lock file_lock;
 	struct file_lock conflock;
 	__be32 status = 0;
@@ -3590,7 +3589,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		 * lock stateid.
 		 */
 		struct nfs4_stateid *open_stp = NULL;
-		struct nfs4_file *fp;
 		
 		status = nfserr_stale_clientid;
 		if (!nfsd4_has_session(cstate) &&
@@ -3633,6 +3631,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		if (status)
 			goto out;
 		lock_sop = lock->lk_replay_owner;
+		fp = lock_stp->st_file;
 	}
 	/* lock->lk_replay_owner and lock_stp have been created or found */
 
@@ -3647,13 +3646,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	switch (lock->lk_type) {
 		case NFS4_READ_LT:
 		case NFS4_READW_LT:
-			filp = find_readable_file(lock_stp->st_file);
+			if (find_readable_file(lock_stp->st_file)) {
+				nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ);
+				filp = find_readable_file(lock_stp->st_file);
+			}
 			file_lock.fl_type = F_RDLCK;
 			cmd = F_SETLK;
 		break;
 		case NFS4_WRITE_LT:
 		case NFS4_WRITEW_LT:
-			filp = find_writeable_file(lock_stp->st_file);
+			if (find_writeable_file(lock_stp->st_file)) {
+				nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE);
+				filp = find_writeable_file(lock_stp->st_file);
+			}
 			file_lock.fl_type = F_WRLCK;
 			cmd = F_SETLK;
 		break;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 84579c86b13d..322518c88e4b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -363,16 +363,16 @@ struct nfs4_file {
  * at all? */
 static inline struct file *find_writeable_file(struct nfs4_file *f)
 {
-	if (f->fi_fds[O_RDWR])
-		return f->fi_fds[O_RDWR];
-	return f->fi_fds[O_WRONLY];
+	if (f->fi_fds[O_WRONLY])
+		return f->fi_fds[O_WRONLY];
+	return f->fi_fds[O_RDWR];
 }
 
 static inline struct file *find_readable_file(struct nfs4_file *f)
 {
-	if (f->fi_fds[O_RDWR])
-		return f->fi_fds[O_RDWR];
-	return f->fi_fds[O_RDONLY];
+	if (f->fi_fds[O_RDONLY])
+		return f->fi_fds[O_RDONLY];
+	return f->fi_fds[O_RDWR];
 }
 
 static inline struct file *find_any_file(struct nfs4_file *f)
-- 
cgit v1.2.3


From f6360efb83cd6dd1476cd758834c8277508c1f15 Mon Sep 17 00:00:00 2001
From: Takashi Iwai <tiwai@suse.de>
Date: Fri, 13 Aug 2010 15:53:49 +0200
Subject: nfsd: fix NULL dereference in nfsd_statfs()

The commit ebabe9a9001af0af56c0c2780ca1576246e7a74b
    pass a struct path to vfs_statfs
introduced the struct path initialization, and this seems to trigger
an Oops on my machine.

fh_dentry field may be NULL and set later in fh_verify(), thus the
initialization of path must be after fh_verify().

Signed-off-by: Takashi Iwai <tiwai@suse.de>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/vfs.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 96360a83cb91..661a6cf8e826 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2033,15 +2033,17 @@ out:
 __be32
 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access)
 {
-	struct path path = {
-		.mnt	= fhp->fh_export->ex_path.mnt,
-		.dentry	= fhp->fh_dentry,
-	};
 	__be32 err;
 
 	err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
-	if (!err && vfs_statfs(&path, stat))
-		err = nfserr_io;
+	if (!err) {
+		struct path path = {
+			.mnt	= fhp->fh_export->ex_path.mnt,
+			.dentry	= fhp->fh_dentry,
+		};
+		if (vfs_statfs(&path, stat))
+			err = nfserr_io;
+	}
 	return err;
 }
 
-- 
cgit v1.2.3


From f137f15072411618e37b338aa13e5ae43583bcf2 Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Wed, 11 Aug 2010 12:11:41 +0200
Subject: fs/ecryptfs: Return -ENOMEM on memory allocation failure

In this code, 0 is returned on memory allocation failure, even though other
failures return -ENOMEM or other similar values.

A simplified version of the semantic match that finds this problem is as
follows: (http://coccinelle.lip6.fr/)

// <smpl>
@@
expression ret;
expression x,e1,e2,e3;
@@

ret = 0
... when != ret = e1
*x = \(kmalloc\|kcalloc\|kzalloc\)(...)
... when != ret = e2
if (x == NULL) { ... when != ret = e3
  return ret;
}
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/keystore.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 89c5476506ef..73811cfa2ea4 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -515,6 +515,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 	if (!s) {
 		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
 		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+		rc = -ENOMEM;
 		goto out;
 	}
 	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
@@ -806,6 +807,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size,
 	if (!s) {
 		printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc "
 		       "[%zd] bytes of kernel memory\n", __func__, sizeof(*s));
+		rc = -ENOMEM;
 		goto out;
 	}
 	s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-- 
cgit v1.2.3


From 7371a38201d04124a9ff2cf05059731d7c1e35a5 Mon Sep 17 00:00:00 2001
From: Jerome Marchand <jmarchan@redhat.com>
Date: Tue, 17 Aug 2010 17:24:05 +0200
Subject: ecryptfs: properly mark init functions

Some ecryptfs init functions are not prefixed by __init and thus not
freed after initialization. This patch saved about 1kB in ecryptfs
module.

Signed-off-by: Jerome Marchand <jmarchan@redhat.com>
Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c    | 2 +-
 fs/ecryptfs/kthread.c   | 2 +-
 fs/ecryptfs/messaging.c | 2 +-
 fs/ecryptfs/miscdev.c   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index a2e3b562e65d..13ff48b3eacb 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache;
 static struct list_head key_tfm_list;
 struct mutex key_tfm_list_mutex;
 
-int ecryptfs_init_crypto(void)
+int __init ecryptfs_init_crypto(void)
 {
 	mutex_init(&key_tfm_list_mutex);
 	INIT_LIST_HEAD(&key_tfm_list);
diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c
index d8c3a373aafa..0851ab6980f5 100644
--- a/fs/ecryptfs/kthread.c
+++ b/fs/ecryptfs/kthread.c
@@ -86,7 +86,7 @@ out:
 	return 0;
 }
 
-int ecryptfs_init_kthread(void)
+int __init ecryptfs_init_kthread(void)
 {
 	int rc = 0;
 
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index bcb68c0cb1f0..ab2248090515 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -473,7 +473,7 @@ sleep:
 	return rc;
 }
 
-int ecryptfs_init_messaging(void)
+int __init ecryptfs_init_messaging(void)
 {
 	int i;
 	int rc = 0;
diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c
index 3745f612bcd4..00208c3d7e92 100644
--- a/fs/ecryptfs/miscdev.c
+++ b/fs/ecryptfs/miscdev.c
@@ -500,7 +500,7 @@ static struct miscdevice ecryptfs_miscdev = {
  *
  * Returns zero on success; non-zero otherwise
  */
-int ecryptfs_init_ecryptfs_miscdev(void)
+int __init ecryptfs_init_ecryptfs_miscdev(void)
 {
 	int rc;
 
-- 
cgit v1.2.3


From 93c3fe40c279f002906ad14584c30671097d4394 Mon Sep 17 00:00:00 2001
From: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
Date: Wed, 25 Aug 2010 10:26:37 -0500
Subject: eCryptfs: Fix encrypted file name lookup regression

Fixes a regression caused by 21edad32205e97dc7ccb81a85234c77e760364c8

When file name encryption was enabled, ecryptfs_lookup() failed to use
the encrypted and encoded version of the upper, plaintext, file name
when performing a lookup in the lower file system. This made it
impossible to lookup existing encrypted file names and any newly created
files would have plaintext file names in the lower file system.

https://bugs.launchpad.net/ecryptfs/+bug/623087

Signed-off-by: Tyler Hicks <tyhicks@linux.vnet.ibm.com>
---
 fs/ecryptfs/crypto.c |  1 -
 fs/ecryptfs/inode.c  | 31 ++++++++++++++++++++++++-------
 2 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 13ff48b3eacb..cbadc1bee6e7 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename(
 				(ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE
 				 + encoded_name_no_prefix_size);
 			(*encoded_name)[(*encoded_name_size)] = '\0';
-			(*encoded_name_size)++;
 		} else {
 			rc = -EOPNOTSUPP;
 		}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 6c55113e7222..3fbc94203380 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -349,7 +349,7 @@ out:
 
 /**
  * ecryptfs_new_lower_dentry
- * @ename: The name of the new dentry.
+ * @name: The name of the new dentry.
  * @lower_dir_dentry: Parent directory of the new dentry.
  * @nd: nameidata from last lookup.
  *
@@ -386,20 +386,19 @@ ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry,
  * ecryptfs_lookup_one_lower
  * @ecryptfs_dentry: The eCryptfs dentry that we are looking up
  * @lower_dir_dentry: lower parent directory
+ * @name: lower file name
  *
  * Get the lower dentry from vfs. If lower dentry does not exist yet,
  * create it.
  */
 static struct dentry *
 ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry,
-			  struct dentry *lower_dir_dentry)
+			  struct dentry *lower_dir_dentry, struct qstr *name)
 {
 	struct nameidata nd;
 	struct vfsmount *lower_mnt;
-	struct qstr *name;
 	int err;
 
-	name = &ecryptfs_dentry->d_name;
 	lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(
 				    ecryptfs_dentry->d_parent));
 	err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd);
@@ -434,6 +433,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	size_t encrypted_and_encoded_name_size;
 	struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL;
 	struct dentry *lower_dir_dentry, *lower_dentry;
+	struct qstr lower_name;
 	int rc = 0;
 
 	ecryptfs_dentry->d_op = &ecryptfs_dops;
@@ -444,9 +444,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		goto out_d_drop;
 	}
 	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-
+	lower_name.name = ecryptfs_dentry->d_name.name;
+	lower_name.len = ecryptfs_dentry->d_name.len;
+	lower_name.hash = ecryptfs_dentry->d_name.hash;
+	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+						    &lower_name);
+		if (rc < 0)
+			goto out_d_drop;
+	}
 	lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-						 lower_dir_dentry);
+						 lower_dir_dentry, &lower_name);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
 		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
@@ -471,8 +479,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		       "filename; rc = [%d]\n", __func__, rc);
 		goto out_d_drop;
 	}
+	lower_name.name = encrypted_and_encoded_name;
+	lower_name.len = encrypted_and_encoded_name_size;
+	lower_name.hash = full_name_hash(lower_name.name, lower_name.len);
+	if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) {
+		rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry,
+						    &lower_name);
+		if (rc < 0)
+			goto out_d_drop;
+	}
 	lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry,
-						 lower_dir_dentry);
+						 lower_dir_dentry, &lower_name);
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
 		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned "
-- 
cgit v1.2.3


From a2f13ad0ba5d94b9768c28469b45ca1e81a2b895 Mon Sep 17 00:00:00 2001
From: Andreas Gruenbacher <agruen@suse.de>
Date: Tue, 24 Aug 2010 12:58:54 +0200
Subject: fanotify: Return EPERM when a process is not privileged

The appropriate error code when privileged operations are denied is
EPERM, not EACCES.

Signed-off-by: Andreas Gruenbacher <agruen@suse.de>
Signed-off-by: Eric Paris <paris@paris.rdu.redhat.com>
---
 fs/notify/fanotify/fanotify_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index b966b7230f47..5ed8e58d7bfc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -641,7 +641,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 		__func__, flags, event_f_flags);
 
 	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
+		return -EPERM;
 
 	if (flags & ~FAN_ALL_INIT_FLAGS)
 		return -EINVAL;
-- 
cgit v1.2.3


From f72adfd540bacc4f6ff57a7d708b1a6c8906bdb4 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 27 Aug 2010 21:24:24 -0400
Subject: fsnotify: fix list walk order

Marks were stored on the inode and vfsmonut mark list in order from
highest memory address to lowest memory address.  The code to walk those
lists thought they were in order from lowest to highest with
unpredictable results when trying to match up marks from each.  It was
possible that extra events would be sent to userspace when inode
marks ignoring events wouldn't get matched with the vfsmount marks.

This problem only affected fanotify when using both vfsmount and inode
marks simultaneously.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 6f2777ce87a1..2169aa593d5f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -261,27 +261,26 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 
 	while (inode_node || vfsmount_node) {
 		used_inode = used_vfsmount = false;
+		inode_group = vfsmount_group = NULL;
 
 		if (inode_node) {
 			inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu),
 						 struct fsnotify_mark, i.i_list);
 			inode_group = inode_mark->group;
-		} else
-			inode_group = (void *)-1;
+		}
 
 		if (vfsmount_node) {
 			vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu),
 							struct fsnotify_mark, m.m_list);
 			vfsmount_group = vfsmount_mark->group;
-		} else
-			vfsmount_group = (void *)-1;
+		}
 
-		if (inode_group < vfsmount_group) {
+		if (inode_group > vfsmount_group) {
 			/* handle inode */
 			send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
 				      data_is, cookie, file_name, &event);
 			used_inode = true;
-		} else if (vfsmount_group < inode_group) {
+		} else if (vfsmount_group > inode_group) {
 			send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
 				      data_is, cookie, file_name, &event);
 			used_vfsmount = true;
-- 
cgit v1.2.3


From 92b4678efa8ce0de9b1e01a74e3d13c4002a4136 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 27 Aug 2010 21:42:11 -0400
Subject: fsnotify: drop two useless bools in the fnsotify main loop

The fsnotify main loop has 2 bools which indicated if we processed the
inode or vfsmount mark in that particular pass through the loop.  These
bool can we replaced with the inode_group and vfsmount_group variables
and actually make the code a little easier to understand.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 2169aa593d5f..36802420d69a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -225,7 +225,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	struct fsnotify_event *event = NULL;
 	struct vfsmount *mnt;
 	int idx, ret = 0;
-	bool used_inode, used_vfsmount;
 	/* global tests shouldn't care about events on child only the specific event */
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
@@ -260,7 +259,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	}
 
 	while (inode_node || vfsmount_node) {
-		used_inode = used_vfsmount = false;
 		inode_group = vfsmount_group = NULL;
 
 		if (inode_node) {
@@ -279,23 +277,22 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 			/* handle inode */
 			send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
 				      data_is, cookie, file_name, &event);
-			used_inode = true;
+			/* we didn't use the vfsmount_mark */
+			vfsmount_group = NULL;
 		} else if (vfsmount_group > inode_group) {
 			send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data,
 				      data_is, cookie, file_name, &event);
-			used_vfsmount = true;
+			inode_group = NULL;
 		} else {
 			send_to_group(to_tell, mnt, inode_mark, vfsmount_mark,
 				      mask, data, data_is, cookie, file_name,
 				      &event);
-			used_vfsmount = true;
-			used_inode = true;
 		}
 
-		if (used_inode)
+		if (inode_group)
 			inode_node = srcu_dereference(inode_node->next,
 						      &fsnotify_mark_srcu);
-		if (used_vfsmount)
+		if (vfsmount_group)
 			vfsmount_node = srcu_dereference(vfsmount_node->next,
 							 &fsnotify_mark_srcu);
 	}
-- 
cgit v1.2.3


From 4afc31345e5f543e5d89a47aeadaaad1d91a5bc8 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Sun, 29 Aug 2010 01:55:38 +0900
Subject: nilfs2: fix leak of shadow dat inode in error path of load_nilfs

If load_nilfs() gets an error while doing recovery, it will fail to
free the shadow inode of dat (nilfs->ns_gc_dat).

This fixes the leak issue.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
---
 fs/nilfs2/the_nilfs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 4317f177ea7c..ba7c10c917fc 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -446,6 +446,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	nilfs_mdt_destroy(nilfs->ns_cpfile);
 	nilfs_mdt_destroy(nilfs->ns_sufile);
 	nilfs_mdt_destroy(nilfs->ns_dat);
+	nilfs_mdt_destroy(nilfs->ns_gc_dat);
 
  failed:
 	nilfs_clear_recovery_info(&ri);
-- 
cgit v1.2.3


From 8f587df479c3cea14ba1a9b9d58f34fd2fd6d58b Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 4 Aug 2010 16:27:45 +0000
Subject: 9p: potential ERR_PTR() dereference

p9_client_walk() can return error values if we run out of space or there
is a problem with the network.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Eric Van Hensbergen <ericvh@gmail.com>
---
 fs/9p/fid.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index 358563689064..6406f896bf95 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -242,7 +242,8 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry)
 	}
 	kfree(wnames);
 fid_out:
-	v9fs_fid_add(dentry, fid);
+	if (!IS_ERR(fid))
+		v9fs_fid_add(dentry, fid);
 err_out:
 	up_read(&v9ses->rename_sem);
 	return fid;
-- 
cgit v1.2.3


From 9bc08a45fb117c696e4940cfa1208cb1cc7a2f25 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Thu, 2 Sep 2010 15:14:38 +1000
Subject: xfs: improve buffer cache hash scalability

When doing large parallel file creates on a 16p machines, large amounts of
time is being spent in _xfs_buf_find(). A system wide profile with perf top
shows this:

          1134740.00 19.3% _xfs_buf_find
           733142.00 12.5% __ticket_spin_lock

The problem is that the hash contains 45,000 buffers, and the hash table width
is only 256 buffers. That means we've got around 200 buffers per chain, and
searching it is quite expensive. The hash table size needs to increase.

Secondly, every time we do a lookup, we promote the buffer we find to the head
of the hash chain. This is causing cachelines to be dirtied and causes
invalidation of cachelines across all CPUs that may have walked the hash chain
recently. hence every walk of the hash chain is effectively a cold cache walk.
Remove the promotion to avoid this invalidation.

The results are:

          1045043.00 21.2% __ticket_spin_lock
           326184.00  6.6% _xfs_buf_find

A 70% drop in the CPU usage when looking up buffers. Unfortunately that does
not result in an increase in performance underthis workload as contention on
the inode_lock soaks up most of the reduction in CPU usage.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/linux-2.6/xfs_buf.c | 8 +-------
 fs/xfs/linux-2.6/xfs_buf.h | 1 -
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index ea79072f5210..d72cf2bb054a 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -440,12 +440,7 @@ _xfs_buf_find(
 		ASSERT(btp == bp->b_target);
 		if (bp->b_file_offset == range_base &&
 		    bp->b_buffer_length == range_length) {
-			/*
-			 * If we look at something, bring it to the
-			 * front of the list for next time.
-			 */
 			atomic_inc(&bp->b_hold);
-			list_move(&bp->b_hash_list, &hash->bh_list);
 			goto found;
 		}
 	}
@@ -1443,8 +1438,7 @@ xfs_alloc_bufhash(
 {
 	unsigned int		i;
 
-	btp->bt_hashshift = external ? 3 : 8;	/* 8 or 256 buckets */
-	btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
+	btp->bt_hashshift = external ? 3 : 12;	/* 8 or 4096 buckets */
 	btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
 					 sizeof(xfs_bufhash_t));
 	for (i = 0; i < (1 << btp->bt_hashshift); i++) {
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index d072e5ff923b..2a05614f0b92 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -137,7 +137,6 @@ typedef struct xfs_buftarg {
 	size_t			bt_smask;
 
 	/* per device buffer hash table */
-	uint			bt_hashmask;
 	uint			bt_hashshift;
 	xfs_bufhash_t		*bt_hash;
 
-- 
cgit v1.2.3


From 23963e54ce187ca6e907c83176c15508b0f6e60d Mon Sep 17 00:00:00 2001
From: Arkadiusz Mi?kiewicz <arekm@maven.pl>
Date: Thu, 26 Aug 2010 10:19:43 +0000
Subject: xfs: Disallow 32bit project quota id

Currently on-disk structure is able to keep only 16bit project quota
id, so disallow 32bit ones. This fixes a problem where parts of
kernel structures holding project quota id are 32bit while parts
(on-disk) are 16bit variables which causes project quota member
files to be inaccessible for some operations (like mv/rm).

Signed-off-by: Arkadiusz Mi?kiewicz <arekm@maven.pl>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_ioctl.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 237f5ffb2ee8..4fec427b83ef 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -906,6 +906,13 @@ xfs_ioctl_setattr(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return XFS_ERROR(EIO);
 
+	/*
+	 * Disallow 32bit project ids because on-disk structure
+	 * is 16bit only.
+	 */
+	if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1))
+		return XFS_ERROR(EINVAL);
+
 	/*
 	 * If disk quotas is on, we make sure that the dquots do exist on disk,
 	 * before we start any other transactions. Trying to do this later
-- 
cgit v1.2.3


From 72656c46f50b8dfe50e15793692982e636e3df20 Mon Sep 17 00:00:00 2001
From: Dave Chinner <dchinner@redhat.com>
Date: Fri, 3 Sep 2010 12:19:33 +1000
Subject: xfs: prevent 32bit overflow in space reservation

If we attempt to preallocate more than 2^32 blocks of space in a
single syscall, the transaction block reservation will overflow
leading to a hangs in the superblock block accounting code. This
is trivially reproduced with xfs_io. Fix the problem by capping the
allocation reservation to the maximum number of blocks a single
xfs_bmapi() call can allocate (2^21 blocks).

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_vnodeops.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 66d585c6917c..4c7c7bfb2b2f 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -2299,15 +2299,22 @@ xfs_alloc_file_space(
 			e = allocatesize_fsb;
 		}
 
+		/*
+		 * The transaction reservation is limited to a 32-bit block
+		 * count, hence we need to limit the number of blocks we are
+		 * trying to reserve to avoid an overflow. We can't allocate
+		 * more than @nimaps extents, and an extent is limited on disk
+		 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
+		 */
+		resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
 		if (unlikely(rt)) {
-			resrtextents = qblocks = (uint)(e - s);
+			resrtextents = qblocks = resblks;
 			resrtextents /= mp->m_sb.sb_rextsize;
 			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
 			quota_flag = XFS_QMOPT_RES_RTBLKS;
 		} else {
 			resrtextents = 0;
-			resblks = qblocks = \
-				XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s));
+			resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
 			quota_flag = XFS_QMOPT_RES_REGBLKS;
 		}
 
-- 
cgit v1.2.3


From 9af25465081480a75824fd7a16a37a5cfebeede9 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Mon, 30 Aug 2010 02:44:03 +0000
Subject: xfs: Make fiemap work with sparse files

In xfs_vn_fiemap, we set bvm_count to fi_extent_max + 1 and want
to return fi_extent_max extents, but actually it won't work for
a sparse file. The reason is that in xfs_getbmap we will
calculate holes and set it in 'out', while out is malloced by
bmv_count(fi_extent_max+1) which didn't consider holes. So in the
worst case, if 'out' vector looks like
[hole, extent, hole, extent, hole, ... hole, extent, hole],
we will only return half of fi_extent_max extents.

This patch add a new parameter BMV_IF_NO_HOLES for bvm_iflags.
So with this flags, we don't use our 'out' in xfs_getbmap for
a hole. The solution is a bit ugly by just don't increasing
index of 'out' vector. I felt that it is not easy to skip it
at the very beginning since we have the complicated check and
some function like xfs_getbmapx_fix_eof_hole to adjust 'out'.

Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/linux-2.6/xfs_iops.c |  2 +-
 fs/xfs/xfs_bmap.c           | 14 +++++++++++++-
 fs/xfs/xfs_fs.h             |  4 +++-
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 68be25dcd301..b1fc2a6bfe83 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -664,7 +664,7 @@ xfs_vn_fiemap(
 					fieinfo->fi_extents_max + 1;
 	bm.bmv_count = min_t(__s32, bm.bmv_count,
 			     (PAGE_SIZE * 16 / sizeof(struct getbmapx)));
-	bm.bmv_iflags = BMV_IF_PREALLOC;
+	bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES;
 	if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
 		bm.bmv_iflags |= BMV_IF_ATTRFORK;
 	if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 23f14e595c18..f90dadd5a968 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -5533,12 +5533,24 @@ xfs_getbmap(
 					map[i].br_startblock))
 				goto out_free_map;
 
-			nexleft--;
 			bmv->bmv_offset =
 				out[cur_ext].bmv_offset +
 				out[cur_ext].bmv_length;
 			bmv->bmv_length =
 				max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
+
+			/*
+			 * In case we don't want to return the hole,
+			 * don't increase cur_ext so that we can reuse
+			 * it in the next loop.
+			 */
+			if ((iflags & BMV_IF_NO_HOLES) &&
+			    map[i].br_startblock == HOLESTARTBLOCK) {
+				memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
+				continue;
+			}
+
+			nexleft--;
 			bmv->bmv_entries++;
 			cur_ext++;
 		}
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 7cf7220e7d5f..87c2e9d02288 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -114,8 +114,10 @@ struct getbmapx {
 #define BMV_IF_NO_DMAPI_READ	0x2	/* Do not generate DMAPI read event  */
 #define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */
 #define BMV_IF_DELALLOC		0x8	/* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_NO_HOLES		0x10	/* Do not return holes */
 #define BMV_IF_VALID	\
-	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
+	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|	\
+	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
 
 /*	bmv_oflags values - returned for each non-header segment */
 #define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
-- 
cgit v1.2.3


From 57f9bdac2510cd7fda58e4a111d250861eb1ebeb Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Wed, 25 Aug 2010 09:12:29 +0200
Subject: sysfs: checking for NULL instead of ERR_PTR

d_path() returns an ERR_PTR and it doesn't return NULL.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Cc: stable <stable@kernel.org>
Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 fs/sysfs/file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 1b27b5688f62..da3fefe91a8f 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file)
 	char *p;
 
 	p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file));
-	if (p)
+	if (!IS_ERR(p))
 		memmove(last_sysfs_file, p, strlen(p) + 1);
 
 	/* need attr_sd for attr and ops, its parent for kobj */
-- 
cgit v1.2.3


From 7a2e8a8faab76386d8eaae9ded739ee5615be174 Mon Sep 17 00:00:00 2001
From: Valerie Aurora <vaurora@redhat.com>
Date: Thu, 26 Aug 2010 11:07:22 -0700
Subject: VFS: Sanity check mount flags passed to change_mnt_propagation()

Sanity check the flags passed to change_mnt_propagation().  Exactly
one flag should be set.  Return EINVAL otherwise.

Userspace can pass in arbitrary combinations of MS_* flags to mount().
do_change_type() is called if any of MS_SHARED, MS_PRIVATE, MS_SLAVE,
or MS_UNBINDABLE is set.  do_change_type() clears MS_REC and then
calls change_mnt_propagation() with the rest of the user-supplied
flags.  change_mnt_propagation() clearly assumes only one flag is set
but do_change_type() does not check that this is true.  For example,
mount() with flags MS_SHARED | MS_RDONLY does not actually make the
mount shared or read-only but does clear MNT_UNBINDABLE.

Signed-off-by: Valerie Aurora <vaurora@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index de402eb6eafb..a72eaabfe8f2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1483,6 +1483,23 @@ out_unlock:
 	return err;
 }
 
+/*
+ * Sanity check the flags to change_mnt_propagation.
+ */
+
+static int flags_to_propagation_type(int flags)
+{
+	int type = flags & ~MS_REC;
+
+	/* Fail if any non-propagation flags are set */
+	if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
+		return 0;
+	/* Only one propagation flag should be set */
+	if (!is_power_of_2(type))
+		return 0;
+	return type;
+}
+
 /*
  * recursively change the type of the mountpoint.
  */
@@ -1490,7 +1507,7 @@ static int do_change_type(struct path *path, int flag)
 {
 	struct vfsmount *m, *mnt = path->mnt;
 	int recurse = flag & MS_REC;
-	int type = flag & ~MS_REC;
+	int type;
 	int err = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -1499,6 +1516,10 @@ static int do_change_type(struct path *path, int flag)
 	if (path->dentry != path->mnt->mnt_root)
 		return -EINVAL;
 
+	type = flags_to_propagation_type(flag);
+	if (!type)
+		return -EINVAL;
+
 	down_write(&namespace_sem);
 	if (type == MS_SHARED) {
 		err = invent_group_ids(mnt, recurse);
-- 
cgit v1.2.3