From f5a73672d1811f2fb1dcb62ca90ceb12b2050ae7 Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Tue, 10 Aug 2010 10:20:05 -0400 Subject: NFS: allow close-to-open cache semantics to apply to root of NFS filesystem To obey NFS cache semantics, the client must verify the cached attributes when a file is opened. In most cases this is done by a call to d_validate as one of the last steps in path_walk. However for the root of a filesystem, d_validate is only ever called on the mounted-on filesystem (except when the path ends '.' or '..'). So NFS has no chance to validate the attributes. So, in nfs_opendir, we revalidate the attributes if the opened directory is the mountpoint. This may cause double-validation for "." and ".." lookups, but that is better than missing regular /path/name lookups completely. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 29539ceeb745..bd91b2778315 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -140,6 +140,13 @@ nfs_opendir(struct inode *inode, struct file *filp) /* Call generic open code in order to cache credentials */ res = nfs_open(inode, filp); + if (filp->f_path.dentry == filp->f_path.mnt->mnt_root) { + /* This is a mountpoint, so d_revalidate will never + * have been called, so we need to refresh the + * inode (for close-open consistency) ourselves. + */ + __nfs_revalidate_inode(NFS_SERVER(inode), inode); + } return res; } -- cgit v1.2.3 From 9b00c64318cc337846a7a08a5678f5f19aeff188 Mon Sep 17 00:00:00 2001 From: "Patrick J. LoPresti" Date: Tue, 10 Aug 2010 17:28:01 -0400 Subject: nfs: Add "lookupcache" to displayed mount options Running "cat /proc/mounts" fails to display the "lookupcache" option. This oversight cost me a bunch of wasted time recently. The following simple patch fixes it. CC: stable Signed-off-by: Patrick LoPresti Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f1ae39f6cb02..3d0d63c00304 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -655,6 +655,13 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->options & NFS_OPTION_FSCACHE) seq_printf(m, ",fsc"); + + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) { + if (nfss->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + seq_printf(m, ",lookupcache=none"); + else + seq_printf(m, ",lookupcache=pos"); + } } /* -- cgit v1.2.3 From 5d7ca35a182a626f8ed5596023ad42eb219a332e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 11 Aug 2010 12:42:15 -0400 Subject: nfs: Remove redundant NULL check upon kfree() Signed-off-by: Davidlohr Bueso Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7ffbb98ddec3..6b44bbfb7d8a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2273,8 +2273,7 @@ static int nfs4_get_referral(struct inode *dir, const struct qstr *name, struct out: if (page) __free_page(page); - if (locations) - kfree(locations); + kfree(locations); return status; } -- cgit v1.2.3 From 0702099bd86c33c2dcdbd3963433a61f3f503901 Mon Sep 17 00:00:00 2001 From: "J. R. Okajima" Date: Wed, 11 Aug 2010 13:10:16 -0400 Subject: NFS: fix the return value of nfs_file_fsync() By the commit af7fa16 2010-08-03 NFS: Fix up the fsync code close(2) became returning the non-zero value even if it went well. nfs_file_fsync() should return 0 when "status" is positive. Signed-off-by: J. R. Okajima Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 2d141a74ae82..eb51bd6201da 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -323,7 +323,7 @@ nfs_file_fsync(struct file *file, int datasync) have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); if (have_error) ret = xchg(&ctx->error, 0); - if (!ret) + if (!ret && status < 0) ret = status; return ret; } -- cgit v1.2.3 From af4e36318edb848fcc0a8d5f75000ca00cdc7595 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Fri, 13 Aug 2010 12:42:24 +0900 Subject: nilfs2: fix list corruption after ifile creation failure If nilfs_attach_checkpoint() gets a memory allocation failure during creation of ifile, it will return without removing nilfs_sb_info struct from ns_supers list. When a concurrently mounted snapshot is unmounted or another new snapshot is mounted after that, this causes kernel oops as below: > BUG: unable to handle kernel NULL pointer dereference at (null) > IP: [] nilfs_find_sbinfo+0x74/0xa4 [nilfs2] > *pde = 00000000 > Oops: 0000 [#1] SMP > Call Trace: > [] ? nilfs_get_sb+0x165/0x532 [nilfs2] > [] ? ida_get_new_above+0x16d/0x187 > [] ? alloc_vfsmnt+0x7e/0x10a > [] ? kstrdup+0x2c/0x40 > [] ? vfs_kern_mount+0x96/0x14e > [] ? do_kern_mount+0x32/0xbd > [] ? do_mount+0x642/0x6a1 > [] ? do_page_fault+0x0/0x2d1 > [] ? copy_mount_options+0x80/0xe2 > [] ? strndup_user+0x48/0x67 > [] ? sys_mount+0x61/0x90 > [] ? sysenter_do_call+0x12/0x22 This fixes the problem. Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: stable@kernel.org --- fs/nilfs2/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 1fa86b9df73b..bee60c04109a 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -400,9 +400,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) list_add(&sbi->s_list, &nilfs->ns_supers); up_write(&nilfs->ns_super_sem); + err = -ENOMEM; sbi->s_ifile = nilfs_ifile_new(sbi, nilfs->ns_inode_size); if (!sbi->s_ifile) - return -ENOMEM; + goto delist; down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, @@ -433,6 +434,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) nilfs_mdt_destroy(sbi->s_ifile); sbi->s_ifile = NULL; + delist: down_write(&nilfs->ns_super_sem); list_del_init(&sbi->s_list); up_write(&nilfs->ns_super_sem); -- cgit v1.2.3 From ea1a16f7168ac19d974ac51b47593b92280e7992 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 15 Aug 2010 20:16:11 +0900 Subject: nilfs2: fix false warning saying one of two super blocks is broken After applying commit b2ac86e1, the following message got appeared after unclean shutdown: > NILFS warning: broken superblock. using spare superblock. This turns out to be a false message due to the change which updates two super blocks alternately. The secondary super block now can be selected if it's newer than the primary one. This kills the false warning by suppressing it if another super block is not actually broken. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/the_nilfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 37de1f062d81..6af1c0073e9e 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -608,11 +608,11 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, return -EINVAL; } - if (swp) { + if (!valid[!swp]) printk(KERN_WARNING "NILFS warning: broken superblock. " "using spare superblock.\n"); + if (swp) nilfs_swap_super_block(nilfs); - } nilfs->ns_sbwcount = 0; nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); -- cgit v1.2.3 From 5d9ac7fd32f600f9451ea58abdb07f7ed42e921d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 5 Aug 2010 13:58:22 -0400 Subject: cifs: clean up error handling in cifs_mknod Get rid of some nesting and add a label we can goto on error. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/dir.c | 149 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 74 insertions(+), 75 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 578d88c5b46e..f17d50047f07 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -496,6 +496,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, struct cifsTconInfo *pTcon; char *full_path = NULL; struct inode *newinode = NULL; + int oplock = 0; + u16 fileHandle; + FILE_ALL_INFO *buf = NULL; + unsigned int bytes_written; + struct win_dev *pdev; if (!old_valid_dev(device_number)) return -EINVAL; @@ -506,9 +511,12 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, pTcon = cifs_sb->tcon; full_path = build_path_from_dentry(direntry); - if (full_path == NULL) + if (full_path == NULL) { rc = -ENOMEM; - else if (pTcon->unix_ext) { + goto mknod_out; + } + + if (pTcon->unix_ext) { struct cifs_unix_set_info_args args = { .mode = mode & ~current_umask(), .ctime = NO_CHANGE_64, @@ -527,87 +535,78 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) + goto mknod_out; - if (!rc) { - rc = cifs_get_inode_info_unix(&newinode, full_path, + rc = cifs_get_inode_info_unix(&newinode, full_path, inode->i_sb, xid); - if (pTcon->nocase) - direntry->d_op = &cifs_ci_dentry_ops; - else - direntry->d_op = &cifs_dentry_ops; - if (rc == 0) - d_instantiate(direntry, newinode); - } - } else { - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { - int oplock = 0; - u16 fileHandle; - FILE_ALL_INFO *buf; + if (pTcon->nocase) + direntry->d_op = &cifs_ci_dentry_ops; + else + direntry->d_op = &cifs_dentry_ops; - cFYI(1, "sfu compat create special file"); + if (rc == 0) + d_instantiate(direntry, newinode); + goto mknod_out; + } - buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); - if (buf == NULL) { - kfree(full_path); - rc = -ENOMEM; - FreeXid(xid); - return rc; - } + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL)) + goto mknod_out; - rc = CIFSSMBOpen(xid, pTcon, full_path, - FILE_CREATE, /* fail if exists */ - GENERIC_WRITE /* BB would - WRITE_OWNER | WRITE_DAC be better? */, - /* Create a file and set the - file attribute to SYSTEM */ - CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, - &fileHandle, &oplock, buf, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - - /* BB FIXME - add handling for backlevel servers - which need legacy open and check for all - calls to SMBOpen for fallback to SMBLeagcyOpen */ - if (!rc) { - /* BB Do not bother to decode buf since no - local inode yet to put timestamps in, - but we can reuse it safely */ - unsigned int bytes_written; - struct win_dev *pdev; - pdev = (struct win_dev *)buf; - if (S_ISCHR(mode)) { - memcpy(pdev->type, "IntxCHR", 8); - pdev->major = - cpu_to_le64(MAJOR(device_number)); - pdev->minor = - cpu_to_le64(MINOR(device_number)); - rc = CIFSSMBWrite(xid, pTcon, - fileHandle, - sizeof(struct win_dev), - 0, &bytes_written, (char *)pdev, - NULL, 0); - } else if (S_ISBLK(mode)) { - memcpy(pdev->type, "IntxBLK", 8); - pdev->major = - cpu_to_le64(MAJOR(device_number)); - pdev->minor = - cpu_to_le64(MINOR(device_number)); - rc = CIFSSMBWrite(xid, pTcon, - fileHandle, - sizeof(struct win_dev), - 0, &bytes_written, (char *)pdev, - NULL, 0); - } /* else if(S_ISFIFO */ - CIFSSMBClose(xid, pTcon, fileHandle); - d_drop(direntry); - } - kfree(buf); - /* add code here to set EAs */ - } + + cFYI(1, "sfu compat create special file"); + + buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (buf == NULL) { + kfree(full_path); + rc = -ENOMEM; + FreeXid(xid); + return rc; } + /* FIXME: would WRITE_OWNER | WRITE_DAC be better? */ + rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_CREATE, + GENERIC_WRITE, CREATE_NOT_DIR | CREATE_OPTION_SPECIAL, + &fileHandle, &oplock, buf, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) + goto mknod_out; + + /* BB Do not bother to decode buf since no local inode yet to put + * timestamps in, but we can reuse it safely */ + + pdev = (struct win_dev *)buf; + if (S_ISCHR(mode)) { + memcpy(pdev->type, "IntxCHR", 8); + pdev->major = + cpu_to_le64(MAJOR(device_number)); + pdev->minor = + cpu_to_le64(MINOR(device_number)); + rc = CIFSSMBWrite(xid, pTcon, + fileHandle, + sizeof(struct win_dev), + 0, &bytes_written, (char *)pdev, + NULL, 0); + } else if (S_ISBLK(mode)) { + memcpy(pdev->type, "IntxBLK", 8); + pdev->major = + cpu_to_le64(MAJOR(device_number)); + pdev->minor = + cpu_to_le64(MINOR(device_number)); + rc = CIFSSMBWrite(xid, pTcon, + fileHandle, + sizeof(struct win_dev), + 0, &bytes_written, (char *)pdev, + NULL, 0); + } /* else if (S_ISFIFO) */ + CIFSSMBClose(xid, pTcon, fileHandle); + d_drop(direntry); + + /* FIXME: add code here to set EAs */ + +mknod_out: kfree(full_path); + kfree(buf); FreeXid(xid); return rc; } -- cgit v1.2.3 From 232341ba7fa15115d40f6aa0f8dd14e96e3ad375 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 5 Aug 2010 13:58:38 -0400 Subject: cifs: consolidate error handling in several functions cifs has a lot of complicated functions that have to clean up things on error, but some of them don't have all of the cleanup code well-consolidated. Clean up and consolidate error handling in several functions. This is in preparation of later patches that will need to put references to the tcon link container. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/dir.c | 8 +++----- fs/cifs/file.c | 3 +-- 2 files changed, 4 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index f17d50047f07..f9ed0751cc12 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -305,8 +305,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, full_path = build_path_from_dentry(direntry); if (full_path == NULL) { rc = -ENOMEM; - FreeXid(xid); - return rc; + goto cifs_create_out; } if (oplockEnabled) @@ -365,9 +364,8 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { - kfree(full_path); - FreeXid(xid); - return -ENOMEM; + rc = -ENOMEM; + goto cifs_create_out; } /* diff --git a/fs/cifs/file.c b/fs/cifs/file.c index db11fdef0e92..de748c652d11 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -242,8 +242,7 @@ int cifs_open(struct inode *inode, struct file *file) full_path = build_path_from_dentry(file->f_path.dentry); if (full_path == NULL) { rc = -ENOMEM; - FreeXid(xid); - return rc; + goto out; } cFYI(1, "inode = 0x%p file flags are 0x%x for %s", -- cgit v1.2.3 From df486a25900f4dba9cdc3886c4ac871951c6aef3 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 17 Aug 2010 17:42:45 -0400 Subject: NFS: Fix the selection of security flavours in Kconfig Randy Dunlap reports: ERROR: "svc_gss_principal" [fs/nfs/nfs.ko] undefined! because in fs/nfs/Kconfig, NFS_V4 selects RPCSEC_GSS_KRB5 and/or in fs/nfsd/Kconfig, NFSD_V4 selects RPCSEC_GSS_KRB5. RPCSEC_GSS_KRB5 does 5 selects, but none of these is enforced/followed by the fs/nfs[d]/Kconfig configs: select SUNRPC_GSS select CRYPTO select CRYPTO_MD5 select CRYPTO_DES select CRYPTO_CBC Reported-by: Randy Dunlap Cc: J. Bruce Fields Acked-by: Randy Dunlap Signed-off-by: Trond Myklebust --- fs/nfs/Kconfig | 1 - fs/nfsd/Kconfig | 1 - net/sunrpc/Kconfig | 9 +++++---- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index cc1bb33b59b8..2ddc384ec042 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -63,7 +63,6 @@ config NFS_V3_ACL config NFS_V4 bool "NFS client support for NFS version 4" depends on NFS_FS - select RPCSEC_GSS_KRB5 help This option enables support for version 4 of the NFS protocol (RFC 3530) in the kernel's NFS client. diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 503b9da159a3..95932f523aef 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -69,7 +69,6 @@ config NFSD_V4 depends on NFSD && PROC_FS && EXPERIMENTAL select NFSD_V3 select FS_POSIX_ACL - select RPCSEC_GSS_KRB5 help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 443c161eb8bd..3376d7657185 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -18,10 +18,11 @@ config SUNRPC_XPRT_RDMA If unsure, say N. config RPCSEC_GSS_KRB5 - tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" - depends on SUNRPC && EXPERIMENTAL + tristate + depends on SUNRPC && CRYPTO + prompt "Secure RPC: Kerberos V mechanism" if !(NFS_V4 || NFSD_V4) + default y select SUNRPC_GSS - select CRYPTO select CRYPTO_MD5 select CRYPTO_DES select CRYPTO_CBC @@ -34,7 +35,7 @@ config RPCSEC_GSS_KRB5 available from http://linux-nfs.org/. In addition, user-space Kerberos support should be installed. - If unsure, say N. + If unsure, say Y. config RPCSEC_GSS_SPKM3 tristate "Secure RPC: SPKM3 mechanism (EXPERIMENTAL)" -- cgit v1.2.3 From d7627467b7a8dd6944885290a03a07ceb28c10eb Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 17 Aug 2010 23:52:56 +0100 Subject: Make do_execve() take a const filename pointer Make do_execve() take a const filename pointer so that kernel_execve() compiles correctly on ARM: arch/arm/kernel/sys_arm.c:88: warning: passing argument 1 of 'do_execve' discards qualifiers from pointer target type This also requires the argv and envp arguments to be consted twice, once for the pointer array and once for the strings the array points to. This is because do_execve() passes a pointer to the filename (now const) to copy_strings_kernel(). A simpler alternative would be to cast the filename pointer in do_execve() when it's passed to copy_strings_kernel(). do_execve() may not change any of the strings it is passed as part of the argv or envp lists as they are some of them in .rodata, so marking these strings as const should be fine. Further kernel_execve() and sys_execve() need to be changed to match. This has been test built on x86_64, frv, arm and mips. Signed-off-by: David Howells Tested-by: Ralf Baechle Acked-by: Russell King Signed-off-by: Linus Torvalds --- arch/alpha/kernel/process.c | 5 +++-- arch/arm/kernel/sys_arm.c | 14 +++++++++----- arch/avr32/kernel/process.c | 5 +++-- arch/avr32/kernel/sys_avr32.c | 4 +++- arch/blackfin/kernel/process.c | 4 +++- arch/cris/arch-v10/kernel/process.c | 4 +++- arch/cris/arch-v32/kernel/process.c | 6 ++++-- arch/frv/kernel/process.c | 5 +++-- arch/h8300/kernel/process.c | 5 ++++- arch/h8300/kernel/sys_h8300.c | 4 +++- arch/ia64/kernel/process.c | 4 +++- arch/m32r/kernel/process.c | 4 ++-- arch/m32r/kernel/sys_m32r.c | 4 +++- arch/m68k/kernel/process.c | 4 +++- arch/m68k/kernel/sys_m68k.c | 4 +++- arch/m68knommu/kernel/process.c | 4 +++- arch/m68knommu/kernel/sys_m68k.c | 4 +++- arch/microblaze/kernel/sys_microblaze.c | 10 +++++++--- arch/mips/kernel/syscall.c | 10 +++++++--- arch/mn10300/kernel/process.c | 4 ++-- arch/parisc/hpux/fs.c | 6 ++++-- arch/parisc/kernel/process.c | 15 ++++++++++----- arch/powerpc/kernel/process.c | 5 +++-- arch/s390/kernel/process.c | 5 +++-- arch/score/kernel/sys_score.c | 10 +++++++--- arch/sh/kernel/process_32.c | 7 ++++--- arch/sh/kernel/process_64.c | 4 ++-- arch/sh/kernel/sys_sh32.c | 4 +++- arch/sh/kernel/sys_sh64.c | 4 +++- arch/sparc/kernel/process_32.c | 6 ++++-- arch/sparc/kernel/process_64.c | 4 ++-- arch/sparc/kernel/sys_sparc_32.c | 4 +++- arch/sparc/kernel/sys_sparc_64.c | 4 +++- arch/tile/kernel/process.c | 5 +++-- arch/um/kernel/exec.c | 5 +++-- arch/um/kernel/syscall.c | 4 +++- arch/x86/include/asm/syscalls.h | 5 +++-- arch/x86/kernel/process.c | 5 +++-- arch/x86/kernel/sys_i386_32.c | 4 +++- arch/xtensa/kernel/process.c | 5 +++-- fs/binfmt_misc.c | 2 +- fs/binfmt_script.c | 3 ++- fs/exec.c | 21 +++++++++++---------- include/linux/binfmts.h | 7 ++++--- include/linux/sched.h | 4 +++- include/linux/syscalls.h | 2 +- init/do_mounts_initrd.c | 7 ++++--- init/main.c | 6 +++--- kernel/kmod.c | 4 +++- security/commoncap.c | 2 +- 50 files changed, 179 insertions(+), 98 deletions(-) (limited to 'fs') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 88e608aebc8c..842dba308eab 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -387,8 +387,9 @@ EXPORT_SYMBOL(dump_elf_task_fp); * sys_execve() executes a new program. */ asmlinkage int -do_sys_execve(const char __user *ufilename, char __user * __user *argv, - char __user * __user *envp, struct pt_regs *regs) +do_sys_execve(const char __user *ufilename, + const char __user *const __user *argv, + const char __user *const __user *envp, struct pt_regs *regs) { int error; char *filename; diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c index 5b7c541a4c63..62e7c61d0342 100644 --- a/arch/arm/kernel/sys_arm.c +++ b/arch/arm/kernel/sys_arm.c @@ -62,8 +62,9 @@ asmlinkage int sys_vfork(struct pt_regs *regs) /* sys_execve() executes a new program. * This is called indirectly via a small wrapper */ -asmlinkage int sys_execve(const char __user *filenamei, char __user * __user *argv, - char __user * __user *envp, struct pt_regs *regs) +asmlinkage int sys_execve(const char __user *filenamei, + const char __user *const __user *argv, + const char __user *const __user *envp, struct pt_regs *regs) { int error; char * filename; @@ -78,14 +79,17 @@ out: return error; } -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { struct pt_regs regs; int ret; memset(®s, 0, sizeof(struct pt_regs)); - ret = do_execve(filename, (char __user * __user *)argv, - (char __user * __user *)envp, ®s); + ret = do_execve(filename, + (const char __user *const __user *)argv, + (const char __user *const __user *)envp, ®s); if (ret < 0) goto out; diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c index e5daddff397d..9c46aaad11ce 100644 --- a/arch/avr32/kernel/process.c +++ b/arch/avr32/kernel/process.c @@ -384,8 +384,9 @@ asmlinkage int sys_vfork(struct pt_regs *regs) } asmlinkage int sys_execve(const char __user *ufilename, - char __user *__user *uargv, - char __user *__user *uenvp, struct pt_regs *regs) + const char __user *const __user *uargv, + const char __user *const __user *uenvp, + struct pt_regs *regs) { int error; char *filename; diff --git a/arch/avr32/kernel/sys_avr32.c b/arch/avr32/kernel/sys_avr32.c index 459349b5ed5a..62635a09ae3e 100644 --- a/arch/avr32/kernel/sys_avr32.c +++ b/arch/avr32/kernel/sys_avr32.c @@ -7,7 +7,9 @@ */ #include -int kernel_execve(const char *file, char **argv, char **envp) +int kernel_execve(const char *file, + const char *const *argv, + const char *const *envp) { register long scno asm("r8") = __NR_execve; register long sc1 asm("r12") = (long)file; diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index a566f61c002a..01f98cb964d2 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -209,7 +209,9 @@ copy_thread(unsigned long clone_flags, /* * sys_execve() executes a new program. */ -asmlinkage int sys_execve(const char __user *name, char __user * __user *argv, char __user * __user *envp) +asmlinkage int sys_execve(const char __user *name, + const char __user *const __user *argv, + const char __user *const __user *envp) { int error; char *filename; diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c index 93f0f64b1326..9a57db6907f5 100644 --- a/arch/cris/arch-v10/kernel/process.c +++ b/arch/cris/arch-v10/kernel/process.c @@ -204,7 +204,9 @@ asmlinkage int sys_vfork(long r10, long r11, long r12, long r13, long mof, long /* * sys_execve() executes a new program. */ -asmlinkage int sys_execve(const char *fname, char **argv, char **envp, +asmlinkage int sys_execve(const char *fname, + const char *const *argv, + const char *const *envp, long r13, long mof, long srp, struct pt_regs *regs) { diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c index 2661a9529d70..562f84718906 100644 --- a/arch/cris/arch-v32/kernel/process.c +++ b/arch/cris/arch-v32/kernel/process.c @@ -218,8 +218,10 @@ sys_vfork(long r10, long r11, long r12, long r13, long mof, long srp, /* sys_execve() executes a new program. */ asmlinkage int -sys_execve(const char *fname, char **argv, char **envp, long r13, long mof, long srp, - struct pt_regs *regs) +sys_execve(const char *fname, + const char *const *argv, + const char *const *envp, long r13, long mof, long srp, + struct pt_regs *regs) { int error; char *filename; diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c index 428931cf2f0c..2b63b0191f52 100644 --- a/arch/frv/kernel/process.c +++ b/arch/frv/kernel/process.c @@ -250,8 +250,9 @@ int copy_thread(unsigned long clone_flags, /* * sys_execve() executes a new program. */ -asmlinkage int sys_execve(const char __user *name, char __user * __user *argv, - char __user * __user *envp) +asmlinkage int sys_execve(const char __user *name, + const char __user *const __user *argv, + const char __user *const __user *envp) { int error; char * filename; diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index 8b7b78d77d5c..97478138e361 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -212,7 +212,10 @@ int copy_thread(unsigned long clone_flags, /* * sys_execve() executes a new program. */ -asmlinkage int sys_execve(const char *name, char **argv, char **envp,int dummy,...) +asmlinkage int sys_execve(const char *name, + const char *const *argv, + const char *const *envp, + int dummy, ...) { int error; char * filename; diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c index f9b3f44da69f..dc1ac0243b78 100644 --- a/arch/h8300/kernel/sys_h8300.c +++ b/arch/h8300/kernel/sys_h8300.c @@ -51,7 +51,9 @@ asmlinkage void syscall_print(void *dummy,...) * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { register long res __asm__("er0"); register char *const *_c __asm__("er3") = envp; diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index a879c03b7f1c..16f1c7b04c69 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -633,7 +633,9 @@ dump_fpu (struct pt_regs *pt, elf_fpregset_t dst) } long -sys_execve (const char __user *filename, char __user * __user *argv, char __user * __user *envp, +sys_execve (const char __user *filename, + const char __user *const __user *argv, + const char __user *const __user *envp, struct pt_regs *regs) { char *fname; diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c index 8665a4d868ec..422bea9f1dbc 100644 --- a/arch/m32r/kernel/process.c +++ b/arch/m32r/kernel/process.c @@ -289,8 +289,8 @@ asmlinkage int sys_vfork(unsigned long r0, unsigned long r1, unsigned long r2, * sys_execve() executes a new program. */ asmlinkage int sys_execve(const char __user *ufilename, - char __user * __user *uargv, - char __user * __user *uenvp, + const char __user *const __user *uargv, + const char __user *const __user *uenvp, unsigned long r3, unsigned long r4, unsigned long r5, unsigned long r6, struct pt_regs regs) { diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c index 0a00f467edfa..d841fb6cc703 100644 --- a/arch/m32r/kernel/sys_m32r.c +++ b/arch/m32r/kernel/sys_m32r.c @@ -93,7 +93,9 @@ asmlinkage int sys_cachectl(char *addr, int nbytes, int op) * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { register long __scno __asm__ ("r7") = __NR_execve; register long __arg3 __asm__ ("r2") = (long)(envp); diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 221d0b71ce39..18732ab23292 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -315,7 +315,9 @@ EXPORT_SYMBOL(dump_fpu); /* * sys_execve() executes a new program. */ -asmlinkage int sys_execve(const char __user *name, char __user * __user *argv, char __user * __user *envp) +asmlinkage int sys_execve(const char __user *name, + const char __user *const __user *argv, + const char __user *const __user *envp) { int error; char * filename; diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index 77896692eb0a..2f431ece7b5f 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c @@ -459,7 +459,9 @@ asmlinkage int sys_getpagesize(void) * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { register long __res asm ("%d0") = __NR_execve; register long __a asm ("%d1") = (long)(filename); diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c index 6350f68cd026..4d090d3c0897 100644 --- a/arch/m68knommu/kernel/process.c +++ b/arch/m68knommu/kernel/process.c @@ -350,7 +350,9 @@ void dump(struct pt_regs *fp) /* * sys_execve() executes a new program. */ -asmlinkage int sys_execve(const char *name, char **argv, char **envp) +asmlinkage int sys_execve(const char *name, + const char *const *argv, + const char *const *envp) { int error; char * filename; diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c index d65e9c4c930c..68488ae47f0a 100644 --- a/arch/m68knommu/kernel/sys_m68k.c +++ b/arch/m68knommu/kernel/sys_m68k.c @@ -44,7 +44,9 @@ asmlinkage int sys_getpagesize(void) * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { register long __res asm ("%d0") = __NR_execve; register long __a asm ("%d1") = (long)(filename); diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c index 6abab6ebedbe..2250fe9d269b 100644 --- a/arch/microblaze/kernel/sys_microblaze.c +++ b/arch/microblaze/kernel/sys_microblaze.c @@ -47,8 +47,10 @@ asmlinkage long microblaze_clone(int flags, unsigned long stack, struct pt_regs return do_fork(flags, stack, regs, 0, NULL, NULL); } -asmlinkage long microblaze_execve(const char __user *filenamei, char __user *__user *argv, - char __user *__user *envp, struct pt_regs *regs) +asmlinkage long microblaze_execve(const char __user *filenamei, + const char __user *const __user *argv, + const char __user *const __user *envp, + struct pt_regs *regs) { int error; char *filename; @@ -77,7 +79,9 @@ asmlinkage long sys_mmap(unsigned long addr, unsigned long len, * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { register const char *__a __asm__("r5") = filename; register const void *__b __asm__("r6") = argv; diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index bddce0bca195..1dc6edff45e0 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -258,8 +258,10 @@ asmlinkage int sys_execve(nabi_no_regargs struct pt_regs regs) error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; - error = do_execve(filename, (char __user *__user *) (long)regs.regs[5], - (char __user *__user *) (long)regs.regs[6], ®s); + error = do_execve(filename, + (const char __user *const __user *) (long)regs.regs[5], + (const char __user *const __user *) (long)regs.regs[6], + ®s); putname(filename); out: @@ -436,7 +438,9 @@ asmlinkage void bad_stack(void) * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { register unsigned long __a0 asm("$4") = (unsigned long) filename; register unsigned long __a1 asm("$5") = (unsigned long) argv; diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c index 762eb325b949..f48373e2bc1c 100644 --- a/arch/mn10300/kernel/process.c +++ b/arch/mn10300/kernel/process.c @@ -269,8 +269,8 @@ asmlinkage long sys_vfork(void) } asmlinkage long sys_execve(const char __user *name, - char __user * __user *argv, - char __user * __user *envp) + const char __user *const __user *argv, + const char __user *const __user *envp) { char *filename; int error; diff --git a/arch/parisc/hpux/fs.c b/arch/parisc/hpux/fs.c index 1444875a7611..0dc8543acb4f 100644 --- a/arch/parisc/hpux/fs.c +++ b/arch/parisc/hpux/fs.c @@ -41,8 +41,10 @@ int hpux_execve(struct pt_regs *regs) if (IS_ERR(filename)) goto out; - error = do_execve(filename, (char __user * __user *) regs->gr[25], - (char __user * __user *) regs->gr[24], regs); + error = do_execve(filename, + (const char __user *const __user *) regs->gr[25], + (const char __user *const __user *) regs->gr[24], + regs); putname(filename); diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index 76332dadc6e9..4b4b9181a1a0 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -348,17 +348,22 @@ asmlinkage int sys_execve(struct pt_regs *regs) error = PTR_ERR(filename); if (IS_ERR(filename)) goto out; - error = do_execve(filename, (char __user * __user *) regs->gr[25], - (char __user * __user *) regs->gr[24], regs); + error = do_execve(filename, + (const char __user *const __user *) regs->gr[25], + (const char __user *const __user *) regs->gr[24], + regs); putname(filename); out: return error; } -extern int __execve(const char *filename, char *const argv[], - char *const envp[], struct task_struct *task); -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +extern int __execve(const char *filename, + const char *const argv[], + const char *const envp[], struct task_struct *task); +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { return __execve(filename, argv, envp, current); } diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index feacfb789686..91356ffda2ca 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1034,8 +1034,9 @@ int sys_execve(unsigned long a0, unsigned long a1, unsigned long a2, flush_fp_to_thread(current); flush_altivec_to_thread(current); flush_spe_to_thread(current); - error = do_execve(filename, (char __user * __user *) a1, - (char __user * __user *) a2, regs); + error = do_execve(filename, + (const char __user *const __user *) a1, + (const char __user *const __user *) a2, regs); putname(filename); out: return error; diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 7eafaf2662b9..d3a2d1c6438e 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -267,8 +267,9 @@ asmlinkage void execve_tail(void) /* * sys_execve() executes a new program. */ -SYSCALL_DEFINE3(execve, const char __user *, name, char __user * __user *, argv, - char __user * __user *, envp) +SYSCALL_DEFINE3(execve, const char __user *, name, + const char __user *const __user *, argv, + const char __user *const __user *, envp) { struct pt_regs *regs = task_pt_regs(current); char *filename; diff --git a/arch/score/kernel/sys_score.c b/arch/score/kernel/sys_score.c index 651096ff8db4..e478bf9a7e91 100644 --- a/arch/score/kernel/sys_score.c +++ b/arch/score/kernel/sys_score.c @@ -99,8 +99,10 @@ score_execve(struct pt_regs *regs) if (IS_ERR(filename)) return error; - error = do_execve(filename, (char __user *__user*)regs->regs[5], - (char __user *__user *) regs->regs[6], regs); + error = do_execve(filename, + (const char __user *const __user *)regs->regs[5], + (const char __user *const __user *)regs->regs[6], + regs); putname(filename); return error; @@ -110,7 +112,9 @@ score_execve(struct pt_regs *regs) * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { register unsigned long __r4 asm("r4") = (unsigned long) filename; register unsigned long __r5 asm("r5") = (unsigned long) argv; diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 052981972ae6..762a13984bbd 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -296,9 +296,10 @@ asmlinkage int sys_vfork(unsigned long r4, unsigned long r5, /* * sys_execve() executes a new program. */ -asmlinkage int sys_execve(char __user *ufilename, char __user * __user *uargv, - char __user * __user *uenvp, unsigned long r7, - struct pt_regs __regs) +asmlinkage int sys_execve(const char __user *ufilename, + const char __user *const __user *uargv, + const char __user *const __user *uenvp, + unsigned long r7, struct pt_regs __regs) { struct pt_regs *regs = RELOC_HIDE(&__regs, 0); int error; diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c index 68d128d651b3..210c1cabcb7f 100644 --- a/arch/sh/kernel/process_64.c +++ b/arch/sh/kernel/process_64.c @@ -497,8 +497,8 @@ asmlinkage int sys_execve(const char *ufilename, char **uargv, goto out; error = do_execve(filename, - (char __user * __user *)uargv, - (char __user * __user *)uenvp, + (const char __user *const __user *)uargv, + (const char __user *const __user *)uenvp, pregs); putname(filename); out: diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c index eb68bfdd86e6..f56b6fe5c5d0 100644 --- a/arch/sh/kernel/sys_sh32.c +++ b/arch/sh/kernel/sys_sh32.c @@ -71,7 +71,9 @@ asmlinkage int sys_fadvise64_64_wrapper(int fd, u32 offset0, u32 offset1, * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { register long __sc0 __asm__ ("r3") = __NR_execve; register long __sc4 __asm__ ("r4") = (long) filename; diff --git a/arch/sh/kernel/sys_sh64.c b/arch/sh/kernel/sys_sh64.c index 287235768bc5..c5a38c4bf410 100644 --- a/arch/sh/kernel/sys_sh64.c +++ b/arch/sh/kernel/sys_sh64.c @@ -33,7 +33,9 @@ * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { register unsigned long __sc0 __asm__ ("r9") = ((0x13 << 16) | __NR_execve); register unsigned long __sc2 __asm__ ("r2") = (unsigned long) filename; diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 40e29fc8a4d6..17529298c50a 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -633,8 +633,10 @@ asmlinkage int sparc_execve(struct pt_regs *regs) if(IS_ERR(filename)) goto out; error = do_execve(filename, - (char __user * __user *)regs->u_regs[base + UREG_I1], - (char __user * __user *)regs->u_regs[base + UREG_I2], + (const char __user *const __user *) + regs->u_regs[base + UREG_I1], + (const char __user *const __user *) + regs->u_regs[base + UREG_I2], regs); putname(filename); out: diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index dbe81a368b45..485f54748384 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -739,9 +739,9 @@ asmlinkage int sparc_execve(struct pt_regs *regs) if (IS_ERR(filename)) goto out; error = do_execve(filename, - (char __user * __user *) + (const char __user *const __user *) regs->u_regs[base + UREG_I1], - (char __user * __user *) + (const char __user *const __user *) regs->u_regs[base + UREG_I2], regs); putname(filename); if (!error) { diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index ee995b7dae7e..50794137d710 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -282,7 +282,9 @@ out: * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { long __res; register long __g1 __asm__ ("g1") = __NR_execve; diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 3d435c42e6db..f836f4e93afe 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -758,7 +758,9 @@ SYSCALL_DEFINE5(rt_sigaction, int, sig, const struct sigaction __user *, act, * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { long __res; register long __g1 __asm__ ("g1") = __NR_execve; diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c index ed590ad0acdc..985cc28c74c5 100644 --- a/arch/tile/kernel/process.c +++ b/arch/tile/kernel/process.c @@ -543,8 +543,9 @@ long _sys_vfork(struct pt_regs *regs) /* * sys_execve() executes a new program. */ -long _sys_execve(char __user *path, char __user *__user *argv, - char __user *__user *envp, struct pt_regs *regs) +long _sys_execve(const char __user *path, + const char __user *const __user *argv, + const char __user *const __user *envp, struct pt_regs *regs) { long error; char *filename; diff --git a/arch/um/kernel/exec.c b/arch/um/kernel/exec.c index 59b20d93b6d4..cd145eda3579 100644 --- a/arch/um/kernel/exec.c +++ b/arch/um/kernel/exec.c @@ -44,8 +44,9 @@ void start_thread(struct pt_regs *regs, unsigned long eip, unsigned long esp) PT_REGS_SP(regs) = esp; } -static long execve1(const char *file, char __user * __user *argv, - char __user *__user *env) +static long execve1(const char *file, + const char __user *const __user *argv, + const char __user *const __user *env) { long error; diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c index 7427c0b1930c..5ddb246626db 100644 --- a/arch/um/kernel/syscall.c +++ b/arch/um/kernel/syscall.c @@ -51,7 +51,9 @@ long old_mmap(unsigned long addr, unsigned long len, return err; } -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { mm_segment_t fs; int ret; diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index feb2ff9bfc2d..f1d8b441fc77 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -23,8 +23,9 @@ long sys_iopl(unsigned int, struct pt_regs *); /* kernel/process.c */ int sys_fork(struct pt_regs *); int sys_vfork(struct pt_regs *); -long sys_execve(const char __user *, char __user * __user *, - char __user * __user *, struct pt_regs *); +long sys_execve(const char __user *, + const char __user *const __user *, + const char __user *const __user *, struct pt_regs *); long sys_clone(unsigned long, unsigned long, void __user *, void __user *, struct pt_regs *); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 64ecaf0af9af..57d1868a86aa 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -301,8 +301,9 @@ EXPORT_SYMBOL(kernel_thread); /* * sys_execve() executes a new program. */ -long sys_execve(const char __user *name, char __user * __user *argv, - char __user * __user *envp, struct pt_regs *regs) +long sys_execve(const char __user *name, + const char __user *const __user *argv, + const char __user *const __user *envp, struct pt_regs *regs) { long error; char *filename; diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 196552bb412c..d5e06624e34a 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -28,7 +28,9 @@ * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. */ -int kernel_execve(const char *filename, char *const argv[], char *const envp[]) +int kernel_execve(const char *filename, + const char *const argv[], + const char *const envp[]) { long __res; asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 7c2f38f68ebb..e3558b9a58ba 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -318,8 +318,9 @@ long xtensa_clone(unsigned long clone_flags, unsigned long newsp, */ asmlinkage -long xtensa_execve(const char __user *name, char __user * __user *argv, - char __user * __user *envp, +long xtensa_execve(const char __user *name, + const char __user *const __user *argv, + const char __user *const __user *envp, long a3, long a4, long a5, struct pt_regs *regs) { diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 9e60fd201716..a7528b913936 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -108,7 +108,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) Node *fmt; struct file * interp_file = NULL; char iname[BINPRM_BUF_SIZE]; - char *iname_addr = iname; + const char *iname_addr = iname; int retval; int fd_binary = -1; diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index aca9d55afb22..396a9884591f 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -16,7 +16,8 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) { - char *cp, *i_name, *i_arg; + const char *i_arg, *i_name; + char *cp; struct file *file; char interp[BINPRM_BUF_SIZE]; int retval; diff --git a/fs/exec.c b/fs/exec.c index 7761837e4500..05c7d6b84df7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -361,13 +361,13 @@ err: /* * count() counts the number of strings in array ARGV. */ -static int count(char __user * __user * argv, int max) +static int count(const char __user * const __user * argv, int max) { int i = 0; if (argv != NULL) { for (;;) { - char __user * p; + const char __user * p; if (get_user(p, argv)) return -EFAULT; @@ -387,7 +387,7 @@ static int count(char __user * __user * argv, int max) * processes's memory to the new process's stack. The call to get_user_pages() * ensures the destination page is created and not swapped out. */ -static int copy_strings(int argc, char __user * __user * argv, +static int copy_strings(int argc, const char __user *const __user *argv, struct linux_binprm *bprm) { struct page *kmapped_page = NULL; @@ -396,7 +396,7 @@ static int copy_strings(int argc, char __user * __user * argv, int ret; while (argc-- > 0) { - char __user *str; + const char __user *str; int len; unsigned long pos; @@ -470,12 +470,13 @@ out: /* * Like copy_strings, but get argv and its values from kernel memory. */ -int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm) +int copy_strings_kernel(int argc, const char *const *argv, + struct linux_binprm *bprm) { int r; mm_segment_t oldfs = get_fs(); set_fs(KERNEL_DS); - r = copy_strings(argc, (char __user * __user *)argv, bprm); + r = copy_strings(argc, (const char __user *const __user *)argv, bprm); set_fs(oldfs); return r; } @@ -997,7 +998,7 @@ EXPORT_SYMBOL(flush_old_exec); void setup_new_exec(struct linux_binprm * bprm) { int i, ch; - char * name; + const char *name; char tcomm[sizeof(current->comm)]; arch_pick_mmap_layout(current->mm); @@ -1316,9 +1317,9 @@ EXPORT_SYMBOL(search_binary_handler); /* * sys_execve() executes a new program. */ -int do_execve(char * filename, - char __user *__user *argv, - char __user *__user *envp, +int do_execve(const char * filename, + const char __user *const __user *argv, + const char __user *const __user *envp, struct pt_regs * regs) { struct linux_binprm *bprm; diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index c809e286d213..a065612fc928 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -50,8 +50,8 @@ struct linux_binprm{ int unsafe; /* how unsafe this exec is (mask of LSM_UNSAFE_*) */ unsigned int per_clear; /* bits to clear in current->personality */ int argc, envc; - char * filename; /* Name of binary as seen by procps */ - char * interp; /* Name of the binary really executed. Most + const char * filename; /* Name of binary as seen by procps */ + const char * interp; /* Name of the binary really executed. Most of the time same as filename, but could be different for binfmt_{misc,script} */ unsigned interp_flags; @@ -126,7 +126,8 @@ extern int setup_arg_pages(struct linux_binprm * bprm, unsigned long stack_top, int executable_stack); extern int bprm_mm_init(struct linux_binprm *bprm); -extern int copy_strings_kernel(int argc,char ** argv,struct linux_binprm *bprm); +extern int copy_strings_kernel(int argc, const char *const *argv, + struct linux_binprm *bprm); extern int prepare_bprm_creds(struct linux_binprm *bprm); extern void install_exec_creds(struct linux_binprm *bprm); extern void do_coredump(long signr, int exit_code, struct pt_regs *regs); diff --git a/include/linux/sched.h b/include/linux/sched.h index ce160d68f5e7..1e2a6db2d7dd 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2109,7 +2109,9 @@ extern void daemonize(const char *, ...); extern int allow_signal(int); extern int disallow_signal(int); -extern int do_execve(char *, char __user * __user *, char __user * __user *, struct pt_regs *); +extern int do_execve(const char *, + const char __user * const __user *, + const char __user * const __user *, struct pt_regs *); extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 6e5d19788634..e6319d18a55d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -820,7 +820,7 @@ asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags, u64 mask, int fd, const char __user *pathname); -int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]); asmlinkage long sys_perf_event_open( diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c index 2b108538d0d9..3098a38f3ae1 100644 --- a/init/do_mounts_initrd.c +++ b/init/do_mounts_initrd.c @@ -24,10 +24,11 @@ static int __init no_initrd(char *str) __setup("noinitrd", no_initrd); -static int __init do_linuxrc(void * shell) +static int __init do_linuxrc(void *_shell) { - static char *argv[] = { "linuxrc", NULL, }; - extern char * envp_init[]; + static const char *argv[] = { "linuxrc", NULL, }; + extern const char *envp_init[]; + const char *shell = _shell; sys_close(old_fd);sys_close(root_fd); sys_setsid(); diff --git a/init/main.c b/init/main.c index 22d61cb06f98..94ab488039aa 100644 --- a/init/main.c +++ b/init/main.c @@ -197,8 +197,8 @@ static int __init set_reset_devices(char *str) __setup("reset_devices", set_reset_devices); -static char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; -char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; +static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, }; +const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, }; static const char *panic_later, *panic_param; extern const struct obs_kernel_param __setup_start[], __setup_end[]; @@ -809,7 +809,7 @@ static void __init do_pre_smp_initcalls(void) do_one_initcall(*fn); } -static void run_init_process(char *init_filename) +static void run_init_process(const char *init_filename) { argv_init[0] = init_filename; kernel_execve(init_filename, argv_init, envp_init); diff --git a/kernel/kmod.c b/kernel/kmod.c index 6e9b19667a8d..9cd0591c96a2 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -153,7 +153,9 @@ static int ____call_usermodehelper(void *data) goto fail; } - retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); + retval = kernel_execve(sub_info->path, + (const char *const *)sub_info->argv, + (const char *const *)sub_info->envp); /* Exec failed? */ fail: diff --git a/security/commoncap.c b/security/commoncap.c index 4e015996dd4d..9d172e6e330c 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -40,7 +40,7 @@ * * Warn if that happens, once per boot. */ -static void warn_setuid_and_fcaps_mixed(char *fname) +static void warn_setuid_and_fcaps_mixed(const char *fname) { static int warned; if (!warned) { -- cgit v1.2.3 From f4ae2faa40199b97b12f508234640bc565d166f8 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 11 Aug 2010 14:07:01 +0300 Subject: fix reiserfs_evict_inode end_writeback second call reiserfs_evict_inode calls end_writeback two times hitting kernel BUG at fs/inode.c:298 becase inode->i_state is I_CLEAR already. Signed-off-by: Sergey Senozhatsky Signed-off-by: Al Viro --- fs/reiserfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index ae35413dcbe1..caa758377d66 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -83,6 +83,7 @@ void reiserfs_evict_inode(struct inode *inode) dquot_drop(inode); inode->i_blocks = 0; reiserfs_write_unlock_once(inode->i_sb, depth); + return; no_delete: end_writeback(inode); -- cgit v1.2.3 From b845ff8f3ea2988ad5041315e2d35298e85cbc2f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 17 Aug 2010 17:08:35 +0300 Subject: cramfs: only unlock new inodes Commit 77b8a75f5bb introduced a warning at fs/inode.c:692 unlock_new_inode(), caused by unlock_new_inode() being called on existing inodes as well. This patch changes setup_inode() to only call unlock_new_inode() for I_NEW inodes. Signed-off-by: Alexander Shishkin Signed-off-by: Al Viro --- fs/cramfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index a53b130b366c..1e7a33028d33 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -80,7 +80,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, } } else { inode = iget_locked(sb, CRAMINO(cramfs_inode)); - if (inode) { + if (inode && (inode->i_state & I_NEW)) { setup_inode(inode, cramfs_inode); unlock_new_inode(inode); } -- cgit v1.2.3 From dad5eb6daa7eeb63d4fc9d982892c59faa07e797 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 17 Aug 2010 12:42:13 +0200 Subject: vfs: update ctime when changing the file's permission by setfacl generic_acl_set didn't update the ctime of the file when its permission was changed. Steps to reproduce: # touch aaa # stat -c %Z aaa 1275289822 # setfacl -m 'u::x,g::x,o::x' aaa # stat -c %Z aaa 1275289822 <- unchanged But, according to the spec of the ctime, vfs must update it. Port of ext3 patch by Miao Xie . CC: Al Viro Signed-off-by: Jan Kara Signed-off-by: Al Viro --- fs/generic_acl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/generic_acl.c b/fs/generic_acl.c index 99800e564157..6bc9e3a5a693 100644 --- a/fs/generic_acl.c +++ b/fs/generic_acl.c @@ -94,6 +94,7 @@ generic_acl_set(struct dentry *dentry, const char *name, const void *value, if (error < 0) goto failed; inode->i_mode = mode; + inode->i_ctime = CURRENT_TIME; if (error == 0) { posix_acl_release(acl); acl = NULL; -- cgit v1.2.3 From 87e99511ea54510ffb60b98001d108794d5037f8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2010 17:05:45 +0200 Subject: kill BH_Ordered flag Instead of abusing a buffer_head flag just add a variant of sync_dirty_buffer which allows passing the exact type of write flag required. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/buffer.c | 17 ++++++++-------- fs/jbd/commit.c | 49 +++++++++++++++++++++++---------------------- fs/jbd2/commit.c | 39 ++++++++++++++---------------------- fs/nilfs2/super.c | 28 +++++++++++++------------- include/linux/buffer_head.h | 3 +-- 5 files changed, 63 insertions(+), 73 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 50efa339e051..6c8ad977f3d4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2911,13 +2911,6 @@ int submit_bh(int rw, struct buffer_head * bh) BUG_ON(buffer_delay(bh)); BUG_ON(buffer_unwritten(bh)); - /* - * Mask in barrier bit for a write (could be either a WRITE or a - * WRITE_SYNC - */ - if (buffer_ordered(bh) && (rw & WRITE)) - rw |= WRITE_BARRIER; - /* * Only clear out a write error when rewriting */ @@ -3021,7 +3014,7 @@ EXPORT_SYMBOL(ll_rw_block); * and then start new I/O and then wait upon it. The caller must have a ref on * the buffer_head. */ -int sync_dirty_buffer(struct buffer_head *bh) +int __sync_dirty_buffer(struct buffer_head *bh, int rw) { int ret = 0; @@ -3030,7 +3023,7 @@ int sync_dirty_buffer(struct buffer_head *bh) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(rw, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); @@ -3043,6 +3036,12 @@ int sync_dirty_buffer(struct buffer_head *bh) } return ret; } +EXPORT_SYMBOL(__sync_dirty_buffer); + +int sync_dirty_buffer(struct buffer_head *bh) +{ + return __sync_dirty_buffer(bh, WRITE_SYNC); +} EXPORT_SYMBOL(sync_dirty_buffer); /* diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 28a9ddaa0c49..95d8c11c929e 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -119,7 +119,6 @@ static int journal_write_commit_record(journal_t *journal, struct buffer_head *bh; journal_header_t *header; int ret; - int barrier_done = 0; if (is_journal_aborted(journal)) return 0; @@ -137,34 +136,36 @@ static int journal_write_commit_record(journal_t *journal, JBUFFER_TRACE(descriptor, "write commit block"); set_buffer_dirty(bh); + if (journal->j_flags & JFS_BARRIER) { - set_buffer_ordered(bh); - barrier_done = 1; - } - ret = sync_dirty_buffer(bh); - if (barrier_done) - clear_buffer_ordered(bh); - /* is it possible for another commit to fail at roughly - * the same time as this one? If so, we don't want to - * trust the barrier flag in the super, but instead want - * to remember if we sent a barrier request - */ - if (ret == -EOPNOTSUPP && barrier_done) { - char b[BDEVNAME_SIZE]; + ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_BARRIER); - printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", - bdevname(journal->j_dev, b)); - spin_lock(&journal->j_state_lock); - journal->j_flags &= ~JFS_BARRIER; - spin_unlock(&journal->j_state_lock); + /* + * Is it possible for another commit to fail at roughly + * the same time as this one? If so, we don't want to + * trust the barrier flag in the super, but instead want + * to remember if we sent a barrier request + */ + if (ret == -EOPNOTSUPP) { + char b[BDEVNAME_SIZE]; - /* And try again, without the barrier */ - set_buffer_uptodate(bh); - set_buffer_dirty(bh); + printk(KERN_WARNING + "JBD: barrier-based sync failed on %s - " + "disabling barriers\n", + bdevname(journal->j_dev, b)); + spin_lock(&journal->j_state_lock); + journal->j_flags &= ~JFS_BARRIER; + spin_unlock(&journal->j_state_lock); + + /* And try again, without the barrier */ + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + ret = sync_dirty_buffer(bh); + } + } else { ret = sync_dirty_buffer(bh); } + put_bh(bh); /* One for getblk() */ journal_put_journal_head(descriptor); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index f52e5e8049f1..7c068c189d80 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -101,7 +101,6 @@ static int journal_submit_commit_record(journal_t *journal, struct commit_header *tmp; struct buffer_head *bh; int ret; - int barrier_done = 0; struct timespec now = current_kernel_time(); if (is_journal_aborted(journal)) @@ -136,30 +135,22 @@ static int journal_submit_commit_record(journal_t *journal, if (journal->j_flags & JBD2_BARRIER && !JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { - set_buffer_ordered(bh); - barrier_done = 1; - } - ret = submit_bh(WRITE_SYNC_PLUG, bh); - if (barrier_done) - clear_buffer_ordered(bh); - - /* is it possible for another commit to fail at roughly - * the same time as this one? If so, we don't want to - * trust the barrier flag in the super, but instead want - * to remember if we sent a barrier request - */ - if (ret == -EOPNOTSUPP && barrier_done) { - printk(KERN_WARNING - "JBD2: Disabling barriers on %s, " - "not supported by device\n", journal->j_devname); - write_lock(&journal->j_state_lock); - journal->j_flags &= ~JBD2_BARRIER; - write_unlock(&journal->j_state_lock); + ret = submit_bh(WRITE_SYNC_PLUG | WRITE_BARRIER, bh); + if (ret == -EOPNOTSUPP) { + printk(KERN_WARNING + "JBD2: Disabling barriers on %s, " + "not supported by device\n", journal->j_devname); + write_lock(&journal->j_state_lock); + journal->j_flags &= ~JBD2_BARRIER; + write_unlock(&journal->j_state_lock); - /* And try again, without the barrier */ - lock_buffer(bh); - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); + /* And try again, without the barrier */ + lock_buffer(bh); + set_buffer_uptodate(bh); + clear_buffer_dirty(bh); + ret = submit_bh(WRITE_SYNC_PLUG, bh); + } + } else { ret = submit_bh(WRITE_SYNC_PLUG, bh); } *cbh = bh; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 1fa86b9df73b..68345430fb48 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -175,24 +175,24 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) { struct the_nilfs *nilfs = sbi->s_nilfs; int err; - int barrier_done = 0; - if (nilfs_test_opt(sbi, BARRIER)) { - set_buffer_ordered(nilfs->ns_sbh[0]); - barrier_done = 1; - } retry: set_buffer_dirty(nilfs->ns_sbh[0]); - err = sync_dirty_buffer(nilfs->ns_sbh[0]); - if (err == -EOPNOTSUPP && barrier_done) { - nilfs_warning(sbi->s_super, __func__, - "barrier-based sync failed. " - "disabling barriers\n"); - nilfs_clear_opt(sbi, BARRIER); - barrier_done = 0; - clear_buffer_ordered(nilfs->ns_sbh[0]); - goto retry; + + if (nilfs_test_opt(sbi, BARRIER)) { + err = __sync_dirty_buffer(nilfs->ns_sbh[0], + WRITE_SYNC | WRITE_BARRIER); + if (err == -EOPNOTSUPP) { + nilfs_warning(sbi->s_super, __func__, + "barrier-based sync failed. " + "disabling barriers\n"); + nilfs_clear_opt(sbi, BARRIER); + goto retry; + } + } else { + err = sync_dirty_buffer(nilfs->ns_sbh[0]); } + if (unlikely(err)) { printk(KERN_ERR "NILFS: unable to write superblock (err=%d)\n", err); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 43e649a72529..72c1cf83eb85 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -32,7 +32,6 @@ enum bh_state_bits { BH_Delay, /* Buffer is not yet allocated on disk */ BH_Boundary, /* Block is followed by a discontiguity */ BH_Write_EIO, /* I/O error on write */ - BH_Ordered, /* ordered write */ BH_Eopnotsupp, /* operation not supported (barrier) */ BH_Unwritten, /* Buffer is allocated on disk but not written */ BH_Quiet, /* Buffer Error Prinks to be quiet */ @@ -125,7 +124,6 @@ BUFFER_FNS(Async_Write, async_write) BUFFER_FNS(Delay, delay) BUFFER_FNS(Boundary, boundary) BUFFER_FNS(Write_EIO, write_io_error) -BUFFER_FNS(Ordered, ordered) BUFFER_FNS(Eopnotsupp, eopnotsupp) BUFFER_FNS(Unwritten, unwritten) @@ -183,6 +181,7 @@ void unlock_buffer(struct buffer_head *bh); void __lock_buffer(struct buffer_head *bh); void ll_rw_block(int, int, struct buffer_head * bh[]); int sync_dirty_buffer(struct buffer_head *bh); +int __sync_dirty_buffer(struct buffer_head *bh, int rw); int submit_bh(int, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); -- cgit v1.2.3 From 9cb569d601e0b93e01c20a22872270ec663b75f6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Aug 2010 17:06:24 +0200 Subject: remove SWRITE* I/O types These flags aren't real I/O types, but tell ll_rw_block to always lock the buffer instead of giving up on a failed trylock. Instead add a new write_dirty_buffer helper that implements this semantic and use it from the existing SWRITE* callers. Note that the ll_rw_block code had a bug where it didn't promote WRITE_SYNC_PLUG properly, which this patch fixes. In the ufs code clean up the helper that used to call ll_rw_block to mirror sync_dirty_buffer, which is the function it implements for compound buffers. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/buffer.c | 52 +++++++++++++++++++++++++-------------------- fs/fat/misc.c | 4 +++- fs/jbd/checkpoint.c | 4 +++- fs/jbd/journal.c | 2 +- fs/jbd/revoke.c | 2 +- fs/jbd2/checkpoint.c | 4 +++- fs/jbd2/journal.c | 2 +- fs/jbd2/revoke.c | 2 +- fs/reiserfs/journal.c | 2 +- fs/ufs/balloc.c | 24 +++++++-------------- fs/ufs/ialloc.c | 18 ++++++---------- fs/ufs/truncate.c | 18 ++++++---------- fs/ufs/util.c | 20 +++++++---------- fs/ufs/util.h | 3 +-- include/linux/buffer_head.h | 1 + include/linux/fs.h | 9 -------- 16 files changed, 73 insertions(+), 94 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 6c8ad977f3d4..3e7dca279d1c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -770,11 +770,12 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list) spin_unlock(lock); /* * Ensure any pending I/O completes so that - * ll_rw_block() actually writes the current - * contents - it is a noop if I/O is still in - * flight on potentially older contents. + * write_dirty_buffer() actually writes the + * current contents - it is a noop if I/O is + * still in flight on potentially older + * contents. */ - ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh); + write_dirty_buffer(bh, WRITE_SYNC_PLUG); /* * Kick off IO for the previous mapping. Note @@ -2949,22 +2950,21 @@ EXPORT_SYMBOL(submit_bh); /** * ll_rw_block: low-level access to block devices (DEPRECATED) - * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead) + * @rw: whether to %READ or %WRITE or maybe %READA (readahead) * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * * ll_rw_block() takes an array of pointers to &struct buffer_heads, and * requests an I/O operation on them, either a %READ or a %WRITE. The third - * %SWRITE is like %WRITE only we make sure that the *current* data in buffers - * are sent to disk. The fourth %READA option is described in the documentation - * for generic_make_request() which ll_rw_block() calls. + * %READA option is described in the documentation for generic_make_request() + * which ll_rw_block() calls. * * This function drops any buffer that it cannot get a lock on (with the - * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be - * clean when doing a write request, and any buffer that appears to be - * up-to-date when doing read request. Further it marks as clean buffers that - * are processed for writing (the buffer cache won't assume that they are - * actually clean until the buffer gets unlocked). + * BH_Lock state bit), any buffer that appears to be clean when doing a write + * request, and any buffer that appears to be up-to-date when doing read + * request. Further it marks as clean buffers that are processed for + * writing (the buffer cache won't assume that they are actually clean + * until the buffer gets unlocked). * * ll_rw_block sets b_end_io to simple completion handler that marks * the buffer up-to-date (if approriate), unlocks the buffer and wakes @@ -2980,20 +2980,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) for (i = 0; i < nr; i++) { struct buffer_head *bh = bhs[i]; - if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG) - lock_buffer(bh); - else if (!trylock_buffer(bh)) + if (!trylock_buffer(bh)) continue; - - if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC || - rw == SWRITE_SYNC_PLUG) { + if (rw == WRITE) { if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; get_bh(bh); - if (rw == SWRITE_SYNC) - submit_bh(WRITE_SYNC, bh); - else - submit_bh(WRITE, bh); + submit_bh(WRITE, bh); continue; } } else { @@ -3009,6 +3002,19 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) } EXPORT_SYMBOL(ll_rw_block); +void write_dirty_buffer(struct buffer_head *bh, int rw) +{ + lock_buffer(bh); + if (!test_clear_buffer_dirty(bh)) { + unlock_buffer(bh); + return; + } + bh->b_end_io = end_buffer_write_sync; + get_bh(bh); + submit_bh(rw, bh); +} +EXPORT_SYMBOL(write_dirty_buffer); + /* * For a data-integrity writeout, we need to wait upon any in-progress I/O * and then start new I/O and then wait upon it. The caller must have a ref on diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 1fa23f6ffba5..1736f2356388 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -250,7 +250,9 @@ int fat_sync_bhs(struct buffer_head **bhs, int nr_bhs) { int i, err = 0; - ll_rw_block(SWRITE, nr_bhs, bhs); + for (i = 0; i < nr_bhs; i++) + write_dirty_buffer(bhs[i], WRITE); + for (i = 0; i < nr_bhs; i++) { wait_on_buffer(bhs[i]); if (buffer_eopnotsupp(bhs[i])) { diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c index b0435dd0654d..05a38b9c4c0e 100644 --- a/fs/jbd/checkpoint.c +++ b/fs/jbd/checkpoint.c @@ -254,7 +254,9 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) { int i; - ll_rw_block(SWRITE, *batch_count, bhs); + for (i = 0; i < *batch_count; i++) + write_dirty_buffer(bhs[i], WRITE); + for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = bhs[i]; clear_buffer_jwrite(bh); diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index f19ce94693d8..2c4b1f109da9 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1024,7 +1024,7 @@ void journal_update_superblock(journal_t *journal, int wait) if (wait) sync_dirty_buffer(bh); else - ll_rw_block(SWRITE, 1, &bh); + write_dirty_buffer(bh, WRITE); out: /* If we have just flushed the log (by marking s_start==0), then diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index ad717328343a..d29018307e2e 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -617,7 +617,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(bh); BUFFER_TRACE(bh, "write"); set_buffer_dirty(bh); - ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); + write_dirty_buffer(bh, write_op); } #endif diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 1c23a0f4e8a3..5247e7ffdcb4 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -255,7 +255,9 @@ __flush_batch(journal_t *journal, int *batch_count) { int i; - ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs); + for (i = 0; i < *batch_count; i++) + write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE); + for (i = 0; i < *batch_count; i++) { struct buffer_head *bh = journal->j_chkpt_bhs[i]; clear_buffer_jwrite(bh); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index ad5866aaf0f9..0e8014ea6b94 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1124,7 +1124,7 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) set_buffer_uptodate(bh); } } else - ll_rw_block(SWRITE, 1, &bh); + write_dirty_buffer(bh, WRITE); out: /* If we have just flushed the log (by marking s_start==0), then diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index a360b06af2e3..9ad321fd63fd 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -625,7 +625,7 @@ static void flush_descriptor(journal_t *journal, set_buffer_jwrite(bh); BUFFER_TRACE(bh, "write"); set_buffer_dirty(bh); - ll_rw_block((write_op == WRITE) ? SWRITE : SWRITE_SYNC_PLUG, 1, &bh); + write_dirty_buffer(bh, write_op); } #endif diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 1ec952b1f036..812e2c05aa29 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2311,7 +2311,7 @@ static int journal_read_transaction(struct super_block *sb, /* flush out the real blocks */ for (i = 0; i < get_desc_trans_len(desc); i++) { set_buffer_dirty(real_blocks[i]); - ll_rw_block(SWRITE, 1, real_blocks + i); + write_dirty_buffer(real_blocks[i], WRITE); } for (i = 0; i < get_desc_trans_len(desc); i++) { wait_on_buffer(real_blocks[i]); diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 048484fb10d2..46f7a807bbc1 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -114,10 +114,8 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count) ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); - ubh_wait_on_buffer (UCPI_UBH(ucpi)); - } + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); sb->s_dirt = 1; unlock_super (sb); @@ -207,10 +205,8 @@ do_more: ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); - ubh_wait_on_buffer (UCPI_UBH(ucpi)); - } + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); if (overflow) { fragment += count; @@ -558,10 +554,8 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment, ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); - ubh_wait_on_buffer (UCPI_UBH(ucpi)); - } + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); sb->s_dirt = 1; UFSD("EXIT, fragment %llu\n", (unsigned long long)fragment); @@ -680,10 +674,8 @@ cg_found: succed: ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); - ubh_wait_on_buffer (UCPI_UBH(ucpi)); - } + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); sb->s_dirt = 1; result += cgno * uspi->s_fpg; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 428017e018fe..2eabf04af3de 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -113,10 +113,8 @@ void ufs_free_inode (struct inode * inode) ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); - ubh_wait_on_buffer (UCPI_UBH(ucpi)); - } + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); sb->s_dirt = 1; unlock_super (sb); @@ -156,10 +154,8 @@ static void ufs2_init_inodes_chunk(struct super_block *sb, fs32_add(sb, &ucg->cg_u.cg_u2.cg_initediblk, uspi->s_inopb); ubh_mark_buffer_dirty(UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); - ubh_wait_on_buffer(UCPI_UBH(ucpi)); - } + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); UFSD("EXIT\n"); } @@ -290,10 +286,8 @@ cg_found: } ubh_mark_buffer_dirty (USPI_UBH(uspi)); ubh_mark_buffer_dirty (UCPI_UBH(ucpi)); - if (sb->s_flags & MS_SYNCHRONOUS) { - ubh_ll_rw_block(SWRITE, UCPI_UBH(ucpi)); - ubh_wait_on_buffer (UCPI_UBH(ucpi)); - } + if (sb->s_flags & MS_SYNCHRONOUS) + ubh_sync_block(UCPI_UBH(ucpi)); sb->s_dirt = 1; inode->i_ino = cg * uspi->s_ipg + bit; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 34d5cb135320..a58f9155fc9a 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -243,10 +243,8 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) ubh_bforget(ind_ubh); ind_ubh = NULL; } - if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) { - ubh_ll_rw_block(SWRITE, ind_ubh); - ubh_wait_on_buffer (ind_ubh); - } + if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) + ubh_sync_block(ind_ubh); ubh_brelse (ind_ubh); UFSD("EXIT: ino %lu\n", inode->i_ino); @@ -307,10 +305,8 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) ubh_bforget(dind_bh); dind_bh = NULL; } - if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) { - ubh_ll_rw_block(SWRITE, dind_bh); - ubh_wait_on_buffer (dind_bh); - } + if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) + ubh_sync_block(dind_bh); ubh_brelse (dind_bh); UFSD("EXIT: ino %lu\n", inode->i_ino); @@ -367,10 +363,8 @@ static int ufs_trunc_tindirect(struct inode *inode) ubh_bforget(tind_bh); tind_bh = NULL; } - if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) { - ubh_ll_rw_block(SWRITE, tind_bh); - ubh_wait_on_buffer (tind_bh); - } + if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) + ubh_sync_block(tind_bh); ubh_brelse (tind_bh); UFSD("EXIT: ino %lu\n", inode->i_ino); diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 85a7fc9e4a4e..d2c36d53fe66 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -113,21 +113,17 @@ void ubh_mark_buffer_uptodate (struct ufs_buffer_head * ubh, int flag) } } -void ubh_ll_rw_block(int rw, struct ufs_buffer_head *ubh) +void ubh_sync_block(struct ufs_buffer_head *ubh) { - if (!ubh) - return; + if (ubh) { + unsigned i; - ll_rw_block(rw, ubh->count, ubh->bh); -} + for (i = 0; i < ubh->count; i++) + write_dirty_buffer(ubh->bh[i], WRITE); -void ubh_wait_on_buffer (struct ufs_buffer_head * ubh) -{ - unsigned i; - if (!ubh) - return; - for ( i = 0; i < ubh->count; i++ ) - wait_on_buffer (ubh->bh[i]); + for (i = 0; i < ubh->count; i++) + wait_on_buffer(ubh->bh[i]); + } } void ubh_bforget (struct ufs_buffer_head * ubh) diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 0466036912f1..9f8775ce381c 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -269,8 +269,7 @@ extern void ubh_brelse (struct ufs_buffer_head *); extern void ubh_brelse_uspi (struct ufs_sb_private_info *); extern void ubh_mark_buffer_dirty (struct ufs_buffer_head *); extern void ubh_mark_buffer_uptodate (struct ufs_buffer_head *, int); -extern void ubh_ll_rw_block(int, struct ufs_buffer_head *); -extern void ubh_wait_on_buffer (struct ufs_buffer_head *); +extern void ubh_sync_block(struct ufs_buffer_head *); extern void ubh_bforget (struct ufs_buffer_head *); extern int ubh_buffer_dirty (struct ufs_buffer_head *); #define ubh_ubhcpymem(mem,ubh,size) _ubh_ubhcpymem_(uspi,mem,ubh,size) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 72c1cf83eb85..ec94c12f21da 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -182,6 +182,7 @@ void __lock_buffer(struct buffer_head *bh); void ll_rw_block(int, int, struct buffer_head * bh[]); int sync_dirty_buffer(struct buffer_head *bh); int __sync_dirty_buffer(struct buffer_head *bh, int rw); +void write_dirty_buffer(struct buffer_head *bh, int rw); int submit_bh(int, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); diff --git a/include/linux/fs.h b/include/linux/fs.h index 9a96b4d83fc1..29f7c975304c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -125,9 +125,6 @@ struct inodes_stat_t { * block layer could (in theory) choose to ignore this * request if it runs into resource problems. * WRITE A normal async write. Device will be plugged. - * SWRITE Like WRITE, but a special case for ll_rw_block() that - * tells it to lock the buffer first. Normally a buffer - * must be locked before doing IO. * WRITE_SYNC_PLUG Synchronous write. Identical to WRITE, but passes down * the hint that someone will be waiting on this IO * shortly. The device must still be unplugged explicitly, @@ -138,9 +135,6 @@ struct inodes_stat_t { * immediately after submission. The write equivalent * of READ_SYNC. * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only. - * SWRITE_SYNC - * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. - * See SWRITE. * WRITE_BARRIER Like WRITE_SYNC, but tells the block layer that all * previously submitted writes must be safely on storage * before this one is started. Also guarantees that when @@ -155,7 +149,6 @@ struct inodes_stat_t { #define READ 0 #define WRITE RW_MASK #define READA RWA_MASK -#define SWRITE (WRITE | READA) #define READ_SYNC (READ | REQ_SYNC | REQ_UNPLUG) #define READ_META (READ | REQ_META) @@ -165,8 +158,6 @@ struct inodes_stat_t { #define WRITE_META (WRITE | REQ_META) #define WRITE_BARRIER (WRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG | \ REQ_HARDBARRIER) -#define SWRITE_SYNC_PLUG (SWRITE | REQ_SYNC | REQ_NOIDLE) -#define SWRITE_SYNC (SWRITE | REQ_SYNC | REQ_NOIDLE | REQ_UNPLUG) /* * These aren't really reads or writes, they pass down information about -- cgit v1.2.3 From 850a496f969719b494cc972ab1d0e088737358d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 18 Aug 2010 06:18:57 -0400 Subject: hostfs: dumb (and usually harmless) tpyo - strncpy instead of strlcpy ... not harmless in this case - we have a string in the end of buffer already. Signed-off-by: Al Viro --- fs/hostfs/hostfs_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index dd1e55535a4e..77c4f6ee6c40 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -104,7 +104,7 @@ static char *__dentry_name(struct dentry *dentry, char *name) __putname(name); return NULL; } - strncpy(name, root, PATH_MAX); + strlcpy(name, root, PATH_MAX); if (len > p - name) { __putname(name); return NULL; -- cgit v1.2.3 From 3b6036d148bad5bb7928b021a49bb9e395361084 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 18 Aug 2010 06:21:10 -0400 Subject: hostfs ->follow_link() braino we want the assignment to err done inside the if () to be visible after it, so (re)declaring err inside if () body is wrong. Signed-off-by: Al Viro --- fs/hostfs/hostfs_kern.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 77c4f6ee6c40..f7dc9b5f9ef8 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -876,7 +876,7 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd) char *path = dentry_name(dentry); int err = -ENOMEM; if (path) { - int err = hostfs_do_readlink(path, link, PATH_MAX); + err = hostfs_do_readlink(path, link, PATH_MAX); if (err == PATH_MAX) err = -E2BIG; __putname(path); -- cgit v1.2.3 From 3a48ee8a4ad26c3a538b6fc11a86a8f80c3dce18 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 16 Aug 2010 19:05:23 +0200 Subject: mbcache: Limit the maximum number of cache entries Limit the maximum number of mb_cache entries depending on the number of hash buckets: if the only limit to the number of cache entries is the available memory the hash chains can grow very long, taking a long time to search. At least partially solves https://bugzilla.lustre.org/show_bug.cgi?id=22771. Signed-off-by: Andreas Gruenbacher Signed-off-by: Al Viro --- fs/mbcache.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/mbcache.c b/fs/mbcache.c index cf4e6cdfd15b..93444747237b 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -80,6 +80,7 @@ struct mb_cache { struct list_head c_cache_list; const char *c_name; atomic_t c_entry_count; + int c_max_entries; int c_bucket_bits; struct kmem_cache *c_entry_cache; struct list_head *c_block_hash; @@ -243,6 +244,12 @@ mb_cache_create(const char *name, int bucket_bits) if (!cache->c_entry_cache) goto fail2; + /* + * Set an upper limit on the number of cache entries so that the hash + * chains won't grow too long. + */ + cache->c_max_entries = bucket_count << 4; + spin_lock(&mb_cache_spinlock); list_add(&cache->c_cache_list, &mb_cache_list); spin_unlock(&mb_cache_spinlock); @@ -333,7 +340,6 @@ mb_cache_destroy(struct mb_cache *cache) kfree(cache); } - /* * mb_cache_entry_alloc() * @@ -345,17 +351,29 @@ mb_cache_destroy(struct mb_cache *cache) struct mb_cache_entry * mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) { - struct mb_cache_entry *ce; - - ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); - if (ce) { + struct mb_cache_entry *ce = NULL; + + if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { + spin_lock(&mb_cache_spinlock); + if (!list_empty(&mb_cache_lru_list)) { + ce = list_entry(mb_cache_lru_list.next, + struct mb_cache_entry, e_lru_list); + list_del_init(&ce->e_lru_list); + __mb_cache_entry_unhash(ce); + } + spin_unlock(&mb_cache_spinlock); + } + if (!ce) { + ce = kmem_cache_alloc(cache->c_entry_cache, gfp_flags); + if (!ce) + return NULL; atomic_inc(&cache->c_entry_count); INIT_LIST_HEAD(&ce->e_lru_list); INIT_LIST_HEAD(&ce->e_block_list); ce->e_cache = cache; - ce->e_used = 1 + MB_CACHE_WRITER; ce->e_queued = 0; } + ce->e_used = 1 + MB_CACHE_WRITER; return ce; } -- cgit v1.2.3 From 2e2e88ea8c3bd9e1bd6e42faf047a4ac3fbb3b2f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Aug 2010 04:37:30 +1000 Subject: fs: fix do_lookup false negative fs: fix do_lookup false negative In do_lookup, if we initially find no dentry, we take the directory i_mutex and re-check the lookup. If we find a dentry there, then we revalidate it if needed. However if that revalidate asks for the dentry to be invalidated, we return -ENOENT from do_lookup. What should happen instead is an attempt to allocate and lookup a new dentry. This is probably not noticed because it is rare. It is only reached if a concurrent create races in first (in which case, the dentry probably won't be invalidated anyway), or if the racy __d_lookup has failed due to a false-negative (which is very rare). Fix this by removing code and have it use the normal reval path. Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/namei.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 17ea76bf2fbe..c2742b7dec59 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -709,6 +709,7 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, dentry = __d_lookup(nd->path.dentry, name); if (!dentry) goto need_lookup; +found: if (dentry->d_op && dentry->d_op->d_revalidate) goto need_revalidate; done: @@ -766,14 +767,7 @@ out_unlock: * we waited on the semaphore. Need to revalidate. */ mutex_unlock(&dir->i_mutex); - if (dentry->d_op && dentry->d_op->d_revalidate) { - dentry = do_revalidate(dentry, nd); - if (!dentry) - dentry = ERR_PTR(-ENOENT); - } - if (IS_ERR(dentry)) - goto fail; - goto done; + goto found; need_revalidate: dentry = do_revalidate(dentry, nd); -- cgit v1.2.3 From baa0389073eb7beb9d36f6d13df97e16c1bfa626 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Aug 2010 04:37:31 +1000 Subject: fs: dentry allocation consolidation fs: dentry allocation consolidation There are 2 duplicate copies of code in dentry allocation in path lookup. Consolidate them into a single function. Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/namei.c | 70 +++++++++++++++++++++++++++++--------------------------------- 1 file changed, 33 insertions(+), 37 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index c2742b7dec59..b815a4d2e1d6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -685,6 +685,35 @@ static __always_inline void follow_dotdot(struct nameidata *nd) follow_mount(&nd->path); } +/* + * Allocate a dentry with name and parent, and perform a parent + * directory ->lookup on it. Returns the new dentry, or ERR_PTR + * on error. parent->d_inode->i_mutex must be held. d_lookup must + * have verified that no child exists while under i_mutex. + */ +static struct dentry *d_alloc_and_lookup(struct dentry *parent, + struct qstr *name, struct nameidata *nd) +{ + struct inode *inode = parent->d_inode; + struct dentry *dentry; + struct dentry *old; + + /* Don't create child dentry for a dead directory. */ + if (unlikely(IS_DEADDIR(inode))) + return ERR_PTR(-ENOENT); + + dentry = d_alloc(parent, name); + if (unlikely(!dentry)) + return ERR_PTR(-ENOMEM); + + old = inode->i_op->lookup(inode, dentry, nd); + if (unlikely(old)) { + dput(dentry); + dentry = old; + } + return dentry; +} + /* * It's more convoluted than I'd like it to be, but... it's still fairly * small and for now I'd prefer to have fast path as straight as possible. @@ -738,30 +767,13 @@ need_lookup: * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup */ dentry = d_lookup(parent, name); - if (!dentry) { - struct dentry *new; - - /* Don't create child dentry for a dead directory. */ - dentry = ERR_PTR(-ENOENT); - if (IS_DEADDIR(dir)) - goto out_unlock; - - new = d_alloc(parent, name); - dentry = ERR_PTR(-ENOMEM); - if (new) { - dentry = dir->i_op->lookup(dir, new, nd); - if (dentry) - dput(new); - else - dentry = new; - } -out_unlock: + if (likely(!dentry)) { + dentry = d_alloc_and_lookup(parent, name, nd); mutex_unlock(&dir->i_mutex); if (IS_ERR(dentry)) goto fail; goto done; } - /* * Uhhuh! Nasty case: the cache was re-populated while * we waited on the semaphore. Need to revalidate. @@ -1135,24 +1147,8 @@ static struct dentry *__lookup_hash(struct qstr *name, if (dentry && dentry->d_op && dentry->d_op->d_revalidate) dentry = do_revalidate(dentry, nd); - if (!dentry) { - struct dentry *new; - - /* Don't create child dentry for a dead directory. */ - dentry = ERR_PTR(-ENOENT); - if (IS_DEADDIR(inode)) - goto out; - - new = d_alloc(base, name); - dentry = ERR_PTR(-ENOMEM); - if (!new) - goto out; - dentry = inode->i_op->lookup(inode, new, nd); - if (!dentry) - dentry = new; - else - dput(new); - } + if (!dentry) + dentry = d_alloc_and_lookup(base, name, nd); out: return dentry; } -- cgit v1.2.3 From 2a4419b5b2a77f3f4537c14f7ad7df95770655dd Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Aug 2010 04:37:33 +1000 Subject: fs: fs_struct rwlock to spinlock fs: fs_struct rwlock to spinlock struct fs_struct.lock is an rwlock with the read-side used to protect root and pwd members while taking references to them. Taking a reference to a path typically requires just 2 atomic ops, so the critical section is very small. Parallel read-side operations would have cacheline contention on the lock, the dentry, and the vfsmount cachelines, so the rwlock is unlikely to ever give a real parallelism increase. Replace it with a spinlock to avoid one or two atomic operations in typical path lookup fastpath. Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- drivers/staging/pohmelfs/path_entry.c | 8 ++++---- fs/exec.c | 4 ++-- fs/fs_struct.c | 32 ++++++++++++++++---------------- include/linux/fs_struct.h | 14 +++++++------- kernel/fork.c | 10 +++++----- 5 files changed, 34 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/drivers/staging/pohmelfs/path_entry.c b/drivers/staging/pohmelfs/path_entry.c index cdc4dd50d638..8ec83d2dffb7 100644 --- a/drivers/staging/pohmelfs/path_entry.c +++ b/drivers/staging/pohmelfs/path_entry.c @@ -44,9 +44,9 @@ int pohmelfs_construct_path_string(struct pohmelfs_inode *pi, void *data, int le return -ENOENT; } - read_lock(¤t->fs->lock); + spin_lock(¤t->fs->lock); path.mnt = mntget(current->fs->root.mnt); - read_unlock(¤t->fs->lock); + spin_unlock(¤t->fs->lock); path.dentry = d; @@ -91,9 +91,9 @@ int pohmelfs_path_length(struct pohmelfs_inode *pi) return -ENOENT; } - read_lock(¤t->fs->lock); + spin_lock(¤t->fs->lock); root = dget(current->fs->root.dentry); - read_unlock(¤t->fs->lock); + spin_unlock(¤t->fs->lock); spin_lock(&dcache_lock); diff --git a/fs/exec.c b/fs/exec.c index 7761837e4500..5adab2c93eca 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1117,7 +1117,7 @@ int check_unsafe_exec(struct linux_binprm *bprm) bprm->unsafe = tracehook_unsafe_exec(p); n_fs = 1; - write_lock(&p->fs->lock); + spin_lock(&p->fs->lock); rcu_read_lock(); for (t = next_thread(p); t != p; t = next_thread(t)) { if (t->fs == p->fs) @@ -1134,7 +1134,7 @@ int check_unsafe_exec(struct linux_binprm *bprm) res = 1; } } - write_unlock(&p->fs->lock); + spin_unlock(&p->fs->lock); return res; } diff --git a/fs/fs_struct.c b/fs/fs_struct.c index 1ee40eb9a2c0..ed45a9cf5f3d 100644 --- a/fs/fs_struct.c +++ b/fs/fs_struct.c @@ -13,11 +13,11 @@ void set_fs_root(struct fs_struct *fs, struct path *path) { struct path old_root; - write_lock(&fs->lock); + spin_lock(&fs->lock); old_root = fs->root; fs->root = *path; path_get(path); - write_unlock(&fs->lock); + spin_unlock(&fs->lock); if (old_root.dentry) path_put(&old_root); } @@ -30,11 +30,11 @@ void set_fs_pwd(struct fs_struct *fs, struct path *path) { struct path old_pwd; - write_lock(&fs->lock); + spin_lock(&fs->lock); old_pwd = fs->pwd; fs->pwd = *path; path_get(path); - write_unlock(&fs->lock); + spin_unlock(&fs->lock); if (old_pwd.dentry) path_put(&old_pwd); @@ -51,7 +51,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root) task_lock(p); fs = p->fs; if (fs) { - write_lock(&fs->lock); + spin_lock(&fs->lock); if (fs->root.dentry == old_root->dentry && fs->root.mnt == old_root->mnt) { path_get(new_root); @@ -64,7 +64,7 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root) fs->pwd = *new_root; count++; } - write_unlock(&fs->lock); + spin_unlock(&fs->lock); } task_unlock(p); } while_each_thread(g, p); @@ -87,10 +87,10 @@ void exit_fs(struct task_struct *tsk) if (fs) { int kill; task_lock(tsk); - write_lock(&fs->lock); + spin_lock(&fs->lock); tsk->fs = NULL; kill = !--fs->users; - write_unlock(&fs->lock); + spin_unlock(&fs->lock); task_unlock(tsk); if (kill) free_fs_struct(fs); @@ -104,7 +104,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old) if (fs) { fs->users = 1; fs->in_exec = 0; - rwlock_init(&fs->lock); + spin_lock_init(&fs->lock); fs->umask = old->umask; get_fs_root_and_pwd(old, &fs->root, &fs->pwd); } @@ -121,10 +121,10 @@ int unshare_fs_struct(void) return -ENOMEM; task_lock(current); - write_lock(&fs->lock); + spin_lock(&fs->lock); kill = !--fs->users; current->fs = new_fs; - write_unlock(&fs->lock); + spin_unlock(&fs->lock); task_unlock(current); if (kill) @@ -143,7 +143,7 @@ EXPORT_SYMBOL(current_umask); /* to be mentioned only in INIT_TASK */ struct fs_struct init_fs = { .users = 1, - .lock = __RW_LOCK_UNLOCKED(init_fs.lock), + .lock = __SPIN_LOCK_UNLOCKED(init_fs.lock), .umask = 0022, }; @@ -156,14 +156,14 @@ void daemonize_fs_struct(void) task_lock(current); - write_lock(&init_fs.lock); + spin_lock(&init_fs.lock); init_fs.users++; - write_unlock(&init_fs.lock); + spin_unlock(&init_fs.lock); - write_lock(&fs->lock); + spin_lock(&fs->lock); current->fs = &init_fs; kill = !--fs->users; - write_unlock(&fs->lock); + spin_unlock(&fs->lock); task_unlock(current); if (kill) diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index eca3d5202138..a42b5bf02f8b 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -5,7 +5,7 @@ struct fs_struct { int users; - rwlock_t lock; + spinlock_t lock; int umask; int in_exec; struct path root, pwd; @@ -23,29 +23,29 @@ extern int unshare_fs_struct(void); static inline void get_fs_root(struct fs_struct *fs, struct path *root) { - read_lock(&fs->lock); + spin_lock(&fs->lock); *root = fs->root; path_get(root); - read_unlock(&fs->lock); + spin_unlock(&fs->lock); } static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd) { - read_lock(&fs->lock); + spin_lock(&fs->lock); *pwd = fs->pwd; path_get(pwd); - read_unlock(&fs->lock); + spin_unlock(&fs->lock); } static inline void get_fs_root_and_pwd(struct fs_struct *fs, struct path *root, struct path *pwd) { - read_lock(&fs->lock); + spin_lock(&fs->lock); *root = fs->root; path_get(root); *pwd = fs->pwd; path_get(pwd); - read_unlock(&fs->lock); + spin_unlock(&fs->lock); } #endif /* _LINUX_FS_STRUCT_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 98b450876f93..856eac3ec52e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -752,13 +752,13 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ - write_lock(&fs->lock); + spin_lock(&fs->lock); if (fs->in_exec) { - write_unlock(&fs->lock); + spin_unlock(&fs->lock); return -EAGAIN; } fs->users++; - write_unlock(&fs->lock); + spin_unlock(&fs->lock); return 0; } tsk->fs = copy_fs_struct(fs); @@ -1676,13 +1676,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) if (new_fs) { fs = current->fs; - write_lock(&fs->lock); + spin_lock(&fs->lock); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; - write_unlock(&fs->lock); + spin_unlock(&fs->lock); } if (new_mm) { -- cgit v1.2.3 From b04f784e5d19ed58892833dae845738972cea260 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Aug 2010 04:37:34 +1000 Subject: fs: remove extra lookup in __lookup_hash fs: remove extra lookup in __lookup_hash Optimize lookup for create operations, where no dentry should often be common-case. In cases where it is not, such as unlink, the added overhead is much smaller than the removed. Also, move comments about __d_lookup racyness to the __d_lookup call site. d_lookup is intuitive; __d_lookup is what needs commenting. So in that same vein, add kerneldoc comments to __d_lookup and clean up some of the comments: - We are interested in how the RCU lookup works here, particularly with renames. Make that explicit, and point to the document where it is explained in more detail. - RCU is pretty standard now, and macros make implementations pretty mindless. If we want to know about RCU barrier details, we look in RCU code. - Delete some boring legacy comments because we don't care much about how the code used to work, more about the interesting parts of how it works now. So comments about lazy LRU may be interesting, but would better be done in the LRU or refcount management code. Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/dcache.c | 60 +++++++++++++++++++++++++++++++++++------------------------- fs/namei.c | 32 ++++++++++++++++---------------- 2 files changed, 51 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 4d13bf50b7b1..d56a40b5a577 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1332,31 +1332,13 @@ EXPORT_SYMBOL(d_add_ci); * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find + * Returns: dentry, or NULL * - * Searches the children of the parent dentry for the name in question. If - * the dentry is found its reference count is incremented and the dentry - * is returned. The caller must use dput to free the entry when it has - * finished using it. %NULL is returned on failure. - * - * __d_lookup is dcache_lock free. The hash list is protected using RCU. - * Memory barriers are used while updating and doing lockless traversal. - * To avoid races with d_move while rename is happening, d_lock is used. - * - * Overflows in memcmp(), while d_move, are avoided by keeping the length - * and name pointer in one structure pointed by d_qstr. - * - * rcu_read_lock() and rcu_read_unlock() are used to disable preemption while - * lookup is going on. - * - * The dentry unused LRU is not updated even if lookup finds the required dentry - * in there. It is updated in places such as prune_dcache, shrink_dcache_sb, - * select_parent and __dget_locked. This laziness saves lookup from dcache_lock - * acquisition. - * - * d_lookup() is protected against the concurrent renames in some unrelated - * directory using the seqlockt_t rename_lock. + * d_lookup searches the children of the parent dentry for the name in + * question. If the dentry is found its reference count is incremented and the + * dentry is returned. The caller must use dput to free the entry when it has + * finished using it. %NULL is returned if the dentry does not exist. */ - struct dentry * d_lookup(struct dentry * parent, struct qstr * name) { struct dentry * dentry = NULL; @@ -1372,6 +1354,21 @@ struct dentry * d_lookup(struct dentry * parent, struct qstr * name) } EXPORT_SYMBOL(d_lookup); +/* + * __d_lookup - search for a dentry (racy) + * @parent: parent dentry + * @name: qstr of name we wish to find + * Returns: dentry, or NULL + * + * __d_lookup is like d_lookup, however it may (rarely) return a + * false-negative result due to unrelated rename activity. + * + * __d_lookup is slightly faster by avoiding rename_lock read seqlock, + * however it must be used carefully, eg. with a following d_lookup in + * the case of failure. + * + * __d_lookup callers must be commented. + */ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) { unsigned int len = name->len; @@ -1382,6 +1379,19 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) struct hlist_node *node; struct dentry *dentry; + /* + * The hash list is protected using RCU. + * + * Take d_lock when comparing a candidate dentry, to avoid races + * with d_move(). + * + * It is possible that concurrent renames can mess up our list + * walk here and result in missing our dentry, resulting in the + * false-negative result. d_lookup() protects against concurrent + * renames using rename_lock seqlock. + * + * See Documentation/vfs/dcache-locking.txt for more details. + */ rcu_read_lock(); hlist_for_each_entry_rcu(dentry, node, head, d_hash) { @@ -1396,8 +1406,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) /* * Recheck the dentry after taking the lock - d_move may have - * changed things. Don't bother checking the hash because we're - * about to compare the whole name anyway. + * changed things. Don't bother checking the hash because + * we're about to compare the whole name anyway. */ if (dentry->d_parent != parent) goto next; diff --git a/fs/namei.c b/fs/namei.c index b815a4d2e1d6..11de7c39ff76 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -735,6 +735,11 @@ static int do_lookup(struct nameidata *nd, struct qstr *name, return err; } + /* + * Rename seqlock is not required here because in the off chance + * of a false negative due to a concurrent rename, we're going to + * do the non-racy lookup, below. + */ dentry = __d_lookup(nd->path.dentry, name); if (!dentry) goto need_lookup; @@ -754,17 +759,13 @@ need_lookup: mutex_lock(&dir->i_mutex); /* * First re-do the cached lookup just in case it was created - * while we waited for the directory semaphore.. - * - * FIXME! This could use version numbering or similar to - * avoid unnecessary cache lookups. - * - * The "dcache_lock" is purely to protect the RCU list walker - * from concurrent renames at this point (we mustn't get false - * negatives from the RCU list walk here, unlike the optimistic - * fast walk). + * while we waited for the directory semaphore, or the first + * lookup failed due to an unrelated rename. * - * so doing d_lookup() (with seqlock), instead of lockfree __d_lookup + * This could use version numbering or similar to avoid unnecessary + * cache lookups, but then we'd have to do the first lookup in the + * non-racy way. However in the common case here, everything should + * be hot in cache, so would it be a big win? */ dentry = d_lookup(parent, name); if (likely(!dentry)) { @@ -1136,13 +1137,12 @@ static struct dentry *__lookup_hash(struct qstr *name, goto out; } - dentry = __d_lookup(base, name); - - /* lockess __d_lookup may fail due to concurrent d_move() - * in some unrelated directory, so try with d_lookup + /* + * Don't bother with __d_lookup: callers are for creat as + * well as unlink, so a lot of the time it would cost + * a double lookup. */ - if (!dentry) - dentry = d_lookup(base, name); + dentry = d_lookup(base, name); if (dentry && dentry->d_op && dentry->d_op->d_revalidate) dentry = do_revalidate(dentry, nd); -- cgit v1.2.3 From ee2ffa0dfdd2db19705f2ba1c6a4c0bfe8122dd8 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Aug 2010 04:37:35 +1000 Subject: fs: cleanup files_lock locking fs: cleanup files_lock locking Lock tty_files with a new spinlock, tty_files_lock; provide helpers to manipulate the per-sb files list; unexport the files_lock spinlock. Cc: linux-kernel@vger.kernel.org Cc: Christoph Hellwig Cc: Alan Cox Acked-by: Andi Kleen Acked-by: Greg Kroah-Hartman Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- drivers/char/pty.c | 6 +++++- drivers/char/tty_io.c | 26 ++++++++++++++++++-------- fs/file_table.c | 42 ++++++++++++++++++------------------------ fs/open.c | 4 ++-- include/linux/fs.h | 7 ++----- include/linux/tty.h | 1 + security/selinux/hooks.c | 4 ++-- 7 files changed, 48 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/drivers/char/pty.c b/drivers/char/pty.c index ad46eae1f9bb..2c64faa8efa4 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -676,7 +676,11 @@ static int ptmx_open(struct inode *inode, struct file *filp) set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ filp->private_data = tty; - file_move(filp, &tty->tty_files); + + file_sb_list_del(filp); /* __dentry_open has put it on the sb list */ + spin_lock(&tty_files_lock); + list_add(&filp->f_u.fu_list, &tty->tty_files); + spin_unlock(&tty_files_lock); retval = devpts_pty_new(inode, tty->link); if (retval) diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 0350c42375a2..cd5b829634ea 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -136,6 +136,9 @@ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ DEFINE_MUTEX(tty_mutex); EXPORT_SYMBOL(tty_mutex); +/* Spinlock to protect the tty->tty_files list */ +DEFINE_SPINLOCK(tty_files_lock); + static ssize_t tty_read(struct file *, char __user *, size_t, loff_t *); static ssize_t tty_write(struct file *, const char __user *, size_t, loff_t *); ssize_t redirected_tty_write(struct file *, const char __user *, @@ -235,11 +238,11 @@ static int check_tty_count(struct tty_struct *tty, const char *routine) struct list_head *p; int count = 0; - file_list_lock(); + spin_lock(&tty_files_lock); list_for_each(p, &tty->tty_files) { count++; } - file_list_unlock(); + spin_unlock(&tty_files_lock); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_SLAVE && tty->link && tty->link->count) @@ -519,7 +522,7 @@ void __tty_hangup(struct tty_struct *tty) workqueue with the lock held */ check_tty_count(tty, "tty_hangup"); - file_list_lock(); + spin_lock(&tty_files_lock); /* This breaks for file handles being sent over AF_UNIX sockets ? */ list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) { if (filp->f_op->write == redirected_tty_write) @@ -530,7 +533,7 @@ void __tty_hangup(struct tty_struct *tty) __tty_fasync(-1, filp, 0); /* can't block */ filp->f_op = &hung_up_tty_fops; } - file_list_unlock(); + spin_unlock(&tty_files_lock); tty_ldisc_hangup(tty); @@ -1424,9 +1427,9 @@ static void release_one_tty(struct work_struct *work) tty_driver_kref_put(driver); module_put(driver->owner); - file_list_lock(); + spin_lock(&tty_files_lock); list_del_init(&tty->tty_files); - file_list_unlock(); + spin_unlock(&tty_files_lock); put_pid(tty->pgrp); put_pid(tty->session); @@ -1671,7 +1674,10 @@ int tty_release(struct inode *inode, struct file *filp) * - do_tty_hangup no longer sees this file descriptor as * something that needs to be handled for hangups. */ - file_kill(filp); + spin_lock(&tty_files_lock); + BUG_ON(list_empty(&filp->f_u.fu_list)); + list_del_init(&filp->f_u.fu_list); + spin_unlock(&tty_files_lock); filp->private_data = NULL; /* @@ -1840,7 +1846,11 @@ got_driver: } filp->private_data = tty; - file_move(filp, &tty->tty_files); + BUG_ON(list_empty(&filp->f_u.fu_list)); + file_sb_list_del(filp); /* __dentry_open has put it on the sb list */ + spin_lock(&tty_files_lock); + list_add(&filp->f_u.fu_list, &tty->tty_files); + spin_unlock(&tty_files_lock); check_tty_count(tty, "tty_open"); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) diff --git a/fs/file_table.c b/fs/file_table.c index edecd36fed9b..6f0e62ecfddd 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -32,8 +32,7 @@ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -/* public. Not pretty! */ -__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __read_mostly; @@ -249,7 +248,7 @@ static void __fput(struct file *file) cdev_put(inode->i_cdev); fops_put(file->f_op); put_pid(file->f_owner.pid); - file_kill(file); + file_sb_list_del(file); if (file->f_mode & FMODE_WRITE) drop_file_write_access(file); file->f_path.dentry = NULL; @@ -328,31 +327,29 @@ struct file *fget_light(unsigned int fd, int *fput_needed) return file; } - void put_filp(struct file *file) { if (atomic_long_dec_and_test(&file->f_count)) { security_file_free(file); - file_kill(file); + file_sb_list_del(file); file_free(file); } } -void file_move(struct file *file, struct list_head *list) +void file_sb_list_add(struct file *file, struct super_block *sb) { - if (!list) - return; - file_list_lock(); - list_move(&file->f_u.fu_list, list); - file_list_unlock(); + spin_lock(&files_lock); + BUG_ON(!list_empty(&file->f_u.fu_list)); + list_add(&file->f_u.fu_list, &sb->s_files); + spin_unlock(&files_lock); } -void file_kill(struct file *file) +void file_sb_list_del(struct file *file) { if (!list_empty(&file->f_u.fu_list)) { - file_list_lock(); + spin_lock(&files_lock); list_del_init(&file->f_u.fu_list); - file_list_unlock(); + spin_unlock(&files_lock); } } @@ -361,7 +358,7 @@ int fs_may_remount_ro(struct super_block *sb) struct file *file; /* Check that no files are currently opened for writing. */ - file_list_lock(); + spin_lock(&files_lock); list_for_each_entry(file, &sb->s_files, f_u.fu_list) { struct inode *inode = file->f_path.dentry->d_inode; @@ -373,10 +370,10 @@ int fs_may_remount_ro(struct super_block *sb) if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) goto too_bad; } - file_list_unlock(); + spin_unlock(&files_lock); return 1; /* Tis' cool bro. */ too_bad: - file_list_unlock(); + spin_unlock(&files_lock); return 0; } @@ -392,7 +389,7 @@ void mark_files_ro(struct super_block *sb) struct file *f; retry: - file_list_lock(); + spin_lock(&files_lock); list_for_each_entry(f, &sb->s_files, f_u.fu_list) { struct vfsmount *mnt; if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) @@ -408,16 +405,13 @@ retry: continue; file_release_write(f); mnt = mntget(f->f_path.mnt); - file_list_unlock(); - /* - * This can sleep, so we can't hold - * the file_list_lock() spinlock. - */ + /* This can sleep, so we can't hold the spinlock. */ + spin_unlock(&files_lock); mnt_drop_write(mnt); mntput(mnt); goto retry; } - file_list_unlock(); + spin_unlock(&files_lock); } void __init files_init(unsigned long mempages) diff --git a/fs/open.c b/fs/open.c index 630715f9f73d..d74e1983e8dc 100644 --- a/fs/open.c +++ b/fs/open.c @@ -675,7 +675,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, f->f_path.mnt = mnt; f->f_pos = 0; f->f_op = fops_get(inode->i_fop); - file_move(f, &inode->i_sb->s_files); + file_sb_list_add(f, inode->i_sb); error = security_dentry_open(f, cred); if (error) @@ -721,7 +721,7 @@ cleanup_all: mnt_drop_write(mnt); } } - file_kill(f); + file_sb_list_del(f); f->f_path.dentry = NULL; f->f_path.mnt = NULL; cleanup_file: diff --git a/include/linux/fs.h b/include/linux/fs.h index 29f7c975304c..5a9a9e5a3705 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -944,9 +944,6 @@ struct file { unsigned long f_mnt_write_state; #endif }; -extern spinlock_t files_lock; -#define file_list_lock() spin_lock(&files_lock); -#define file_list_unlock() spin_unlock(&files_lock); #define get_file(x) atomic_long_inc(&(x)->f_count) #define fput_atomic(x) atomic_long_add_unless(&(x)->f_count, -1, 1) @@ -2188,8 +2185,8 @@ static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } -extern void file_move(struct file *f, struct list_head *list); -extern void file_kill(struct file *f); +extern void file_sb_list_add(struct file *f, struct super_block *sb); +extern void file_sb_list_del(struct file *f); #ifdef CONFIG_BLOCK extern void submit_bio(int, struct bio *); extern int bdev_read_only(struct block_device *); diff --git a/include/linux/tty.h b/include/linux/tty.h index 1437da3ddc62..f6b371a2514e 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -470,6 +470,7 @@ extern struct tty_struct *tty_pair_get_tty(struct tty_struct *tty); extern struct tty_struct *tty_pair_get_pty(struct tty_struct *tty); extern struct mutex tty_mutex; +extern spinlock_t tty_files_lock; extern void tty_write_unlock(struct tty_struct *tty); extern int tty_write_lock(struct tty_struct *tty, int ndelay); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 42043f96e54f..bd7da0f0ccf3 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2170,7 +2170,7 @@ static inline void flush_unauthorized_files(const struct cred *cred, tty = get_current_tty(); if (tty) { - file_list_lock(); + spin_lock(&tty_files_lock); if (!list_empty(&tty->tty_files)) { struct inode *inode; @@ -2186,7 +2186,7 @@ static inline void flush_unauthorized_files(const struct cred *cred, drop_tty = 1; } } - file_list_unlock(); + spin_unlock(&tty_files_lock); tty_kref_put(tty); } /* Reset controlling tty. */ -- cgit v1.2.3 From d996b62a8df1d935b01319bf8defb95b5709f7b8 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Aug 2010 04:37:36 +1000 Subject: tty: fix fu_list abuse tty: fix fu_list abuse tty code abuses fu_list, which causes a bug in remount,ro handling. If a tty device node is opened on a filesystem, then the last link to the inode removed, the filesystem will be allowed to be remounted readonly. This is because fs_may_remount_ro does not find the 0 link tty inode on the file sb list (because the tty code incorrectly removed it to use for its own purpose). This can result in a filesystem with errors after it is marked "clean". Taking idea from Christoph's initial patch, allocate a tty private struct at file->private_data and put our required list fields in there, linking file and tty. This makes tty nodes behave the same way as other device nodes and avoid meddling with the vfs, and avoids this bug. The error handling is not trivial in the tty code, so for this bugfix, I take the simple approach of using __GFP_NOFAIL and don't worry about memory errors. This is not a problem because our allocator doesn't fail small allocs as a rule anyway. So proper error handling is left as an exercise for tty hackers. [ Arguably filesystem's device inode would ideally be divorced from the driver's pseudo inode when it is opened, but in practice it's not clear whether that will ever be worth implementing. ] Cc: linux-kernel@vger.kernel.org Cc: Christoph Hellwig Cc: Alan Cox Cc: Greg Kroah-Hartman Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- drivers/char/pty.c | 6 +--- drivers/char/tty_io.c | 84 +++++++++++++++++++++++++++++++----------------- fs/internal.h | 2 ++ include/linux/fs.h | 2 -- include/linux/tty.h | 8 +++++ security/selinux/hooks.c | 5 ++- 6 files changed, 69 insertions(+), 38 deletions(-) (limited to 'fs') diff --git a/drivers/char/pty.c b/drivers/char/pty.c index 2c64faa8efa4..c350d01716bd 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -675,12 +675,8 @@ static int ptmx_open(struct inode *inode, struct file *filp) } set_bit(TTY_PTY_LOCK, &tty->flags); /* LOCK THE SLAVE */ - filp->private_data = tty; - file_sb_list_del(filp); /* __dentry_open has put it on the sb list */ - spin_lock(&tty_files_lock); - list_add(&filp->f_u.fu_list, &tty->tty_files); - spin_unlock(&tty_files_lock); + tty_add_file(tty, filp); retval = devpts_pty_new(inode, tty->link); if (retval) diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index cd5b829634ea..949067a0bd47 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -188,6 +188,41 @@ void free_tty_struct(struct tty_struct *tty) kfree(tty); } +static inline struct tty_struct *file_tty(struct file *file) +{ + return ((struct tty_file_private *)file->private_data)->tty; +} + +/* Associate a new file with the tty structure */ +void tty_add_file(struct tty_struct *tty, struct file *file) +{ + struct tty_file_private *priv; + + /* XXX: must implement proper error handling in callers */ + priv = kmalloc(sizeof(*priv), GFP_KERNEL|__GFP_NOFAIL); + + priv->tty = tty; + priv->file = file; + file->private_data = priv; + + spin_lock(&tty_files_lock); + list_add(&priv->list, &tty->tty_files); + spin_unlock(&tty_files_lock); +} + +/* Delete file from its tty */ +void tty_del_file(struct file *file) +{ + struct tty_file_private *priv = file->private_data; + + spin_lock(&tty_files_lock); + list_del(&priv->list); + spin_unlock(&tty_files_lock); + file->private_data = NULL; + kfree(priv); +} + + #define TTY_NUMBER(tty) ((tty)->index + (tty)->driver->name_base) /** @@ -500,6 +535,7 @@ void __tty_hangup(struct tty_struct *tty) struct file *cons_filp = NULL; struct file *filp, *f = NULL; struct task_struct *p; + struct tty_file_private *priv; int closecount = 0, n; unsigned long flags; int refs = 0; @@ -509,7 +545,7 @@ void __tty_hangup(struct tty_struct *tty) spin_lock(&redirect_lock); - if (redirect && redirect->private_data == tty) { + if (redirect && file_tty(redirect) == tty) { f = redirect; redirect = NULL; } @@ -524,7 +560,8 @@ void __tty_hangup(struct tty_struct *tty) spin_lock(&tty_files_lock); /* This breaks for file handles being sent over AF_UNIX sockets ? */ - list_for_each_entry(filp, &tty->tty_files, f_u.fu_list) { + list_for_each_entry(priv, &tty->tty_files, list) { + filp = priv->file; if (filp->f_op->write == redirected_tty_write) cons_filp = filp; if (filp->f_op->write != tty_write) @@ -892,12 +929,10 @@ static ssize_t tty_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { int i; - struct tty_struct *tty; - struct inode *inode; + struct inode *inode = file->f_path.dentry->d_inode; + struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; - tty = file->private_data; - inode = file->f_path.dentry->d_inode; if (tty_paranoia_check(tty, inode, "tty_read")) return -EIO; if (!tty || (test_bit(TTY_IO_ERROR, &tty->flags))) @@ -1068,12 +1103,11 @@ void tty_write_message(struct tty_struct *tty, char *msg) static ssize_t tty_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct tty_struct *tty; struct inode *inode = file->f_path.dentry->d_inode; + struct tty_struct *tty = file_tty(file); + struct tty_ldisc *ld; ssize_t ret; - struct tty_ldisc *ld; - tty = file->private_data; if (tty_paranoia_check(tty, inode, "tty_write")) return -EIO; if (!tty || !tty->ops->write || @@ -1510,13 +1544,13 @@ static void release_tty(struct tty_struct *tty, int idx) int tty_release(struct inode *inode, struct file *filp) { - struct tty_struct *tty, *o_tty; + struct tty_struct *tty = file_tty(filp); + struct tty_struct *o_tty; int pty_master, tty_closing, o_tty_closing, do_sleep; int devpts; int idx; char buf[64]; - tty = filp->private_data; if (tty_paranoia_check(tty, inode, "tty_release_dev")) return 0; @@ -1674,11 +1708,7 @@ int tty_release(struct inode *inode, struct file *filp) * - do_tty_hangup no longer sees this file descriptor as * something that needs to be handled for hangups. */ - spin_lock(&tty_files_lock); - BUG_ON(list_empty(&filp->f_u.fu_list)); - list_del_init(&filp->f_u.fu_list); - spin_unlock(&tty_files_lock); - filp->private_data = NULL; + tty_del_file(filp); /* * Perform some housekeeping before deciding whether to return. @@ -1845,12 +1875,8 @@ got_driver: return PTR_ERR(tty); } - filp->private_data = tty; - BUG_ON(list_empty(&filp->f_u.fu_list)); - file_sb_list_del(filp); /* __dentry_open has put it on the sb list */ - spin_lock(&tty_files_lock); - list_add(&filp->f_u.fu_list, &tty->tty_files); - spin_unlock(&tty_files_lock); + tty_add_file(tty, filp); + check_tty_count(tty, "tty_open"); if (tty->driver->type == TTY_DRIVER_TYPE_PTY && tty->driver->subtype == PTY_TYPE_MASTER) @@ -1926,11 +1952,10 @@ got_driver: static unsigned int tty_poll(struct file *filp, poll_table *wait) { - struct tty_struct *tty; + struct tty_struct *tty = file_tty(filp); struct tty_ldisc *ld; int ret = 0; - tty = filp->private_data; if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_poll")) return 0; @@ -1943,11 +1968,10 @@ static unsigned int tty_poll(struct file *filp, poll_table *wait) static int __tty_fasync(int fd, struct file *filp, int on) { - struct tty_struct *tty; + struct tty_struct *tty = file_tty(filp); unsigned long flags; int retval = 0; - tty = filp->private_data; if (tty_paranoia_check(tty, filp->f_path.dentry->d_inode, "tty_fasync")) goto out; @@ -2501,13 +2525,13 @@ EXPORT_SYMBOL(tty_pair_get_pty); */ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct tty_struct *tty, *real_tty; + struct tty_struct *tty = file_tty(file); + struct tty_struct *real_tty; void __user *p = (void __user *)arg; int retval; struct tty_ldisc *ld; struct inode *inode = file->f_dentry->d_inode; - tty = file->private_data; if (tty_paranoia_check(tty, inode, "tty_ioctl")) return -EINVAL; @@ -2629,7 +2653,7 @@ static long tty_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct inode *inode = file->f_dentry->d_inode; - struct tty_struct *tty = file->private_data; + struct tty_struct *tty = file_tty(file); struct tty_ldisc *ld; int retval = -ENOIOCTLCMD; @@ -2721,7 +2745,7 @@ void __do_SAK(struct tty_struct *tty) if (!filp) continue; if (filp->f_op->read == tty_read && - filp->private_data == tty) { + file_tty(filp) == tty) { printk(KERN_NOTICE "SAK: killed process %d" " (%s): fd#%d opened to the tty\n", task_pid_nr(p), p->comm, i); diff --git a/fs/internal.h b/fs/internal.h index 6b706bc60a66..6a5c13a80660 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -80,6 +80,8 @@ extern void chroot_fs_refs(struct path *, struct path *); /* * file_table.c */ +extern void file_sb_list_add(struct file *f, struct super_block *sb); +extern void file_sb_list_del(struct file *f); extern void mark_files_ro(struct super_block *); extern struct file *get_empty_filp(void); diff --git a/include/linux/fs.h b/include/linux/fs.h index 5a9a9e5a3705..5e65add0f163 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2185,8 +2185,6 @@ static inline void insert_inode_hash(struct inode *inode) { __insert_inode_hash(inode, inode->i_ino); } -extern void file_sb_list_add(struct file *f, struct super_block *sb); -extern void file_sb_list_del(struct file *f); #ifdef CONFIG_BLOCK extern void submit_bio(int, struct bio *); extern int bdev_read_only(struct block_device *); diff --git a/include/linux/tty.h b/include/linux/tty.h index f6b371a2514e..67d64e6efe7a 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -329,6 +329,13 @@ struct tty_struct { struct tty_port *port; }; +/* Each of a tty's open files has private_data pointing to tty_file_private */ +struct tty_file_private { + struct tty_struct *tty; + struct file *file; + struct list_head list; +}; + /* tty magic number */ #define TTY_MAGIC 0x5401 @@ -458,6 +465,7 @@ extern void proc_clear_tty(struct task_struct *p); extern struct tty_struct *get_current_tty(void); extern void tty_default_fops(struct file_operations *fops); extern struct tty_struct *alloc_tty_struct(void); +extern void tty_add_file(struct tty_struct *tty, struct file *file); extern void free_tty_struct(struct tty_struct *tty); extern void initialize_tty_struct(struct tty_struct *tty, struct tty_driver *driver, int idx); diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index bd7da0f0ccf3..4796ddd4e721 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2172,6 +2172,7 @@ static inline void flush_unauthorized_files(const struct cred *cred, if (tty) { spin_lock(&tty_files_lock); if (!list_empty(&tty->tty_files)) { + struct tty_file_private *file_priv; struct inode *inode; /* Revalidate access to controlling tty. @@ -2179,7 +2180,9 @@ static inline void flush_unauthorized_files(const struct cred *cred, than using file_has_perm, as this particular open file may belong to another process and we are only interested in the inode-based check here. */ - file = list_first_entry(&tty->tty_files, struct file, f_u.fu_list); + file_priv = list_first_entry(&tty->tty_files, + struct tty_file_private, list); + file = file_priv->file; inode = file->f_path.dentry->d_inode; if (inode_has_perm(cred, inode, FILE__READ | FILE__WRITE, NULL)) { -- cgit v1.2.3 From 6416ccb7899960868f5016751fb81bf25213d24f Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Aug 2010 04:37:38 +1000 Subject: fs: scale files_lock fs: scale files_lock Improve scalability of files_lock by adding per-cpu, per-sb files lists, protected with an lglock. The lglock provides fast access to the per-cpu lists to add and remove files. It also provides a snapshot of all the per-cpu lists (although this is very slow). One difficulty with this approach is that a file can be removed from the list by another CPU. We must track which per-cpu list the file is on with a new variale in the file struct (packed into a hole on 64-bit archs). Scalability could suffer if files are frequently removed from different cpu's list. However loads with frequent removal of files imply short interval between adding and removing the files, and the scheduler attempts to avoid moving processes too far away. Also, even in the case of cross-CPU removal, the hardware has much more opportunity to parallelise cacheline transfers with N cachelines than with 1. A worst-case test of 1 CPU allocating files subsequently being freed by N CPUs degenerates to contending on a single lock, which is no worse than before. When more than one CPU are allocating files, even if they are always freed by different CPUs, there will be more parallelism than the single-lock case. Testing results: On a 2 socket, 8 core opteron, I measure the number of times the lock is taken to remove the file, the number of times it is removed by the same CPU that added it, and the number of times it is removed by the same node that added it. Booting: locks= 25049 cpu-hits= 23174 (92.5%) node-hits= 23945 (95.6%) kbuild -j16 locks=2281913 cpu-hits=2208126 (96.8%) node-hits=2252674 (98.7%) dbench 64 locks=4306582 cpu-hits=4287247 (99.6%) node-hits=4299527 (99.8%) So a file is removed from the same CPU it was added by over 90% of the time. It remains within the same node 95% of the time. Tim Chen ran some numbers for a 64 thread Nehalem system performing a compile. throughput 2.6.34-rc2 24.5 +patch 24.9 us sys idle IO wait (in %) 2.6.34-rc2 51.25 28.25 17.25 3.25 +patch 53.75 18.5 19 8.75 So significantly less CPU time spent in kernel code, higher idle time and slightly higher throughput. Single threaded performance difference was within the noise of microbenchmarks. That is not to say penalty does not exist, the code is larger and more memory accesses required so it will be slightly slower. Cc: linux-kernel@vger.kernel.org Cc: Tim Chen Cc: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/file_table.c | 108 ++++++++++++++++++++++++++++++++++++++++++++--------- fs/super.c | 18 +++++++++ include/linux/fs.h | 7 ++++ 3 files changed, 115 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/file_table.c b/fs/file_table.c index 6f0e62ecfddd..a04bdd81c11c 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -20,7 +20,9 @@ #include #include #include +#include #include +#include #include #include @@ -32,7 +34,8 @@ struct files_stat_struct files_stat = { .max_files = NR_FILE }; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock); +DECLARE_LGLOCK(files_lglock); +DEFINE_LGLOCK(files_lglock); /* SLAB cache for file structures */ static struct kmem_cache *filp_cachep __read_mostly; @@ -336,30 +339,98 @@ void put_filp(struct file *file) } } +static inline int file_list_cpu(struct file *file) +{ +#ifdef CONFIG_SMP + return file->f_sb_list_cpu; +#else + return smp_processor_id(); +#endif +} + +/* helper for file_sb_list_add to reduce ifdefs */ +static inline void __file_sb_list_add(struct file *file, struct super_block *sb) +{ + struct list_head *list; +#ifdef CONFIG_SMP + int cpu; + cpu = smp_processor_id(); + file->f_sb_list_cpu = cpu; + list = per_cpu_ptr(sb->s_files, cpu); +#else + list = &sb->s_files; +#endif + list_add(&file->f_u.fu_list, list); +} + +/** + * file_sb_list_add - add a file to the sb's file list + * @file: file to add + * @sb: sb to add it to + * + * Use this function to associate a file with the superblock of the inode it + * refers to. + */ void file_sb_list_add(struct file *file, struct super_block *sb) { - spin_lock(&files_lock); - BUG_ON(!list_empty(&file->f_u.fu_list)); - list_add(&file->f_u.fu_list, &sb->s_files); - spin_unlock(&files_lock); + lg_local_lock(files_lglock); + __file_sb_list_add(file, sb); + lg_local_unlock(files_lglock); } +/** + * file_sb_list_del - remove a file from the sb's file list + * @file: file to remove + * @sb: sb to remove it from + * + * Use this function to remove a file from its superblock. + */ void file_sb_list_del(struct file *file) { if (!list_empty(&file->f_u.fu_list)) { - spin_lock(&files_lock); + lg_local_lock_cpu(files_lglock, file_list_cpu(file)); list_del_init(&file->f_u.fu_list); - spin_unlock(&files_lock); + lg_local_unlock_cpu(files_lglock, file_list_cpu(file)); } } +#ifdef CONFIG_SMP + +/* + * These macros iterate all files on all CPUs for a given superblock. + * files_lglock must be held globally. + */ +#define do_file_list_for_each_entry(__sb, __file) \ +{ \ + int i; \ + for_each_possible_cpu(i) { \ + struct list_head *list; \ + list = per_cpu_ptr((__sb)->s_files, i); \ + list_for_each_entry((__file), list, f_u.fu_list) + +#define while_file_list_for_each_entry \ + } \ +} + +#else + +#define do_file_list_for_each_entry(__sb, __file) \ +{ \ + struct list_head *list; \ + list = &(sb)->s_files; \ + list_for_each_entry((__file), list, f_u.fu_list) + +#define while_file_list_for_each_entry \ +} + +#endif + int fs_may_remount_ro(struct super_block *sb) { struct file *file; - /* Check that no files are currently opened for writing. */ - spin_lock(&files_lock); - list_for_each_entry(file, &sb->s_files, f_u.fu_list) { + lg_global_lock(files_lglock); + do_file_list_for_each_entry(sb, file) { struct inode *inode = file->f_path.dentry->d_inode; /* File with pending delete? */ @@ -369,11 +440,11 @@ int fs_may_remount_ro(struct super_block *sb) /* Writeable file? */ if (S_ISREG(inode->i_mode) && (file->f_mode & FMODE_WRITE)) goto too_bad; - } - spin_unlock(&files_lock); + } while_file_list_for_each_entry; + lg_global_unlock(files_lglock); return 1; /* Tis' cool bro. */ too_bad: - spin_unlock(&files_lock); + lg_global_unlock(files_lglock); return 0; } @@ -389,8 +460,8 @@ void mark_files_ro(struct super_block *sb) struct file *f; retry: - spin_lock(&files_lock); - list_for_each_entry(f, &sb->s_files, f_u.fu_list) { + lg_global_lock(files_lglock); + do_file_list_for_each_entry(sb, f) { struct vfsmount *mnt; if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) continue; @@ -406,12 +477,12 @@ retry: file_release_write(f); mnt = mntget(f->f_path.mnt); /* This can sleep, so we can't hold the spinlock. */ - spin_unlock(&files_lock); + lg_global_unlock(files_lglock); mnt_drop_write(mnt); mntput(mnt); goto retry; - } - spin_unlock(&files_lock); + } while_file_list_for_each_entry; + lg_global_unlock(files_lglock); } void __init files_init(unsigned long mempages) @@ -431,5 +502,6 @@ void __init files_init(unsigned long mempages) if (files_stat.max_files < NR_FILE) files_stat.max_files = NR_FILE; files_defer_init(); + lg_lock_init(files_lglock); percpu_counter_init(&nr_files, 0); } diff --git a/fs/super.c b/fs/super.c index 9674ab2c8718..8819e3a7ff20 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,22 @@ static struct super_block *alloc_super(struct file_system_type *type) s = NULL; goto out; } +#ifdef CONFIG_SMP + s->s_files = alloc_percpu(struct list_head); + if (!s->s_files) { + security_sb_free(s); + kfree(s); + s = NULL; + goto out; + } else { + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(per_cpu_ptr(s->s_files, i)); + } +#else INIT_LIST_HEAD(&s->s_files); +#endif INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); @@ -108,6 +123,9 @@ out: */ static inline void destroy_super(struct super_block *s) { +#ifdef CONFIG_SMP + free_percpu(s->s_files); +#endif security_sb_free(s); kfree(s->s_subtype); kfree(s->s_options); diff --git a/include/linux/fs.h b/include/linux/fs.h index 5e65add0f163..76041b614758 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -920,6 +920,9 @@ struct file { #define f_vfsmnt f_path.mnt const struct file_operations *f_op; spinlock_t f_lock; /* f_ep_links, f_flags, no IRQ */ +#ifdef CONFIG_SMP + int f_sb_list_cpu; +#endif atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; @@ -1334,7 +1337,11 @@ struct super_block { struct list_head s_inodes; /* all inodes */ struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ +#ifdef CONFIG_SMP + struct list_head __percpu *s_files; +#else struct list_head s_files; +#endif /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ -- cgit v1.2.3 From 99b7db7b8ffd6bb755eb0a175596421a0b581cb2 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Aug 2010 04:37:39 +1000 Subject: fs: brlock vfsmount_lock fs: brlock vfsmount_lock Use a brlock for the vfsmount lock. It must be taken for write whenever modifying the mount hash or associated fields, and may be taken for read when performing mount hash lookups. A new lock is added for the mnt-id allocator, so it doesn't need to take the heavy vfsmount write-lock. The number of atomics should remain the same for fastpath rlock cases, though code would be slightly slower due to per-cpu access. Scalability is not not be much improved in common cases yet, due to other locks (ie. dcache_lock) getting in the way. However path lookups crossing mountpoints should be one case where scalability is improved (currently requiring the global lock). The slowpath is slower due to use of brlock. On a 64 core, 64 socket, 32 node Altix system (high latency to remote nodes), a simple umount microbenchmark (mount --bind mnt mnt2 ; umount mnt2 loop 1000 times), before this patch it took 6.8s, afterwards took 7.1s, about 5% slower. Cc: Al Viro Signed-off-by: Nick Piggin Signed-off-by: Al Viro --- fs/dcache.c | 11 ++-- fs/internal.h | 5 +- fs/namei.c | 7 ++- fs/namespace.c | 177 ++++++++++++++++++++++++++++++++++++--------------------- fs/pnode.c | 11 +++- 5 files changed, 134 insertions(+), 77 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index d56a40b5a577..83293be48149 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1935,7 +1935,7 @@ static int prepend_path(const struct path *path, struct path *root, bool slash = false; int error = 0; - spin_lock(&vfsmount_lock); + br_read_lock(vfsmount_lock); while (dentry != root->dentry || vfsmnt != root->mnt) { struct dentry * parent; @@ -1964,7 +1964,7 @@ out: if (!error && !slash) error = prepend(buffer, buflen, "/", 1); - spin_unlock(&vfsmount_lock); + br_read_unlock(vfsmount_lock); return error; global_root: @@ -2302,11 +2302,12 @@ int path_is_under(struct path *path1, struct path *path2) struct vfsmount *mnt = path1->mnt; struct dentry *dentry = path1->dentry; int res; - spin_lock(&vfsmount_lock); + + br_read_lock(vfsmount_lock); if (mnt != path2->mnt) { for (;;) { if (mnt->mnt_parent == mnt) { - spin_unlock(&vfsmount_lock); + br_read_unlock(vfsmount_lock); return 0; } if (mnt->mnt_parent == path2->mnt) @@ -2316,7 +2317,7 @@ int path_is_under(struct path *path1, struct path *path2) dentry = mnt->mnt_mountpoint; } res = is_subdir(dentry, path2->dentry); - spin_unlock(&vfsmount_lock); + br_read_unlock(vfsmount_lock); return res; } EXPORT_SYMBOL(path_is_under); diff --git a/fs/internal.h b/fs/internal.h index 6a5c13a80660..a6910e91cee8 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -9,6 +9,8 @@ * 2 of the License, or (at your option) any later version. */ +#include + struct super_block; struct linux_binprm; struct path; @@ -70,7 +72,8 @@ extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int); extern void __init mnt_init(void); -extern spinlock_t vfsmount_lock; +DECLARE_BRLOCK(vfsmount_lock); + /* * fs_struct.c diff --git a/fs/namei.c b/fs/namei.c index 11de7c39ff76..24896e833565 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -595,15 +595,16 @@ int follow_up(struct path *path) { struct vfsmount *parent; struct dentry *mountpoint; - spin_lock(&vfsmount_lock); + + br_read_lock(vfsmount_lock); parent = path->mnt->mnt_parent; if (parent == path->mnt) { - spin_unlock(&vfsmount_lock); + br_read_unlock(vfsmount_lock); return 0; } mntget(parent); mountpoint = dget(path->mnt->mnt_mountpoint); - spin_unlock(&vfsmount_lock); + br_read_unlock(vfsmount_lock); dput(path->dentry); path->dentry = mountpoint; mntput(path->mnt); diff --git a/fs/namespace.c b/fs/namespace.c index 2e10cb19c5b0..de402eb6eafb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include #include #include #include @@ -38,12 +40,10 @@ #define HASH_SHIFT ilog2(PAGE_SIZE / sizeof(struct list_head)) #define HASH_SIZE (1UL << HASH_SHIFT) -/* spinlock for vfsmount related operations, inplace of dcache_lock */ -__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); - static int event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); +static DEFINE_SPINLOCK(mnt_id_lock); static int mnt_id_start = 0; static int mnt_group_start = 1; @@ -55,6 +55,16 @@ static struct rw_semaphore namespace_sem; struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); +/* + * vfsmount lock may be taken for read to prevent changes to the + * vfsmount hash, ie. during mountpoint lookups or walking back + * up the tree. + * + * It should be taken for write in all cases where the vfsmount + * tree or hash is modified or when a vfsmount structure is modified. + */ +DEFINE_BRLOCK(vfsmount_lock); + static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); @@ -65,18 +75,21 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) -/* allocation is serialized by namespace_sem */ +/* + * allocation is serialized by namespace_sem, but we need the spinlock to + * serialize with freeing. + */ static int mnt_alloc_id(struct vfsmount *mnt) { int res; retry: ida_pre_get(&mnt_id_ida, GFP_KERNEL); - spin_lock(&vfsmount_lock); + spin_lock(&mnt_id_lock); res = ida_get_new_above(&mnt_id_ida, mnt_id_start, &mnt->mnt_id); if (!res) mnt_id_start = mnt->mnt_id + 1; - spin_unlock(&vfsmount_lock); + spin_unlock(&mnt_id_lock); if (res == -EAGAIN) goto retry; @@ -86,11 +99,11 @@ retry: static void mnt_free_id(struct vfsmount *mnt) { int id = mnt->mnt_id; - spin_lock(&vfsmount_lock); + spin_lock(&mnt_id_lock); ida_remove(&mnt_id_ida, id); if (mnt_id_start > id) mnt_id_start = id; - spin_unlock(&vfsmount_lock); + spin_unlock(&mnt_id_lock); } /* @@ -348,7 +361,7 @@ static int mnt_make_readonly(struct vfsmount *mnt) { int ret = 0; - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); mnt->mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store @@ -382,15 +395,15 @@ static int mnt_make_readonly(struct vfsmount *mnt) */ smp_wmb(); mnt->mnt_flags &= ~MNT_WRITE_HOLD; - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); return ret; } static void __mnt_unmake_readonly(struct vfsmount *mnt) { - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); mnt->mnt_flags &= ~MNT_READONLY; - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); } void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) @@ -414,6 +427,7 @@ void free_vfsmnt(struct vfsmount *mnt) /* * find the first or last mount at @dentry on vfsmount @mnt depending on * @dir. If @dir is set return the first mount else return the last mount. + * vfsmount_lock must be held for read or write. */ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, int dir) @@ -443,10 +457,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry, struct vfsmount *lookup_mnt(struct path *path) { struct vfsmount *child_mnt; - spin_lock(&vfsmount_lock); + + br_read_lock(vfsmount_lock); if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1))) mntget(child_mnt); - spin_unlock(&vfsmount_lock); + br_read_unlock(vfsmount_lock); return child_mnt; } @@ -455,6 +470,9 @@ static inline int check_mnt(struct vfsmount *mnt) return mnt->mnt_ns == current->nsproxy->mnt_ns; } +/* + * vfsmount lock must be held for write + */ static void touch_mnt_namespace(struct mnt_namespace *ns) { if (ns) { @@ -463,6 +481,9 @@ static void touch_mnt_namespace(struct mnt_namespace *ns) } } +/* + * vfsmount lock must be held for write + */ static void __touch_mnt_namespace(struct mnt_namespace *ns) { if (ns && ns->event != event) { @@ -471,6 +492,9 @@ static void __touch_mnt_namespace(struct mnt_namespace *ns) } } +/* + * vfsmount lock must be held for write + */ static void detach_mnt(struct vfsmount *mnt, struct path *old_path) { old_path->dentry = mnt->mnt_mountpoint; @@ -482,6 +506,9 @@ static void detach_mnt(struct vfsmount *mnt, struct path *old_path) old_path->dentry->d_mounted--; } +/* + * vfsmount lock must be held for write + */ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, struct vfsmount *child_mnt) { @@ -490,6 +517,9 @@ void mnt_set_mountpoint(struct vfsmount *mnt, struct dentry *dentry, dentry->d_mounted++; } +/* + * vfsmount lock must be held for write + */ static void attach_mnt(struct vfsmount *mnt, struct path *path) { mnt_set_mountpoint(path->mnt, path->dentry, mnt); @@ -499,7 +529,7 @@ static void attach_mnt(struct vfsmount *mnt, struct path *path) } /* - * the caller must hold vfsmount_lock + * vfsmount lock must be held for write */ static void commit_tree(struct vfsmount *mnt) { @@ -623,39 +653,43 @@ static inline void __mntput(struct vfsmount *mnt) void mntput_no_expire(struct vfsmount *mnt) { repeat: - if (atomic_dec_and_lock(&mnt->mnt_count, &vfsmount_lock)) { - if (likely(!mnt->mnt_pinned)) { - spin_unlock(&vfsmount_lock); - __mntput(mnt); - return; - } - atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); - mnt->mnt_pinned = 0; - spin_unlock(&vfsmount_lock); - acct_auto_close_mnt(mnt); - goto repeat; + if (atomic_add_unless(&mnt->mnt_count, -1, 1)) + return; + br_write_lock(vfsmount_lock); + if (!atomic_dec_and_test(&mnt->mnt_count)) { + br_write_unlock(vfsmount_lock); + return; + } + if (likely(!mnt->mnt_pinned)) { + br_write_unlock(vfsmount_lock); + __mntput(mnt); + return; } + atomic_add(mnt->mnt_pinned + 1, &mnt->mnt_count); + mnt->mnt_pinned = 0; + br_write_unlock(vfsmount_lock); + acct_auto_close_mnt(mnt); + goto repeat; } - EXPORT_SYMBOL(mntput_no_expire); void mnt_pin(struct vfsmount *mnt) { - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); mnt->mnt_pinned++; - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); } EXPORT_SYMBOL(mnt_pin); void mnt_unpin(struct vfsmount *mnt) { - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); if (mnt->mnt_pinned) { atomic_inc(&mnt->mnt_count); mnt->mnt_pinned--; } - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); } EXPORT_SYMBOL(mnt_unpin); @@ -746,12 +780,12 @@ int mnt_had_events(struct proc_mounts *p) struct mnt_namespace *ns = p->ns; int res = 0; - spin_lock(&vfsmount_lock); + br_read_lock(vfsmount_lock); if (p->event != ns->event) { p->event = ns->event; res = 1; } - spin_unlock(&vfsmount_lock); + br_read_unlock(vfsmount_lock); return res; } @@ -952,12 +986,12 @@ int may_umount_tree(struct vfsmount *mnt) int minimum_refs = 0; struct vfsmount *p; - spin_lock(&vfsmount_lock); + br_read_lock(vfsmount_lock); for (p = mnt; p; p = next_mnt(p, mnt)) { actual_refs += atomic_read(&p->mnt_count); minimum_refs += 2; } - spin_unlock(&vfsmount_lock); + br_read_unlock(vfsmount_lock); if (actual_refs > minimum_refs) return 0; @@ -984,10 +1018,10 @@ int may_umount(struct vfsmount *mnt) { int ret = 1; down_read(&namespace_sem); - spin_lock(&vfsmount_lock); + br_read_lock(vfsmount_lock); if (propagate_mount_busy(mnt, 2)) ret = 0; - spin_unlock(&vfsmount_lock); + br_read_unlock(vfsmount_lock); up_read(&namespace_sem); return ret; } @@ -1003,13 +1037,14 @@ void release_mounts(struct list_head *head) if (mnt->mnt_parent != mnt) { struct dentry *dentry; struct vfsmount *m; - spin_lock(&vfsmount_lock); + + br_write_lock(vfsmount_lock); dentry = mnt->mnt_mountpoint; m = mnt->mnt_parent; mnt->mnt_mountpoint = mnt->mnt_root; mnt->mnt_parent = mnt; m->mnt_ghosts--; - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); dput(dentry); mntput(m); } @@ -1017,6 +1052,10 @@ void release_mounts(struct list_head *head) } } +/* + * vfsmount lock must be held for write + * namespace_sem must be held for write + */ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) { struct vfsmount *p; @@ -1107,7 +1146,7 @@ static int do_umount(struct vfsmount *mnt, int flags) } down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); event++; if (!(flags & MNT_DETACH)) @@ -1119,7 +1158,7 @@ static int do_umount(struct vfsmount *mnt, int flags) umount_tree(mnt, 1, &umount_list); retval = 0; } - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); return retval; @@ -1231,19 +1270,19 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, q = clone_mnt(p, p->mnt_root, flag); if (!q) goto Enomem; - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); list_add_tail(&q->mnt_list, &res->mnt_list); attach_mnt(q, &path); - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); } } return res; Enomem: if (res) { LIST_HEAD(umount_list); - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); umount_tree(res, 0, &umount_list); - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); release_mounts(&umount_list); } return NULL; @@ -1262,9 +1301,9 @@ void drop_collected_mounts(struct vfsmount *mnt) { LIST_HEAD(umount_list); down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); umount_tree(mnt, 0, &umount_list); - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); } @@ -1392,7 +1431,7 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (err) goto out_cleanup_ids; - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) @@ -1411,7 +1450,8 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, list_del_init(&child->mnt_hash); commit_tree(child); } - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); + return 0; out_cleanup_ids: @@ -1466,10 +1506,10 @@ static int do_change_type(struct path *path, int flag) goto out_unlock; } - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); out_unlock: up_write(&namespace_sem); @@ -1513,9 +1553,10 @@ static int do_loopback(struct path *path, char *old_name, err = graft_tree(mnt, path); if (err) { LIST_HEAD(umount_list); - spin_lock(&vfsmount_lock); + + br_write_lock(vfsmount_lock); umount_tree(mnt, 0, &umount_list); - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); release_mounts(&umount_list); } @@ -1568,16 +1609,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags, else err = do_remount_sb(sb, flags, data, 0); if (!err) { - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); mnt_flags |= path->mnt->mnt_flags & MNT_PROPAGATION_MASK; path->mnt->mnt_flags = mnt_flags; - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); } up_write(&sb->s_umount); if (!err) { - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); touch_mnt_namespace(path->mnt->mnt_ns); - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); } return err; } @@ -1754,7 +1795,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) return; down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); /* extract from the expiration list every vfsmount that matches the * following criteria: @@ -1773,7 +1814,7 @@ void mark_mounts_for_expiry(struct list_head *mounts) touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, 1, &umounts); } - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); up_write(&namespace_sem); release_mounts(&umounts); @@ -1830,6 +1871,8 @@ resume: /* * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint + * + * vfsmount_lock must be held for write */ static void shrink_submounts(struct vfsmount *mnt, struct list_head *umounts) { @@ -2048,9 +2091,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns, kfree(new_ns); return ERR_PTR(-ENOMEM); } - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); list_add_tail(&new_ns->list, &new_ns->root->mnt_list); - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts @@ -2244,7 +2287,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, goto out2; /* not attached */ /* make sure we can reach put_old from new_root */ tmp = old.mnt; - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); if (tmp != new.mnt) { for (;;) { if (tmp->mnt_parent == tmp) @@ -2264,7 +2307,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* mount new_root on / */ attach_mnt(new.mnt, &root_parent); touch_mnt_namespace(current->nsproxy->mnt_ns); - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); chroot_fs_refs(&root, &new); error = 0; path_put(&root_parent); @@ -2279,7 +2322,7 @@ out1: out0: return error; out3: - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); goto out2; } @@ -2326,6 +2369,8 @@ void __init mnt_init(void) for (u = 0; u < HASH_SIZE; u++) INIT_LIST_HEAD(&mount_hashtable[u]); + br_lock_init(vfsmount_lock); + err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", @@ -2344,9 +2389,9 @@ void put_mnt_ns(struct mnt_namespace *ns) if (!atomic_dec_and_test(&ns->count)) return; down_write(&namespace_sem); - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); umount_tree(ns->root, 0, &umount_list); - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); up_write(&namespace_sem); release_mounts(&umount_list); kfree(ns); diff --git a/fs/pnode.c b/fs/pnode.c index 5cc564a83149..8066b8dd748f 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -126,6 +126,9 @@ static int do_make_slave(struct vfsmount *mnt) return 0; } +/* + * vfsmount lock must be held for write + */ void change_mnt_propagation(struct vfsmount *mnt, int type) { if (type == MS_SHARED) { @@ -270,12 +273,12 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry, prev_src_mnt = child; } out: - spin_lock(&vfsmount_lock); + br_write_lock(vfsmount_lock); while (!list_empty(&tmp_list)) { child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash); umount_tree(child, 0, &umount_list); } - spin_unlock(&vfsmount_lock); + br_write_unlock(vfsmount_lock); release_mounts(&umount_list); return ret; } @@ -296,6 +299,8 @@ static inline int do_refcount_check(struct vfsmount *mnt, int count) * other mounts its parent propagates to. * Check if any of these mounts that **do not have submounts** * have more references than 'refcnt'. If so return busy. + * + * vfsmount lock must be held for read or write */ int propagate_mount_busy(struct vfsmount *mnt, int refcnt) { @@ -353,6 +358,8 @@ static void __propagate_umount(struct vfsmount *mnt) * collect all mounts that receive propagation from the mount in @list, * and return these additional mounts in the same list. * @list: the list of mounts to be unmounted. + * + * vfsmount lock must be held for write */ int propagate_umount(struct list_head *list) { -- cgit v1.2.3 From 0a377cff9428af2da2b293d11e07bc4dbf064ee5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 18 Aug 2010 09:25:42 -0400 Subject: NFS: Fix an Oops in the NFSv4 atomic open code Adam Lackorzynski reports: with 2.6.35.2 I'm getting this reproducible Oops: [ 110.825396] BUG: unable to handle kernel NULL pointer dereference at (null) [ 110.828638] IP: [] encode_attrs+0x1a/0x2a4 [ 110.828638] PGD be89f067 PUD bf18f067 PMD 0 [ 110.828638] Oops: 0000 [#1] SMP [ 110.828638] last sysfs file: /sys/class/net/lo/operstate [ 110.828638] CPU 2 [ 110.828638] Modules linked in: rtc_cmos rtc_core rtc_lib amd64_edac_mod i2c_amd756 edac_core i2c_core dm_mirror dm_region_hash dm_log dm_snapshot sg sr_mod usb_storage ohci_hcd mptspi tg3 mptscsih mptbase usbcore nls_base [last unloaded: scsi_wait_scan] [ 110.828638] [ 110.828638] Pid: 11264, comm: setchecksum Not tainted 2.6.35.2 #1 [ 110.828638] RIP: 0010:[] [] encode_attrs+0x1a/0x2a4 [ 110.828638] RSP: 0000:ffff88003bf5b878 EFLAGS: 00010296 [ 110.828638] RAX: ffff8800bddb48a8 RBX: ffff88003bf5bb18 RCX: 0000000000000000 [ 110.828638] RDX: ffff8800be258800 RSI: 0000000000000000 RDI: ffff88003bf5b9f8 [ 110.828638] RBP: 0000000000000000 R08: ffff8800bddb48a8 R09: 0000000000000004 [ 110.828638] R10: 0000000000000003 R11: ffff8800be779000 R12: ffff8800be258800 [ 110.828638] R13: ffff88003bf5b9f8 R14: ffff88003bf5bb20 R15: ffff8800be258800 [ 110.828638] FS: 0000000000000000(0000) GS:ffff880041e00000(0063) knlGS:00000000556bd6b0 [ 110.828638] CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b [ 110.828638] CR2: 0000000000000000 CR3: 00000000be8ef000 CR4: 00000000000006e0 [ 110.828638] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 110.828638] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 110.828638] Process setchecksum (pid: 11264, threadinfo ffff88003bf5a000, task ffff88003f232210) [ 110.828638] Stack: [ 110.828638] 0000000000000000 ffff8800bfbcf920 0000000000000000 0000000000000ffe [ 110.828638] <0> 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 110.828638] <0> 0000000000000000 0000000000000000 0000000000000000 0000000000000000 [ 110.828638] Call Trace: [ 110.828638] [] ? nfs4_xdr_enc_setattr+0x90/0xb4 [ 110.828638] [] ? call_transmit+0x1c3/0x24a [ 110.828638] [] ? __rpc_execute+0x78/0x22a [ 110.828638] [] ? rpc_run_task+0x21/0x2b [ 110.828638] [] ? rpc_call_sync+0x3d/0x5d [ 110.828638] [] ? _nfs4_do_setattr+0x11b/0x147 [ 110.828638] [] ? nfs_init_locked+0x0/0x32 [ 110.828638] [] ? ifind+0x4e/0x90 [ 110.828638] [] ? nfs4_do_setattr+0x4b/0x6e [ 110.828638] [] ? nfs4_do_open+0x291/0x3a6 [ 110.828638] [] ? nfs4_open_revalidate+0x63/0x14a [ 110.828638] [] ? nfs_open_revalidate+0xd7/0x161 [ 110.828638] [] ? do_lookup+0x1a4/0x201 [ 110.828638] [] ? link_path_walk+0x6a/0x9d5 [ 110.828638] [] ? do_last+0x17b/0x58e [ 110.828638] [] ? do_filp_open+0x1bd/0x56e [ 110.828638] [] ? _atomic_dec_and_lock+0x30/0x48 [ 110.828638] [] ? dput+0x37/0x152 [ 110.828638] [] ? alloc_fd+0x69/0x10a [ 110.828638] [] ? do_sys_open+0x56/0x100 [ 110.828638] [] ? ia32_sysret+0x0/0x5 [ 110.828638] Code: 83 f1 01 e8 f5 ca ff ff 48 83 c4 50 5b 5d 41 5c c3 41 57 41 56 41 55 49 89 fd 41 54 49 89 d4 55 48 89 f5 53 48 81 ec 18 01 00 00 <8b> 06 89 c2 83 e2 08 83 fa 01 19 db 83 e3 f8 83 c3 18 a8 01 8d [ 110.828638] RIP [] encode_attrs+0x1a/0x2a4 [ 110.828638] RSP [ 110.828638] CR2: 0000000000000000 [ 112.840396] ---[ end trace 95282e83fd77358f ]--- We need to ensure that the O_EXCL flag is turned off if the user doesn't set O_CREAT. Cc: stable@kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- fs/nfs/nfs4proc.c | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index bd91b2778315..e257172d438c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1110,7 +1110,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) goto no_open_dput; /* We can't create new files, or truncate existing ones here */ - openflags &= ~(O_CREAT|O_TRUNC); + openflags &= ~(O_CREAT|O_EXCL|O_TRUNC); /* * Note: we're not holding inode->i_mutex and so may be racing with diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6b44bbfb7d8a..089da5b5d20a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2036,7 +2036,8 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) struct rpc_cred *cred; struct nfs4_state *state; struct dentry *res; - fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); + int open_flags = nd->intent.open.flags; + fmode_t fmode = open_flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC); if (nd->flags & LOOKUP_CREATE) { attr.ia_mode = nd->intent.open.create_mode; @@ -2044,8 +2045,9 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) if (!IS_POSIXACL(dir)) attr.ia_mode &= ~current_umask(); } else { + open_flags &= ~O_EXCL; attr.ia_valid = 0; - BUG_ON(nd->intent.open.flags & O_CREAT); + BUG_ON(open_flags & O_CREAT); } cred = rpc_lookup_cred(); @@ -2054,7 +2056,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) parent = dentry->d_parent; /* Protect against concurrent sillydeletes */ nfs_block_sillyrename(parent); - state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred); + state = nfs4_do_open(dir, &path, fmode, open_flags, &attr, cred); put_rpccred(cred); if (IS_ERR(state)) { if (PTR_ERR(state) == -ENOENT) { -- cgit v1.2.3 From 1cb0c924fa2d616e5e3b5bc62d97191aac9ff442 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 18 Aug 2010 21:11:11 +0900 Subject: nilfs2: wait for discard to finish nilfs_discard_segment() doesn't wait for completion of discard requests. This specifies BLKDEV_IFL_WAIT flag when calling blkdev_issue_discard() in order to fix the sync failure. Reported-by: Christoph Hellwig Signed-off-by: Ryusuke Konishi Cc: Christoph Hellwig --- fs/nilfs2/the_nilfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 6af1c0073e9e..4317f177ea7c 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -775,6 +775,7 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, start * sects_per_block, nblocks * sects_per_block, GFP_NOFS, + BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); if (ret < 0) return ret; @@ -785,7 +786,8 @@ int nilfs_discard_segments(struct the_nilfs *nilfs, __u64 *segnump, ret = blkdev_issue_discard(nilfs->ns_bdev, start * sects_per_block, nblocks * sects_per_block, - GFP_NOFS, BLKDEV_IFL_BARRIER); + GFP_NOFS, + BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER); return ret; } -- cgit v1.2.3 From fc87a40677bbe0937e2ff0642c7e83c9a4813f3d Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 18 Aug 2010 13:13:39 -0400 Subject: cifs: fix NULL pointer dereference in cifs_find_smb_ses cifs_find_smb_ses assumes that the vol->password field is a valid pointer, but that's only the case if a password was passed in via the options string. It's possible that one won't be if there is no mount helper on the box. Reported-by: diabel Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 95c2ea67edfb..446e2486d5f0 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1673,7 +1673,8 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) MAX_USERNAME_SIZE)) continue; if (strlen(vol->username) != 0 && - strncmp(ses->password, vol->password, + strncmp(ses->password, + vol->password ? vol->password : "", MAX_PASSWORD_SIZE)) continue; } -- cgit v1.2.3 From bf4f12113812ac5be76c5590c6f50c8346f784a4 Mon Sep 17 00:00:00 2001 From: Igor Druzhinin Date: Fri, 20 Aug 2010 00:27:12 +0400 Subject: cifs: correction of unicode header files This patch corrects a problem of compilation errors at removal of UNIUPR_NOLOWER definition and adds include guards to cifs_unicode.h. Signed-off-by: Igor Druzhinin Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_unicode.h | 18 +++++++++++------- fs/cifs/cifs_uniupr.h | 16 ++++++++-------- 2 files changed, 19 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 650638275a6f..7fe6b52df507 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -30,6 +30,8 @@ * This is a compressed table of upper and lower case conversion. * */ +#ifndef _CIFS_UNICODE_H +#define _CIFS_UNICODE_H #include #include @@ -67,8 +69,8 @@ extern const struct UniCaseRange CifsUniUpperRange[]; #endif /* UNIUPR_NOUPPER */ #ifndef UNIUPR_NOLOWER -extern signed char UniLowerTable[512]; -extern struct UniCaseRange UniLowerRange[]; +extern signed char CifsUniLowerTable[512]; +extern const struct UniCaseRange CifsUniLowerRange[]; #endif /* UNIUPR_NOLOWER */ #ifdef __KERNEL__ @@ -337,15 +339,15 @@ UniStrupr(register wchar_t *upin) * UniTolower: Convert a unicode character to lower case */ static inline wchar_t -UniTolower(wchar_t uc) +UniTolower(register wchar_t uc) { - register struct UniCaseRange *rp; + register const struct UniCaseRange *rp; - if (uc < sizeof(UniLowerTable)) { + if (uc < sizeof(CifsUniLowerTable)) { /* Latin characters */ - return uc + UniLowerTable[uc]; /* Use base tables */ + return uc + CifsUniLowerTable[uc]; /* Use base tables */ } else { - rp = UniLowerRange; /* Use range tables */ + rp = CifsUniLowerRange; /* Use range tables */ while (rp->start) { if (uc < rp->start) /* Before start of range */ return uc; /* Uppercase = input */ @@ -374,3 +376,5 @@ UniStrlwr(register wchar_t *upin) } #endif + +#endif /* _CIFS_UNICODE_H */ diff --git a/fs/cifs/cifs_uniupr.h b/fs/cifs/cifs_uniupr.h index 18a9d978e519..0ac7c5a8633a 100644 --- a/fs/cifs/cifs_uniupr.h +++ b/fs/cifs/cifs_uniupr.h @@ -140,7 +140,7 @@ const struct UniCaseRange CifsUniUpperRange[] = { /* * Latin lower case */ -static signed char CifsUniLowerTable[512] = { +signed char CifsUniLowerTable[512] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 000-00f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 010-01f */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 020-02f */ @@ -242,12 +242,12 @@ static signed char UniCaseRangeLff20[27] = { /* * Lower Case Range */ -static const struct UniCaseRange CifsUniLowerRange[] = { - 0x0380, 0x03ab, UniCaseRangeL0380, - 0x0400, 0x042f, UniCaseRangeL0400, - 0x0490, 0x04cb, UniCaseRangeL0490, - 0x1e00, 0x1ff7, UniCaseRangeL1e00, - 0xff20, 0xff3a, UniCaseRangeLff20, - 0, 0, 0 +const struct UniCaseRange CifsUniLowerRange[] = { + {0x0380, 0x03ab, UniCaseRangeL0380}, + {0x0400, 0x042f, UniCaseRangeL0400}, + {0x0490, 0x04cb, UniCaseRangeL0490}, + {0x1e00, 0x1ff7, UniCaseRangeL1e00}, + {0xff20, 0xff3a, UniCaseRangeLff20}, + {0} }; #endif -- cgit v1.2.3 From 9fbc590860e75785bdaf8b83e48fabfe4d4f7d58 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 20 Aug 2010 20:42:26 +0000 Subject: [CIFS] Fix ntlmv2 auth with ntlmssp Make ntlmv2 as an authentication mechanism within ntlmssp instead of ntlmv1. Parse type 2 response in ntlmssp negotiation to pluck AV pairs and use them to calculate ntlmv2 response token. Also, assign domain name from the sever response in type 2 packet of ntlmssp and use that (netbios) domain name in calculation of response. Enable cifs/smb signing using rc4 and md5. Changed name of the structure mac_key to session_key to reflect the type of key it holds. Use kernel crypto_shash_* APIs instead of the equivalent cifs functions. Signed-off-by: Shirish Pargaonkar Acked-by: Herbert Xu Signed-off-by: Steve French --- fs/cifs/Kconfig | 2 + fs/cifs/asn1.c | 6 +- fs/cifs/cifsencrypt.c | 416 +++++++++++++++++++++++++++++++++++--------------- fs/cifs/cifsglob.h | 18 ++- fs/cifs/cifspdu.h | 7 +- fs/cifs/cifsproto.h | 12 +- fs/cifs/cifssmb.c | 13 +- fs/cifs/connect.c | 13 +- fs/cifs/ntlmssp.h | 13 ++ fs/cifs/sess.c | 118 ++++++++++---- fs/cifs/transport.c | 6 +- 11 files changed, 452 insertions(+), 172 deletions(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 917b7d449bb2..0da1debd499d 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -2,6 +2,8 @@ config CIFS tristate "CIFS support (advanced network filesystem, SMBFS successor)" depends on INET select NLS + select CRYPTO_MD5 + select CRYPTO_ARC4 help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index cfd1ce34e0bc..21f0fbd86989 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -597,13 +597,13 @@ decode_negTokenInit(unsigned char *security_blob, int length, if (compare_oid(oid, oidlen, MSKRB5_OID, MSKRB5_OID_LEN)) server->sec_mskerberos = true; - else if (compare_oid(oid, oidlen, KRB5U2U_OID, + if (compare_oid(oid, oidlen, KRB5U2U_OID, KRB5U2U_OID_LEN)) server->sec_kerberosu2u = true; - else if (compare_oid(oid, oidlen, KRB5_OID, + if (compare_oid(oid, oidlen, KRB5_OID, KRB5_OID_LEN)) server->sec_kerberos = true; - else if (compare_oid(oid, oidlen, NTLMSSP_OID, + if (compare_oid(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN)) server->sec_ntlmssp = true; diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 847628dfdc44..051d00011ca3 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -27,6 +27,7 @@ #include "md5.h" #include "cifs_unicode.h" #include "cifsproto.h" +#include "ntlmssp.h" #include #include @@ -42,21 +43,44 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8, unsigned char *p24); static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, - const struct mac_key *key, char *signature) + struct TCP_Server_Info *server, char *signature) { - struct MD5Context context; + int rc = 0; + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(server->ntlmssp.md5)]; + } sdesc; - if ((cifs_pdu == NULL) || (signature == NULL) || (key == NULL)) + if (cifs_pdu == NULL || server == NULL || signature == NULL) return -EINVAL; - cifs_MD5_init(&context); - cifs_MD5_update(&context, (char *)&key->data, key->len); - cifs_MD5_update(&context, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); + sdesc.shash.tfm = server->ntlmssp.md5; + sdesc.shash.flags = 0x0; + + rc = crypto_shash_init(&sdesc.shash); + if (rc) { + cERROR(1, "could not initialize master crypto API hmacmd5\n"); + return rc; + } + + if (server->secType == RawNTLMSSP) + crypto_shash_update(&sdesc.shash, + server->session_key.data.ntlmv2.key, + CIFS_NTLMV2_SESSKEY_SIZE); + else + crypto_shash_update(&sdesc.shash, + (char *)&server->session_key.data, + server->session_key.len); + + crypto_shash_update(&sdesc.shash, + cifs_pdu->Protocol, cifs_pdu->smb_buf_length); + + rc = crypto_shash_final(&sdesc.shash, signature); - cifs_MD5_final(signature, &context); return 0; } + int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) { @@ -78,8 +102,7 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, server->sequence_number++; spin_unlock(&GlobalMid_Lock); - rc = cifs_calculate_signature(cifs_pdu, &server->mac_signing_key, - smb_signature); + rc = cifs_calculate_signature(cifs_pdu, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -89,16 +112,36 @@ int cifs_sign_smb(struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, } static int cifs_calc_signature2(const struct kvec *iov, int n_vec, - const struct mac_key *key, char *signature) + struct TCP_Server_Info *server, char *signature) { - struct MD5Context context; int i; + int rc = 0; + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(server->ntlmssp.md5)]; + } sdesc; - if ((iov == NULL) || (signature == NULL) || (key == NULL)) + if (iov == NULL || server == NULL || signature == NULL) return -EINVAL; - cifs_MD5_init(&context); - cifs_MD5_update(&context, (char *)&key->data, key->len); + sdesc.shash.tfm = server->ntlmssp.md5; + sdesc.shash.flags = 0x0; + + rc = crypto_shash_init(&sdesc.shash); + if (rc) { + cERROR(1, "could not initialize master crypto API hmacmd5\n"); + return rc; + } + + if (server->secType == RawNTLMSSP) + crypto_shash_update(&sdesc.shash, + server->session_key.data.ntlmv2.key, + CIFS_NTLMV2_SESSKEY_SIZE); + else + crypto_shash_update(&sdesc.shash, + (char *)&server->session_key.data, + server->session_key.len); + for (i = 0; i < n_vec; i++) { if (iov[i].iov_len == 0) continue; @@ -111,18 +154,18 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ - cifs_MD5_update(&context, iov[0].iov_base+4, - iov[0].iov_len-4); + crypto_shash_update(&sdesc.shash, + iov[i].iov_base + 4, iov[i].iov_len - 4); } else - cifs_MD5_update(&context, iov[i].iov_base, iov[i].iov_len); + crypto_shash_update(&sdesc.shash, + iov[i].iov_base, iov[i].iov_len); } - cifs_MD5_final(signature, &context); + rc = crypto_shash_final(&sdesc.shash, signature); return 0; } - int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, __u32 *pexpected_response_sequence_number) { @@ -145,8 +188,7 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, server->sequence_number++; spin_unlock(&GlobalMid_Lock); - rc = cifs_calc_signature2(iov, n_vec, &server->mac_signing_key, - smb_signature); + rc = cifs_calc_signature2(iov, n_vec, server, smb_signature); if (rc) memset(cifs_pdu->Signature.SecuritySignature, 0, 8); else @@ -156,14 +198,14 @@ int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, } int cifs_verify_signature(struct smb_hdr *cifs_pdu, - const struct mac_key *mac_key, + struct TCP_Server_Info *server, __u32 expected_sequence_number) { - unsigned int rc; + int rc; char server_response_sig[8]; char what_we_think_sig_should_be[20]; - if ((cifs_pdu == NULL) || (mac_key == NULL)) + if (cifs_pdu == NULL || server == NULL) return -EINVAL; if (cifs_pdu->Command == SMB_COM_NEGOTIATE) @@ -192,7 +234,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, cpu_to_le32(expected_sequence_number); cifs_pdu->Signature.Sequence.Reserved = 0; - rc = cifs_calculate_signature(cifs_pdu, mac_key, + rc = cifs_calculate_signature(cifs_pdu, server, what_we_think_sig_should_be); if (rc) @@ -209,7 +251,7 @@ int cifs_verify_signature(struct smb_hdr *cifs_pdu, } /* We fill in key by putting in 40 byte array which was allocated by caller */ -int cifs_calculate_mac_key(struct mac_key *key, const char *rn, +int cifs_calculate_session_key(struct session_key *key, const char *rn, const char *password) { char temp_key[16]; @@ -223,63 +265,6 @@ int cifs_calculate_mac_key(struct mac_key *key, const char *rn, return 0; } -int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *ses, - const struct nls_table *nls_info) -{ - char temp_hash[16]; - struct HMACMD5Context ctx; - char *ucase_buf; - __le16 *unicode_buf; - unsigned int i, user_name_len, dom_name_len; - - if (ses == NULL) - return -EINVAL; - - E_md4hash(ses->password, temp_hash); - - hmac_md5_init_limK_to_64(temp_hash, 16, &ctx); - user_name_len = strlen(ses->userName); - if (user_name_len > MAX_USERNAME_SIZE) - return -EINVAL; - if (ses->domainName == NULL) - return -EINVAL; /* BB should we use CIFS_LINUX_DOM */ - dom_name_len = strlen(ses->domainName); - if (dom_name_len > MAX_USERNAME_SIZE) - return -EINVAL; - - ucase_buf = kmalloc((MAX_USERNAME_SIZE+1), GFP_KERNEL); - if (ucase_buf == NULL) - return -ENOMEM; - unicode_buf = kmalloc((MAX_USERNAME_SIZE+1)*4, GFP_KERNEL); - if (unicode_buf == NULL) { - kfree(ucase_buf); - return -ENOMEM; - } - - for (i = 0; i < user_name_len; i++) - ucase_buf[i] = nls_info->charset2upper[(int)ses->userName[i]]; - ucase_buf[i] = 0; - user_name_len = cifs_strtoUCS(unicode_buf, ucase_buf, - MAX_USERNAME_SIZE*2, nls_info); - unicode_buf[user_name_len] = 0; - user_name_len++; - - for (i = 0; i < dom_name_len; i++) - ucase_buf[i] = nls_info->charset2upper[(int)ses->domainName[i]]; - ucase_buf[i] = 0; - dom_name_len = cifs_strtoUCS(unicode_buf+user_name_len, ucase_buf, - MAX_USERNAME_SIZE*2, nls_info); - - unicode_buf[user_name_len + dom_name_len] = 0; - hmac_md5_update((const unsigned char *) unicode_buf, - (user_name_len+dom_name_len)*2, &ctx); - - hmac_md5_final(ses->server->ntlmv2_hash, &ctx); - kfree(ucase_buf); - kfree(unicode_buf); - return 0; -} - #ifdef CONFIG_CIFS_WEAK_PW_HASH void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key) @@ -324,21 +309,29 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, { int rc = 0; int len; - char nt_hash[16]; - struct HMACMD5Context *pctxt; + char nt_hash[CIFS_NTHASH_SIZE]; wchar_t *user; wchar_t *domain; - - pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL); - - if (pctxt == NULL) - return -ENOMEM; + wchar_t *server; + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)]; + } sdesc; /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash); - /* convert Domainname to unicode and uppercase */ - hmac_md5_init_limK_to_64(nt_hash, 16, pctxt); + sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5; + sdesc.shash.flags = 0x0; + + crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, nt_hash, + CIFS_NTHASH_SIZE); + + rc = crypto_shash_init(&sdesc.shash); + if (rc) { + cERROR(1, "could not initialize master crypto API hmacmd5\n"); + return rc; + } /* convert ses->userName to unicode and uppercase */ len = strlen(ses->userName); @@ -347,7 +340,8 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, goto calc_exit_2; len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); UniStrupr(user); - hmac_md5_update((char *)user, 2*len, pctxt); + + crypto_shash_update(&sdesc.shash, (char *)user, 2 * len); /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { @@ -363,65 +357,243 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, Maybe converting the domain name earlier makes sense */ /* UniStrupr(domain); */ - hmac_md5_update((char *)domain, 2*len, pctxt); + crypto_shash_update(&sdesc.shash, (char *)domain, 2 * len); kfree(domain); + } else if (ses->serverName) { + len = strlen(ses->serverName); + + server = kmalloc(2 + (len * 2), GFP_KERNEL); + if (server == NULL) + goto calc_exit_1; + len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, + nls_cp); + /* the following line was removed since it didn't work well + with lower cased domain name that passed as an option. + Maybe converting the domain name earlier makes sense */ + /* UniStrupr(domain); */ + + crypto_shash_update(&sdesc.shash, (char *)server, 2 * len); + + kfree(server); } calc_exit_1: kfree(user); calc_exit_2: /* BB FIXME what about bytes 24 through 40 of the signing key? compare with the NTLM example */ - hmac_md5_final(ses->server->ntlmv2_hash, pctxt); + rc = crypto_shash_final(&sdesc.shash, ses->server->ntlmv2_hash); - kfree(pctxt); return rc; } -void setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, - const struct nls_table *nls_cp) +static int +find_domain_name(struct cifsSesInfo *ses) +{ + int rc = 0; + unsigned int attrsize; + unsigned int type; + unsigned char *blobptr; + struct ntlmssp2_name *attrptr; + + if (ses->server->tiblob) { + blobptr = ses->server->tiblob; + attrptr = (struct ntlmssp2_name *) blobptr; + + while ((type = attrptr->type) != 0) { + blobptr += 2; /* advance attr type */ + attrsize = attrptr->length; + blobptr += 2; /* advance attr size */ + if (type == NTLMSSP_AV_NB_DOMAIN_NAME) { + if (!ses->domainName) { + ses->domainName = + kmalloc(attrptr->length + 1, + GFP_KERNEL); + if (!ses->domainName) + return -ENOMEM; + cifs_from_ucs2(ses->domainName, + (__le16 *)blobptr, + attrptr->length, + attrptr->length, + load_nls_default(), false); + } + } + blobptr += attrsize; /* advance attr value */ + attrptr = (struct ntlmssp2_name *) blobptr; + } + } else { + ses->server->tilen = 2 * sizeof(struct ntlmssp2_name); + ses->server->tiblob = kmalloc(ses->server->tilen, GFP_KERNEL); + if (!ses->server->tiblob) { + ses->server->tilen = 0; + cERROR(1, "Challenge target info allocation failure"); + return -ENOMEM; + } + memset(ses->server->tiblob, 0x0, ses->server->tilen); + attrptr = (struct ntlmssp2_name *) ses->server->tiblob; + attrptr->type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE); + } + + return rc; +} + +static int +CalcNTLMv2_response(const struct TCP_Server_Info *server, + char *v2_session_response) { int rc; + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(server->ntlmssp.hmacmd5)]; + } sdesc; + + sdesc.shash.tfm = server->ntlmssp.hmacmd5; + sdesc.shash.flags = 0x0; + + crypto_shash_setkey(server->ntlmssp.hmacmd5, server->ntlmv2_hash, + CIFS_HMAC_MD5_HASH_SIZE); + + rc = crypto_shash_init(&sdesc.shash); + if (rc) { + cERROR(1, "could not initialize master crypto API hmacmd5\n"); + return rc; + } + + memcpy(v2_session_response + CIFS_SERVER_CHALLENGE_SIZE, + server->cryptKey, CIFS_SERVER_CHALLENGE_SIZE); + crypto_shash_update(&sdesc.shash, + v2_session_response + CIFS_SERVER_CHALLENGE_SIZE, + sizeof(struct ntlmv2_resp) - CIFS_SERVER_CHALLENGE_SIZE); + + if (server->tilen) + crypto_shash_update(&sdesc.shash, + server->tiblob, server->tilen); + + rc = crypto_shash_final(&sdesc.shash, v2_session_response); + + return rc; +} + +int +setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, + const struct nls_table *nls_cp) +{ + int rc = 0; struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf; - struct HMACMD5Context context; + struct { + struct shash_desc shash; + char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)]; + } sdesc; buf->blob_signature = cpu_to_le32(0x00000101); buf->reserved = 0; buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); buf->reserved2 = 0; - buf->names[0].type = cpu_to_le16(NTLMSSP_DOMAIN_TYPE); - buf->names[0].length = 0; - buf->names[1].type = 0; - buf->names[1].length = 0; + + if (!ses->domainName) { + rc = find_domain_name(ses); + if (rc) { + cERROR(1, "could not get domain/server name rc %d", rc); + return rc; + } + } /* calculate buf->ntlmv2_hash */ rc = calc_ntlmv2_hash(ses, nls_cp); - if (rc) + if (rc) { + cERROR(1, "could not get v2 hash rc %d", rc); + return rc; + } + rc = CalcNTLMv2_response(ses->server, resp_buf); + if (rc) { cERROR(1, "could not get v2 hash rc %d", rc); - CalcNTLMv2_response(ses, resp_buf); + return rc; + } + + crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, + ses->server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); + + sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5; + sdesc.shash.flags = 0x0; + + rc = crypto_shash_init(&sdesc.shash); + if (rc) { + cERROR(1, "could not initialize master crypto API hmacmd5\n"); + return rc; + } + + crypto_shash_update(&sdesc.shash, resp_buf, CIFS_HMAC_MD5_HASH_SIZE); - /* now calculate the MAC key for NTLMv2 */ - hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); - hmac_md5_update(resp_buf, 16, &context); - hmac_md5_final(ses->server->mac_signing_key.data.ntlmv2.key, &context); + rc = crypto_shash_final(&sdesc.shash, + ses->server->session_key.data.ntlmv2.key); - memcpy(&ses->server->mac_signing_key.data.ntlmv2.resp, resp_buf, - sizeof(struct ntlmv2_resp)); - ses->server->mac_signing_key.len = 16 + sizeof(struct ntlmv2_resp); + memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf, + sizeof(struct ntlmv2_resp)); + ses->server->session_key.len = 16 + sizeof(struct ntlmv2_resp); + + return rc; } -void CalcNTLMv2_response(const struct cifsSesInfo *ses, - char *v2_session_response) +int +calc_seckey(struct TCP_Server_Info *server) { - struct HMACMD5Context context; - /* rest of v2 struct already generated */ - memcpy(v2_session_response + 8, ses->server->cryptKey, 8); - hmac_md5_init_limK_to_64(ses->server->ntlmv2_hash, 16, &context); + int rc; + unsigned char sec_key[CIFS_NTLMV2_SESSKEY_SIZE]; + struct crypto_blkcipher *tfm_arc4; + struct scatterlist sgin, sgout; + struct blkcipher_desc desc; + + get_random_bytes(sec_key, CIFS_NTLMV2_SESSKEY_SIZE); + + tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", + 0, CRYPTO_ALG_ASYNC); + if (!tfm_arc4 || IS_ERR(tfm_arc4)) { + cERROR(1, "could not allocate " "master crypto API arc4\n"); + return 1; + } + + crypto_blkcipher_setkey(tfm_arc4, + server->session_key.data.ntlmv2.key, CIFS_CPHTXT_SIZE); + sg_init_one(&sgin, sec_key, CIFS_CPHTXT_SIZE); + sg_init_one(&sgout, server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE); + rc = crypto_blkcipher_encrypt(&desc, &sgout, &sgin, CIFS_CPHTXT_SIZE); + + if (!rc) + memcpy(server->session_key.data.ntlmv2.key, + sec_key, CIFS_NTLMV2_SESSKEY_SIZE); - hmac_md5_update(v2_session_response+8, - sizeof(struct ntlmv2_resp) - 8, &context); + crypto_free_blkcipher(tfm_arc4); - hmac_md5_final(v2_session_response, &context); -/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */ + return 0; +} + +void +cifs_crypto_shash_release(struct TCP_Server_Info *server) +{ + if (server->ntlmssp.md5) + crypto_free_shash(server->ntlmssp.md5); + + if (server->ntlmssp.hmacmd5) + crypto_free_shash(server->ntlmssp.hmacmd5); +} + +int +cifs_crypto_shash_allocate(struct TCP_Server_Info *server) +{ + server->ntlmssp.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); + if (!server->ntlmssp.hmacmd5 || + IS_ERR(server->ntlmssp.hmacmd5)) { + cERROR(1, "could not allocate master crypto API hmacmd5\n"); + return 1; + } + + server->ntlmssp.md5 = crypto_alloc_shash("md5", 0, 0); + if (!server->ntlmssp.md5 || IS_ERR(server->ntlmssp.md5)) { + crypto_free_shash(server->ntlmssp.hmacmd5); + cERROR(1, "could not allocate master crypto API md5\n"); + return 1; + } + + return 0; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 0cdfb8c32ac6..49563e0c1725 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -25,6 +25,9 @@ #include #include "cifs_fs_sb.h" #include "cifsacl.h" +#include +#include + /* * The sizes of various internal tables and strings */ @@ -97,7 +100,7 @@ enum protocolEnum { /* Netbios frames protocol not supported at this time */ }; -struct mac_key { +struct session_key { unsigned int len; union { char ntlm[CIFS_SESS_KEY_SIZE + 16]; @@ -120,6 +123,14 @@ struct cifs_cred { struct cifs_ace *aces; }; +struct ntlmssp_auth { + __u32 client_flags; + __u32 server_flags; + unsigned char ciphertext[CIFS_CPHTXT_SIZE]; + struct crypto_shash *hmacmd5; + struct crypto_shash *md5; +}; + /* ***************************************************************** * Except the CIFS PDUs themselves all the @@ -182,11 +193,14 @@ struct TCP_Server_Info { /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* needed for CIFS PDU signature */ - struct mac_key mac_signing_key; + struct session_key session_key; char ntlmv2_hash[16]; unsigned long lstrp; /* when we got last response from this server */ u16 dialect; /* dialect index that server chose */ /* extended security flavors that server supports */ + unsigned int tilen; /* length of the target info blob */ + unsigned char *tiblob; /* target info blob in challenge response */ + struct ntlmssp_auth ntlmssp; /* various keys, ciphers, flags */ bool sec_kerberos; /* supports plain Kerberos */ bool sec_mskerberos; /* supports legacy MS Kerberos */ bool sec_kerberosu2u; /* supports U2U Kerberos */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 14d036d8db11..320e0fd0ba7b 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -134,6 +134,12 @@ * Size of the session key (crypto key encrypted with the password */ #define CIFS_SESS_KEY_SIZE (24) +#define CIFS_CLIENT_CHALLENGE_SIZE (8) +#define CIFS_SERVER_CHALLENGE_SIZE (8) +#define CIFS_HMAC_MD5_HASH_SIZE (16) +#define CIFS_CPHTXT_SIZE (16) +#define CIFS_NTLMV2_SESSKEY_SIZE (16) +#define CIFS_NTHASH_SIZE (16) /* * Maximum user name length @@ -663,7 +669,6 @@ struct ntlmv2_resp { __le64 time; __u64 client_chal; /* random */ __u32 reserved2; - struct ntlmssp2_name names[2]; /* array of name entries could follow ending in minimum 4 byte struct */ } __attribute__((packed)); diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 1f5450814087..1378d9133844 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -361,15 +361,15 @@ extern int cifs_sign_smb(struct smb_hdr *, struct TCP_Server_Info *, __u32 *); extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, __u32 *); extern int cifs_verify_signature(struct smb_hdr *, - const struct mac_key *mac_key, + struct TCP_Server_Info *server, __u32 expected_sequence_number); -extern int cifs_calculate_mac_key(struct mac_key *key, const char *rn, +extern int cifs_calculate_session_key(struct session_key *key, const char *rn, const char *pass); -extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, - const struct nls_table *); -extern void CalcNTLMv2_response(const struct cifsSesInfo *, char *); -extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, +extern int setup_ntlmv2_rsp(struct cifsSesInfo *, char *, const struct nls_table *); +extern int cifs_crypto_shash_allocate(struct TCP_Server_Info *); +extern void cifs_crypto_shash_release(struct TCP_Server_Info *); +extern int calc_seckey(struct TCP_Server_Info *); #ifdef CONFIG_CIFS_WEAK_PW_HASH extern void calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c65c3419dd37..4bda920d1f75 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -604,11 +604,14 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) else rc = -EINVAL; - if (server->sec_kerberos || server->sec_mskerberos) - server->secType = Kerberos; - else if (server->sec_ntlmssp) - server->secType = RawNTLMSSP; - else + if (server->secType == Kerberos) { + if (!server->sec_kerberos && + !server->sec_mskerberos) + rc = -EOPNOTSUPP; + } else if (server->secType == RawNTLMSSP) { + if (!server->sec_ntlmssp) + rc = -EOPNOTSUPP; + } else rc = -EOPNOTSUPP; } } else diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 446e2486d5f0..18af707f00f1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1707,6 +1707,7 @@ cifs_put_smb_ses(struct cifsSesInfo *ses) CIFSSMBLogoff(xid, ses); _FreeXid(xid); } + cifs_crypto_shash_release(server); sesInfoFree(ses); cifs_put_tcp_session(server); } @@ -1786,13 +1787,23 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info) ses->linux_uid = volume_info->linux_uid; ses->overrideSecFlg = volume_info->secFlg; + rc = cifs_crypto_shash_allocate(server); + if (rc) { + cERROR(1, "could not setup hash structures rc %d", rc); + goto get_ses_fail; + } + server->tilen = 0; + server->tiblob = NULL; + mutex_lock(&ses->session_mutex); rc = cifs_negotiate_protocol(xid, ses); if (!rc) rc = cifs_setup_session(xid, ses, volume_info->local_nls); mutex_unlock(&ses->session_mutex); - if (rc) + if (rc) { + cifs_crypto_shash_release(ses->server); goto get_ses_fail; + } /* success, put it on the list */ write_lock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 49c9a4e75319..1db0f0746a5b 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -61,6 +61,19 @@ #define NTLMSSP_NEGOTIATE_KEY_XCH 0x40000000 #define NTLMSSP_NEGOTIATE_56 0x80000000 +/* Define AV Pair Field IDs */ +#define NTLMSSP_AV_EOL 0 +#define NTLMSSP_AV_NB_COMPUTER_NAME 1 +#define NTLMSSP_AV_NB_DOMAIN_NAME 2 +#define NTLMSSP_AV_DNS_COMPUTER_NAME 3 +#define NTLMSSP_AV_DNS_DOMAIN_NAME 4 +#define NTLMSSP_AV_DNS_TREE_NAME 5 +#define NTLMSSP_AV_FLAGS 6 +#define NTLMSSP_AV_TIMESTAMP 7 +#define NTLMSSP_AV_RESTRICTION 8 +#define NTLMSSP_AV_TARGET_NAME 9 +#define NTLMSSP_AV_CHANNEL_BINDINGS 10 + /* Although typedefs are not commonly used for structure definitions */ /* in the Linux kernel, in this particular case they are useful */ /* to more closely match the standards document for NTLMSSP from */ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 0a57cb7db5dd..41fc5328120d 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -383,6 +383,9 @@ static int decode_ascii_ssetup(char **pbcc_area, int bleft, static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, struct cifsSesInfo *ses) { + unsigned int tioffset; /* challeng message target info area */ + unsigned int tilen; /* challeng message target info area length */ + CHALLENGE_MESSAGE *pblob = (CHALLENGE_MESSAGE *)bcc_ptr; if (blob_len < sizeof(CHALLENGE_MESSAGE)) { @@ -405,6 +408,18 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, /* BB spec says that if AvId field of MsvAvTimestamp is populated then we must set the MIC field of the AUTHENTICATE_MESSAGE */ + tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset); + tilen = cpu_to_le16(pblob->TargetInfoArray.Length); + ses->server->tilen = tilen; + if (tilen) { + ses->server->tiblob = kmalloc(tilen, GFP_KERNEL); + if (!ses->server->tiblob) { + cERROR(1, "Challenge target info allocation failure"); + return -ENOMEM; + } + memcpy(ses->server->tiblob, bcc_ptr + tioffset, tilen); + } + return 0; } @@ -451,10 +466,12 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, struct cifsSesInfo *ses, const struct nls_table *nls_cp, bool first) { + int rc; + unsigned int size; AUTHENTICATE_MESSAGE *sec_blob = (AUTHENTICATE_MESSAGE *)pbuffer; __u32 flags; unsigned char *tmp; - char ntlm_session_key[CIFS_SESS_KEY_SIZE]; + struct ntlmv2_resp ntlmv2_response = {}; memcpy(sec_blob->Signature, NTLMSSP_SIGNATURE, 8); sec_blob->MessageType = NtLmAuthenticate; @@ -477,19 +494,25 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->LmChallengeResponse.Length = 0; sec_blob->LmChallengeResponse.MaximumLength = 0; - /* calculate session key, BB what about adding similar ntlmv2 path? */ - SMBNTencrypt(ses->password, ses->server->cryptKey, ntlm_session_key); - if (first) - cifs_calculate_mac_key(&ses->server->mac_signing_key, - ntlm_session_key, ses->password); - - memcpy(tmp, ntlm_session_key, CIFS_SESS_KEY_SIZE); sec_blob->NtChallengeResponse.BufferOffset = cpu_to_le32(tmp - pbuffer); - sec_blob->NtChallengeResponse.Length = cpu_to_le16(CIFS_SESS_KEY_SIZE); - sec_blob->NtChallengeResponse.MaximumLength = - cpu_to_le16(CIFS_SESS_KEY_SIZE); + rc = setup_ntlmv2_rsp(ses, (char *)&ntlmv2_response, nls_cp); + if (rc) { + cERROR(1, "error rc: %d during ntlmssp ntlmv2 setup", rc); + goto setup_ntlmv2_ret; + } + size = sizeof(struct ntlmv2_resp); + memcpy(tmp, (char *)&ntlmv2_response, size); + tmp += size; + if (ses->server->tilen > 0) { + memcpy(tmp, ses->server->tiblob, ses->server->tilen); + tmp += ses->server->tilen; + } else + ses->server->tilen = 0; - tmp += CIFS_SESS_KEY_SIZE; + sec_blob->NtChallengeResponse.Length = cpu_to_le16(size + + ses->server->tilen); + sec_blob->NtChallengeResponse.MaximumLength = + cpu_to_le16(size + ses->server->tilen); if (ses->domainName == NULL) { sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); @@ -501,7 +524,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, len = cifs_strtoUCS((__le16 *)tmp, ses->domainName, MAX_USERNAME_SIZE, nls_cp); len *= 2; /* unicode is 2 bytes each */ - len += 2; /* trailing null */ sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->DomainName.Length = cpu_to_le16(len); sec_blob->DomainName.MaximumLength = cpu_to_le16(len); @@ -518,7 +540,6 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, len = cifs_strtoUCS((__le16 *)tmp, ses->userName, MAX_USERNAME_SIZE, nls_cp); len *= 2; /* unicode is 2 bytes each */ - len += 2; /* trailing null */ sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer); sec_blob->UserName.Length = cpu_to_le16(len); sec_blob->UserName.MaximumLength = cpu_to_le16(len); @@ -530,9 +551,26 @@ static int build_ntlmssp_auth_blob(unsigned char *pbuffer, sec_blob->WorkstationName.MaximumLength = 0; tmp += 2; - sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); - sec_blob->SessionKey.Length = 0; - sec_blob->SessionKey.MaximumLength = 0; + if ((ses->server->ntlmssp.server_flags & NTLMSSP_NEGOTIATE_KEY_XCH) && + !calc_seckey(ses->server)) { + memcpy(tmp, ses->server->ntlmssp.ciphertext, CIFS_CPHTXT_SIZE); + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = cpu_to_le16(CIFS_CPHTXT_SIZE); + sec_blob->SessionKey.MaximumLength = + cpu_to_le16(CIFS_CPHTXT_SIZE); + tmp += CIFS_CPHTXT_SIZE; + } else { + sec_blob->SessionKey.BufferOffset = cpu_to_le32(tmp - pbuffer); + sec_blob->SessionKey.Length = 0; + sec_blob->SessionKey.MaximumLength = 0; + } + + ses->server->sequence_number = 0; + +setup_ntlmv2_ret: + if (ses->server->tilen > 0) + kfree(ses->server->tiblob); + return tmp - pbuffer; } @@ -546,15 +584,14 @@ static void setup_ntlmssp_neg_req(SESSION_SETUP_ANDX *pSMB, return; } -static int setup_ntlmssp_auth_req(SESSION_SETUP_ANDX *pSMB, +static int setup_ntlmssp_auth_req(char *ntlmsspblob, struct cifsSesInfo *ses, const struct nls_table *nls, bool first_time) { int bloblen; - bloblen = build_ntlmssp_auth_blob(&pSMB->req.SecurityBlob[0], ses, nls, + bloblen = build_ntlmssp_auth_blob(ntlmsspblob, ses, nls, first_time); - pSMB->req.SecurityBlobLength = cpu_to_le16(bloblen); return bloblen; } @@ -580,6 +617,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, struct key *spnego_key = NULL; __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ bool first_time; + char *ntlmsspblob; if (ses == NULL) return -EINVAL; @@ -690,7 +728,7 @@ ssetup_ntlmssp_authenticate: if (first_time) /* should this be moved into common code with similar ntlmv2 path? */ - cifs_calculate_mac_key(&ses->server->mac_signing_key, + cifs_calculate_session_key(&ses->server->session_key, ntlm_session_key, ses->password); /* copy session key */ @@ -729,12 +767,21 @@ ssetup_ntlmssp_authenticate: cpu_to_le16(sizeof(struct ntlmv2_resp)); /* calculate session key */ - setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); + rc = setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); + if (rc) { + kfree(v2_sess_key); + goto ssetup_exit; + } /* FIXME: calculate MAC key */ memcpy(bcc_ptr, (char *)v2_sess_key, sizeof(struct ntlmv2_resp)); bcc_ptr += sizeof(struct ntlmv2_resp); kfree(v2_sess_key); + if (ses->server->tilen > 0) { + memcpy(bcc_ptr, ses->server->tiblob, + ses->server->tilen); + bcc_ptr += ses->server->tilen; + } if (ses->capabilities & CAP_UNICODE) { if (iov[0].iov_len % 2) { *bcc_ptr = 0; @@ -765,15 +812,15 @@ ssetup_ntlmssp_authenticate: } /* bail out if key is too long */ if (msg->sesskey_len > - sizeof(ses->server->mac_signing_key.data.krb5)) { + sizeof(ses->server->session_key.data.krb5)) { cERROR(1, "Kerberos signing key too long (%u bytes)", msg->sesskey_len); rc = -EOVERFLOW; goto ssetup_exit; } if (first_time) { - ses->server->mac_signing_key.len = msg->sesskey_len; - memcpy(ses->server->mac_signing_key.data.krb5, + ses->server->session_key.len = msg->sesskey_len; + memcpy(ses->server->session_key.data.krb5, msg->data, msg->sesskey_len); } pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; @@ -815,12 +862,26 @@ ssetup_ntlmssp_authenticate: if (phase == NtLmNegotiate) { setup_ntlmssp_neg_req(pSMB, ses); iov[1].iov_len = sizeof(NEGOTIATE_MESSAGE); + iov[1].iov_base = &pSMB->req.SecurityBlob[0]; } else if (phase == NtLmAuthenticate) { int blob_len; - blob_len = setup_ntlmssp_auth_req(pSMB, ses, - nls_cp, - first_time); + ntlmsspblob = kmalloc(5 * + sizeof(struct _AUTHENTICATE_MESSAGE), + GFP_KERNEL); + if (!ntlmsspblob) { + cERROR(1, "Can't allocate NTLMSSP"); + rc = -ENOMEM; + goto ssetup_exit; + } + + blob_len = setup_ntlmssp_auth_req(ntlmsspblob, + ses, + nls_cp, + first_time); iov[1].iov_len = blob_len; + iov[1].iov_base = ntlmsspblob; + pSMB->req.SecurityBlobLength = + cpu_to_le16(blob_len); /* Make sure that we tell the server that we are using the uid that it just gave us back on the response (challenge) */ @@ -830,7 +891,6 @@ ssetup_ntlmssp_authenticate: rc = -ENOSYS; goto ssetup_exit; } - iov[1].iov_base = &pSMB->req.SecurityBlob[0]; /* unicode strings must be word aligned */ if ((iov[0].iov_len + iov[1].iov_len) % 2) { *bcc_ptr = 0; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 82f78c4d6978..e0588cdf4cc5 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -543,7 +543,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(midQ->resp_buf, - &ses->server->mac_signing_key, + ses->server, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); @@ -731,7 +731,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(out_buf, - &ses->server->mac_signing_key, + ses->server, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); @@ -981,7 +981,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifsTconInfo *tcon, (ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED))) { rc = cifs_verify_signature(out_buf, - &ses->server->mac_signing_key, + ses->server, midQ->sequence_number+1); if (rc) { cERROR(1, "Unexpected SMB signature"); -- cgit v1.2.3 From f3c60c5918f26ea16761ddc8b12d8401a3db626b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 11 Aug 2010 14:51:23 -0700 Subject: ceph: fix multiple mds session shutdown The use of a completion when waiting for session shutdown during umount is inappropriate, given the complexity of the condition. For multiple MDS's, this resulted in the umount thread spinning, often preventing the session close message from being processed in some cases. Switch to a waitqueue and defined a condition helper. This cleans things up nicely. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 68 +++++++++++++++++++++++++++------------------------- fs/ceph/mds_client.h | 3 ++- 2 files changed, 37 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a75ddbf9fe37..397a47b696ce 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2208,7 +2208,7 @@ static void handle_session(struct ceph_mds_session *session, pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); wake = 1; /* for good measure */ - complete_all(&mdsc->session_close_waiters); + wake_up_all(&mdsc->session_close_wq); kick_requests(mdsc, mds); break; @@ -2876,7 +2876,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client) return -ENOMEM; init_completion(&mdsc->safe_umount_waiters); - init_completion(&mdsc->session_close_waiters); + init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->sessions = NULL; mdsc->max_sessions = 0; @@ -3021,6 +3021,23 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush)); } +/* + * true if all sessions are closed, or we force unmount + */ +bool done_closing_sessions(struct ceph_mds_client *mdsc) +{ + int i, n = 0; + + if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN) + return true; + + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) + if (mdsc->sessions[i]) + n++; + mutex_unlock(&mdsc->mutex); + return n == 0; +} /* * called after sb is ro. @@ -3029,45 +3046,32 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { struct ceph_mds_session *session; int i; - int n; struct ceph_client *client = mdsc->client; - unsigned long started, timeout = client->mount_args->mount_timeout * HZ; + unsigned long timeout = client->mount_args->mount_timeout * HZ; dout("close_sessions\n"); - mutex_lock(&mdsc->mutex); - /* close sessions */ - started = jiffies; - while (time_before(jiffies, started + timeout)) { - dout("closing sessions\n"); - n = 0; - for (i = 0; i < mdsc->max_sessions; i++) { - session = __ceph_lookup_mds_session(mdsc, i); - if (!session) - continue; - mutex_unlock(&mdsc->mutex); - mutex_lock(&session->s_mutex); - __close_session(mdsc, session); - mutex_unlock(&session->s_mutex); - ceph_put_mds_session(session); - mutex_lock(&mdsc->mutex); - n++; - } - if (n == 0) - break; - - if (client->mount_state == CEPH_MOUNT_SHUTDOWN) - break; - - dout("waiting for sessions to close\n"); + mutex_lock(&mdsc->mutex); + for (i = 0; i < mdsc->max_sessions; i++) { + session = __ceph_lookup_mds_session(mdsc, i); + if (!session) + continue; mutex_unlock(&mdsc->mutex); - wait_for_completion_timeout(&mdsc->session_close_waiters, - timeout); + mutex_lock(&session->s_mutex); + __close_session(mdsc, session); + mutex_unlock(&session->s_mutex); + ceph_put_mds_session(session); mutex_lock(&mdsc->mutex); } + mutex_unlock(&mdsc->mutex); + + dout("waiting for sessions to close\n"); + wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), + timeout); /* tear down remaining sessions */ + mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { if (mdsc->sessions[i]) { session = get_session(mdsc->sessions[i]); @@ -3080,9 +3084,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) mutex_lock(&mdsc->mutex); } } - WARN_ON(!list_empty(&mdsc->cap_delay_list)); - mutex_unlock(&mdsc->mutex); ceph_cleanup_empty_realms(mdsc); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ab7e89f5e344..c98267ce6d2a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -234,7 +234,8 @@ struct ceph_mds_client { struct mutex mutex; /* all nested structures */ struct ceph_mdsmap *mdsmap; - struct completion safe_umount_waiters, session_close_waiters; + struct completion safe_umount_waiters; + wait_queue_head_t session_close_wq; struct list_head waiting_for_map; struct ceph_mds_session **sessions; /* NULL for mds if no session */ -- cgit v1.2.3 From 082afec92d1052305af1195f591602f4d0f44277 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 22 Aug 2010 15:16:41 -0700 Subject: ceph: fix xattr cap writeback We should include the xattr metadata blob in the cap update message any time we are flushing dirty state, NOT just when we are also dropping the cap. This fixes async xattr writeback. Also, clean up the code slightly to avoid duplicating the bit test. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7bf182b03973..0ac2703f3bdf 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1082,6 +1082,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, gid_t gid; struct ceph_mds_session *session; u64 xattr_version = 0; + struct ceph_buffer *xattr_blob = NULL; int delayed = 0; u64 flush_tid = 0; int i; @@ -1160,9 +1161,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, gid = inode->i_gid; mode = inode->i_mode; - if (dropping & CEPH_CAP_XATTR_EXCL) { + if (flushing & CEPH_CAP_XATTR_EXCL) { __ceph_build_xattrs_blob(ci); - xattr_version = ci->i_xattrs.version + 1; + xattr_blob = ci->i_xattrs.blob; + xattr_version = ci->i_xattrs.version; } spin_unlock(&inode->i_lock); @@ -1170,9 +1172,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, - uid, gid, mode, - xattr_version, - (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL, + uid, gid, mode, xattr_version, xattr_blob, follows); if (ret < 0) { dout("error sending cap msg, must requeue %p\n", inode); -- cgit v1.2.3 From 4a625be47243e0e07dedd0a1a6b94c66c2ab93ba Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 22 Aug 2010 15:03:56 -0700 Subject: ceph: include dirty xattrs state in snapped caps When we snapshot dirty metadata that needs to be written back to the MDS, include dirty xattr metadata. Make the capsnap reference the encoded xattr blob so that it will be written back in the FLUSHSNAP op. Also fix the capsnap creation guard to include dirty auth or file bits, not just tests specific to dirty file data or file writes in progress (this fixes auth metadata writeback). Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- fs/ceph/snap.c | 23 ++++++++++++++++------- fs/ceph/super.h | 8 +++++--- fs/ceph/xattr.c | 1 + 4 files changed, 23 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0ac2703f3bdf..ba5bbf318fe1 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1282,7 +1282,7 @@ retry: &capsnap->mtime, &capsnap->atime, capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, - 0, NULL, + capsnap->xattr_version, capsnap->xattr_blob, capsnap->follows); next_follows = capsnap->follows + 1; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index c0b26b6badba..2cb190c2bd99 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -435,7 +435,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; - int used; + int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); if (!capsnap) { @@ -445,6 +445,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) spin_lock(&inode->i_lock); used = __ceph_caps_used(ci); + dirty = __ceph_caps_dirty(ci); if (__ceph_have_pending_cap_snap(ci)) { /* there is no point in queuing multiple "pending" cap_snaps, as no new writes are allowed to start when pending, so any @@ -452,11 +453,13 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); - } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { + } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR) || + (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| + CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { struct ceph_snap_context *snapc = ci->i_head_snapc; igrab(inode); - + atomic_set(&capsnap->nref, 1); capsnap->ci = ci; INIT_LIST_HEAD(&capsnap->ci_item); @@ -464,15 +467,21 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->follows = snapc->seq - 1; capsnap->issued = __ceph_caps_issued(ci, NULL); - capsnap->dirty = __ceph_caps_dirty(ci); + capsnap->dirty = dirty; capsnap->mode = inode->i_mode; capsnap->uid = inode->i_uid; capsnap->gid = inode->i_gid; - /* fixme? */ - capsnap->xattr_blob = NULL; - capsnap->xattr_len = 0; + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; + } else { + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; + } /* dirty page count moved from _head to this cap_snap; all subsequent writes page dirties occur _after_ this diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 2482d696f0de..b33929d8f287 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -216,8 +216,7 @@ struct ceph_cap_snap { uid_t uid; gid_t gid; - void *xattr_blob; - int xattr_len; + struct ceph_buffer *xattr_blob; u64 xattr_version; u64 size; @@ -229,8 +228,11 @@ struct ceph_cap_snap { static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) { - if (atomic_dec_and_test(&capsnap->nref)) + if (atomic_dec_and_test(&capsnap->nref)) { + if (capsnap->xattr_blob) + ceph_buffer_put(capsnap->xattr_blob); kfree(capsnap); + } } /* diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 097a2654c00f..9578af610b73 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -485,6 +485,7 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob; ci->i_xattrs.prealloc_blob = NULL; ci->i_xattrs.dirty = false; + ci->i_xattrs.version++; } } -- cgit v1.2.3 From ed326044489ed89c740c50a3df5dffc9c3b20b96 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 16 Aug 2010 13:37:31 -0700 Subject: ceph: queue cap snap writeback for realm children on snap update When a realm is updated, we need to queue writeback on inodes in that realm _and_ its children. Otherwise, if the inode gets cowed on the server, we can get a hang later due to out-of-sync cap/snap state. Signed-off-by: Sage Weil --- fs/ceph/snap.c | 60 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 2cb190c2bd99..6bdbf3ae7082 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -548,6 +548,41 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, return 1; /* caller may want to ceph_flush_snaps */ } +/* + * Queue cap_snaps for snap writeback for this realm and its children. + * Called under snap_rwsem, so realm topology won't change. + */ +static void queue_realm_cap_snaps(struct ceph_snap_realm *realm) +{ + struct ceph_inode_info *ci; + struct inode *lastinode = NULL; + struct ceph_snap_realm *child; + + dout("queue_realm_cap_snaps %p %llx inodes\n", realm, realm->ino); + + spin_lock(&realm->inodes_with_caps_lock); + list_for_each_entry(ci, &realm->inodes_with_caps, + i_snap_realm_item) { + struct inode *inode = igrab(&ci->vfs_inode); + if (!inode) + continue; + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + lastinode = inode; + ceph_queue_cap_snap(ci); + spin_lock(&realm->inodes_with_caps_lock); + } + spin_unlock(&realm->inodes_with_caps_lock); + if (lastinode) + iput(lastinode); + + dout("queue_realm_cap_snaps %p %llx children\n", realm, realm->ino); + list_for_each_entry(child, &realm->children, child_item) + queue_realm_cap_snaps(child); + + dout("queue_realm_cap_snaps %p %llx done\n", realm, realm->ino); +} /* * Parse and apply a snapblob "snap trace" from the MDS. This specifies @@ -598,29 +633,8 @@ more: * * ...unless it's a snap deletion! */ - if (!deletion) { - struct ceph_inode_info *ci; - struct inode *lastinode = NULL; - - spin_lock(&realm->inodes_with_caps_lock); - list_for_each_entry(ci, &realm->inodes_with_caps, - i_snap_realm_item) { - struct inode *inode = igrab(&ci->vfs_inode); - if (!inode) - continue; - spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); - lastinode = inode; - ceph_queue_cap_snap(ci); - spin_lock(&realm->inodes_with_caps_lock); - } - spin_unlock(&realm->inodes_with_caps_lock); - if (lastinode) - iput(lastinode); - dout("update_snap_trace cap_snaps queued\n"); - } - + if (!deletion) + queue_realm_cap_snaps(realm); } else { dout("update_snap_trace %llx %p seq %lld unchanged\n", realm->ino, realm, realm->seq); -- cgit v1.2.3 From eb6bb1c5bdc6e455a9d16cb845cc65afc9b0a617 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 16 Aug 2010 09:21:27 -0700 Subject: ceph: direct requests in snapped namespace based on nonsnap parent When making a request in the virtual snapdir or a snapped portion of the namespace, we should choose the MDS based on the first nonsnap parent (and its caps). If that is not the best place, we will get forward hints to find the right MDS in the cluster. This fixes ESTALE errors when using the .snap directory and namespace with multiple MDSs. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 397a47b696ce..8d1f11c7a5a2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -560,6 +560,13 @@ static void __unregister_request(struct ceph_mds_client *mdsc, * * Called under mdsc->mutex. */ +struct dentry *get_nonsnap_parent(struct dentry *dentry) +{ + while (!IS_ROOT(dentry) && ceph_snap(dentry->d_inode) != CEPH_NOSNAP) + dentry = dentry->d_parent; + return dentry; +} + static int __choose_mds(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { @@ -590,14 +597,29 @@ static int __choose_mds(struct ceph_mds_client *mdsc, if (req->r_inode) { inode = req->r_inode; } else if (req->r_dentry) { - if (req->r_dentry->d_inode) { + struct inode *dir = req->r_dentry->d_parent->d_inode; + + if (dir->i_sb != mdsc->client->sb) { + /* not this fs! */ + inode = req->r_dentry->d_inode; + } else if (ceph_snap(dir) != CEPH_NOSNAP) { + /* direct snapped/virtual snapdir requests + * based on parent dir inode */ + struct dentry *dn = + get_nonsnap_parent(req->r_dentry->d_parent); + inode = dn->d_inode; + dout("__choose_mds using nonsnap parent %p\n", inode); + } else if (req->r_dentry->d_inode) { + /* dentry target */ inode = req->r_dentry->d_inode; } else { - inode = req->r_dentry->d_parent->d_inode; + /* dir + name */ + inode = dir; hash = req->r_dentry->d_name.hash; is_hash = true; } } + dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, (int)hash, mode); if (!inode) -- cgit v1.2.3 From 679ceace848e9fd570678396ffe1ef034e00e82d Mon Sep 17 00:00:00 2001 From: Michael Rubin Date: Fri, 20 Aug 2010 02:31:26 -0700 Subject: mm: exporting account_page_dirty This allows code outside of the mm core to safely manipulate page state and not worry about the other accounting. Not using these routines means that some code will lose track of the accounting and we get bugs. This has happened once already. Signed-off-by: Michael Rubin Signed-off-by: Sage Weil --- fs/ceph/addr.c | 8 +------- mm/page-writeback.c | 1 + 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5598a0d02295..420d46974ec8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -105,13 +105,7 @@ static int ceph_set_page_dirty(struct page *page) spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(!PageUptodate(page)); - - if (mapping_cap_account_dirty(mapping)) { - __inc_zone_page_state(page, NR_FILE_DIRTY); - __inc_bdi_stat(mapping->backing_dev_info, - BDI_RECLAIMABLE); - task_io_account_write(PAGE_CACHE_SIZE); - } + account_page_dirtied(page, page->mapping); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 37498ef61548..849d0ccbe914 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1096,6 +1096,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) task_io_account_write(PAGE_CACHE_SIZE); } } +EXPORT_SYMBOL(account_page_dirtied); /* * For address_spaces which do not use buffers. Just tag the page as dirty in -- cgit v1.2.3 From faa9560ae76ef50a3cbfb1a6afc0343fd8172374 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 18 Aug 2010 12:25:49 -0400 Subject: fanotify: do not dereference inode_mark when it is unset The fanotify code is supposed to get the group from the mark. It accidentally only used the inode_mark. If the vfsmount_mark was set but not the inode_mark it would deref the NULL inode_mark. Get the group from the correct place. Reported-by: Tvrtko Ursulin Signed-off-by: Eric Paris --- fs/notify/fsnotify.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 3970392b2722..f3e3b355ba7f 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -148,13 +148,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, const unsigned char *file_name, struct fsnotify_event **event) { - struct fsnotify_group *group = inode_mark->group; + struct fsnotify_group *group = NULL; __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); - pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p" - " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell, - mnt, inode_mark, mask, data, data_is, cookie, *event); + if (unlikely(!inode_mark && !vfsmount_mark)) { + BUG(); + return 0; + } /* clear ignored on inode modification */ if (mask & FS_MODIFY) { @@ -168,18 +169,24 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, /* does the inode mark tell us to do something? */ if (inode_mark) { + group = inode_mark->group; inode_test_mask &= inode_mark->mask; inode_test_mask &= ~inode_mark->ignored_mask; } /* does the vfsmount_mark tell us to do something? */ if (vfsmount_mark) { + group = vfsmount_mark->group; vfsmount_test_mask &= vfsmount_mark->mask; vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; if (inode_mark) vfsmount_test_mask &= ~inode_mark->ignored_mask; } + pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p" + " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell, + mnt, inode_mark, mask, data, data_is, cookie, *event); + if (!inode_test_mask && !vfsmount_test_mask) return 0; -- cgit v1.2.3 From 5f3f259fa8f1d7969360acfad5307d03c2f53d63 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 18 Aug 2010 12:25:49 -0400 Subject: fsnotify: reset used_inode and used_vfsmount on each pass The fsnotify main loop has 2 booleans which tell if a particular mark was sent to the listeners or if it should be processed in the next pass. The problem is that the booleans were not reset on each traversal of the loop. So marks could get skipped even when they were not sent to the notifiers. Reported-by: Tvrtko Ursulin Signed-off-by: Eric Paris --- fs/notify/fsnotify.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index f3e3b355ba7f..59dc7a02bd0c 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -220,7 +220,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct fsnotify_event *event = NULL; struct vfsmount *mnt; int idx, ret = 0; - bool used_inode = false, used_vfsmount = false; + bool used_inode, used_vfsmount; /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); @@ -261,6 +261,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, } while (inode_node || vfsmount_node) { + used_inode = used_vfsmount = false; + if (inode_node) { inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), struct fsnotify_mark, i.i_list); -- cgit v1.2.3 From 84e1ab4d875922c034db7f4f814ac445a20a14bd Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 18 Aug 2010 12:25:50 -0400 Subject: fsnotify: fix ignored mask handling between inode and vfsmount marks The interesting 2 list lockstep walking didn't quite work out if the inode marks only had ignores and the vfsmount list requested events. The code to shortcut list traversal would not run the inode list since it didn't have real event requests. This code forces inode list traversal when a vfsmount mark matches the event type. Maybe we could add an i_fsnotify_ignored_mask field to struct inode to get the shortcut back, but it doesn't seem worth it to grow struct inode again. I bet with the recent changes to lock the way we do now it would actually not be a major perf hit to just drop i_fsnotify_mark_mask altogether. But that is for another day. Signed-off-by: Eric Paris --- fs/notify/fsnotify.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 59dc7a02bd0c..6f2777ce87a1 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -149,8 +149,8 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, struct fsnotify_event **event) { struct fsnotify_group *group = NULL; - __u32 inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); - __u32 vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); + __u32 inode_test_mask = 0; + __u32 vfsmount_test_mask = 0; if (unlikely(!inode_mark && !vfsmount_mark)) { BUG(); @@ -170,12 +170,14 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, /* does the inode mark tell us to do something? */ if (inode_mark) { group = inode_mark->group; + inode_test_mask = (mask & ~FS_EVENT_ON_CHILD); inode_test_mask &= inode_mark->mask; inode_test_mask &= ~inode_mark->ignored_mask; } /* does the vfsmount_mark tell us to do something? */ if (vfsmount_mark) { + vfsmount_test_mask = (mask & ~FS_EVENT_ON_CHILD); group = vfsmount_mark->group; vfsmount_test_mask &= vfsmount_mark->mask; vfsmount_test_mask &= ~vfsmount_mark->ignored_mask; @@ -183,9 +185,12 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, vfsmount_test_mask &= ~inode_mark->ignored_mask; } - pr_debug("%s: group=%p to_tell=%p mnt=%p mark=%p mask=%x data=%p" - " data_is=%d cookie=%d event=%p\n", __func__, group, to_tell, - mnt, inode_mark, mask, data, data_is, cookie, *event); + pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p" + " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x" + " data=%p data_is=%d cookie=%d event=%p\n", + __func__, group, to_tell, mnt, mask, inode_mark, + inode_test_mask, vfsmount_mark, vfsmount_test_mask, data, + data_is, cookie, *event); if (!inode_test_mask && !vfsmount_test_mask) return 0; @@ -214,7 +219,7 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt, int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const unsigned char *file_name, u32 cookie) { - struct hlist_node *inode_node, *vfsmount_node; + struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; struct fsnotify_event *event = NULL; @@ -245,19 +250,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, (test_mask & to_tell->i_fsnotify_mask)) inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, &fsnotify_mark_srcu); - else - inode_node = NULL; - if (mnt) { - if ((mask & FS_MODIFY) || - (test_mask & mnt->mnt_fsnotify_mask)) - vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, - &fsnotify_mark_srcu); - else - vfsmount_node = NULL; - } else { - mnt = NULL; - vfsmount_node = NULL; + if (mnt && ((mask & FS_MODIFY) || + (test_mask & mnt->mnt_fsnotify_mask))) { + vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, + &fsnotify_mark_srcu); + inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + &fsnotify_mark_srcu); } while (inode_node || vfsmount_node) { -- cgit v1.2.3 From 2eebf582c9b3106abb9c33f4fc0a347fb9391037 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 18 Aug 2010 12:25:50 -0400 Subject: fanotify: flush outstanding perm requests on group destroy When an fanotify listener is closing it may cause a deadlock between the listener and the original task doing an fs operation. If the original task is waiting for a permissions response it will be holding the srcu lock. The listener cannot clean up and exit until after that srcu lock is syncronized. Thus deadlock. The fix introduced here is to stop accepting new permissions events when a listener is shutting down and to grant permission for all outstanding events. Thus the original task will eventually release the srcu lock and the listener can complete shutdown. Reported-by: Andreas Gruenbacher Cc: Andreas Gruenbacher Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 27 +++++++++++++++++++++++++++ include/linux/fanotify.h | 7 ------- include/linux/fsnotify_backend.h | 1 + 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 032b837fcd11..b966b7230f47 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -195,6 +195,14 @@ static int prepare_for_access_response(struct fsnotify_group *group, re->fd = fd; mutex_lock(&group->fanotify_data.access_mutex); + + if (group->fanotify_data.bypass_perm) { + mutex_unlock(&group->fanotify_data.access_mutex); + kmem_cache_free(fanotify_response_event_cache, re); + event->response = FAN_ALLOW; + return 0; + } + list_add_tail(&re->list, &group->fanotify_data.access_list); mutex_unlock(&group->fanotify_data.access_mutex); @@ -364,9 +372,28 @@ static ssize_t fanotify_write(struct file *file, const char __user *buf, size_t static int fanotify_release(struct inode *ignored, struct file *file) { struct fsnotify_group *group = file->private_data; + struct fanotify_response_event *re, *lre; pr_debug("%s: file=%p group=%p\n", __func__, file, group); +#ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS + mutex_lock(&group->fanotify_data.access_mutex); + + group->fanotify_data.bypass_perm = true; + + list_for_each_entry_safe(re, lre, &group->fanotify_data.access_list, list) { + pr_debug("%s: found group=%p re=%p event=%p\n", __func__, group, + re, re->event); + + list_del_init(&re->list); + re->event->response = FAN_ALLOW; + + kmem_cache_free(fanotify_response_event_cache, re); + } + mutex_unlock(&group->fanotify_data.access_mutex); + + wake_up(&group->fanotify_data.access_waitq); +#endif /* matches the fanotify_init->fsnotify_alloc_group */ fsnotify_put_group(group); diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index f0949a57ca9d..985435622ecd 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -95,11 +95,4 @@ struct fanotify_response { (long)(meta)->event_len >= (long)FAN_EVENT_METADATA_LEN && \ (long)(meta)->event_len <= (long)(len)) -#ifdef __KERNEL__ - -struct fanotify_wait { - struct fsnotify_event *event; - __s32 fd; -}; -#endif /* __KERNEL__ */ #endif /* _LINUX_FANOTIFY_H */ diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index ed36fb57c426..e40190d16878 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -156,6 +156,7 @@ struct fsnotify_group { struct mutex access_mutex; struct list_head access_list; wait_queue_head_t access_waitq; + bool bypass_perm; /* protected by access_mutex */ #endif /* CONFIG_FANOTIFY_ACCESS_PERMISSIONS */ int f_flags; } fanotify_data; -- cgit v1.2.3 From ff8d6e983185ce19fa92bb836eb52b589957be65 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Fri, 20 Aug 2010 10:24:18 +0100 Subject: fanotify: drop duplicate pr_debug statement This reminded me... you have two pr_debugs in fanotify_should_send_event which output redundant information. Maybe you intended it like that so it is selectable how much log spam you want, or if not you may want to apply this patch. Signed-off-by: Tvrtko Ursulin Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 756566fe8449..85366c78cc37 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -165,9 +165,6 @@ static bool fanotify_should_send_event(struct fsnotify_group *group, "mask=%x data=%p data_type=%d\n", __func__, group, to_tell, inode_mark, vfsmnt_mark, event_mask, data, data_type); - pr_debug("%s: group=%p vfsmount_mark=%p inode_mark=%p mask=%x\n", - __func__, group, vfsmnt_mark, inode_mark, event_mask); - /* sorry, fanotify only gives a damn about files and dirs */ if (!S_ISREG(to_tell->i_mode) && !S_ISDIR(to_tell->i_mode)) -- cgit v1.2.3 From 124514918b030d74f1f3e15483b7bf3b85268082 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 22 Aug 2010 21:33:32 -0700 Subject: ceph: don't improperly set dir complete when holding EXCL cap If we hold the EXCL cap, we cannot trust the dir stats from the MDS (num files, subdirs) and must not incorrectly conclude that the directory is empty. If we do, we get can bad results from lookup (bad ENOENT) and bad readdir results. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 5d893d31e399..3e6b52cb5ee8 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -677,6 +677,7 @@ static int fill_inode(struct inode *inode, if (ci->i_files == 0 && ci->i_subdirs == 0 && ceph_snap(inode) == CEPH_NOSNAP && (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) { dout(" marking %p complete (empty)\n", inode); ci->i_ceph_flags |= CEPH_I_COMPLETE; -- cgit v1.2.3 From 07a27e226d1ed210d2d4218bd0642b40f5405c6a Mon Sep 17 00:00:00 2001 From: Henry C Chang Date: Sun, 22 Aug 2010 21:34:27 -0700 Subject: ceph: fix osd request lru adjustment when sending request Fix argument order. We want to move the item to the end of the list, not change the position of the head. Signed-off-by: Henry C Chang Signed-off-by: Sage Weil --- fs/ceph/osd_client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c index bed6391e52c7..dfced1dacbcd 100644 --- a/fs/ceph/osd_client.c +++ b/fs/ceph/osd_client.c @@ -661,7 +661,7 @@ static int __send_request(struct ceph_osd_client *osdc, reqhead->reassert_version = req->r_reassert_version; req->r_stamp = jiffies; - list_move_tail(&osdc->req_lru, &req->r_req_lru_item); + list_move_tail(&req->r_req_lru_item, &osdc->req_lru); ceph_msg_get(req->r_request); /* send consumes a ref */ ceph_con_send(&req->r_osd->o_con, req->r_request); -- cgit v1.2.3 From 3ec6bbcdb4e85403f2c5958876ca9492afdf4031 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Mon, 23 Aug 2010 11:04:07 -0500 Subject: missing changes during ntlmv2/ntlmssp auth and sign Signed-off-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 2 ++ fs/cifs/sess.c | 13 ++++++++----- 2 files changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 051d00011ca3..eef78c24e0cc 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -553,6 +553,8 @@ calc_seckey(struct TCP_Server_Info *server) return 1; } + desc.tfm = tfm_arc4; + crypto_blkcipher_setkey(tfm_arc4, server->session_key.data.ntlmv2.key, CIFS_CPHTXT_SIZE); sg_init_one(&sgin, sec_key, CIFS_CPHTXT_SIZE); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 41fc5328120d..4788e16a02cc 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -408,6 +408,8 @@ static int decode_ntlmssp_challenge(char *bcc_ptr, int blob_len, /* BB spec says that if AvId field of MsvAvTimestamp is populated then we must set the MIC field of the AUTHENTICATE_MESSAGE */ + ses->server->ntlmssp.server_flags = le32_to_cpu(pblob->NegotiateFlags); + tioffset = cpu_to_le16(pblob->TargetInfoArray.BufferOffset); tilen = cpu_to_le16(pblob->TargetInfoArray.Length); ses->server->tilen = tilen; @@ -440,12 +442,13 @@ static void build_ntlmssp_negotiate_blob(unsigned char *pbuffer, /* BB is NTLMV2 session security format easier to use here? */ flags = NTLMSSP_NEGOTIATE_56 | NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_128 | NTLMSSP_NEGOTIATE_UNICODE | - NTLMSSP_NEGOTIATE_NT_ONLY | NTLMSSP_NEGOTIATE_NTLM; + NTLMSSP_NEGOTIATE_NTLM; if (ses->server->secMode & - (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - flags |= NTLMSSP_NEGOTIATE_SIGN; - if (ses->server->secMode & SECMODE_SIGN_REQUIRED) - flags |= NTLMSSP_NEGOTIATE_ALWAYS_SIGN; + (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) { + flags |= NTLMSSP_NEGOTIATE_SIGN | + NTLMSSP_NEGOTIATE_KEY_XCH | + NTLMSSP_NEGOTIATE_EXTENDED_SEC; + } sec_blob->NegotiateFlags |= cpu_to_le32(flags); -- cgit v1.2.3 From 24e6cf92fde1f140d8eb0bf7cd24c2c78149b6b2 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Aug 2010 11:38:04 -0400 Subject: cifs: check for NULL session password It's possible for a cifsSesInfo struct to have a NULL password, so we need to check for that prior to running strncmp on it. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 18af707f00f1..ec0ea4a43bdb 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1673,6 +1673,7 @@ cifs_find_smb_ses(struct TCP_Server_Info *server, struct smb_vol *vol) MAX_USERNAME_SIZE)) continue; if (strlen(vol->username) != 0 && + ses->password != NULL && strncmp(ses->password, vol->password ? vol->password : "", MAX_PASSWORD_SIZE)) -- cgit v1.2.3 From d17c701ce6a548a92f7f8a3cec20299465f36ee3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:42:52 +1000 Subject: xfs: unlock items before allowing the CIL to commit When we commit a transaction using delayed logging, we need to unlock the items in the transaciton before we unlock the CIL context and allow it to be checkpointed. If we unlock them after we release the CIl context lock, the CIL can checkpoint and complete before we free the log items. This breaks stale buffer item unlock and unpin processing as there is an implicit assumption that the unlock will occur before the unpin. Also, some log items need to store the LSN of the transaction commit in the item (inodes and EFIs) and so can race with other transaction completions if we don't prevent the CIL from checkpointing before the unlock occurs. Cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log_cil.c | 14 ++++++++++++++ fs/xfs/xfs_trans.c | 5 +---- fs/xfs/xfs_trans_priv.h | 3 ++- 3 files changed, 17 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 31e4ea2d19ac..ef8e7d9f445d 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -377,9 +377,23 @@ xfs_log_commit_cil( xfs_log_done(mp, tp->t_ticket, NULL, log_flags); xfs_trans_unreserve_and_mod_sb(tp); + /* + * Once all the items of the transaction have been copied to the CIL, + * the items can be unlocked and freed. + * + * This needs to be done before we drop the CIL context lock because we + * have to update state in the log items and unlock them before they go + * to disk. If we don't, then the CIL checkpoint can race with us and + * we can run checkpoint completion before we've updated and unlocked + * the log items. This affects (at least) processing of stale buffers, + * inodes and EFIs. + */ + xfs_trans_free_items(tp, *commit_lsn, 0); + /* check for background commit before unlock */ if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) push = 1; + up_read(&log->l_cilp->xc_ctx_lock); /* diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index fdca7416c754..1c47edaea0d2 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -1167,7 +1167,7 @@ xfs_trans_del_item( * Unlock all of the items of a transaction and free all the descriptors * of that transaction. */ -STATIC void +void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, @@ -1653,9 +1653,6 @@ xfs_trans_commit_cil( return error; current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - - /* xfs_trans_free_items() unlocks them first */ - xfs_trans_free_items(tp, *commit_lsn, 0); xfs_trans_free(tp); return 0; } diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index e2d93d8ead7b..62da86c90de5 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -25,7 +25,8 @@ struct xfs_trans; void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); - +void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, + int flags); void xfs_trans_item_committed(struct xfs_log_item *lip, xfs_lsn_t commit_lsn, int aborted); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); -- cgit v1.2.3 From 5b3eed756cd37255cad1181bd86bfd0977e97953 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:42:41 +1000 Subject: xfs: ensure we mark all inodes in a freed cluster XFS_ISTALE Under heavy load parallel metadata loads (e.g. dbench), we can fail to mark all the inodes in a cluster being freed as XFS_ISTALE as we skip inodes we cannot get the XFS_ILOCK_EXCL or the flush lock on. When this happens and the inode cluster buffer has already been marked stale and freed, inode reclaim can try to write the inode out as it is dirty and not marked stale. This can result in writing th metadata to an freed extent, or in the case it has already been overwritten trigger a magic number check failure and return an EUCLEAN error such as: Filesystem "ram0": inode 0x442ba1 background reclaim flush failed with 117 Fix this by ensuring that we hoover up all in memory inodes in the cluster and mark them XFS_ISTALE when freeing the cluster. Cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_inode.c | 49 ++++++++++++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 23 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 68415cb4f23c..34798f391c49 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1914,6 +1914,11 @@ xfs_iunlink_remove( return 0; } +/* + * A big issue when freeing the inode cluster is is that we _cannot_ skip any + * inodes that are in memory - they all must be marked stale and attached to + * the cluster buffer. + */ STATIC void xfs_ifree_cluster( xfs_inode_t *free_ip, @@ -1945,8 +1950,6 @@ xfs_ifree_cluster( } for (j = 0; j < nbufs; j++, inum += ninodes) { - int found = 0; - blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -1965,7 +1968,9 @@ xfs_ifree_cluster( /* * Walk the inodes already attached to the buffer and mark them * stale. These will all have the flush locks held, so an - * in-memory inode walk can't lock them. + * in-memory inode walk can't lock them. By marking them all + * stale first, we will not attempt to lock them in the loop + * below as the XFS_ISTALE flag will be set. */ lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *); while (lip) { @@ -1977,11 +1982,11 @@ xfs_ifree_cluster( &iip->ili_flush_lsn, &iip->ili_item.li_lsn); xfs_iflags_set(iip->ili_inode, XFS_ISTALE); - found++; } lip = lip->li_bio_list; } + /* * For each inode in memory attempt to add it to the inode * buffer and set it up for being staled on buffer IO @@ -1993,6 +1998,7 @@ xfs_ifree_cluster( * even trying to lock them. */ for (i = 0; i < ninodes; i++) { +retry: read_lock(&pag->pag_ici_lock); ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, (inum + i))); @@ -2003,38 +2009,36 @@ xfs_ifree_cluster( continue; } - /* don't try to lock/unlock the current inode */ + /* + * Don't try to lock/unlock the current inode, but we + * _cannot_ skip the other inodes that we did not find + * in the list attached to the buffer and are not + * already marked stale. If we can't lock it, back off + * and retry. + */ if (ip != free_ip && !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { read_unlock(&pag->pag_ici_lock); - continue; + delay(1); + goto retry; } read_unlock(&pag->pag_ici_lock); - if (!xfs_iflock_nowait(ip)) { - if (ip != free_ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } - + xfs_iflock(ip); xfs_iflags_set(ip, XFS_ISTALE); - if (xfs_inode_clean(ip)) { - ASSERT(ip != free_ip); - xfs_ifunlock(ip); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - continue; - } + /* + * we don't need to attach clean inodes or those only + * with unlogged changes (which we throw away, anyway). + */ iip = ip->i_itemp; - if (!iip) { - /* inode with unlogged changes only */ + if (!iip || xfs_inode_clean(ip)) { ASSERT(ip != free_ip); ip->i_update_core = 0; xfs_ifunlock(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL); continue; } - found++; iip->ili_last_fields = iip->ili_format.ilf_fields; iip->ili_format.ilf_fields = 0; @@ -2049,8 +2053,7 @@ xfs_ifree_cluster( xfs_iunlock(ip, XFS_ILOCK_EXCL); } - if (found) - xfs_trans_stale_inode_buf(tp, bp); + xfs_trans_stale_inode_buf(tp, bp); xfs_trans_binval(tp, bp); } -- cgit v1.2.3 From 4536f2ad8b330453d7ebec0746c4374eadd649b1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:42:30 +1000 Subject: xfs: fix untrusted inode number lookup Commit 7124fe0a5b619d65b739477b3b55a20bf805b06d ("xfs: validate untrusted inode numbers during lookup") changes the inode lookup code to do btree lookups for untrusted inode numbers. This change made an invalid assumption about the alignment of inodes and hence incorrectly calculated the first inode in the cluster. As a result, some inode numbers were being incorrectly considered invalid when they were actually valid. The issue was not picked up by the xfstests suite because it always runs fsr and dump (the two utilities that utilise the bulkstat interface) on cache hot inodes and hence the lookup code in the cold cache path was not sufficiently exercised to uncover this intermittent problem. Fix the issue by relaxing the btree lookup criteria and then checking if the record returned contains the inode number we are lookup for. If it we get an incorrect record, then the inode number is invalid. Cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_ialloc.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index abf80ae1e95b..5371d2dc360e 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -1213,7 +1213,6 @@ xfs_imap_lookup( struct xfs_inobt_rec_incore rec; struct xfs_btree_cur *cur; struct xfs_buf *agbp; - xfs_agino_t startino; int error; int i; @@ -1227,13 +1226,13 @@ xfs_imap_lookup( } /* - * derive and lookup the exact inode record for the given agino. If the - * record cannot be found, then it's an invalid inode number and we - * should abort. + * Lookup the inode record for the given agino. If the record cannot be + * found, then it's an invalid inode number and we should abort. Once + * we have a record, we need to ensure it contains the inode number + * we are looking up. */ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno); - startino = agino & ~(XFS_IALLOC_INODES(mp) - 1); - error = xfs_inobt_lookup(cur, startino, XFS_LOOKUP_EQ, &i); + error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i); if (!error) { if (i) error = xfs_inobt_get_rec(cur, &rec, &i); @@ -1246,6 +1245,11 @@ xfs_imap_lookup( if (error) return error; + /* check that the returned record contains the required inode */ + if (rec.ir_startino > agino || + rec.ir_startino + XFS_IALLOC_INODES(mp) <= agino) + return EINVAL; + /* for untrusted inodes check it is allocated first */ if ((flags & XFS_IGET_UNTRUSTED) && (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino))) -- cgit v1.2.3 From efceab1d563153a2b1a6e7d35376241a48126989 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:44:56 +1000 Subject: xfs: handle negative wbc->nr_to_write during sync writeback During data integrity (WB_SYNC_ALL) writeback, wbc->nr_to_write will go negative on inodes with more than 1024 dirty pages due to implementation details of write_cache_pages(). Currently XFS will abort page clustering in writeback once nr_to_write drops below zero, and so for data integrity writeback we will do very inefficient page at a time allocation and IO submission for inodes with large numbers of dirty pages. Fix this by only aborting the page clustering code when wbc->nr_to_write is negative and the sync mode is WB_SYNC_NONE. Cc: Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_aops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 15412fe15c3a..528be1ba1402 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -852,8 +852,8 @@ xfs_convert_page( SetPageUptodate(page); if (count) { - wbc->nr_to_write--; - if (wbc->nr_to_write <= 0) + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) done = 1; } xfs_start_page_writeback(page, !page_dirty, count); -- cgit v1.2.3 From 2fe33661fcd79d4c53022509f7223d526b5fa233 Mon Sep 17 00:00:00 2001 From: Stuart Brodsky Date: Tue, 24 Aug 2010 11:46:05 +1000 Subject: xfs: ensure f_ffree returned by statfs() is non-negative Because of delayed updates to sb_icount field in the super block, it is possible to allocate over maxicount number of inodes. This causes the arithmetic to calculate a negative number of free inodes in user commands like df or stat -f. Since maxicount is a somewhat arbitrary number, a slight over allocation is not critical but user commands should be displayed as 0 or greater and never go negative. To do this the value in the stats buffer f_ffree is capped to never go negative. [ Modified to use max_t as per Christoph's comment. ] Signed-off-by: Stu Brodsky Signed-off-by: Dave Chinner --- fs/xfs/linux-2.6/xfs_super.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 15c35b62ff14..c6b24e7c308a 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1226,6 +1226,7 @@ xfs_fs_statfs( struct xfs_inode *ip = XFS_I(dentry->d_inode); __uint64_t fakeinos, id; xfs_extlen_t lsize; + __int64_t ffree; statp->f_type = XFS_SB_MAGIC; statp->f_namelen = MAXNAMELEN - 1; @@ -1249,7 +1250,11 @@ xfs_fs_statfs( statp->f_files = min_t(typeof(statp->f_files), statp->f_files, mp->m_maxicount); - statp->f_ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + + /* make sure statp->f_ffree does not underflow */ + ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree); + statp->f_ffree = max_t(__int64_t, ffree, 0); + spin_unlock(&mp->m_sb_lock); if ((ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) || -- cgit v1.2.3 From 1a387d3be2b30c90f20d49a3497a8fc0693a9d18 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:46:31 +1000 Subject: xfs: dummy transactions should not dirty VFS state When we need to cover the log, we issue dummy transactions to ensure the current log tail is on disk. Unfortunately we currently use the root inode in the dummy transaction, and the act of committing the transaction dirties the inode at the VFS level. As a result, the VFS writeback of the dirty inode will prevent the filesystem from idling long enough for the log covering state machine to complete. The state machine gets stuck in a loop issuing new dummy transactions to cover the log and never makes progress. To avoid this problem, the dummy transactions should not cause externally visible state changes. To ensure this occurs, make sure that dummy transactions log an unchanging field in the superblock as it's state is never propagated outside the filesystem. This allows the log covering state machine to complete successfully and the filesystem now correctly enters a fully idle state about 90s after the last modification was made. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_super.c | 2 +- fs/xfs/linux-2.6/xfs_sync.c | 42 ++++++------------------------------------ fs/xfs/xfs_fsops.c | 31 ++++++++++++++++++------------- fs/xfs/xfs_fsops.h | 2 +- 4 files changed, 26 insertions(+), 51 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index c6b24e7c308a..a4e07974955b 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1407,7 +1407,7 @@ xfs_fs_freeze( xfs_save_resvblks(mp); xfs_quiesce_attr(mp); - return -xfs_fs_log_dummy(mp); + return -xfs_fs_log_dummy(mp, SYNC_WAIT); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index dfcbd98d1599..d59c4a65d492 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -34,6 +34,7 @@ #include "xfs_inode_item.h" #include "xfs_quota.h" #include "xfs_trace.h" +#include "xfs_fsops.h" #include #include @@ -340,38 +341,6 @@ xfs_sync_attr( XFS_ICI_NO_TAG, 0, NULL); } -STATIC int -xfs_commit_dummy_trans( - struct xfs_mount *mp, - uint flags) -{ - struct xfs_inode *ip = mp->m_rootip; - struct xfs_trans *tp; - int error; - - /* - * Put a dummy transaction in the log to tell recovery - * that all others are OK. - */ - tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - /* the log force ensures this transaction is pushed to disk */ - xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0); - return error; -} - STATIC int xfs_sync_fsdata( struct xfs_mount *mp) @@ -432,7 +401,7 @@ xfs_quiesce_data( /* mark the log as covered if needed */ if (xfs_log_need_covered(mp)) - error2 = xfs_commit_dummy_trans(mp, SYNC_WAIT); + error2 = xfs_fs_log_dummy(mp, SYNC_WAIT); /* flush data-only devices */ if (mp->m_rtdev_targp) @@ -563,7 +532,7 @@ xfs_flush_inodes( /* * Every sync period we need to unpin all items, reclaim inodes and sync * disk quotas. We might need to cover the log to indicate that the - * filesystem is idle. + * filesystem is idle and not frozen. */ STATIC void xfs_sync_worker( @@ -577,8 +546,9 @@ xfs_sync_worker( xfs_reclaim_inodes(mp, 0); /* dgc: errors ignored here */ error = xfs_qm_sync(mp, SYNC_TRYLOCK); - if (xfs_log_need_covered(mp)) - error = xfs_commit_dummy_trans(mp, 0); + if (mp->m_super->s_frozen == SB_UNFROZEN && + xfs_log_need_covered(mp)) + error = xfs_fs_log_dummy(mp, 0); } mp->m_sync_seq++; wake_up(&mp->m_wait_single_sync_task); diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index dbca5f5c37ba..43b1d5699335 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -604,31 +604,36 @@ out: return 0; } +/* + * Dump a transaction into the log that contains no real change. This is needed + * to be able to make the log dirty or stamp the current tail LSN into the log + * during the covering operation. + * + * We cannot use an inode here for this - that will push dirty state back up + * into the VFS and then periodic inode flushing will prevent log covering from + * making progress. Hence we log a field in the superblock instead. + */ int xfs_fs_log_dummy( - xfs_mount_t *mp) + xfs_mount_t *mp, + int flags) { xfs_trans_t *tp; - xfs_inode_t *ip; int error; tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP); - error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, + XFS_DEFAULT_LOG_COUNT); if (error) { xfs_trans_cancel(tp, 0); return error; } - ip = mp->m_rootip; - xfs_ilock(ip, XFS_ILOCK_EXCL); - - xfs_trans_ijoin(tp, ip); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - return error; + /* log the UUID because it is an unchanging field */ + xfs_mod_sb(tp, XFS_SB_UUID); + if (flags & SYNC_WAIT) + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp, 0); } int diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 88435e0a77c9..a786c5212c1e 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -25,6 +25,6 @@ extern int xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt); extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval, xfs_fsop_resblks_t *outval); extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags); -extern int xfs_fs_log_dummy(xfs_mount_t *mp); +extern int xfs_fs_log_dummy(xfs_mount_t *mp, int flags); #endif /* __XFS_FSOPS_H__ */ -- cgit v1.2.3 From a44f13edf0ebb4e41942d0f16ca80489dcf6659d Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:40:03 +1000 Subject: xfs: Reduce log force overhead for delayed logging Delayed logging adds some serialisation to the log force process to ensure that it does not deference a bad commit context structure when determining if a CIL push is necessary or not. It does this by grabing the CIL context lock exclusively, then dropping it before pushing the CIL if necessary. This causes serialisation of all log forces and pushes regardless of whether a force is necessary or not. As a result fsync heavy workloads (like dbench) can be significantly slower with delayed logging than without. To avoid this penalty, copy the current sequence from the context to the CIL structure when they are swapped. This allows us to do unlocked checks on the current sequence without having to worry about dereferencing context structures that may have already been freed. Hence we can remove the CIL context locking in the forcing code and only call into the push code if the current context matches the sequence we need to force. By passing the sequence into the push code, we can check the sequence again once we have the CIL lock held exclusive and abort if the sequence has already been pushed. This avoids a lock round-trip and unnecessary CIL pushes when we have racing push calls. The result is that the regression in dbench performance goes away - this change improves dbench performance on a ramdisk from ~2100MB/s to ~2500MB/s. This compares favourably to not using delayed logging which retuns ~2500MB/s for the same workload. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log.c | 7 +- fs/xfs/xfs_log_cil.c | 245 +++++++++++++++++++++++++++----------------------- fs/xfs/xfs_log_priv.h | 13 ++- 3 files changed, 147 insertions(+), 118 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 925d572bf0f4..33f718f92a48 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3015,7 +3015,8 @@ _xfs_log_force( XFS_STATS_INC(xs_log_force); - xlog_cil_push(log, 1); + if (log->l_cilp) + xlog_cil_force(log); spin_lock(&log->l_icloglock); @@ -3167,7 +3168,7 @@ _xfs_log_force_lsn( XFS_STATS_INC(xs_log_force); if (log->l_cilp) { - lsn = xlog_cil_push_lsn(log, lsn); + lsn = xlog_cil_force_lsn(log, lsn); if (lsn == NULLCOMMITLSN) return 0; } @@ -3724,7 +3725,7 @@ xfs_log_force_umount( * call below. */ if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG)) - xlog_cil_push(log, 1); + xlog_cil_force(log); /* * We must hold both the GRANT lock and the LOG lock, diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index ef8e7d9f445d..9768f2437bb3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -68,6 +68,7 @@ xlog_cil_init( ctx->sequence = 1; ctx->cil = cil; cil->xc_ctx = ctx; + cil->xc_current_sequence = ctx->sequence; cil->xc_log = log; log->l_cilp = cil; @@ -320,94 +321,6 @@ xlog_cil_free_logvec( } } -/* - * Commit a transaction with the given vector to the Committed Item List. - * - * To do this, we need to format the item, pin it in memory if required and - * account for the space used by the transaction. Once we have done that we - * need to release the unused reservation for the transaction, attach the - * transaction to the checkpoint context so we carry the busy extents through - * to checkpoint completion, and then unlock all the items in the transaction. - * - * For more specific information about the order of operations in - * xfs_log_commit_cil() please refer to the comments in - * xfs_trans_commit_iclog(). - * - * Called with the context lock already held in read mode to lock out - * background commit, returns without it held once background commits are - * allowed again. - */ -int -xfs_log_commit_cil( - struct xfs_mount *mp, - struct xfs_trans *tp, - struct xfs_log_vec *log_vector, - xfs_lsn_t *commit_lsn, - int flags) -{ - struct log *log = mp->m_log; - int log_flags = 0; - int push = 0; - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; - - if (XLOG_FORCED_SHUTDOWN(log)) { - xlog_cil_free_logvec(log_vector); - return XFS_ERROR(EIO); - } - - /* lock out background commit */ - down_read(&log->l_cilp->xc_ctx_lock); - xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); - - /* check we didn't blow the reservation */ - if (tp->t_ticket->t_curr_res < 0) - xlog_print_tic_res(log->l_mp, tp->t_ticket); - - /* attach the transaction to the CIL if it has any busy extents */ - if (!list_empty(&tp->t_busy)) { - spin_lock(&log->l_cilp->xc_cil_lock); - list_splice_init(&tp->t_busy, - &log->l_cilp->xc_ctx->busy_extents); - spin_unlock(&log->l_cilp->xc_cil_lock); - } - - tp->t_commit_lsn = *commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); - xfs_trans_unreserve_and_mod_sb(tp); - - /* - * Once all the items of the transaction have been copied to the CIL, - * the items can be unlocked and freed. - * - * This needs to be done before we drop the CIL context lock because we - * have to update state in the log items and unlock them before they go - * to disk. If we don't, then the CIL checkpoint can race with us and - * we can run checkpoint completion before we've updated and unlocked - * the log items. This affects (at least) processing of stale buffers, - * inodes and EFIs. - */ - xfs_trans_free_items(tp, *commit_lsn, 0); - - /* check for background commit before unlock */ - if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) - push = 1; - - up_read(&log->l_cilp->xc_ctx_lock); - - /* - * We need to push CIL every so often so we don't cache more than we - * can fit in the log. The limit really is that a checkpoint can't be - * more than half the log (the current checkpoint is not allowed to - * overwrite the previous checkpoint), but commit latency and memory - * usage limit this to a smaller size in most cases. - */ - if (push) - xlog_cil_push(log, 0); - return 0; -} - /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as @@ -441,13 +354,23 @@ xlog_cil_committed( } /* - * Push the Committed Item List to the log. If the push_now flag is not set, - * then it is a background flush and so we can chose to ignore it. + * Push the Committed Item List to the log. If @push_seq flag is zero, then it + * is a background flush and so we can chose to ignore it. Otherwise, if the + * current sequence is the same as @push_seq we need to do a flush. If + * @push_seq is less than the current sequence, then it has already been + * flushed and we don't need to do anything - the caller will wait for it to + * complete if necessary. + * + * @push_seq is a value rather than a flag because that allows us to do an + * unlocked check of the sequence number for a match. Hence we can allows log + * forces to run racily and not issue pushes for the same sequence twice. If we + * get a race between multiple pushes for the same sequence they will block on + * the first one and then abort, hence avoiding needless pushes. */ -int +STATIC int xlog_cil_push( struct log *log, - int push_now) + xfs_lsn_t push_seq) { struct xfs_cil *cil = log->l_cilp; struct xfs_log_vec *lv; @@ -467,12 +390,14 @@ xlog_cil_push( if (!cil) return 0; + ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence); + new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS); new_ctx->ticket = xlog_cil_ticket_alloc(log); /* lock out transaction commit, but don't block on background push */ if (!down_write_trylock(&cil->xc_ctx_lock)) { - if (!push_now) + if (!push_seq) goto out_free_ticket; down_write(&cil->xc_ctx_lock); } @@ -483,7 +408,11 @@ xlog_cil_push( goto out_skip; /* check for spurious background flush */ - if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) + goto out_skip; + + /* check for a previously pushed seqeunce */ + if (push_seq < cil->xc_ctx->sequence) goto out_skip; /* @@ -528,6 +457,13 @@ xlog_cil_push( new_ctx->cil = cil; cil->xc_ctx = new_ctx; + /* + * mirror the new sequence into the cil structure so that we can do + * unlocked checks against the current sequence in log forces without + * risking deferencing a freed context pointer. + */ + cil->xc_current_sequence = new_ctx->sequence; + /* * The switch is now done, so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, @@ -639,6 +575,94 @@ out_abort: return XFS_ERROR(EIO); } +/* + * Commit a transaction with the given vector to the Committed Item List. + * + * To do this, we need to format the item, pin it in memory if required and + * account for the space used by the transaction. Once we have done that we + * need to release the unused reservation for the transaction, attach the + * transaction to the checkpoint context so we carry the busy extents through + * to checkpoint completion, and then unlock all the items in the transaction. + * + * For more specific information about the order of operations in + * xfs_log_commit_cil() please refer to the comments in + * xfs_trans_commit_iclog(). + * + * Called with the context lock already held in read mode to lock out + * background commit, returns without it held once background commits are + * allowed again. + */ +int +xfs_log_commit_cil( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_log_vec *log_vector, + xfs_lsn_t *commit_lsn, + int flags) +{ + struct log *log = mp->m_log; + int log_flags = 0; + int push = 0; + + if (flags & XFS_TRANS_RELEASE_LOG_RES) + log_flags = XFS_LOG_REL_PERM_RESERV; + + if (XLOG_FORCED_SHUTDOWN(log)) { + xlog_cil_free_logvec(log_vector); + return XFS_ERROR(EIO); + } + + /* lock out background commit */ + down_read(&log->l_cilp->xc_ctx_lock); + xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); + + /* check we didn't blow the reservation */ + if (tp->t_ticket->t_curr_res < 0) + xlog_print_tic_res(log->l_mp, tp->t_ticket); + + /* attach the transaction to the CIL if it has any busy extents */ + if (!list_empty(&tp->t_busy)) { + spin_lock(&log->l_cilp->xc_cil_lock); + list_splice_init(&tp->t_busy, + &log->l_cilp->xc_ctx->busy_extents); + spin_unlock(&log->l_cilp->xc_cil_lock); + } + + tp->t_commit_lsn = *commit_lsn; + xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_trans_unreserve_and_mod_sb(tp); + + /* + * Once all the items of the transaction have been copied to the CIL, + * the items can be unlocked and freed. + * + * This needs to be done before we drop the CIL context lock because we + * have to update state in the log items and unlock them before they go + * to disk. If we don't, then the CIL checkpoint can race with us and + * we can run checkpoint completion before we've updated and unlocked + * the log items. This affects (at least) processing of stale buffers, + * inodes and EFIs. + */ + xfs_trans_free_items(tp, *commit_lsn, 0); + + /* check for background commit before unlock */ + if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log)) + push = 1; + + up_read(&log->l_cilp->xc_ctx_lock); + + /* + * We need to push CIL every so often so we don't cache more than we + * can fit in the log. The limit really is that a checkpoint can't be + * more than half the log (the current checkpoint is not allowed to + * overwrite the previous checkpoint), but commit latency and memory + * usage limit this to a smaller size in most cases. + */ + if (push) + xlog_cil_push(log, 0); + return 0; +} + /* * Conditionally push the CIL based on the sequence passed in. * @@ -653,39 +677,34 @@ out_abort: * commit lsn is there. It'll be empty, so this is broken for now. */ xfs_lsn_t -xlog_cil_push_lsn( +xlog_cil_force_lsn( struct log *log, - xfs_lsn_t push_seq) + xfs_lsn_t sequence) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx; xfs_lsn_t commit_lsn = NULLCOMMITLSN; -restart: - down_write(&cil->xc_ctx_lock); - ASSERT(push_seq <= cil->xc_ctx->sequence); - - /* check to see if we need to force out the current context */ - if (push_seq == cil->xc_ctx->sequence) { - up_write(&cil->xc_ctx_lock); - xlog_cil_push(log, 1); - goto restart; - } + ASSERT(sequence <= cil->xc_current_sequence); + + /* + * check to see if we need to force out the current context. + * xlog_cil_push() handles racing pushes for the same sequence, + * so no need to deal with it here. + */ + if (sequence == cil->xc_current_sequence) + xlog_cil_push(log, sequence); /* * See if we can find a previous sequence still committing. - * We can drop the flush lock as soon as we have the cil lock - * because we are now only comparing contexts protected by - * the cil lock. - * * We need to wait for all previous sequence commits to complete * before allowing the force of push_seq to go ahead. Hence block * on commits for those as well. */ +restart: spin_lock(&cil->xc_cil_lock); - up_write(&cil->xc_ctx_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { - if (ctx->sequence > push_seq) + if (ctx->sequence > sequence) continue; if (!ctx->commit_lsn) { /* @@ -695,7 +714,7 @@ restart: sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0); goto restart; } - if (ctx->sequence != push_seq) + if (ctx->sequence != sequence) continue; /* found it! */ commit_lsn = ctx->commit_lsn; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 8c072618965c..ced52b98b322 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -422,6 +422,7 @@ struct xfs_cil { struct rw_semaphore xc_ctx_lock; struct list_head xc_committing; sv_t xc_commit_wait; + xfs_lsn_t xc_current_sequence; }; /* @@ -562,8 +563,16 @@ int xlog_cil_init(struct log *log); void xlog_cil_init_post_recovery(struct log *log); void xlog_cil_destroy(struct log *log); -int xlog_cil_push(struct log *log, int push_now); -xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence); +/* + * CIL force routines + */ +xfs_lsn_t xlog_cil_force_lsn(struct log *log, xfs_lsn_t sequence); + +static inline void +xlog_cil_force(struct log *log) +{ + xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence); +} /* * Unmount record type is used as a pseudo transaction type for the ticket. -- cgit v1.2.3 From 3b93c7aaefc05ee2a75e2726929b01a321402984 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 24 Aug 2010 11:45:53 +1000 Subject: xfs: don't do memory allocation under the CIL context lock Formatting items requires memory allocation when using delayed logging. Currently that memory allocation is done while holding the CIL context lock in read mode. This means that if memory allocation takes some time (e.g. enters reclaim), we cannot push on the CIL until the allocation(s) required by formatting complete. This can stall CIL pushes for some time, and once a push is stalled so are all new transaction commits. Fix this splitting the item formatting into two steps. The first step which does the allocation and memcpy() into the allocated buffer is now done outside the CIL context lock, and only the CIL insert is done inside the CIL context lock. This avoids the stall issue. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_log_cil.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 9768f2437bb3..ed575fb4b495 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -270,15 +270,10 @@ xlog_cil_insert( static void xlog_cil_format_items( struct log *log, - struct xfs_log_vec *log_vector, - struct xlog_ticket *ticket, - xfs_lsn_t *start_lsn) + struct xfs_log_vec *log_vector) { struct xfs_log_vec *lv; - if (start_lsn) - *start_lsn = log->l_cilp->xc_ctx->sequence; - ASSERT(log_vector); for (lv = log_vector; lv; lv = lv->lv_next) { void *ptr; @@ -302,9 +297,24 @@ xlog_cil_format_items( ptr += vec->i_len; } ASSERT(ptr == lv->lv_buf + lv->lv_buf_len); + } +} +static void +xlog_cil_insert_items( + struct log *log, + struct xfs_log_vec *log_vector, + struct xlog_ticket *ticket, + xfs_lsn_t *start_lsn) +{ + struct xfs_log_vec *lv; + + if (start_lsn) + *start_lsn = log->l_cilp->xc_ctx->sequence; + + ASSERT(log_vector); + for (lv = log_vector; lv; lv = lv->lv_next) xlog_cil_insert(log, ticket, lv->lv_item, lv); - } } static void @@ -612,9 +622,17 @@ xfs_log_commit_cil( return XFS_ERROR(EIO); } + /* + * do all the hard work of formatting items (including memory + * allocation) outside the CIL context lock. This prevents stalling CIL + * pushes when we are low on memory and a transaction commit spends a + * lot of time in memory reclaim. + */ + xlog_cil_format_items(log, log_vector); + /* lock out background commit */ down_read(&log->l_cilp->xc_ctx_lock); - xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn); + xlog_cil_insert_items(log, log_vector, tp->t_ticket, commit_lsn); /* check we didn't blow the reservation */ if (tp->t_ticket->t_curr_res < 0) -- cgit v1.2.3 From b5420f235953448eeae615b3361584dc5e414f34 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Aug 2010 11:47:51 +1000 Subject: xfs: do not discard page cache data on EAGAIN If xfs_map_blocks returns EAGAIN because of lock contention we must redirty the page and not disard the pagecache content and return an error from writepage. We used to do this correctly, but the logic got lost during the recent reshuffle of the writepage code. Signed-off-by: Christoph Hellwig Reported-by: Mike Gao Tested-by: Mike Gao Reviewed-by: Dave Chinner Signed-off-by: Dave Chinner --- fs/xfs/linux-2.6/xfs_aops.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 528be1ba1402..b552f816de15 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1068,7 +1068,7 @@ xfs_vm_writepage( * by themselves. */ if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) - goto out_fail; + goto redirty; /* * We need a transaction if there are delalloc or unwritten buffers @@ -1080,7 +1080,7 @@ xfs_vm_writepage( */ xfs_count_page_state(page, &delalloc, &unwritten); if ((current->flags & PF_FSTRANS) && (delalloc || unwritten)) - goto out_fail; + goto redirty; /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -1245,12 +1245,15 @@ error: if (iohead) xfs_cancel_ioend(iohead); + if (err == -EAGAIN) + goto redirty; + xfs_aops_discard_page(page); ClearPageUptodate(page); unlock_page(page); return err; -out_fail: +redirty: redirty_page_for_writepage(wbc, page); unlock_page(page); return 0; -- cgit v1.2.3 From 2d20ca835867d93ead6ce61780d883a4b128106d Mon Sep 17 00:00:00 2001 From: "shirishpargaonkar@gmail.com" Date: Tue, 24 Aug 2010 11:53:48 -0500 Subject: Eliminate sparse warning - bad constant expression Eliminiate sparse warning during usage of crypto_shash_* APIs error: bad constant expression Allocate memory for shash descriptors once, so that we do not kmalloc/kfree it for every signature generation (shash descriptor for md5 hash). From ed7538619817777decc44b5660b52268077b74f3 Mon Sep 17 00:00:00 2001 From: Shirish Pargaonkar Date: Tue, 24 Aug 2010 11:47:43 -0500 Subject: [PATCH] eliminate sparse warnings during crypto_shash_* APis usage Signed-off-by: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/cifsencrypt.c | 193 +++++++++++++++++++++++++++++++------------------- fs/cifs/cifsglob.h | 7 ++ 2 files changed, 128 insertions(+), 72 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index eef78c24e0cc..709f2296bdb4 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -45,39 +45,38 @@ extern void SMBencrypt(unsigned char *passwd, const unsigned char *c8, static int cifs_calculate_signature(const struct smb_hdr *cifs_pdu, struct TCP_Server_Info *server, char *signature) { - int rc = 0; - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(server->ntlmssp.md5)]; - } sdesc; + int rc; if (cifs_pdu == NULL || server == NULL || signature == NULL) return -EINVAL; - sdesc.shash.tfm = server->ntlmssp.md5; - sdesc.shash.flags = 0x0; + if (!server->ntlmssp.sdescmd5) { + cERROR(1, + "cifs_calculate_signature: can't generate signature\n"); + return -1; + } - rc = crypto_shash_init(&sdesc.shash); + rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash); if (rc) { - cERROR(1, "could not initialize master crypto API hmacmd5\n"); + cERROR(1, "cifs_calculate_signature: oould not init md5\n"); return rc; } if (server->secType == RawNTLMSSP) - crypto_shash_update(&sdesc.shash, + crypto_shash_update(&server->ntlmssp.sdescmd5->shash, server->session_key.data.ntlmv2.key, CIFS_NTLMV2_SESSKEY_SIZE); else - crypto_shash_update(&sdesc.shash, + crypto_shash_update(&server->ntlmssp.sdescmd5->shash, (char *)&server->session_key.data, server->session_key.len); - crypto_shash_update(&sdesc.shash, + crypto_shash_update(&server->ntlmssp.sdescmd5->shash, cifs_pdu->Protocol, cifs_pdu->smb_buf_length); - rc = crypto_shash_final(&sdesc.shash, signature); + rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature); - return 0; + return rc; } @@ -115,30 +114,28 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, struct TCP_Server_Info *server, char *signature) { int i; - int rc = 0; - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(server->ntlmssp.md5)]; - } sdesc; + int rc; if (iov == NULL || server == NULL || signature == NULL) return -EINVAL; - sdesc.shash.tfm = server->ntlmssp.md5; - sdesc.shash.flags = 0x0; + if (!server->ntlmssp.sdescmd5) { + cERROR(1, "cifs_calc_signature2: can't generate signature\n"); + return -1; + } - rc = crypto_shash_init(&sdesc.shash); + rc = crypto_shash_init(&server->ntlmssp.sdescmd5->shash); if (rc) { - cERROR(1, "could not initialize master crypto API hmacmd5\n"); + cERROR(1, "cifs_calc_signature2: oould not init md5\n"); return rc; } if (server->secType == RawNTLMSSP) - crypto_shash_update(&sdesc.shash, + crypto_shash_update(&server->ntlmssp.sdescmd5->shash, server->session_key.data.ntlmv2.key, CIFS_NTLMV2_SESSKEY_SIZE); else - crypto_shash_update(&sdesc.shash, + crypto_shash_update(&server->ntlmssp.sdescmd5->shash, (char *)&server->session_key.data, server->session_key.len); @@ -146,7 +143,7 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (iov[i].iov_len == 0) continue; if (iov[i].iov_base == NULL) { - cERROR(1, "null iovec entry"); + cERROR(1, "cifs_calc_signature2: null iovec entry"); return -EIO; } /* The first entry includes a length field (which does not get @@ -154,16 +151,16 @@ static int cifs_calc_signature2(const struct kvec *iov, int n_vec, if (i == 0) { if (iov[0].iov_len <= 8) /* cmd field at offset 9 */ break; /* nothing to sign or corrupt header */ - crypto_shash_update(&sdesc.shash, + crypto_shash_update(&server->ntlmssp.sdescmd5->shash, iov[i].iov_base + 4, iov[i].iov_len - 4); } else - crypto_shash_update(&sdesc.shash, + crypto_shash_update(&server->ntlmssp.sdescmd5->shash, iov[i].iov_base, iov[i].iov_len); } - rc = crypto_shash_final(&sdesc.shash, signature); + rc = crypto_shash_final(&server->ntlmssp.sdescmd5->shash, signature); - return 0; + return rc; } int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *server, @@ -313,43 +310,48 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, wchar_t *user; wchar_t *domain; wchar_t *server; - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)]; - } sdesc; + + if (!ses->server->ntlmssp.sdeschmacmd5) { + cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); + return -1; + } /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash); - sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5; - sdesc.shash.flags = 0x0; - crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, nt_hash, CIFS_NTHASH_SIZE); - rc = crypto_shash_init(&sdesc.shash); + rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash); if (rc) { - cERROR(1, "could not initialize master crypto API hmacmd5\n"); + cERROR(1, "calc_ntlmv2_hash: could not init hmacmd5\n"); return rc; } /* convert ses->userName to unicode and uppercase */ len = strlen(ses->userName); user = kmalloc(2 + (len * 2), GFP_KERNEL); - if (user == NULL) + if (user == NULL) { + cERROR(1, "calc_ntlmv2_hash: user mem alloc failure\n"); + rc = -ENOMEM; goto calc_exit_2; + } len = cifs_strtoUCS((__le16 *)user, ses->userName, len, nls_cp); UniStrupr(user); - crypto_shash_update(&sdesc.shash, (char *)user, 2 * len); + crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash, + (char *)user, 2 * len); /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { len = strlen(ses->domainName); domain = kmalloc(2 + (len * 2), GFP_KERNEL); - if (domain == NULL) + if (domain == NULL) { + cERROR(1, "calc_ntlmv2_hash: domain mem alloc failure"); + rc = -ENOMEM; goto calc_exit_1; + } len = cifs_strtoUCS((__le16 *)domain, ses->domainName, len, nls_cp); /* the following line was removed since it didn't work well @@ -357,15 +359,19 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, Maybe converting the domain name earlier makes sense */ /* UniStrupr(domain); */ - crypto_shash_update(&sdesc.shash, (char *)domain, 2 * len); + crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash, + (char *)domain, 2 * len); kfree(domain); } else if (ses->serverName) { len = strlen(ses->serverName); server = kmalloc(2 + (len * 2), GFP_KERNEL); - if (server == NULL) + if (server == NULL) { + cERROR(1, "calc_ntlmv2_hash: server mem alloc failure"); + rc = -ENOMEM; goto calc_exit_1; + } len = cifs_strtoUCS((__le16 *)server, ses->serverName, len, nls_cp); /* the following line was removed since it didn't work well @@ -373,16 +379,20 @@ static int calc_ntlmv2_hash(struct cifsSesInfo *ses, Maybe converting the domain name earlier makes sense */ /* UniStrupr(domain); */ - crypto_shash_update(&sdesc.shash, (char *)server, 2 * len); + crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash, + (char *)server, 2 * len); kfree(server); } + + rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash, + ses->server->ntlmv2_hash); + calc_exit_1: kfree(user); calc_exit_2: /* BB FIXME what about bytes 24 through 40 of the signing key? compare with the NTLM example */ - rc = crypto_shash_final(&sdesc.shash, ses->server->ntlmv2_hash); return rc; } @@ -442,34 +452,33 @@ CalcNTLMv2_response(const struct TCP_Server_Info *server, char *v2_session_response) { int rc; - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(server->ntlmssp.hmacmd5)]; - } sdesc; - sdesc.shash.tfm = server->ntlmssp.hmacmd5; - sdesc.shash.flags = 0x0; + if (!server->ntlmssp.sdeschmacmd5) { + cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); + return -1; + } crypto_shash_setkey(server->ntlmssp.hmacmd5, server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - rc = crypto_shash_init(&sdesc.shash); + rc = crypto_shash_init(&server->ntlmssp.sdeschmacmd5->shash); if (rc) { - cERROR(1, "could not initialize master crypto API hmacmd5\n"); + cERROR(1, "CalcNTLMv2_response: could not init hmacmd5"); return rc; } memcpy(v2_session_response + CIFS_SERVER_CHALLENGE_SIZE, server->cryptKey, CIFS_SERVER_CHALLENGE_SIZE); - crypto_shash_update(&sdesc.shash, + crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash, v2_session_response + CIFS_SERVER_CHALLENGE_SIZE, sizeof(struct ntlmv2_resp) - CIFS_SERVER_CHALLENGE_SIZE); if (server->tilen) - crypto_shash_update(&sdesc.shash, + crypto_shash_update(&server->ntlmssp.sdeschmacmd5->shash, server->tiblob, server->tilen); - rc = crypto_shash_final(&sdesc.shash, v2_session_response); + rc = crypto_shash_final(&server->ntlmssp.sdeschmacmd5->shash, + v2_session_response); return rc; } @@ -480,10 +489,6 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, { int rc = 0; struct ntlmv2_resp *buf = (struct ntlmv2_resp *)resp_buf; - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(ses->server->ntlmssp.hmacmd5)]; - } sdesc; buf->blob_signature = cpu_to_le32(0x00000101); buf->reserved = 0; @@ -511,21 +516,24 @@ setup_ntlmv2_rsp(struct cifsSesInfo *ses, char *resp_buf, return rc; } + if (!ses->server->ntlmssp.sdeschmacmd5) { + cERROR(1, "calc_ntlmv2_hash: can't generate ntlmv2 hash\n"); + return -1; + } + crypto_shash_setkey(ses->server->ntlmssp.hmacmd5, ses->server->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - sdesc.shash.tfm = ses->server->ntlmssp.hmacmd5; - sdesc.shash.flags = 0x0; - - rc = crypto_shash_init(&sdesc.shash); + rc = crypto_shash_init(&ses->server->ntlmssp.sdeschmacmd5->shash); if (rc) { - cERROR(1, "could not initialize master crypto API hmacmd5\n"); + cERROR(1, "setup_ntlmv2_rsp: could not init hmacmd5\n"); return rc; } - crypto_shash_update(&sdesc.shash, resp_buf, CIFS_HMAC_MD5_HASH_SIZE); + crypto_shash_update(&ses->server->ntlmssp.sdeschmacmd5->shash, + resp_buf, CIFS_HMAC_MD5_HASH_SIZE); - rc = crypto_shash_final(&sdesc.shash, + rc = crypto_shash_final(&ses->server->ntlmssp.sdeschmacmd5->shash, ses->server->session_key.data.ntlmv2.key); memcpy(&ses->server->session_key.data.ntlmv2.resp, resp_buf, @@ -578,24 +586,65 @@ cifs_crypto_shash_release(struct TCP_Server_Info *server) if (server->ntlmssp.hmacmd5) crypto_free_shash(server->ntlmssp.hmacmd5); + + kfree(server->ntlmssp.sdeschmacmd5); + + kfree(server->ntlmssp.sdescmd5); } int cifs_crypto_shash_allocate(struct TCP_Server_Info *server) { + int rc; + unsigned int size; + server->ntlmssp.hmacmd5 = crypto_alloc_shash("hmac(md5)", 0, 0); if (!server->ntlmssp.hmacmd5 || IS_ERR(server->ntlmssp.hmacmd5)) { - cERROR(1, "could not allocate master crypto API hmacmd5\n"); + cERROR(1, "could not allocate crypto hmacmd5\n"); return 1; } server->ntlmssp.md5 = crypto_alloc_shash("md5", 0, 0); if (!server->ntlmssp.md5 || IS_ERR(server->ntlmssp.md5)) { - crypto_free_shash(server->ntlmssp.hmacmd5); - cERROR(1, "could not allocate master crypto API md5\n"); - return 1; + cERROR(1, "could not allocate crypto md5\n"); + rc = 1; + goto cifs_crypto_shash_allocate_ret1; } + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->ntlmssp.hmacmd5); + server->ntlmssp.sdeschmacmd5 = kmalloc(size, GFP_KERNEL); + if (!server->ntlmssp.sdeschmacmd5) { + cERROR(1, "cifs_crypto_shash_allocate: can't alloc hmacmd5\n"); + rc = -ENOMEM; + goto cifs_crypto_shash_allocate_ret2; + } + server->ntlmssp.sdeschmacmd5->shash.tfm = server->ntlmssp.hmacmd5; + server->ntlmssp.sdeschmacmd5->shash.flags = 0x0; + + + size = sizeof(struct shash_desc) + + crypto_shash_descsize(server->ntlmssp.md5); + server->ntlmssp.sdescmd5 = kmalloc(size, GFP_KERNEL); + if (!server->ntlmssp.sdescmd5) { + cERROR(1, "cifs_crypto_shash_allocate: can't alloc md5\n"); + rc = -ENOMEM; + goto cifs_crypto_shash_allocate_ret3; + } + server->ntlmssp.sdescmd5->shash.tfm = server->ntlmssp.md5; + server->ntlmssp.sdescmd5->shash.flags = 0x0; + return 0; + +cifs_crypto_shash_allocate_ret3: + kfree(server->ntlmssp.sdeschmacmd5); + +cifs_crypto_shash_allocate_ret2: + crypto_free_shash(server->ntlmssp.md5); + +cifs_crypto_shash_allocate_ret1: + crypto_free_shash(server->ntlmssp.hmacmd5); + + return rc; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 49563e0c1725..c9d0cfc086eb 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -123,12 +123,19 @@ struct cifs_cred { struct cifs_ace *aces; }; +struct sdesc { + struct shash_desc shash; + char ctx[]; +}; + struct ntlmssp_auth { __u32 client_flags; __u32 server_flags; unsigned char ciphertext[CIFS_CPHTXT_SIZE]; struct crypto_shash *hmacmd5; struct crypto_shash *md5; + struct sdesc *sdeschmacmd5; + struct sdesc *sdescmd5; }; /* -- cgit v1.2.3 From 7d8cb26d7dcb911f110b7762bd5941e8f009d6c3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 24 Aug 2010 08:44:16 -0700 Subject: ceph: maintain i_head_snapc when any caps are dirty, not just for data We used to use i_head_snapc to keep track of which snapc the current epoch of dirty data was dirtied under. It is used by queue_cap_snap to set up the cap_snap. However, since we queue cap snaps for any dirty caps, not just for dirty file data, we need to keep a valid i_head_snapc anytime we have dirty|flushing caps. This fixes a NULL pointer deref in queue_cap_snap when writing back dirty caps without data (e.g., snaptest-authwb.sh). Signed-off-by: Sage Weil --- fs/ceph/addr.c | 4 ++-- fs/ceph/caps.c | 20 +++++++++++++++++--- fs/ceph/snap.c | 6 +++++- fs/ceph/super.h | 3 ++- 4 files changed, 26 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 420d46974ec8..4cfce1ee31fa 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -87,7 +87,7 @@ static int ceph_set_page_dirty(struct page *page) /* dirty the head */ spin_lock(&inode->i_lock); - if (ci->i_wrbuffer_ref_head == 0) + if (ci->i_head_snapc == NULL) ci->i_head_snapc = ceph_get_snap_context(snapc); ++ci->i_wrbuffer_ref_head; if (ci->i_wrbuffer_ref == 0) @@ -346,7 +346,7 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode, break; } } - if (!snapc && ci->i_head_snapc) { + if (!snapc && ci->i_wrbuffer_ref_head) { snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ba5bbf318fe1..a2069b6680ae 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1143,6 +1143,10 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, for (i = 0; i < CEPH_CAP_BITS; i++) if (flushing & (1 << i)) ci->i_cap_flush_tid[i] = flush_tid; + + follows = ci->i_head_snapc->seq; + } else { + follows = 0; } keep = cap->implemented; @@ -1156,7 +1160,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, mtime = inode->i_mtime; atime = inode->i_atime; time_warp_seq = ci->i_time_warp_seq; - follows = ci->i_snap_realm->cached_context->seq; uid = inode->i_uid; gid = inode->i_gid; mode = inode->i_mode; @@ -1332,7 +1335,11 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { - dout(" inode %p now dirty\n", &ci->vfs_inode); + if (!ci->i_head_snapc) + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + dout(" inode %p now dirty snapc %p\n", &ci->vfs_inode, + ci->i_head_snapc); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &mdsc->cap_dirty); @@ -2190,7 +2197,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; - if (!ci->i_wrbuffer_ref_head) { + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } @@ -2483,6 +2492,11 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = 1; + if (ci->i_wrbuffer_ref_head == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } } else { BUG_ON(list_empty(&ci->i_dirty_item)); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 6bdbf3ae7082..4868b9dcac5a 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -458,6 +458,8 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR))) { struct ceph_snap_context *snapc = ci->i_head_snapc; + dout("queue_cap_snap %p cap_snap %p queuing under %p\n", inode, + capsnap, snapc); igrab(inode); atomic_set(&capsnap->nref, 1); @@ -489,7 +491,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->dirty_pages = ci->i_wrbuffer_ref_head; ci->i_wrbuffer_ref_head = 0; capsnap->context = snapc; - ci->i_head_snapc = NULL; + ci->i_head_snapc = + ceph_get_snap_context(ci->i_snap_realm->cached_context); + dout(" new snapc is %p\n", ci->i_head_snapc); list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); if (used & CEPH_CAP_FILE_WR) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index b33929d8f287..c33897ae5725 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -344,7 +344,8 @@ struct ceph_inode_info { unsigned i_cap_exporting_issued; struct ceph_cap_reservation i_cap_migration_resv; struct list_head i_cap_snaps; /* snapped state pending flush to mds */ - struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */ + struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or + dirty|flushing caps */ unsigned i_snap_caps; /* cap bits for snapped files */ int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */ -- cgit v1.2.3 From 36e21687e6e51c4225c42e6291938363f7bbfa7c Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 24 Aug 2010 16:23:48 -0700 Subject: ceph: initialize fields on new dentry_infos Signed-off-by: Sage Weil --- fs/ceph/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 67bbb41d5526..6e4f43ff23ec 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -46,7 +46,7 @@ int ceph_init_dentry(struct dentry *dentry) else dentry->d_op = &ceph_snap_dentry_ops; - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS); + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ -- cgit v1.2.3 From ac1f12ef569d49b013c3db86e11be7e15d66b1c3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Aug 2010 09:11:35 +0200 Subject: ceph: ceph_get_inode() returns an ERR_PTR ceph_get_inode() returns an ERR_PTR and it doesn't return a NULL. Signed-off-by: Dan Carpenter Signed-off-by: Sage Weil --- fs/ceph/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 3e6b52cb5ee8..e7cca414da03 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1230,11 +1230,11 @@ retry_lookup: in = dn->d_inode; } else { in = ceph_get_inode(parent->d_sb, vino); - if (in == NULL) { + if (IS_ERR(in)) { dout("new_inode badness\n"); d_delete(dn); dput(dn); - err = -ENOMEM; + err = PTR_ERR(in); goto out; } dn = splice_dentry(dn, in, NULL); -- cgit v1.2.3 From ad8453ab0a5b98884074302ba3cc37664791e261 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 25 Aug 2010 13:26:32 +0100 Subject: ceph: Fix warnings Just scrubbing some warnings so I can see real problem ones in the build noise. For 32bit we need to coax gcc politely into believing we really honestly intend to the casts. Using (u64)(unsigned long) means we cast from a pointer to a type of the right size and then extend it. This stops the warning spew. Signed-off-by: Alan Cox Signed-off-by: Sage Weil --- fs/ceph/locks.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ae85af06454f..ff4e753aae92 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -82,7 +82,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) length = fl->fl_end - fl->fl_start + 1; err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, (u64)fl->fl_nspid, + (u64)fl->fl_pid, + (u64)(unsigned long)fl->fl_nspid, lock_cmd, fl->fl_start, length, wait); if (!err) { @@ -92,7 +93,8 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) /* undo! This should only happen if the kernel detects * local deadlock. */ ceph_lock_message(CEPH_LOCK_FCNTL, op, file, - (u64)fl->fl_pid, (u64)fl->fl_nspid, + (u64)fl->fl_pid, + (u64)(unsigned long)fl->fl_nspid, CEPH_LOCK_UNLOCK, fl->fl_start, length, 0); dout("got %d on posix_lock_file, undid lock", err); @@ -132,7 +134,8 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) length = fl->fl_end - fl->fl_start + 1; err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, (u64)fl->fl_pid, (u64)fl->fl_nspid, + file, (u64)fl->fl_pid, + (u64)(unsigned long)fl->fl_nspid, lock_cmd, fl->fl_start, length, wait); if (!err) { @@ -141,7 +144,7 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, file, (u64)fl->fl_pid, - (u64)fl->fl_nspid, + (u64)(unsigned long)fl->fl_nspid, CEPH_LOCK_UNLOCK, fl->fl_start, length, 0); dout("got %d on flock_lock_file_wait, undid lock", err); @@ -235,7 +238,8 @@ int lock_to_ceph_filelock(struct file_lock *lock, cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); cephlock->client = cpu_to_le64(0); cephlock->pid = cpu_to_le64(lock->fl_pid); - cephlock->pid_namespace = cpu_to_le64((u64)lock->fl_nspid); + cephlock->pid_namespace = + cpu_to_le64((u64)(unsigned long)lock->fl_nspid); switch (lock->fl_type) { case F_RDLCK: -- cgit v1.2.3 From c89e5198b26a869ce2842bad8519264f3394dee9 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 26 Aug 2010 02:11:54 +0000 Subject: [CIFS] Eliminate unused variable warning CC: Shirish Pargaonkar Signed-off-by: Steve French --- fs/cifs/sess.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 4788e16a02cc..795095f4eac6 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -620,7 +620,6 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, struct key *spnego_key = NULL; __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ bool first_time; - char *ntlmsspblob; if (ses == NULL) return -EINVAL; @@ -868,6 +867,8 @@ ssetup_ntlmssp_authenticate: iov[1].iov_base = &pSMB->req.SecurityBlob[0]; } else if (phase == NtLmAuthenticate) { int blob_len; + char *ntlmsspblob; + ntlmsspblob = kmalloc(5 * sizeof(struct _AUTHENTICATE_MESSAGE), GFP_KERNEL); -- cgit v1.2.3 From f44c3890d9fd6e4284518ff3bb16879fee194a3a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 26 Aug 2010 11:07:24 +0200 Subject: ceph: ceph_mdsc_build_path() returns an ERR_PTR ceph_mdsc_build_path() returns an ERR_PTR but this code is set up to handle NULL returns. Signed-off-by: Dan Carpenter Signed-off-by: Sage Weil --- fs/ceph/debugfs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 360c4f22718d..6fd8b20a8611 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -171,6 +171,8 @@ static int mdsc_show(struct seq_file *s, void *p) } else if (req->r_dentry) { path = ceph_mdsc_build_path(req->r_dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) + path = NULL; spin_lock(&req->r_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", ceph_ino(req->r_dentry->d_parent->d_inode), @@ -187,6 +189,8 @@ static int mdsc_show(struct seq_file *s, void *p) if (req->r_old_dentry) { path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen, &pathbase, 0); + if (IS_ERR(path)) + path = NULL; spin_lock(&req->r_old_dentry->d_lock); seq_printf(s, " #%llx/%.*s (%s)", ceph_ino(req->r_old_dentry->d_parent->d_inode), -- cgit v1.2.3 From e072f8aa3587710cd35cce0f6b6efd7b4276c327 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 26 Aug 2010 09:26:37 -0700 Subject: ceph: don't BUG on ENOMEM during mds reconnect We are in a position to return an error; do that instead. Signed-off-by: Sage Weil --- fs/ceph/mds_client.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8d1f11c7a5a2..f091b1351786 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2324,7 +2324,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0); if (IS_ERR(path)) { err = PTR_ERR(path); - BUG_ON(err); + goto out_dput; } } else { path = NULL; @@ -2332,7 +2332,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, } err = ceph_pagelist_encode_string(pagelist, path, pathlen); if (err) - goto out; + goto out_free; spin_lock(&inode->i_lock); cap->seq = 0; /* reset cap seq */ @@ -2376,8 +2376,9 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, unlock_kernel(); } -out: +out_free: kfree(path); +out_dput: dput(dentry); return err; } -- cgit v1.2.3 From b545787dbb00a041c541a4759d938ddb0108295a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 26 Aug 2010 11:12:38 +0200 Subject: ceph: fix get_ticket_handler() error handling get_ticket_handler() returns a valid pointer or it returns ERR_PTR(-ENOMEM) if kzalloc() fails. Signed-off-by: Dan Carpenter Signed-off-by: Sage Weil --- fs/ceph/auth_x.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index 582e0b2caf8a..a2d002cbdec2 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -376,7 +376,7 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) th = get_ticket_handler(ac, service); - if (!th) { + if (IS_ERR(th)) { *pneed |= service; continue; } @@ -399,6 +399,9 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); + if (IS_ERR(th)) + return PTR_ERR(th); + ceph_x_validate_tickets(ac, &need); dout("build_request want %x have %x need %x\n", @@ -450,7 +453,6 @@ static int ceph_x_build_request(struct ceph_auth_client *ac, return -ERANGE; head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY); - BUG_ON(!th); ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer); if (ret) return ret; @@ -505,7 +507,8 @@ static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result, case CEPHX_GET_PRINCIPAL_SESSION_KEY: th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH); - BUG_ON(!th); + if (IS_ERR(th)) + return PTR_ERR(th); ret = ceph_x_proc_ticket_reply(ac, &th->session_key, buf + sizeof(*head), end); break; @@ -563,8 +566,8 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, void *end = p + sizeof(au->reply_buf); th = get_ticket_handler(ac, au->service); - if (!th) - return -EIO; /* hrm! */ + if (IS_ERR(th)) + return PTR_ERR(th); ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply)); if (ret < 0) return ret; @@ -626,7 +629,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); - if (th && !IS_ERR(th)) + if (!IS_ERR(th)) remove_ticket_handler(ac, th); } -- cgit v1.2.3 From f0138a79d74e1e942970ea163be268cd2e4bbcfc Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Thu, 26 Aug 2010 14:46:09 +0530 Subject: Cannot allocate memory error on mount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On 08/26/2010 01:56 AM, joe hefner wrote: > On a recent Fedora (13), I am seeing a mount failure message that I can not explain. I have a Windows Server 2003ýa with a share set up for access only for a specific username (say userfoo). If I try to mount it from Linux,ýusing userfoo and the correct password all is well. If I try with a bad password or with some other username (userbar), it fails with "Permission denied" as expected. If I try to mount as username = administrator, and give the correct administrator password, I would also expect "Permission denied", but I see "Cannot allocate memory" instead. > ýfs/cifs/netmisc.c: Mapping smb error code 5 to POSIX err -13 > ýfs/cifs/cifssmb.c: Send error in QPathInfo = -13 > ýCIFS VFS: cifs_read_super: get root inode failed Looks like the commit 0b8f18e3 assumed that cifs_get_inode_info() and friends fail only due to memory allocation error when the inode is NULL which is not the case if CIFSSMBQPathInfo() fails and returns an error. Fix this by propagating the actual error code back. Acked-by: Jeff Layton Signed-off-by: Suresh Jayaraman Signed-off-by: Steve French --- fs/cifs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 4bc47e5b5f29..86a164f08a74 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -834,7 +834,7 @@ struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino) xid, NULL); if (!inode) - return ERR_PTR(-ENOMEM); + return ERR_PTR(rc); #ifdef CONFIG_CIFS_FSCACHE /* populate tcon->resource_id */ -- cgit v1.2.3 From 30c0e1ef0a8a6cab4e0f9357698c81a2f7f73cc5 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Tue, 17 Aug 2010 18:46:33 -0400 Subject: nfsd4: bad BUG() in preprocess_stateid_op It's OK for this function to return without setting filp--we do it in the special-stateid case. And there's a legitimate case where we can hit this, since we do permit reads on write-only stateid's. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 0a024917f052..b990eadb799c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2983,7 +2983,6 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, *filpp = find_readable_file(stp->st_file); else *filpp = find_writeable_file(stp->st_file); - BUG_ON(!*filpp); /* assured by check_openmode */ } } status = nfs_ok; -- cgit v1.2.3 From 18608ad49cffa430cfd0b4e027dedfe3114f916e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Aug 2010 18:06:26 -0400 Subject: nfsd4: typo fix in find_any_file Signed-off-by: J. Bruce Fields --- fs/nfsd/state.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 7731a75971dd..84579c86b13d 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -379,7 +379,7 @@ static inline struct file *find_any_file(struct nfs4_file *f) { if (f->fi_fds[O_RDWR]) return f->fi_fds[O_RDWR]; - else if (f->fi_fds[O_RDWR]) + else if (f->fi_fds[O_WRONLY]) return f->fi_fds[O_WRONLY]; else return f->fi_fds[O_RDONLY]; -- cgit v1.2.3 From 7d94784293096c0a46897acdb83be5abd9278ece Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 20 Aug 2010 18:09:31 -0400 Subject: nfsd4: fix downgrade/lock logic If we already had a RW open for a file, and get a readonly open, we were piggybacking on the existing RW open. That's inconsistent with the downgrade logic which blows away the RW open assuming you'll still have a readonly open. Also, make sure there is a readonly or writeonly open available for locking, again to prevent bad behavior in downgrade cases when any RW open may be lost. Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4state.c | 25 +++++++++++++++---------- fs/nfsd/state.h | 12 ++++++------ 2 files changed, 21 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index b990eadb799c..69b266db7f5c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2450,14 +2450,13 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, static __be32 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) { - u32 op_share_access, new_access; + u32 op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; + bool new_access; __be32 status; - set_access(&new_access, stp->st_access_bmap); - new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK; - + new_access = !test_bit(op_share_access, &stp->st_access_bmap); if (new_access) { - status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access); + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, op_share_access); if (status) return status; } @@ -2470,7 +2469,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c return status; } /* remember the open */ - op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; __set_bit(op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); @@ -3560,7 +3558,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_stateowner *open_sop = NULL; struct nfs4_stateowner *lock_sop = NULL; struct nfs4_stateid *lock_stp; - struct file *filp; + struct nfs4_file *fp; + struct file *filp = NULL; struct file_lock file_lock; struct file_lock conflock; __be32 status = 0; @@ -3590,7 +3589,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * lock stateid. */ struct nfs4_stateid *open_stp = NULL; - struct nfs4_file *fp; status = nfserr_stale_clientid; if (!nfsd4_has_session(cstate) && @@ -3633,6 +3631,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (status) goto out; lock_sop = lock->lk_replay_owner; + fp = lock_stp->st_file; } /* lock->lk_replay_owner and lock_stp have been created or found */ @@ -3647,13 +3646,19 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (lock->lk_type) { case NFS4_READ_LT: case NFS4_READW_LT: - filp = find_readable_file(lock_stp->st_file); + if (find_readable_file(lock_stp->st_file)) { + nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_READ); + filp = find_readable_file(lock_stp->st_file); + } file_lock.fl_type = F_RDLCK; cmd = F_SETLK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: - filp = find_writeable_file(lock_stp->st_file); + if (find_writeable_file(lock_stp->st_file)) { + nfs4_get_vfs_file(rqstp, fp, &cstate->current_fh, NFS4_SHARE_ACCESS_WRITE); + filp = find_writeable_file(lock_stp->st_file); + } file_lock.fl_type = F_WRLCK; cmd = F_SETLK; break; diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 84579c86b13d..322518c88e4b 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -363,16 +363,16 @@ struct nfs4_file { * at all? */ static inline struct file *find_writeable_file(struct nfs4_file *f) { - if (f->fi_fds[O_RDWR]) - return f->fi_fds[O_RDWR]; - return f->fi_fds[O_WRONLY]; + if (f->fi_fds[O_WRONLY]) + return f->fi_fds[O_WRONLY]; + return f->fi_fds[O_RDWR]; } static inline struct file *find_readable_file(struct nfs4_file *f) { - if (f->fi_fds[O_RDWR]) - return f->fi_fds[O_RDWR]; - return f->fi_fds[O_RDONLY]; + if (f->fi_fds[O_RDONLY]) + return f->fi_fds[O_RDONLY]; + return f->fi_fds[O_RDWR]; } static inline struct file *find_any_file(struct nfs4_file *f) -- cgit v1.2.3 From f6360efb83cd6dd1476cd758834c8277508c1f15 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 13 Aug 2010 15:53:49 +0200 Subject: nfsd: fix NULL dereference in nfsd_statfs() The commit ebabe9a9001af0af56c0c2780ca1576246e7a74b pass a struct path to vfs_statfs introduced the struct path initialization, and this seems to trigger an Oops on my machine. fh_dentry field may be NULL and set later in fh_verify(), thus the initialization of path must be after fh_verify(). Signed-off-by: Takashi Iwai Reviewed-by: Christoph Hellwig Reviewed-by: Minchan Kim Signed-off-by: J. Bruce Fields --- fs/nfsd/vfs.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 96360a83cb91..661a6cf8e826 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -2033,15 +2033,17 @@ out: __be32 nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) { - struct path path = { - .mnt = fhp->fh_export->ex_path.mnt, - .dentry = fhp->fh_dentry, - }; __be32 err; err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); - if (!err && vfs_statfs(&path, stat)) - err = nfserr_io; + if (!err) { + struct path path = { + .mnt = fhp->fh_export->ex_path.mnt, + .dentry = fhp->fh_dentry, + }; + if (vfs_statfs(&path, stat)) + err = nfserr_io; + } return err; } -- cgit v1.2.3 From f137f15072411618e37b338aa13e5ae43583bcf2 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 11 Aug 2010 12:11:41 +0200 Subject: fs/ecryptfs: Return -ENOMEM on memory allocation failure In this code, 0 is returned on memory allocation failure, even though other failures return -ENOMEM or other similar values. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @@ expression ret; expression x,e1,e2,e3; @@ ret = 0 ... when != ret = e1 *x = \(kmalloc\|kcalloc\|kzalloc\)(...) ... when != ret = e2 if (x == NULL) { ... when != ret = e3 return ret; } // Signed-off-by: Julia Lawall Signed-off-by: Tyler Hicks --- fs/ecryptfs/keystore.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 89c5476506ef..73811cfa2ea4 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -515,6 +515,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, if (!s) { printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + rc = -ENOMEM; goto out; } s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; @@ -806,6 +807,7 @@ ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, if (!s) { printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + rc = -ENOMEM; goto out; } s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; -- cgit v1.2.3 From 7371a38201d04124a9ff2cf05059731d7c1e35a5 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 17 Aug 2010 17:24:05 +0200 Subject: ecryptfs: properly mark init functions Some ecryptfs init functions are not prefixed by __init and thus not freed after initialization. This patch saved about 1kB in ecryptfs module. Signed-off-by: Jerome Marchand Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 2 +- fs/ecryptfs/kthread.c | 2 +- fs/ecryptfs/messaging.c | 2 +- fs/ecryptfs/miscdev.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index a2e3b562e65d..13ff48b3eacb 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1793,7 +1793,7 @@ struct kmem_cache *ecryptfs_key_tfm_cache; static struct list_head key_tfm_list; struct mutex key_tfm_list_mutex; -int ecryptfs_init_crypto(void) +int __init ecryptfs_init_crypto(void) { mutex_init(&key_tfm_list_mutex); INIT_LIST_HEAD(&key_tfm_list); diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index d8c3a373aafa..0851ab6980f5 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -86,7 +86,7 @@ out: return 0; } -int ecryptfs_init_kthread(void) +int __init ecryptfs_init_kthread(void) { int rc = 0; diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index bcb68c0cb1f0..ab2248090515 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -473,7 +473,7 @@ sleep: return rc; } -int ecryptfs_init_messaging(void) +int __init ecryptfs_init_messaging(void) { int i; int rc = 0; diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 3745f612bcd4..00208c3d7e92 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -500,7 +500,7 @@ static struct miscdevice ecryptfs_miscdev = { * * Returns zero on success; non-zero otherwise */ -int ecryptfs_init_ecryptfs_miscdev(void) +int __init ecryptfs_init_ecryptfs_miscdev(void) { int rc; -- cgit v1.2.3 From 93c3fe40c279f002906ad14584c30671097d4394 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Wed, 25 Aug 2010 10:26:37 -0500 Subject: eCryptfs: Fix encrypted file name lookup regression Fixes a regression caused by 21edad32205e97dc7ccb81a85234c77e760364c8 When file name encryption was enabled, ecryptfs_lookup() failed to use the encrypted and encoded version of the upper, plaintext, file name when performing a lookup in the lower file system. This made it impossible to lookup existing encrypted file names and any newly created files would have plaintext file names in the lower file system. https://bugs.launchpad.net/ecryptfs/+bug/623087 Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 1 - fs/ecryptfs/inode.c | 31 ++++++++++++++++++++++++------- 2 files changed, 24 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 13ff48b3eacb..cbadc1bee6e7 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -2169,7 +2169,6 @@ int ecryptfs_encrypt_and_encode_filename( (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + encoded_name_no_prefix_size); (*encoded_name)[(*encoded_name_size)] = '\0'; - (*encoded_name_size)++; } else { rc = -EOPNOTSUPP; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 6c55113e7222..3fbc94203380 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -349,7 +349,7 @@ out: /** * ecryptfs_new_lower_dentry - * @ename: The name of the new dentry. + * @name: The name of the new dentry. * @lower_dir_dentry: Parent directory of the new dentry. * @nd: nameidata from last lookup. * @@ -386,20 +386,19 @@ ecryptfs_new_lower_dentry(struct qstr *name, struct dentry *lower_dir_dentry, * ecryptfs_lookup_one_lower * @ecryptfs_dentry: The eCryptfs dentry that we are looking up * @lower_dir_dentry: lower parent directory + * @name: lower file name * * Get the lower dentry from vfs. If lower dentry does not exist yet, * create it. */ static struct dentry * ecryptfs_lookup_one_lower(struct dentry *ecryptfs_dentry, - struct dentry *lower_dir_dentry) + struct dentry *lower_dir_dentry, struct qstr *name) { struct nameidata nd; struct vfsmount *lower_mnt; - struct qstr *name; int err; - name = &ecryptfs_dentry->d_name; lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( ecryptfs_dentry->d_parent)); err = vfs_path_lookup(lower_dir_dentry, lower_mnt, name->name , 0, &nd); @@ -434,6 +433,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, size_t encrypted_and_encoded_name_size; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; struct dentry *lower_dir_dentry, *lower_dentry; + struct qstr lower_name; int rc = 0; ecryptfs_dentry->d_op = &ecryptfs_dops; @@ -444,9 +444,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, goto out_d_drop; } lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - + lower_name.name = ecryptfs_dentry->d_name.name; + lower_name.len = ecryptfs_dentry->d_name.len; + lower_name.hash = ecryptfs_dentry->d_name.hash; + if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { + rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, + &lower_name); + if (rc < 0) + goto out_d_drop; + } lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, - lower_dir_dentry); + lower_dir_dentry, &lower_name); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " @@ -471,8 +479,17 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out_d_drop; } + lower_name.name = encrypted_and_encoded_name; + lower_name.len = encrypted_and_encoded_name_size; + lower_name.hash = full_name_hash(lower_name.name, lower_name.len); + if (lower_dir_dentry->d_op && lower_dir_dentry->d_op->d_hash) { + rc = lower_dir_dentry->d_op->d_hash(lower_dir_dentry, + &lower_name); + if (rc < 0) + goto out_d_drop; + } lower_dentry = ecryptfs_lookup_one_lower(ecryptfs_dentry, - lower_dir_dentry); + lower_dir_dentry, &lower_name); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_lower() returned " -- cgit v1.2.3 From a2f13ad0ba5d94b9768c28469b45ca1e81a2b895 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 24 Aug 2010 12:58:54 +0200 Subject: fanotify: Return EPERM when a process is not privileged The appropriate error code when privileged operations are denied is EPERM, not EACCES. Signed-off-by: Andreas Gruenbacher Signed-off-by: Eric Paris --- fs/notify/fanotify/fanotify_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b966b7230f47..5ed8e58d7bfc 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -641,7 +641,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) __func__, flags, event_f_flags); if (!capable(CAP_SYS_ADMIN)) - return -EACCES; + return -EPERM; if (flags & ~FAN_ALL_INIT_FLAGS) return -EINVAL; -- cgit v1.2.3 From f72adfd540bacc4f6ff57a7d708b1a6c8906bdb4 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 27 Aug 2010 21:24:24 -0400 Subject: fsnotify: fix list walk order Marks were stored on the inode and vfsmonut mark list in order from highest memory address to lowest memory address. The code to walk those lists thought they were in order from lowest to highest with unpredictable results when trying to match up marks from each. It was possible that extra events would be sent to userspace when inode marks ignoring events wouldn't get matched with the vfsmount marks. This problem only affected fanotify when using both vfsmount and inode marks simultaneously. Signed-off-by: Eric Paris --- fs/notify/fsnotify.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 6f2777ce87a1..2169aa593d5f 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -261,27 +261,26 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, while (inode_node || vfsmount_node) { used_inode = used_vfsmount = false; + inode_group = vfsmount_group = NULL; if (inode_node) { inode_mark = hlist_entry(srcu_dereference(inode_node, &fsnotify_mark_srcu), struct fsnotify_mark, i.i_list); inode_group = inode_mark->group; - } else - inode_group = (void *)-1; + } if (vfsmount_node) { vfsmount_mark = hlist_entry(srcu_dereference(vfsmount_node, &fsnotify_mark_srcu), struct fsnotify_mark, m.m_list); vfsmount_group = vfsmount_mark->group; - } else - vfsmount_group = (void *)-1; + } - if (inode_group < vfsmount_group) { + if (inode_group > vfsmount_group) { /* handle inode */ send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, data_is, cookie, file_name, &event); used_inode = true; - } else if (vfsmount_group < inode_group) { + } else if (vfsmount_group > inode_group) { send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); used_vfsmount = true; -- cgit v1.2.3 From 92b4678efa8ce0de9b1e01a74e3d13c4002a4136 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Fri, 27 Aug 2010 21:42:11 -0400 Subject: fsnotify: drop two useless bools in the fnsotify main loop The fsnotify main loop has 2 bools which indicated if we processed the inode or vfsmount mark in that particular pass through the loop. These bool can we replaced with the inode_group and vfsmount_group variables and actually make the code a little easier to understand. Signed-off-by: Eric Paris --- fs/notify/fsnotify.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 2169aa593d5f..36802420d69a 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -225,7 +225,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, struct fsnotify_event *event = NULL; struct vfsmount *mnt; int idx, ret = 0; - bool used_inode, used_vfsmount; /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); @@ -260,7 +259,6 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, } while (inode_node || vfsmount_node) { - used_inode = used_vfsmount = false; inode_group = vfsmount_group = NULL; if (inode_node) { @@ -279,23 +277,22 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, /* handle inode */ send_to_group(to_tell, NULL, inode_mark, NULL, mask, data, data_is, cookie, file_name, &event); - used_inode = true; + /* we didn't use the vfsmount_mark */ + vfsmount_group = NULL; } else if (vfsmount_group > inode_group) { send_to_group(to_tell, mnt, NULL, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); - used_vfsmount = true; + inode_group = NULL; } else { send_to_group(to_tell, mnt, inode_mark, vfsmount_mark, mask, data, data_is, cookie, file_name, &event); - used_vfsmount = true; - used_inode = true; } - if (used_inode) + if (inode_group) inode_node = srcu_dereference(inode_node->next, &fsnotify_mark_srcu); - if (used_vfsmount) + if (vfsmount_group) vfsmount_node = srcu_dereference(vfsmount_node->next, &fsnotify_mark_srcu); } -- cgit v1.2.3 From 4afc31345e5f543e5d89a47aeadaaad1d91a5bc8 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 29 Aug 2010 01:55:38 +0900 Subject: nilfs2: fix leak of shadow dat inode in error path of load_nilfs If load_nilfs() gets an error while doing recovery, it will fail to free the shadow inode of dat (nilfs->ns_gc_dat). This fixes the leak issue. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/the_nilfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 4317f177ea7c..ba7c10c917fc 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -446,6 +446,7 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) nilfs_mdt_destroy(nilfs->ns_cpfile); nilfs_mdt_destroy(nilfs->ns_sufile); nilfs_mdt_destroy(nilfs->ns_dat); + nilfs_mdt_destroy(nilfs->ns_gc_dat); failed: nilfs_clear_recovery_info(&ri); -- cgit v1.2.3 From 8f587df479c3cea14ba1a9b9d58f34fd2fd6d58b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Aug 2010 16:27:45 +0000 Subject: 9p: potential ERR_PTR() dereference p9_client_walk() can return error values if we run out of space or there is a problem with the network. Signed-off-by: Dan Carpenter Signed-off-by: Eric Van Hensbergen --- fs/9p/fid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 358563689064..6406f896bf95 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -242,7 +242,8 @@ struct p9_fid *v9fs_fid_lookup(struct dentry *dentry) } kfree(wnames); fid_out: - v9fs_fid_add(dentry, fid); + if (!IS_ERR(fid)) + v9fs_fid_add(dentry, fid); err_out: up_read(&v9ses->rename_sem); return fid; -- cgit v1.2.3 From 9bc08a45fb117c696e4940cfa1208cb1cc7a2f25 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 2 Sep 2010 15:14:38 +1000 Subject: xfs: improve buffer cache hash scalability When doing large parallel file creates on a 16p machines, large amounts of time is being spent in _xfs_buf_find(). A system wide profile with perf top shows this: 1134740.00 19.3% _xfs_buf_find 733142.00 12.5% __ticket_spin_lock The problem is that the hash contains 45,000 buffers, and the hash table width is only 256 buffers. That means we've got around 200 buffers per chain, and searching it is quite expensive. The hash table size needs to increase. Secondly, every time we do a lookup, we promote the buffer we find to the head of the hash chain. This is causing cachelines to be dirtied and causes invalidation of cachelines across all CPUs that may have walked the hash chain recently. hence every walk of the hash chain is effectively a cold cache walk. Remove the promotion to avoid this invalidation. The results are: 1045043.00 21.2% __ticket_spin_lock 326184.00 6.6% _xfs_buf_find A 70% drop in the CPU usage when looking up buffers. Unfortunately that does not result in an increase in performance underthis workload as contention on the inode_lock soaks up most of the reduction in CPU usage. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_buf.c | 8 +------- fs/xfs/linux-2.6/xfs_buf.h | 1 - 2 files changed, 1 insertion(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index ea79072f5210..d72cf2bb054a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -440,12 +440,7 @@ _xfs_buf_find( ASSERT(btp == bp->b_target); if (bp->b_file_offset == range_base && bp->b_buffer_length == range_length) { - /* - * If we look at something, bring it to the - * front of the list for next time. - */ atomic_inc(&bp->b_hold); - list_move(&bp->b_hash_list, &hash->bh_list); goto found; } } @@ -1443,8 +1438,7 @@ xfs_alloc_bufhash( { unsigned int i; - btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ - btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; + btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * sizeof(xfs_bufhash_t)); for (i = 0; i < (1 << btp->bt_hashshift); i++) { diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index d072e5ff923b..2a05614f0b92 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -137,7 +137,6 @@ typedef struct xfs_buftarg { size_t bt_smask; /* per device buffer hash table */ - uint bt_hashmask; uint bt_hashshift; xfs_bufhash_t *bt_hash; -- cgit v1.2.3 From 23963e54ce187ca6e907c83176c15508b0f6e60d Mon Sep 17 00:00:00 2001 From: Arkadiusz Mi?kiewicz Date: Thu, 26 Aug 2010 10:19:43 +0000 Subject: xfs: Disallow 32bit project quota id Currently on-disk structure is able to keep only 16bit project quota id, so disallow 32bit ones. This fixes a problem where parts of kernel structures holding project quota id are 32bit while parts (on-disk) are 16bit variables which causes project quota member files to be inaccessible for some operations (like mv/rm). Signed-off-by: Arkadiusz Mi?kiewicz Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 237f5ffb2ee8..4fec427b83ef 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -906,6 +906,13 @@ xfs_ioctl_setattr( if (XFS_FORCED_SHUTDOWN(mp)) return XFS_ERROR(EIO); + /* + * Disallow 32bit project ids because on-disk structure + * is 16bit only. + */ + if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) + return XFS_ERROR(EINVAL); + /* * If disk quotas is on, we make sure that the dquots do exist on disk, * before we start any other transactions. Trying to do this later -- cgit v1.2.3 From 72656c46f50b8dfe50e15793692982e636e3df20 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Fri, 3 Sep 2010 12:19:33 +1000 Subject: xfs: prevent 32bit overflow in space reservation If we attempt to preallocate more than 2^32 blocks of space in a single syscall, the transaction block reservation will overflow leading to a hangs in the superblock block accounting code. This is trivially reproduced with xfs_io. Fix the problem by capping the allocation reservation to the maximum number of blocks a single xfs_bmapi() call can allocate (2^21 blocks). Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_vnodeops.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 66d585c6917c..4c7c7bfb2b2f 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2299,15 +2299,22 @@ xfs_alloc_file_space( e = allocatesize_fsb; } + /* + * The transaction reservation is limited to a 32-bit block + * count, hence we need to limit the number of blocks we are + * trying to reserve to avoid an overflow. We can't allocate + * more than @nimaps extents, and an extent is limited on disk + * to MAXEXTLEN (21 bits), so use that to enforce the limit. + */ + resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps)); if (unlikely(rt)) { - resrtextents = qblocks = (uint)(e - s); + resrtextents = qblocks = resblks; resrtextents /= mp->m_sb.sb_rextsize; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0); quota_flag = XFS_QMOPT_RES_RTBLKS; } else { resrtextents = 0; - resblks = qblocks = \ - XFS_DIOSTRAT_SPACE_RES(mp, (uint)(e - s)); + resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks); quota_flag = XFS_QMOPT_RES_REGBLKS; } -- cgit v1.2.3 From 9af25465081480a75824fd7a16a37a5cfebeede9 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Mon, 30 Aug 2010 02:44:03 +0000 Subject: xfs: Make fiemap work with sparse files In xfs_vn_fiemap, we set bvm_count to fi_extent_max + 1 and want to return fi_extent_max extents, but actually it won't work for a sparse file. The reason is that in xfs_getbmap we will calculate holes and set it in 'out', while out is malloced by bmv_count(fi_extent_max+1) which didn't consider holes. So in the worst case, if 'out' vector looks like [hole, extent, hole, extent, hole, ... hole, extent, hole], we will only return half of fi_extent_max extents. This patch add a new parameter BMV_IF_NO_HOLES for bvm_iflags. So with this flags, we don't use our 'out' in xfs_getbmap for a hole. The solution is a bit ugly by just don't increasing index of 'out' vector. I felt that it is not easy to skip it at the very beginning since we have the complicated check and some function like xfs_getbmapx_fix_eof_hole to adjust 'out'. Cc: Dave Chinner Signed-off-by: Tao Ma Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_iops.c | 2 +- fs/xfs/xfs_bmap.c | 14 +++++++++++++- fs/xfs/xfs_fs.h | 4 +++- 3 files changed, 17 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 68be25dcd301..b1fc2a6bfe83 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -664,7 +664,7 @@ xfs_vn_fiemap( fieinfo->fi_extents_max + 1; bm.bmv_count = min_t(__s32, bm.bmv_count, (PAGE_SIZE * 16 / sizeof(struct getbmapx))); - bm.bmv_iflags = BMV_IF_PREALLOC; + bm.bmv_iflags = BMV_IF_PREALLOC | BMV_IF_NO_HOLES; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC)) diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 23f14e595c18..f90dadd5a968 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -5533,12 +5533,24 @@ xfs_getbmap( map[i].br_startblock)) goto out_free_map; - nexleft--; bmv->bmv_offset = out[cur_ext].bmv_offset + out[cur_ext].bmv_length; bmv->bmv_length = max_t(__int64_t, 0, bmvend - bmv->bmv_offset); + + /* + * In case we don't want to return the hole, + * don't increase cur_ext so that we can reuse + * it in the next loop. + */ + if ((iflags & BMV_IF_NO_HOLES) && + map[i].br_startblock == HOLESTARTBLOCK) { + memset(&out[cur_ext], 0, sizeof(out[cur_ext])); + continue; + } + + nexleft--; bmv->bmv_entries++; cur_ext++; } diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h index 7cf7220e7d5f..87c2e9d02288 100644 --- a/fs/xfs/xfs_fs.h +++ b/fs/xfs/xfs_fs.h @@ -114,8 +114,10 @@ struct getbmapx { #define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */ #define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */ #define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */ +#define BMV_IF_NO_HOLES 0x10 /* Do not return holes */ #define BMV_IF_VALID \ - (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC) + (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC| \ + BMV_IF_DELALLOC|BMV_IF_NO_HOLES) /* bmv_oflags values - returned for each non-header segment */ #define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ -- cgit v1.2.3 From 57f9bdac2510cd7fda58e4a111d250861eb1ebeb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 25 Aug 2010 09:12:29 +0200 Subject: sysfs: checking for NULL instead of ERR_PTR d_path() returns an ERR_PTR and it doesn't return NULL. Signed-off-by: Dan Carpenter Cc: stable Reviewed-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1b27b5688f62..da3fefe91a8f 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -340,7 +340,7 @@ static int sysfs_open_file(struct inode *inode, struct file *file) char *p; p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); - if (p) + if (!IS_ERR(p)) memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ -- cgit v1.2.3 From 7a2e8a8faab76386d8eaae9ded739ee5615be174 Mon Sep 17 00:00:00 2001 From: Valerie Aurora Date: Thu, 26 Aug 2010 11:07:22 -0700 Subject: VFS: Sanity check mount flags passed to change_mnt_propagation() Sanity check the flags passed to change_mnt_propagation(). Exactly one flag should be set. Return EINVAL otherwise. Userspace can pass in arbitrary combinations of MS_* flags to mount(). do_change_type() is called if any of MS_SHARED, MS_PRIVATE, MS_SLAVE, or MS_UNBINDABLE is set. do_change_type() clears MS_REC and then calls change_mnt_propagation() with the rest of the user-supplied flags. change_mnt_propagation() clearly assumes only one flag is set but do_change_type() does not check that this is true. For example, mount() with flags MS_SHARED | MS_RDONLY does not actually make the mount shared or read-only but does clear MNT_UNBINDABLE. Signed-off-by: Valerie Aurora Signed-off-by: Linus Torvalds --- fs/namespace.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index de402eb6eafb..a72eaabfe8f2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1483,6 +1483,23 @@ out_unlock: return err; } +/* + * Sanity check the flags to change_mnt_propagation. + */ + +static int flags_to_propagation_type(int flags) +{ + int type = flags & ~MS_REC; + + /* Fail if any non-propagation flags are set */ + if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) + return 0; + /* Only one propagation flag should be set */ + if (!is_power_of_2(type)) + return 0; + return type; +} + /* * recursively change the type of the mountpoint. */ @@ -1490,7 +1507,7 @@ static int do_change_type(struct path *path, int flag) { struct vfsmount *m, *mnt = path->mnt; int recurse = flag & MS_REC; - int type = flag & ~MS_REC; + int type; int err = 0; if (!capable(CAP_SYS_ADMIN)) @@ -1499,6 +1516,10 @@ static int do_change_type(struct path *path, int flag) if (path->dentry != path->mnt->mnt_root) return -EINVAL; + type = flags_to_propagation_type(flag); + if (!type) + return -EINVAL; + down_write(&namespace_sem); if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); -- cgit v1.2.3