From c89681ed7d0e4a61d35bdc12c06c6733b718b2cb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 22 Jun 2006 14:47:22 -0700 Subject: [PATCH] remove steal_locks() This patch removes the steal_locks() function. steal_locks() doesn't work correctly with any filesystem that does it's own lock management, including NFS, CIFS, etc. In addition it has weird semantics on local filesystems in case tasks sharing file-descriptor tables are doing POSIX locking operations in parallel to execve(). The steal_locks() function has an effect on applications doing: clone(CLONE_FILES) /* in child */ lock execve lock POSIX locks acquired before execve (by "child", "parent" or any further task sharing files_struct) will after the execve be owned exclusively by "child". According to Chris Wright some LSB/LTP kind of suite triggers without the stealing behavior, but there's no known real-world application that would also fail. Apps using NPTL are not affected, since all other threads are killed before execve. Apps using LinuxThreads are only affected if they - have multiple threads during exec (LinuxThreads doesn't kill other threads, the app may do it with pthread_kill_other_threads_np()) - rely on POSIX locks being inherited across exec Both conditions are documented, but not their interaction. Apps using clone() natively are affected if they - use clone(CLONE_FILES) - rely on POSIX locks being inherited across exec The above scenarios are unlikely, but possible. If the patch is vetoed, there's a plan B, that involves mostly keeping the weird stealing semantics, but changing the way lock ownership is handled so that network and local filesystems work consistently. That would add more complexity though, so this solution seems to be preferred by most people. Signed-off-by: Miklos Szeredi Cc: Trond Myklebust Cc: Matthew Wilcox Cc: Chris Wright Cc: Christoph Hellwig Cc: Steven French Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 57 --------------------------------------------------------- 1 file changed, 57 deletions(-) (limited to 'fs/locks.c') diff --git a/fs/locks.c b/fs/locks.c index ab61a8b54829..69435c68c1ed 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2206,63 +2206,6 @@ int lock_may_write(struct inode *inode, loff_t start, unsigned long len) EXPORT_SYMBOL(lock_may_write); -static inline void __steal_locks(struct file *file, fl_owner_t from) -{ - struct inode *inode = file->f_dentry->d_inode; - struct file_lock *fl = inode->i_flock; - - while (fl) { - if (fl->fl_file == file && fl->fl_owner == from) - fl->fl_owner = current->files; - fl = fl->fl_next; - } -} - -/* When getting ready for executing a binary, we make sure that current - * has a files_struct on its own. Before dropping the old files_struct, - * we take over ownership of all locks for all file descriptors we own. - * Note that we may accidentally steal a lock for a file that a sibling - * has created since the unshare_files() call. - */ -void steal_locks(fl_owner_t from) -{ - struct files_struct *files = current->files; - int i, j; - struct fdtable *fdt; - - if (from == files) - return; - - lock_kernel(); - j = 0; - - /* - * We are not taking a ref to the file structures, so - * we need to acquire ->file_lock. - */ - spin_lock(&files->file_lock); - fdt = files_fdtable(files); - for (;;) { - unsigned long set; - i = j * __NFDBITS; - if (i >= fdt->max_fdset || i >= fdt->max_fds) - break; - set = fdt->open_fds->fds_bits[j++]; - while (set) { - if (set & 1) { - struct file *file = fdt->fd[i]; - if (file) - __steal_locks(file, from); - } - i++; - set >>= 1; - } - } - spin_unlock(&files->file_lock); - unlock_kernel(); -} -EXPORT_SYMBOL(steal_locks); - static int __init filelock_init(void) { filelock_cache = kmem_cache_create("file_lock_cache", -- cgit v1.2.3 From 0d9a490abe1f69fda220f7866f6f23af41daa128 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 23 Jun 2006 02:05:09 -0700 Subject: [PATCH] locks: don't unnecessarily fail posix lock operations posix_lock_file() was too cautious, failing operations on OOM, even if they didn't actually require an allocation. This has the disadvantage, that a failing unlock on process exit could lead to a memory leak. There are two possibilites for this: - filesystem implements .lock() and calls back to posix_lock_file(). On cleanup of files_struct locks_remove_posix() is called which should remove all locks belonging to files_struct. However if filesystem calls posix_lock_file() which fails, then those locks will never be freed. - if a file is closed while a lock is blocked, then after acquiring fcntl_setlk() will undo the lock. But this unlock itself might fail on OOM, again possibly leaking the lock. The solution is to move the checking of the allocations until after it is sure that they will be needed. This will solve the above problem since unlock will always succeed unless it splits an existing region. Signed-off-by: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) (limited to 'fs/locks.c') diff --git a/fs/locks.c b/fs/locks.c index 69435c68c1ed..c5ac6b40e766 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -834,14 +834,7 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request if (request->fl_flags & FL_ACCESS) goto out; - error = -ENOLCK; /* "no luck" */ - if (!(new_fl && new_fl2)) - goto out; - /* - * We've allocated the new locks in advance, so there are no - * errors possible (and no blocking operations) from here on. - * * Find the first old lock with the same owner as the new lock. */ @@ -938,10 +931,25 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request before = &fl->fl_next; } + /* + * The above code only modifies existing locks in case of + * merging or replacing. If new lock(s) need to be inserted + * all modifications are done bellow this, so it's safe yet to + * bail out. + */ + error = -ENOLCK; /* "no luck" */ + if (right && left == right && !new_fl2) + goto out; + error = 0; if (!added) { if (request->fl_type == F_UNLCK) goto out; + + if (!new_fl) { + error = -ENOLCK; + goto out; + } locks_copy_lock(new_fl, request); locks_insert_lock(before, new_fl); new_fl = NULL; -- cgit v1.2.3 From 39005d022ad221b76dc2de0ac62ef475a796433b Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 23 Jun 2006 02:05:10 -0700 Subject: [PATCH] locks: don't do unnecessary allocations posix_lock_file() always allocates new locks in advance, even if it's easy to determine that no allocations will be needed. Optimize these cases: - FL_ACCESS flag is set - Unlocking the whole range Signed-off-by: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'fs/locks.c') diff --git a/fs/locks.c b/fs/locks.c index c5ac6b40e766..2344f241c687 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -794,7 +794,8 @@ out: static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request, struct file_lock *conflock) { struct file_lock *fl; - struct file_lock *new_fl, *new_fl2; + struct file_lock *new_fl = NULL; + struct file_lock *new_fl2 = NULL; struct file_lock *left = NULL; struct file_lock *right = NULL; struct file_lock **before; @@ -803,9 +804,15 @@ static int __posix_lock_file_conf(struct inode *inode, struct file_lock *request /* * We may need two file_lock structures for this operation, * so we get them in advance to avoid races. + * + * In some cases we can be sure, that no new locks will be needed */ - new_fl = locks_alloc_lock(); - new_fl2 = locks_alloc_lock(); + if (!(request->fl_flags & FL_ACCESS) && + (request->fl_type != F_UNLCK || + request->fl_start != 0 || request->fl_end != OFFSET_MAX)) { + new_fl = locks_alloc_lock(); + new_fl2 = locks_alloc_lock(); + } lock_kernel(); if (request->fl_type != F_UNLCK) { -- cgit v1.2.3 From ff7b86b82083f24b8637dff1528c7101c18c7f39 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 23 Jun 2006 02:05:11 -0700 Subject: [PATCH] locks: clean up locks_remove_posix() locks_remove_posix() can use posix_lock_file() instead of doing the lock removal by hand. posix_lock_file() now does exacly the same. The comment about pids no longer applies, posix_lock_file() takes only the owner into account. Signed-off-by: Miklos Szeredi Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) (limited to 'fs/locks.c') diff --git a/fs/locks.c b/fs/locks.c index 2344f241c687..e588e1c265f7 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1896,15 +1896,14 @@ out: */ void locks_remove_posix(struct file *filp, fl_owner_t owner) { - struct file_lock lock, **before; + struct file_lock lock; /* * If there are no locks held on this file, we don't need to call * posix_lock_file(). Another process could be setting a lock on this * file at the same time, but we wouldn't remove that lock anyway. */ - before = &filp->f_dentry->d_inode->i_flock; - if (*before == NULL) + if (!filp->f_dentry->d_inode->i_flock) return; lock.fl_type = F_UNLCK; @@ -1917,25 +1916,11 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) lock.fl_ops = NULL; lock.fl_lmops = NULL; - if (filp->f_op && filp->f_op->lock != NULL) { + if (filp->f_op && filp->f_op->lock != NULL) filp->f_op->lock(filp, F_SETLK, &lock); - goto out; - } + else + posix_lock_file(filp, &lock); - /* Can't use posix_lock_file here; we need to remove it no matter - * which pid we have. - */ - lock_kernel(); - while (*before != NULL) { - struct file_lock *fl = *before; - if (IS_POSIX(fl) && posix_same_owner(fl, &lock)) { - locks_delete_lock(before); - continue; - } - before = &fl->fl_next; - } - unlock_kernel(); -out: if (lock.fl_ops && lock.fl_ops->fl_release_private) lock.fl_ops->fl_release_private(&lock); } -- cgit v1.2.3 From 75e1fcc0b18df0a65ab113198e9dc0e98999a08c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Fri, 23 Jun 2006 02:05:12 -0700 Subject: [PATCH] vfs: add lock owner argument to flush operation Pass the POSIX lock owner ID to the flush operation. This is useful for filesystems which don't want to store any locking state in inode->i_flock but want to handle locking/unlocking POSIX locks internally. FUSE is one such filesystem but I think it possible that some network filesystems would need this also. Also add a flag to indicate that a POSIX locking request was generated by close(), so filesystems using the above feature won't send an extra locking request in this case. Signed-off-by: Miklos Szeredi Cc: Trond Myklebust Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 3 +-- drivers/input/evdev.c | 2 +- drivers/scsi/osst.c | 2 +- drivers/scsi/st.c | 2 +- fs/cifs/cifsfs.h | 2 +- fs/cifs/file.c | 2 +- fs/coda/file.c | 2 +- fs/fuse/file.c | 2 +- fs/locks.c | 2 +- fs/nfs/file.c | 4 ++-- fs/open.c | 2 +- include/linux/coda_linux.h | 2 +- include/linux/fs.h | 3 ++- ipc/mqueue.c | 2 +- 14 files changed, 16 insertions(+), 16 deletions(-) (limited to 'fs/locks.c') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 2359e2809f50..6d7bc8ff7b3a 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -532,7 +532,6 @@ static ctl_table pfm_sysctl_root[] = { static struct ctl_table_header *pfm_sysctl_header; static int pfm_context_unload(pfm_context_t *ctx, void *arg, int count, struct pt_regs *regs); -static int pfm_flush(struct file *filp); #define pfm_get_cpu_var(v) __ia64_per_cpu_var(v) #define pfm_get_cpu_data(a,b) per_cpu(a, b) @@ -1774,7 +1773,7 @@ pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx) * When caller is self-monitoring, the context is unloaded. */ static int -pfm_flush(struct file *filp) +pfm_flush(struct file *filp, fl_owner_t id) { pfm_context_t *ctx; struct task_struct *task; diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c index ba325f16d077..5f561fce32d8 100644 --- a/drivers/input/evdev.c +++ b/drivers/input/evdev.c @@ -82,7 +82,7 @@ static int evdev_fasync(int fd, struct file *file, int on) return retval < 0 ? retval : 0; } -static int evdev_flush(struct file * file) +static int evdev_flush(struct file * file, fl_owner_t id) { struct evdev_list *list = file->private_data; if (!list->evdev->exist) return -ENODEV; diff --git a/drivers/scsi/osst.c b/drivers/scsi/osst.c index ce0ba3a174f9..4a2fed350d4e 100644 --- a/drivers/scsi/osst.c +++ b/drivers/scsi/osst.c @@ -4724,7 +4724,7 @@ err_out: /* Flush the tape buffer before close */ -static int os_scsi_tape_flush(struct file * filp) +static int os_scsi_tape_flush(struct file * filp, fl_owner_t id) { int result = 0, result2; struct osst_tape * STp = filp->private_data; diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index ad87d73f88ee..1272dd249af3 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -1193,7 +1193,7 @@ static int st_open(struct inode *inode, struct file *filp) /* Flush the tape buffer before close */ -static int st_flush(struct file *filp) +static int st_flush(struct file *filp, fl_owner_t id) { int result = 0, result2; unsigned char cmd[MAX_COMMAND_SIZE]; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index c98755dca868..d56c0577c710 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -74,7 +74,7 @@ extern ssize_t cifs_user_write(struct file *file, const char __user *write_data, size_t write_size, loff_t * poffset); extern int cifs_lock(struct file *, int, struct file_lock *); extern int cifs_fsync(struct file *, struct dentry *, int); -extern int cifs_flush(struct file *); +extern int cifs_flush(struct file *, fl_owner_t id); extern int cifs_file_mmap(struct file * , struct vm_area_struct *); extern const struct file_operations cifs_dir_ops; extern int cifs_dir_open(struct inode *inode, struct file *file); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 487ea8b3baaa..b4a18c1cab0a 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1417,7 +1417,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) * As file closes, flush all cached write data for this inode checking * for write behind errors. */ -int cifs_flush(struct file *file) +int cifs_flush(struct file *file, fl_owner_t id) { struct inode * inode = file->f_dentry->d_inode; int rc = 0; diff --git a/fs/coda/file.c b/fs/coda/file.c index 7c2642431fa5..cc66c681bd11 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -164,7 +164,7 @@ int coda_open(struct inode *coda_inode, struct file *coda_file) return 0; } -int coda_flush(struct file *coda_file) +int coda_flush(struct file *coda_file, fl_owner_t id) { unsigned short flags = coda_file->f_flags & ~O_EXCL; unsigned short coda_flags = coda_flags_to_cflags(flags); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index fc342cf7c2cc..087f3b734f40 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -169,7 +169,7 @@ static int fuse_release(struct inode *inode, struct file *file) return fuse_release_common(inode, file, 0); } -static int fuse_flush(struct file *file) +static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file->f_dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); diff --git a/fs/locks.c b/fs/locks.c index e588e1c265f7..f8a634ac1121 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1907,7 +1907,7 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) return; lock.fl_type = F_UNLCK; - lock.fl_flags = FL_POSIX; + lock.fl_flags = FL_POSIX | FL_CLOSE; lock.fl_start = 0; lock.fl_end = OFFSET_MAX; lock.fl_owner = owner; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index fade02c15e6e..fa05c027ea11 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -43,7 +43,7 @@ static int nfs_file_mmap(struct file *, struct vm_area_struct *); static ssize_t nfs_file_sendfile(struct file *, loff_t *, size_t, read_actor_t, void *); static ssize_t nfs_file_read(struct kiocb *, char __user *, size_t, loff_t); static ssize_t nfs_file_write(struct kiocb *, const char __user *, size_t, loff_t); -static int nfs_file_flush(struct file *); +static int nfs_file_flush(struct file *, fl_owner_t id); static int nfs_fsync(struct file *, struct dentry *dentry, int datasync); static int nfs_check_flags(int flags); static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl); @@ -188,7 +188,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) * */ static int -nfs_file_flush(struct file *file) +nfs_file_flush(struct file *file, fl_owner_t id) { struct nfs_open_context *ctx = (struct nfs_open_context *)file->private_data; struct inode *inode = file->f_dentry->d_inode; diff --git a/fs/open.c b/fs/open.c index a37ff861108f..5fb16e5267dc 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1152,7 +1152,7 @@ int filp_close(struct file *filp, fl_owner_t id) } if (filp->f_op && filp->f_op->flush) - retval = filp->f_op->flush(filp); + retval = filp->f_op->flush(filp, id); dnotify_flush(filp, id); locks_remove_posix(filp, id); diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h index b3ecf8f71d97..7b5c5df5cb69 100644 --- a/include/linux/coda_linux.h +++ b/include/linux/coda_linux.h @@ -36,7 +36,7 @@ extern const struct file_operations coda_ioctl_operations; /* operations shared over more than one file */ int coda_open(struct inode *i, struct file *f); -int coda_flush(struct file *f); +int coda_flush(struct file *f, fl_owner_t id); int coda_release(struct inode *i, struct file *f); int coda_permission(struct inode *inode, int mask, struct nameidata *nd); int coda_revalidate_inode(struct dentry *); diff --git a/include/linux/fs.h b/include/linux/fs.h index e917403f4d58..56d8bf0d0a77 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -683,6 +683,7 @@ extern spinlock_t files_lock; #define FL_FLOCK 2 #define FL_ACCESS 8 /* not trying to lock, just looking */ #define FL_LEASE 32 /* lease held on this file */ +#define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ /* @@ -1025,7 +1026,7 @@ struct file_operations { long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); int (*open) (struct inode *, struct file *); - int (*flush) (struct file *); + int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, struct dentry *, int datasync); int (*aio_fsync) (struct kiocb *, int datasync); diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 0a2a24b6ebe4..02e6f6798972 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -359,7 +359,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, return count; } -static int mqueue_flush_file(struct file *filp) +static int mqueue_flush_file(struct file *filp, fl_owner_t id) { struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); -- cgit v1.2.3 From b0904e147f7cbe4be3b4dae49ddccd627bb66f16 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Fri, 23 Jun 2006 02:05:13 -0700 Subject: [PATCH] fs/locks.c: make posix_locks_deadlock() static We can now make posix_locks_deadlock() static. Signed-off-by: Adrian Bunk Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/locks.c | 4 +--- include/linux/fs.h | 1 - 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/locks.c') diff --git a/fs/locks.c b/fs/locks.c index f8a634ac1121..1ad29c9b6252 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -703,7 +703,7 @@ EXPORT_SYMBOL(posix_test_lock); * from a broken NFS client. But broken NFS clients have a lot more to * worry about than proper deadlock detection anyway... --okir */ -int posix_locks_deadlock(struct file_lock *caller_fl, +static int posix_locks_deadlock(struct file_lock *caller_fl, struct file_lock *block_fl) { struct list_head *tmp; @@ -722,8 +722,6 @@ next_task: return 0; } -EXPORT_SYMBOL(posix_locks_deadlock); - /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks * at the head of the list, but that's secret knowledge known only to * flock_lock_file and posix_lock_file. diff --git a/include/linux/fs.h b/include/linux/fs.h index 56d8bf0d0a77..dba4cbd157ee 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -776,7 +776,6 @@ extern int posix_lock_file_conf(struct file *, struct file_lock *, struct file_l extern int posix_lock_file(struct file *, struct file_lock *); extern int posix_lock_file_wait(struct file *, struct file_lock *); extern int posix_unblock_lock(struct file *, struct file_lock *); -extern int posix_locks_deadlock(struct file_lock *, struct file_lock *); extern int flock_lock_file_wait(struct file *filp, struct file_lock *fl); extern int __break_lease(struct inode *inode, unsigned int flags); extern void lease_get_mtime(struct inode *, struct timespec *time); -- cgit v1.2.3