diff options
Diffstat (limited to 'fs')
45 files changed, 1301 insertions, 411 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index dfebdbe7440e..3031e3233dd6 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -26,7 +26,6 @@ #include <linux/fs.h> #include <linux/sched.h> #include <linux/idr.h> -#include <asm/semaphore.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -444,22 +444,27 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, struct bio_map_data { struct bio_vec *iovecs; - void __user *userptr; + int nr_sgvecs; + struct sg_iovec *sgvecs; }; -static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio) +static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, + struct sg_iovec *iov, int iov_count) { memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); + memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); + bmd->nr_sgvecs = iov_count; bio->bi_private = bmd; } static void bio_free_map_data(struct bio_map_data *bmd) { kfree(bmd->iovecs); + kfree(bmd->sgvecs); kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs) +static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) { struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); @@ -467,13 +472,71 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs) return NULL; bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); - if (bmd->iovecs) + if (!bmd->iovecs) { + kfree(bmd); + return NULL; + } + + bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL); + if (bmd->sgvecs) return bmd; + kfree(bmd->iovecs); kfree(bmd); return NULL; } +static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, + int uncopy) +{ + int ret = 0, i; + struct bio_vec *bvec; + int iov_idx = 0; + unsigned int iov_off = 0; + int read = bio_data_dir(bio) == READ; + + __bio_for_each_segment(bvec, bio, i, 0) { + char *bv_addr = page_address(bvec->bv_page); + unsigned int bv_len = bvec->bv_len; + + while (bv_len && iov_idx < iov_count) { + unsigned int bytes; + char *iov_addr; + + bytes = min_t(unsigned int, + iov[iov_idx].iov_len - iov_off, bv_len); + iov_addr = iov[iov_idx].iov_base + iov_off; + + if (!ret) { + if (!read && !uncopy) + ret = copy_from_user(bv_addr, iov_addr, + bytes); + if (read && uncopy) + ret = copy_to_user(iov_addr, bv_addr, + bytes); + + if (ret) + ret = -EFAULT; + } + + bv_len -= bytes; + bv_addr += bytes; + iov_addr += bytes; + iov_off += bytes; + + if (iov[iov_idx].iov_len == iov_off) { + iov_idx++; + iov_off = 0; + } + } + + if (uncopy) + __free_page(bvec->bv_page); + } + + return ret; +} + /** * bio_uncopy_user - finish previously mapped bio * @bio: bio being terminated @@ -484,55 +547,56 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs) int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; - const int read = bio_data_dir(bio) == READ; - struct bio_vec *bvec; - int i, ret = 0; + int ret; - __bio_for_each_segment(bvec, bio, i, 0) { - char *addr = page_address(bvec->bv_page); - unsigned int len = bmd->iovecs[i].bv_len; + ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1); - if (read && !ret && copy_to_user(bmd->userptr, addr, len)) - ret = -EFAULT; - - __free_page(bvec->bv_page); - bmd->userptr += len; - } bio_free_map_data(bmd); bio_put(bio); return ret; } /** - * bio_copy_user - copy user data to bio + * bio_copy_user_iov - copy user data to bio * @q: destination block queue - * @uaddr: start of user address - * @len: length in bytes + * @iov: the iovec. + * @iov_count: number of elements in the iovec * @write_to_vm: bool indicating writing to pages or not * * Prepares and returns a bio for indirect user io, bouncing data * to/from kernel pages as necessary. Must be paired with * call bio_uncopy_user() on io completion. */ -struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, - unsigned int len, int write_to_vm) +struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, + int iov_count, int write_to_vm) { - unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = uaddr >> PAGE_SHIFT; struct bio_map_data *bmd; struct bio_vec *bvec; struct page *page; struct bio *bio; int i, ret; + int nr_pages = 0; + unsigned int len = 0; - bmd = bio_alloc_map_data(end - start); + for (i = 0; i < iov_count; i++) { + unsigned long uaddr; + unsigned long end; + unsigned long start; + + uaddr = (unsigned long)iov[i].iov_base; + end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = uaddr >> PAGE_SHIFT; + + nr_pages += end - start; + len += iov[i].iov_len; + } + + bmd = bio_alloc_map_data(nr_pages, iov_count); if (!bmd) return ERR_PTR(-ENOMEM); - bmd->userptr = (void __user *) uaddr; - ret = -ENOMEM; - bio = bio_alloc(GFP_KERNEL, end - start); + bio = bio_alloc(GFP_KERNEL, nr_pages); if (!bio) goto out_bmd; @@ -564,22 +628,12 @@ struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, * success */ if (!write_to_vm) { - char __user *p = (char __user *) uaddr; - - /* - * for a write, copy in data to kernel pages - */ - ret = -EFAULT; - bio_for_each_segment(bvec, bio, i) { - char *addr = page_address(bvec->bv_page); - - if (copy_from_user(addr, p, bvec->bv_len)) - goto cleanup; - p += bvec->bv_len; - } + ret = __bio_copy_iov(bio, iov, iov_count, 0); + if (ret) + goto cleanup; } - bio_set_map_data(bmd, bio); + bio_set_map_data(bmd, bio, iov, iov_count); return bio; cleanup: bio_for_each_segment(bvec, bio, i) @@ -591,6 +645,28 @@ out_bmd: return ERR_PTR(ret); } +/** + * bio_copy_user - copy user data to bio + * @q: destination block queue + * @uaddr: start of user address + * @len: length in bytes + * @write_to_vm: bool indicating writing to pages or not + * + * Prepares and returns a bio for indirect user io, bouncing data + * to/from kernel pages as necessary. Must be paired with + * call bio_uncopy_user() on io completion. + */ +struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, + unsigned int len, int write_to_vm) +{ + struct sg_iovec iov; + + iov.iov_base = (void __user *)uaddr; + iov.iov_len = len; + + return bio_copy_user_iov(q, &iov, 1, write_to_vm); +} + static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, struct sg_iovec *iov, int iov_count, diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 350680fd7da7..0c3b618c15b3 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -23,7 +23,6 @@ #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/mutex.h> -#include <asm/semaphore.h> #include <asm/uaccess.h> diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index d30ea8b433a2..7a8824f475f2 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -37,7 +37,6 @@ #include <linux/jhash.h> #include <linux/miscdevice.h> #include <linux/mutex.h> -#include <asm/semaphore.h> #include <asm/uaccess.h> #include <linux/dlm.h> diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 5deb8b74e649..08f647d8188d 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -253,7 +253,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) * it has too few free inodes left (min_inodes) or * it has too few free blocks left (min_blocks) or * it's already running too large debt (max_debt). - * Parent's group is prefered, if it doesn't satisfy these + * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index c62006805427..b8a2990bab83 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -239,7 +239,7 @@ no_block: * @inode: owner * @ind: descriptor of indirect block. * - * This function returns the prefered place for block allocation. + * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. @@ -283,7 +283,7 @@ static unsigned long ext2_find_near(struct inode *inode, Indirect *ind) } /** - * ext2_find_goal - find a prefered place for allocation. + * ext2_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index b8ea11fee5c6..de876fa793e1 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -12,6 +12,7 @@ #include <linux/time.h> #include <linux/sched.h> #include <linux/compat.h> +#include <linux/mount.h> #include <linux/smp_lock.h> #include <asm/current.h> #include <asm/uaccess.h> @@ -23,6 +24,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct ext2_inode_info *ei = EXT2_I(inode); unsigned int flags; unsigned short rsv_window_size; + int ret; ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg); @@ -34,14 +36,19 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case EXT2_IOC_SETFLAGS: { unsigned int oldflags; - if (IS_RDONLY(inode)) - return -EROFS; + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; - if (!is_owner_or_cap(inode)) - return -EACCES; + if (!is_owner_or_cap(inode)) { + ret = -EACCES; + goto setflags_out; + } - if (get_user(flags, (int __user *) arg)) - return -EFAULT; + if (get_user(flags, (int __user *) arg)) { + ret = -EFAULT; + goto setflags_out; + } if (!S_ISDIR(inode->i_mode)) flags &= ~EXT2_DIRSYNC_FL; @@ -50,7 +57,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + ret = -EPERM; + goto setflags_out; } oldflags = ei->i_flags; @@ -63,7 +71,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + ret = -EPERM; + goto setflags_out; } } @@ -75,20 +84,26 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext2_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - return 0; +setflags_out: + mnt_drop_write(filp->f_path.mnt); + return ret; } case EXT2_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *) arg); case EXT2_IOC_SETVERSION: if (!is_owner_or_cap(inode)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(inode->i_generation, (int __user *) arg)) - return -EFAULT; - inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); - return 0; + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + if (get_user(inode->i_generation, (int __user *) arg)) { + ret = -EFAULT; + } else { + inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); + } + mnt_drop_write(filp->f_path.mnt); + return ret; case EXT2_IOC_GETRSVSZ: if (test_opt(inode->i_sb, RESERVATION) && S_ISREG(inode->i_mode) @@ -102,15 +117,16 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (IS_RDONLY(inode)) - return -EROFS; - - if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER)) + if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) return -EFAULT; + ret = mnt_want_write(filp->f_path.mnt); + if (ret) + return ret; + if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS) rsv_window_size = EXT2_MAX_RESERVE_BLOCKS; @@ -131,6 +147,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) rsv->rsv_goal_size = rsv_window_size; } mutex_unlock(&ei->truncate_mutex); + mnt_drop_write(filp->f_path.mnt); return 0; } default: diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 4f4020c54683..96dd5573e49b 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -239,7 +239,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent) * it has too few free inodes left (min_inodes) or * it has too few free blocks left (min_blocks) or * it's already running too large debt (max_debt). - * Parent's group is prefered, if it doesn't satisfy these + * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index eb95670a27eb..c683609b0e3a 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -392,7 +392,7 @@ no_block: * @inode: owner * @ind: descriptor of indirect block. * - * This function returns the prefered place for block allocation. + * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. @@ -436,12 +436,12 @@ static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind) } /** - * ext3_find_goal - find a prefered place for allocation. + * ext3_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain * - * Normally this function find the prefered place for block allocation, + * Normally this function find the preferred place for block allocation, * returns it. */ diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index 023a070f55f1..0d0c70151642 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -12,6 +12,7 @@ #include <linux/capability.h> #include <linux/ext3_fs.h> #include <linux/ext3_jbd.h> +#include <linux/mount.h> #include <linux/time.h> #include <linux/compat.h> #include <linux/smp_lock.h> @@ -38,14 +39,19 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, unsigned int oldflags; unsigned int jflag; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; - if (!is_owner_or_cap(inode)) - return -EACCES; + if (!is_owner_or_cap(inode)) { + err = -EACCES; + goto flags_out; + } - if (get_user(flags, (int __user *) arg)) - return -EFAULT; + if (get_user(flags, (int __user *) arg)) { + err = -EFAULT; + goto flags_out; + } if (!S_ISDIR(inode->i_mode)) flags &= ~EXT3_DIRSYNC_FL; @@ -54,7 +60,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + err = -EPERM; + goto flags_out; } oldflags = ei->i_flags; @@ -70,7 +77,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + err = -EPERM; + goto flags_out; } } @@ -81,7 +89,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { if (!capable(CAP_SYS_RESOURCE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + err = -EPERM; + goto flags_out; } } @@ -89,7 +98,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, handle = ext3_journal_start(inode, 1); if (IS_ERR(handle)) { mutex_unlock(&inode->i_mutex); - return PTR_ERR(handle); + err = PTR_ERR(handle); + goto flags_out; } if (IS_SYNC(inode)) handle->h_sync = 1; @@ -115,6 +125,8 @@ flags_err: if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) err = ext3_change_inode_journal_flag(inode, jflag); mutex_unlock(&inode->i_mutex); +flags_out: + mnt_drop_write(filp->f_path.mnt); return err; } case EXT3_IOC_GETVERSION: @@ -129,14 +141,18 @@ flags_err: if (!is_owner_or_cap(inode)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(generation, (int __user *) arg)) - return -EFAULT; - + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } handle = ext3_journal_start(inode, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto setversion_out; + } err = ext3_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode->i_ctime = CURRENT_TIME_SEC; @@ -144,6 +160,8 @@ flags_err: err = ext3_mark_iloc_dirty(handle, inode, &iloc); } ext3_journal_stop(handle); +setversion_out: + mnt_drop_write(filp->f_path.mnt); return err; } #ifdef CONFIG_JBD_DEBUG @@ -179,18 +197,24 @@ flags_err: } return -ENOTTY; case EXT3_IOC_SETRSVSZ: { + int err; if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; - if (!is_owner_or_cap(inode)) - return -EACCES; + if (!is_owner_or_cap(inode)) { + err = -EACCES; + goto setrsvsz_out; + } - if (get_user(rsv_window_size, (int __user *)arg)) - return -EFAULT; + if (get_user(rsv_window_size, (int __user *)arg)) { + err = -EFAULT; + goto setrsvsz_out; + } if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS) rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; @@ -208,7 +232,9 @@ flags_err: rsv->rsv_goal_size = rsv_window_size; } mutex_unlock(&ei->truncate_mutex); - return 0; +setrsvsz_out: + mnt_drop_write(filp->f_path.mnt); + return err; } case EXT3_IOC_GROUP_EXTEND: { ext3_fsblk_t n_blocks_count; @@ -218,17 +244,20 @@ flags_err: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - - if (get_user(n_blocks_count, (__u32 __user *)arg)) - return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(n_blocks_count, (__u32 __user *)arg)) { + err = -EFAULT; + goto group_extend_out; + } err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); journal_lock_updates(EXT3_SB(sb)->s_journal); journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); - +group_extend_out: + mnt_drop_write(filp->f_path.mnt); return err; } case EXT3_IOC_GROUP_ADD: { @@ -239,18 +268,22 @@ flags_err: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg, - sizeof(input))) - return -EFAULT; + sizeof(input))) { + err = -EFAULT; + goto group_add_out; + } err = ext3_group_add(sb, &input); journal_lock_updates(EXT3_SB(sb)->s_journal); journal_flush(EXT3_SB(sb)->s_journal); journal_unlock_updates(EXT3_SB(sb)->s_journal); - +group_add_out: + mnt_drop_write(filp->f_path.mnt); return err; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 8036b9b5376b..486e46a3918d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -305,7 +305,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, * it has too few free inodes left (min_inodes) or * it has too few free blocks left (min_blocks) or * it's already running too large debt (max_debt). - * Parent's group is prefered, if it doesn't satisfy these + * Parent's group is preferred, if it doesn't satisfy these * conditions we search cyclically through the rest. If none * of the groups look good we just look for a group with more * free inodes than average (starting at parent's group). diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 945cbf6cb1fc..8fab233cb05f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -382,7 +382,7 @@ no_block: * @inode: owner * @ind: descriptor of indirect block. * - * This function returns the prefered place for block allocation. + * This function returns the preferred place for block allocation. * It is used when heuristic for sequential allocation fails. * Rules are: * + if there is a block to the left of our position - allocate near it. @@ -432,12 +432,12 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) } /** - * ext4_find_goal - find a prefered place for allocation. + * ext4_find_goal - find a preferred place for allocation. * @inode: owner * @block: block we want * @partial: pointer to the last triple within a chain * - * Normally this function find the prefered place for block allocation, + * Normally this function find the preferred place for block allocation, * returns it. */ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2ed7c37f897e..25b13ede8086 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -15,6 +15,7 @@ #include <linux/time.h> #include <linux/compat.h> #include <linux/smp_lock.h> +#include <linux/mount.h> #include <asm/uaccess.h> int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, @@ -38,24 +39,25 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, unsigned int oldflags; unsigned int jflag; - if (IS_RDONLY(inode)) - return -EROFS; - if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(flags, (int __user *) arg)) return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (!S_ISDIR(inode->i_mode)) flags &= ~EXT4_DIRSYNC_FL; + err = -EPERM; mutex_lock(&inode->i_mutex); /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) { - mutex_unlock(&inode->i_mutex); - return -EPERM; - } + if (IS_NOQUOTA(inode)) + goto flags_out; + oldflags = ei->i_flags; /* The JOURNAL_DATA flag is modifiable only by root */ @@ -68,10 +70,8 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * This test looks nicer. Thanks to Pauline Middelink */ if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); - return -EPERM; - } + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; } /* @@ -79,17 +79,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, * the relevant capability. */ if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) { - mutex_unlock(&inode->i_mutex); - return -EPERM; - } + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; } - handle = ext4_journal_start(inode, 1); if (IS_ERR(handle)) { - mutex_unlock(&inode->i_mutex); - return PTR_ERR(handle); + err = PTR_ERR(handle); + goto flags_out; } if (IS_SYNC(inode)) handle->h_sync = 1; @@ -107,14 +104,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, err = ext4_mark_iloc_dirty(handle, inode, &iloc); flags_err: ext4_journal_stop(handle); - if (err) { - mutex_unlock(&inode->i_mutex); - return err; - } + if (err) + goto flags_out; if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) err = ext4_change_inode_journal_flag(inode, jflag); +flags_out: mutex_unlock(&inode->i_mutex); + mnt_drop_write(filp->f_path.mnt); return err; } case EXT4_IOC_GETVERSION: @@ -129,14 +126,20 @@ flags_err: if (!is_owner_or_cap(inode)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(generation, (int __user *) arg)) - return -EFAULT; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(generation, (int __user *) arg)) { + err = -EFAULT; + goto setversion_out; + } handle = ext4_journal_start(inode, 1); - if (IS_ERR(handle)) - return PTR_ERR(handle); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto setversion_out; + } err = ext4_reserve_inode_write(handle, inode, &iloc); if (err == 0) { inode->i_ctime = ext4_current_time(inode); @@ -144,6 +147,8 @@ flags_err: err = ext4_mark_iloc_dirty(handle, inode, &iloc); } ext4_journal_stop(handle); +setversion_out: + mnt_drop_write(filp->f_path.mnt); return err; } #ifdef CONFIG_JBD2_DEBUG @@ -179,19 +184,21 @@ flags_err: } return -ENOTTY; case EXT4_IOC_SETRSVSZ: { + int err; if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) return -ENOTTY; - if (IS_RDONLY(inode)) - return -EROFS; - if (!is_owner_or_cap(inode)) return -EACCES; if (get_user(rsv_window_size, (int __user *)arg)) return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS) rsv_window_size = EXT4_MAX_RESERVE_BLOCKS; @@ -208,6 +215,7 @@ flags_err: rsv->rsv_goal_size = rsv_window_size; } up_write(&ei->i_data_sem); + mnt_drop_write(filp->f_path.mnt); return 0; } case EXT4_IOC_GROUP_EXTEND: { @@ -218,16 +226,18 @@ flags_err: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(n_blocks_count, (__u32 __user *)arg)) return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + mnt_drop_write(filp->f_path.mnt); return err; } @@ -239,17 +249,19 @@ flags_err: if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, sizeof(input))) return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + err = ext4_group_add(sb, &input); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + mnt_drop_write(filp->f_path.mnt); return err; } diff --git a/fs/fat/file.c b/fs/fat/file.c index c614175876e0..2a3bed967041 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -8,6 +8,7 @@ #include <linux/capability.h> #include <linux/module.h> +#include <linux/mount.h> #include <linux/time.h> #include <linux/msdos_fs.h> #include <linux/smp_lock.h> @@ -46,10 +47,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, mutex_lock(&inode->i_mutex); - if (IS_RDONLY(inode)) { - err = -EROFS; - goto up; - } + err = mnt_want_write(filp->f_path.mnt); + if (err) + goto up_no_drop_write; /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -105,7 +105,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp, MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED; mark_inode_dirty(inode); - up: +up: + mnt_drop_write(filp->f_path.mnt); +up_no_drop_write: mutex_unlock(&inode->i_mutex); return err; } diff --git a/fs/file_table.c b/fs/file_table.c index 986ff4ed0a7c..7a0a9b872251 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -42,6 +42,7 @@ static inline void file_free_rcu(struct rcu_head *head) static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); + file_check_state(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -199,6 +200,18 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry, file->f_mapping = dentry->d_inode->i_mapping; file->f_mode = mode; file->f_op = fop; + + /* + * These mounts don't really matter in practice + * for r/o bind mounts. They aren't userspace- + * visible. We do this for consistency, and so + * that we can do debugging checks at __fput() + */ + if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) { + file_take_write(file); + error = mnt_want_write(mnt); + WARN_ON(error); + } return error; } EXPORT_SYMBOL(init_file); @@ -211,6 +224,31 @@ void fput(struct file *file) EXPORT_SYMBOL(fput); +/** + * drop_file_write_access - give up ability to write to a file + * @file: the file to which we will stop writing + * + * This is a central place which will give up the ability + * to write to @file, along with access to write through + * its vfsmount. + */ +void drop_file_write_access(struct file *file) +{ + struct vfsmount *mnt = file->f_path.mnt; + struct dentry *dentry = file->f_path.dentry; + struct inode *inode = dentry->d_inode; + + put_write_access(inode); + + if (special_file(inode->i_mode)) + return; + if (file_check_writeable(file) != 0) + return; + mnt_drop_write(mnt); + file_release_write(file); +} +EXPORT_SYMBOL_GPL(drop_file_write_access); + /* __fput is called from task context when aio completion releases the last * last use of a struct file *. Do not use otherwise. */ @@ -236,10 +274,10 @@ void __fput(struct file *file) if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) cdev_put(inode->i_cdev); fops_put(file->f_op); - if (file->f_mode & FMODE_WRITE) - put_write_access(inode); put_pid(file->f_owner.pid); file_kill(file); + if (file->f_mode & FMODE_WRITE) + drop_file_write_access(file); file->f_path.dentry = NULL; file->f_path.mnt = NULL; file_free(file); diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index b60c0affbec5..f457d2ca51ab 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -14,6 +14,7 @@ #include <linux/capability.h> #include <linux/fs.h> +#include <linux/mount.h> #include <linux/sched.h> #include <linux/xattr.h> #include <asm/uaccess.h> @@ -35,25 +36,32 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */ return put_user(flags, (int __user *)arg); case HFSPLUS_IOC_EXT2_SETFLAGS: { - if (IS_RDONLY(inode)) - return -EROFS; - - if (!is_owner_or_cap(inode)) - return -EACCES; - - if (get_user(flags, (int __user *)arg)) - return -EFAULT; - + int err = 0; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + + if (!is_owner_or_cap(inode)) { + err = -EACCES; + goto setflags_out; + } + if (get_user(flags, (int __user *)arg)) { + err = -EFAULT; + goto setflags_out; + } if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) || HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) { - if (!capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; + if (!capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto setflags_out; + } } /* don't silently ignore unsupported ext2 flags */ - if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) - return -EOPNOTSUPP; - + if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) { + err = -EOPNOTSUPP; + goto setflags_out; + } if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ inode->i_flags |= S_IMMUTABLE; HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; @@ -75,7 +83,9 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - return 0; +setflags_out: + mnt_drop_write(filp->f_path.mnt); + return err; } default: return -ENOTTY; diff --git a/fs/inode.c b/fs/inode.c index 53245ffcf93d..27ee1af50d02 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1199,42 +1199,37 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry) struct inode *inode = dentry->d_inode; struct timespec now; - if (inode->i_flags & S_NOATIME) + if (mnt_want_write(mnt)) return; + if (inode->i_flags & S_NOATIME) + goto out; if (IS_NOATIME(inode)) - return; + goto out; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) - return; + goto out; - /* - * We may have a NULL vfsmount when coming from NFSD - */ - if (mnt) { - if (mnt->mnt_flags & MNT_NOATIME) - return; - if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) - return; - - if (mnt->mnt_flags & MNT_RELATIME) { - /* - * With relative atime, only update atime if the - * previous atime is earlier than either the ctime or - * mtime. - */ - if (timespec_compare(&inode->i_mtime, - &inode->i_atime) < 0 && - timespec_compare(&inode->i_ctime, - &inode->i_atime) < 0) - return; - } + if (mnt->mnt_flags & MNT_NOATIME) + goto out; + if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) + goto out; + if (mnt->mnt_flags & MNT_RELATIME) { + /* + * With relative atime, only update atime if the previous + * atime is earlier than either the ctime or mtime. + */ + if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 && + timespec_compare(&inode->i_ctime, &inode->i_atime) < 0) + goto out; } now = current_fs_time(inode->i_sb); if (timespec_equal(&inode->i_atime, &now)) - return; + goto out; inode->i_atime = now; mark_inode_dirty_sync(inode); +out: + mnt_drop_write(mnt); } EXPORT_SYMBOL(touch_atime); @@ -1255,10 +1250,13 @@ void file_update_time(struct file *file) struct inode *inode = file->f_path.dentry->d_inode; struct timespec now; int sync_it = 0; + int err; if (IS_NOCMTIME(inode)) return; - if (IS_RDONLY(inode)) + + err = mnt_want_write(file->f_path.mnt); + if (err) return; now = current_fs_time(inode->i_sb); @@ -1279,6 +1277,7 @@ void file_update_time(struct file *file) if (sync_it) mark_inode_dirty_sync(inode); + mnt_drop_write(file->f_path.mnt); } EXPORT_SYMBOL(file_update_time); diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 0b78fdc9773b..a841f4973a74 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -15,7 +15,7 @@ #include <linux/version.h> #include <linux/rbtree.h> #include <linux/posix_acl.h> -#include <asm/semaphore.h> +#include <linux/semaphore.h> struct jffs2_inode_info { /* We need an internal mutex similar to inode->i_mutex. diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 3a2197f3c812..18fca2b9e531 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -16,7 +16,7 @@ #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/completion.h> -#include <asm/semaphore.h> +#include <linux/semaphore.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/list.h> diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index a1f8e375ad21..afe222bf300f 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/ctype.h> #include <linux/capability.h> +#include <linux/mount.h> #include <linux/time.h> #include <linux/sched.h> #include <asm/current.h> @@ -65,23 +66,30 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return put_user(flags, (int __user *) arg); case JFS_IOC_SETFLAGS: { unsigned int oldflags; + int err; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; - if (!is_owner_or_cap(inode)) - return -EACCES; - - if (get_user(flags, (int __user *) arg)) - return -EFAULT; + if (!is_owner_or_cap(inode)) { + err = -EACCES; + goto setflags_out; + } + if (get_user(flags, (int __user *) arg)) { + err = -EFAULT; + goto setflags_out; + } flags = jfs_map_ext2(flags, 1); if (!S_ISDIR(inode->i_mode)) flags &= ~JFS_DIRSYNC_FL; /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) - return -EPERM; + if (IS_NOQUOTA(inode)) { + err = -EPERM; + goto setflags_out; + } /* Lock against other parallel changes of flags */ mutex_lock(&inode->i_mutex); @@ -98,7 +106,8 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) { mutex_unlock(&inode->i_mutex); - return -EPERM; + err = -EPERM; + goto setflags_out; } } @@ -110,7 +119,9 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) mutex_unlock(&inode->i_mutex); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - return 0; +setflags_out: + mnt_drop_write(filp->f_path.mnt); + return err; } default: return -ENOTTY; diff --git a/fs/locks.c b/fs/locks.c index 43c0af21a0c5..592faadbcec1 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -127,7 +127,6 @@ #include <linux/rcupdate.h> #include <linux/pid_namespace.h> -#include <asm/semaphore.h> #include <asm/uaccess.h> #define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) diff --git a/fs/namei.c b/fs/namei.c index 8cf9bb9c2fc0..e179f71bfcb0 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) return -EACCES; flag &= ~O_TRUNC; - } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE)) - return -EROFS; + } error = vfs_permission(nd, acc_mode); if (error) @@ -1677,7 +1676,12 @@ int may_open(struct nameidata *nd, int acc_mode, int flag) return 0; } -static int open_namei_create(struct nameidata *nd, struct path *path, +/* + * Be careful about ever adding any more callers of this + * function. Its flags must be in the namei format, not + * what get passed to sys_open(). + */ +static int __open_namei_create(struct nameidata *nd, struct path *path, int flag, int mode) { int error; @@ -1696,26 +1700,56 @@ static int open_namei_create(struct nameidata *nd, struct path *path, } /* - * open_namei() + * Note that while the flag value (low two bits) for sys_open means: + * 00 - read-only + * 01 - write-only + * 10 - read-write + * 11 - special + * it is changed into + * 00 - no permissions needed + * 01 - read-permission + * 10 - write-permission + * 11 - read-write + * for the internal routines (ie open_namei()/follow_link() etc) + * This is more logical, and also allows the 00 "no perm needed" + * to be used for symlinks (where the permissions are checked + * later). * - * namei for open - this is in fact almost the whole open-routine. - * - * Note that the low bits of "flag" aren't the same as in the open - * system call - they are 00 - no permissions needed - * 01 - read permission needed - * 10 - write permission needed - * 11 - read/write permissions needed - * which is a lot more logical, and also allows the "no perm" needed - * for symlinks (where the permissions are checked later). - * SMP-safe +*/ +static inline int open_to_namei_flags(int flag) +{ + if ((flag+1) & O_ACCMODE) + flag++; + return flag; +} + +static int open_will_write_to_fs(int flag, struct inode *inode) +{ + /* + * We'll never write to the fs underlying + * a device file. + */ + if (special_file(inode->i_mode)) + return 0; + return (flag & O_TRUNC); +} + +/* + * Note that the low bits of the passed in "open_flag" + * are not the same as in the local variable "flag". See + * open_to_namei_flags() for more details. */ -int open_namei(int dfd, const char *pathname, int flag, - int mode, struct nameidata *nd) +struct file *do_filp_open(int dfd, const char *pathname, + int open_flag, int mode) { + struct file *filp; + struct nameidata nd; int acc_mode, error; struct path path; struct dentry *dir; int count = 0; + int will_write; + int flag = open_to_namei_flags(open_flag); acc_mode = ACC_MODE(flag); @@ -1733,18 +1767,19 @@ int open_namei(int dfd, const char *pathname, int flag, */ if (!(flag & O_CREAT)) { error = path_lookup_open(dfd, pathname, lookup_flags(flag), - nd, flag); + &nd, flag); if (error) - return error; + return ERR_PTR(error); goto ok; } /* * Create - we need to know the parent. */ - error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode); + error = path_lookup_create(dfd, pathname, LOOKUP_PARENT, + &nd, flag, mode); if (error) - return error; + return ERR_PTR(error); /* * We have the parent and last component. First of all, check @@ -1752,14 +1787,14 @@ int open_namei(int dfd, const char *pathname, int flag, * will not do. */ error = -EISDIR; - if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len]) + if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len]) goto exit; - dir = nd->path.dentry; - nd->flags &= ~LOOKUP_PARENT; + dir = nd.path.dentry; + nd.flags &= ~LOOKUP_PARENT; mutex_lock(&dir->d_inode->i_mutex); - path.dentry = lookup_hash(nd); - path.mnt = nd->path.mnt; + path.dentry = lookup_hash(&nd); + path.mnt = nd.path.mnt; do_last: error = PTR_ERR(path.dentry); @@ -1768,18 +1803,31 @@ do_last: goto exit; } - if (IS_ERR(nd->intent.open.file)) { - mutex_unlock(&dir->d_inode->i_mutex); - error = PTR_ERR(nd->intent.open.file); - goto exit_dput; + if (IS_ERR(nd.intent.open.file)) { + error = PTR_ERR(nd.intent.open.file); + goto exit_mutex_unlock; } /* Negative dentry, just create the file */ if (!path.dentry->d_inode) { - error = open_namei_create(nd, &path, flag, mode); + /* + * This write is needed to ensure that a + * ro->rw transition does not occur between + * the time when the file is created and when + * a permanent write count is taken through + * the 'struct file' in nameidata_to_filp(). + */ + error = mnt_want_write(nd.path.mnt); if (error) + goto exit_mutex_unlock; + error = __open_namei_create(&nd, &path, flag, mode); + if (error) { + mnt_drop_write(nd.path.mnt); goto exit; - return 0; + } + filp = nameidata_to_filp(&nd, open_flag); + mnt_drop_write(nd.path.mnt); + return filp; } /* @@ -1804,23 +1852,52 @@ do_last: if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link) goto do_link; - path_to_nameidata(&path, nd); + path_to_nameidata(&path, &nd); error = -EISDIR; if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) goto exit; ok: - error = may_open(nd, acc_mode, flag); - if (error) + /* + * Consider: + * 1. may_open() truncates a file + * 2. a rw->ro mount transition occurs + * 3. nameidata_to_filp() fails due to + * the ro mount. + * That would be inconsistent, and should + * be avoided. Taking this mnt write here + * ensures that (2) can not occur. + */ + will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode); + if (will_write) { + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit; + } + error = may_open(&nd, acc_mode, flag); + if (error) { + if (will_write) + mnt_drop_write(nd.path.mnt); goto exit; - return 0; + } + filp = nameidata_to_filp(&nd, open_flag); + /* + * It is now safe to drop the mnt write + * because the filp has had a write taken + * on its behalf. + */ + if (will_write) + mnt_drop_write(nd.path.mnt); + return filp; +exit_mutex_unlock: + mutex_unlock(&dir->d_inode->i_mutex); exit_dput: - path_put_conditional(&path, nd); + path_put_conditional(&path, &nd); exit: - if (!IS_ERR(nd->intent.open.file)) - release_open_intent(nd); - path_put(&nd->path); - return error; + if (!IS_ERR(nd.intent.open.file)) + release_open_intent(&nd); + path_put(&nd.path); + return ERR_PTR(error); do_link: error = -ELOOP; @@ -1836,43 +1913,60 @@ do_link: * stored in nd->last.name and we will have to putname() it when we * are done. Procfs-like symlinks just set LAST_BIND. */ - nd->flags |= LOOKUP_PARENT; - error = security_inode_follow_link(path.dentry, nd); + nd.flags |= LOOKUP_PARENT; + error = security_inode_follow_link(path.dentry, &nd); if (error) goto exit_dput; - error = __do_follow_link(&path, nd); + error = __do_follow_link(&path, &nd); if (error) { /* Does someone understand code flow here? Or it is only * me so stupid? Anathema to whoever designed this non-sense * with "intent.open". */ - release_open_intent(nd); - return error; + release_open_intent(&nd); + return ERR_PTR(error); } - nd->flags &= ~LOOKUP_PARENT; - if (nd->last_type == LAST_BIND) + nd.flags &= ~LOOKUP_PARENT; + if (nd.last_type == LAST_BIND) goto ok; error = -EISDIR; - if (nd->last_type != LAST_NORM) + if (nd.last_type != LAST_NORM) goto exit; - if (nd->last.name[nd->last.len]) { - __putname(nd->last.name); + if (nd.last.name[nd.last.len]) { + __putname(nd.last.name); goto exit; } error = -ELOOP; if (count++==32) { - __putname(nd->last.name); + __putname(nd.last.name); goto exit; } - dir = nd->path.dentry; + dir = nd.path.dentry; mutex_lock(&dir->d_inode->i_mutex); - path.dentry = lookup_hash(nd); - path.mnt = nd->path.mnt; - __putname(nd->last.name); + path.dentry = lookup_hash(&nd); + path.mnt = nd.path.mnt; + __putname(nd.last.name); goto do_last; } /** + * filp_open - open file and return file pointer + * + * @filename: path to open + * @flags: open flags as per the open(2) second argument + * @mode: mode for the new file if O_CREAT is set, else ignored + * + * This is the helper to open a file from kernelspace if you really + * have to. But in generally you should not do this, so please move + * along, nothing to see here.. + */ +struct file *filp_open(const char *filename, int flags, int mode) +{ + return do_filp_open(AT_FDCWD, filename, flags, mode); +} +EXPORT_SYMBOL(filp_open); + +/** * lookup_create - lookup a dentry, creating it if it doesn't exist * @nd: nameidata info * @is_dir: directory flag @@ -1945,6 +2039,23 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) return error; } +static int may_mknod(mode_t mode) +{ + switch (mode & S_IFMT) { + case S_IFREG: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + case 0: /* zero mode translates to S_IFREG */ + return 0; + case S_IFDIR: + return -EPERM; + default: + return -EINVAL; + } +} + asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, unsigned dev) { @@ -1963,12 +2074,19 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, if (error) goto out; dentry = lookup_create(&nd, 0); - error = PTR_ERR(dentry); - + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_unlock; + } if (!IS_POSIXACL(nd.path.dentry->d_inode)) mode &= ~current->fs->umask; - if (!IS_ERR(dentry)) { - switch (mode & S_IFMT) { + error = may_mknod(mode); + if (error) + goto out_dput; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; + switch (mode & S_IFMT) { case 0: case S_IFREG: error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); break; @@ -1979,14 +2097,11 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, case S_IFIFO: case S_IFSOCK: error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); break; - case S_IFDIR: - error = -EPERM; - break; - default: - error = -EINVAL; - } - dput(dentry); } + mnt_drop_write(nd.path.mnt); +out_dput: + dput(dentry); +out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); path_put(&nd.path); out: @@ -2044,7 +2159,12 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode) if (!IS_POSIXACL(nd.path.dentry->d_inode)) mode &= ~current->fs->umask; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); + mnt_drop_write(nd.path.mnt); +out_dput: dput(dentry); out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); @@ -2151,7 +2271,12 @@ static long do_rmdir(int dfd, const char __user *pathname) error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit2; + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit3; error = vfs_rmdir(nd.path.dentry->d_inode, dentry); + mnt_drop_write(nd.path.mnt); +exit3: dput(dentry); exit2: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); @@ -2232,7 +2357,11 @@ static long do_unlinkat(int dfd, const char __user *pathname) inode = dentry->d_inode; if (inode) atomic_inc(&inode->i_count); + error = mnt_want_write(nd.path.mnt); + if (error) + goto exit2; error = vfs_unlink(nd.path.dentry->d_inode, dentry); + mnt_drop_write(nd.path.mnt); exit2: dput(dentry); } @@ -2313,7 +2442,12 @@ asmlinkage long sys_symlinkat(const char __user *oldname, if (IS_ERR(dentry)) goto out_unlock; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO); + mnt_drop_write(nd.path.mnt); +out_dput: dput(dentry); out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); @@ -2408,7 +2542,12 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname, error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto out_unlock; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_dput; error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry); + mnt_drop_write(nd.path.mnt); +out_dput: dput(new_dentry); out_unlock: mutex_unlock(&nd.path.dentry->d_inode->i_mutex); @@ -2634,8 +2773,12 @@ static int do_rename(int olddfd, const char *oldname, if (new_dentry == trap) goto exit5; + error = mnt_want_write(oldnd.path.mnt); + if (error) + goto exit5; error = vfs_rename(old_dir->d_inode, old_dentry, new_dir->d_inode, new_dentry); + mnt_drop_write(oldnd.path.mnt); exit5: dput(new_dentry); exit4: diff --git a/fs/namespace.c b/fs/namespace.c index 94f026ec990a..678f7ce060f2 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -17,6 +17,7 @@ #include <linux/quotaops.h> #include <linux/acct.h> #include <linux/capability.h> +#include <linux/cpumask.h> #include <linux/module.h> #include <linux/sysfs.h> #include <linux/seq_file.h> @@ -55,6 +56,8 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) return tmp & (HASH_SIZE - 1); } +#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) + struct vfsmount *alloc_vfsmnt(const char *name) { struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); @@ -68,6 +71,7 @@ struct vfsmount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); + atomic_set(&mnt->__mnt_writers, 0); if (name) { int size = strlen(name) + 1; char *newname = kmalloc(size, GFP_KERNEL); @@ -80,6 +84,263 @@ struct vfsmount *alloc_vfsmnt(const char *name) return mnt; } +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/* + * __mnt_is_readonly: check whether a mount is read-only + * @mnt: the mount to check for its write status + * + * This shouldn't be used directly ouside of the VFS. + * It does not guarantee that the filesystem will stay + * r/w, just that it is right *now*. This can not and + * should not be used in place of IS_RDONLY(inode). + * mnt_want/drop_write() will _keep_ the filesystem + * r/w. + */ +int __mnt_is_readonly(struct vfsmount *mnt) +{ + if (mnt->mnt_flags & MNT_READONLY) + return 1; + if (mnt->mnt_sb->s_flags & MS_RDONLY) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(__mnt_is_readonly); + +struct mnt_writer { + /* + * If holding multiple instances of this lock, they + * must be ordered by cpu number. + */ + spinlock_t lock; + struct lock_class_key lock_class; /* compiles out with !lockdep */ + unsigned long count; + struct vfsmount *mnt; +} ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct mnt_writer, mnt_writers); + +static int __init init_mnt_writers(void) +{ + int cpu; + for_each_possible_cpu(cpu) { + struct mnt_writer *writer = &per_cpu(mnt_writers, cpu); + spin_lock_init(&writer->lock); + lockdep_set_class(&writer->lock, &writer->lock_class); + writer->count = 0; + } + return 0; +} +fs_initcall(init_mnt_writers); + +static void unlock_mnt_writers(void) +{ + int cpu; + struct mnt_writer *cpu_writer; + + for_each_possible_cpu(cpu) { + cpu_writer = &per_cpu(mnt_writers, cpu); + spin_unlock(&cpu_writer->lock); + } +} + +static inline void __clear_mnt_count(struct mnt_writer *cpu_writer) +{ + if (!cpu_writer->mnt) + return; + /* + * This is in case anyone ever leaves an invalid, + * old ->mnt and a count of 0. + */ + if (!cpu_writer->count) + return; + atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers); + cpu_writer->count = 0; +} + /* + * must hold cpu_writer->lock + */ +static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer, + struct vfsmount *mnt) +{ + if (cpu_writer->mnt == mnt) + return; + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = mnt; +} + +/* + * Most r/o checks on a fs are for operations that take + * discrete amounts of time, like a write() or unlink(). + * We must keep track of when those operations start + * (for permission checks) and when they end, so that + * we can determine when writes are able to occur to + * a filesystem. + */ +/** + * mnt_want_write - get write access to a mount + * @mnt: the mount on which to take a write + * + * This tells the low-level filesystem that a write is + * about to be performed to it, and makes sure that + * writes are allowed before returning success. When + * the write operation is finished, mnt_drop_write() + * must be called. This is effectively a refcount. + */ +int mnt_want_write(struct vfsmount *mnt) +{ + int ret = 0; + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + if (__mnt_is_readonly(mnt)) { + ret = -EROFS; + goto out; + } + use_cpu_writer_for_mount(cpu_writer, mnt); + cpu_writer->count++; +out: + spin_unlock(&cpu_writer->lock); + put_cpu_var(mnt_writers); + return ret; +} +EXPORT_SYMBOL_GPL(mnt_want_write); + +static void lock_mnt_writers(void) +{ + int cpu; + struct mnt_writer *cpu_writer; + + for_each_possible_cpu(cpu) { + cpu_writer = &per_cpu(mnt_writers, cpu); + spin_lock(&cpu_writer->lock); + __clear_mnt_count(cpu_writer); + cpu_writer->mnt = NULL; + } +} + +/* + * These per-cpu write counts are not guaranteed to have + * matched increments and decrements on any given cpu. + * A file open()ed for write on one cpu and close()d on + * another cpu will imbalance this count. Make sure it + * does not get too far out of whack. + */ +static void handle_write_count_underflow(struct vfsmount *mnt) +{ + if (atomic_read(&mnt->__mnt_writers) >= + MNT_WRITER_UNDERFLOW_LIMIT) + return; + /* + * It isn't necessary to hold all of the locks + * at the same time, but doing it this way makes + * us share a lot more code. + */ + lock_mnt_writers(); + /* + * vfsmount_lock is for mnt_flags. + */ + spin_lock(&vfsmount_lock); + /* + * If coalescing the per-cpu writer counts did not + * get us back to a positive writer count, we have + * a bug. + */ + if ((atomic_read(&mnt->__mnt_writers) < 0) && + !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) { + printk(KERN_DEBUG "leak detected on mount(%p) writers " + "count: %d\n", + mnt, atomic_read(&mnt->__mnt_writers)); + WARN_ON(1); + /* use the flag to keep the dmesg spam down */ + mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT; + } + spin_unlock(&vfsmount_lock); + unlock_mnt_writers(); +} + +/** + * mnt_drop_write - give up write access to a mount + * @mnt: the mount on which to give up write access + * + * Tells the low-level filesystem that we are done + * performing writes to it. Must be matched with + * mnt_want_write() call above. + */ +void mnt_drop_write(struct vfsmount *mnt) +{ + int must_check_underflow = 0; + struct mnt_writer *cpu_writer; + + cpu_writer = &get_cpu_var(mnt_writers); + spin_lock(&cpu_writer->lock); + + use_cpu_writer_for_mount(cpu_writer, mnt); + if (cpu_writer->count > 0) { + cpu_writer->count--; + } else { + must_check_underflow = 1; + atomic_dec(&mnt->__mnt_writers); + } + + spin_unlock(&cpu_writer->lock); + /* + * Logically, we could call this each time, + * but the __mnt_writers cacheline tends to + * be cold, and makes this expensive. + */ + if (must_check_underflow) + handle_write_count_underflow(mnt); + /* + * This could be done right after the spinlock + * is taken because the spinlock keeps us on + * the cpu, and disables preemption. However, + * putting it here bounds the amount that + * __mnt_writers can underflow. Without it, + * we could theoretically wrap __mnt_writers. + */ + put_cpu_var(mnt_writers); +} +EXPORT_SYMBOL_GPL(mnt_drop_write); + +static int mnt_make_readonly(struct vfsmount *mnt) +{ + int ret = 0; + + lock_mnt_writers(); + /* + * With all the locks held, this value is stable + */ + if (atomic_read(&mnt->__mnt_writers) > 0) { + ret = -EBUSY; + goto out; + } + /* + * nobody can do a successful mnt_want_write() with all + * of the counts in MNT_DENIED_WRITE and the locks held. + */ + spin_lock(&vfsmount_lock); + if (!ret) + mnt->mnt_flags |= MNT_READONLY; + spin_unlock(&vfsmount_lock); +out: + unlock_mnt_writers(); + return ret; +} + +static void __mnt_unmake_readonly(struct vfsmount *mnt) +{ + spin_lock(&vfsmount_lock); + mnt->mnt_flags &= ~MNT_READONLY; + spin_unlock(&vfsmount_lock); +} + int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) { mnt->mnt_sb = sb; @@ -271,7 +532,36 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, static inline void __mntput(struct vfsmount *mnt) { + int cpu; struct super_block *sb = mnt->mnt_sb; + /* + * We don't have to hold all of the locks at the + * same time here because we know that we're the + * last reference to mnt and that no new writers + * can come in. + */ + for_each_possible_cpu(cpu) { + struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); + if (cpu_writer->mnt != mnt) + continue; + spin_lock(&cpu_writer->lock); + atomic_add(cpu_writer->count, &mnt->__mnt_writers); + cpu_writer->count = 0; + /* + * Might as well do this so that no one + * ever sees the pointer and expects + * it to be valid. + */ + cpu_writer->mnt = NULL; + spin_unlock(&cpu_writer->lock); + } + /* + * This probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this + * happens, the filesystem was probably unable + * to make r/w->r/o transitions. + */ + WARN_ON(atomic_read(&mnt->__mnt_writers)); dput(mnt->mnt_root); free_vfsmnt(mnt); deactivate_super(sb); @@ -417,7 +707,7 @@ static int show_vfsmnt(struct seq_file *m, void *v) seq_putc(m, '.'); mangle(m, mnt->mnt_sb->s_subtype); } - seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw"); + seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { if (mnt->mnt_sb->s_flags & fs_infop->flag) seq_puts(m, fs_infop->str); @@ -1019,6 +1309,23 @@ out: return err; } +static int change_mount_flags(struct vfsmount *mnt, int ms_flags) +{ + int error = 0; + int readonly_request = 0; + + if (ms_flags & MS_RDONLY) + readonly_request = 1; + if (readonly_request == __mnt_is_readonly(mnt)) + return 0; + + if (readonly_request) + error = mnt_make_readonly(mnt); + else + __mnt_unmake_readonly(mnt); + return error; +} + /* * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount @@ -1041,7 +1348,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags, return -EINVAL; down_write(&sb->s_umount); - err = do_remount_sb(sb, flags, data, 0); + if (flags & MS_BIND) + err = change_mount_flags(nd->path.mnt, flags); + else + err = do_remount_sb(sb, flags, data, 0); if (!err) nd->path.mnt->mnt_flags = mnt_flags; up_write(&sb->s_umount); @@ -1425,6 +1735,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, mnt_flags |= MNT_NODIRATIME; if (flags & MS_RELATIME) mnt_flags |= MNT_RELATIME; + if (flags & MS_RDONLY) + mnt_flags |= MNT_READONLY; flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index c67b4bdcf719..ad8f167e54bc 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -14,6 +14,7 @@ #include <linux/ioctl.h> #include <linux/time.h> #include <linux/mm.h> +#include <linux/mount.h> #include <linux/highuid.h> #include <linux/smp_lock.h> #include <linux/vmalloc.h> @@ -261,7 +262,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg) } #endif /* CONFIG_NCPFS_NLS */ -int ncp_ioctl(struct inode *inode, struct file *filp, +static int __ncp_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { struct ncp_server *server = NCP_SERVER(inode); @@ -822,6 +823,57 @@ outrel: return -EINVAL; } +static int ncp_ioctl_need_write(unsigned int cmd) +{ + switch (cmd) { + case NCP_IOC_GET_FS_INFO: + case NCP_IOC_GET_FS_INFO_V2: + case NCP_IOC_NCPREQUEST: + case NCP_IOC_SETDENTRYTTL: + case NCP_IOC_SIGN_INIT: + case NCP_IOC_LOCKUNLOCK: + case NCP_IOC_SET_SIGN_WANTED: + return 1; + case NCP_IOC_GETOBJECTNAME: + case NCP_IOC_SETOBJECTNAME: + case NCP_IOC_GETPRIVATEDATA: + case NCP_IOC_SETPRIVATEDATA: + case NCP_IOC_SETCHARSETS: + case NCP_IOC_GETCHARSETS: + case NCP_IOC_CONN_LOGGED_IN: + case NCP_IOC_GETDENTRYTTL: + case NCP_IOC_GETMOUNTUID2: + case NCP_IOC_SIGN_WANTED: + case NCP_IOC_GETROOT: + case NCP_IOC_SETROOT: + return 0; + default: + /* unkown IOCTL command, assume write */ + return 1; + } +} + +int ncp_ioctl(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int ret; + + if (ncp_ioctl_need_write(cmd)) { + /* + * inside the ioctl(), any failures which + * are because of file_permission() are + * -EACCESS, so it seems consistent to keep + * that here. + */ + if (mnt_want_write(filp->f_path.mnt)) + return -EACCES; + } + ret = __ncp_ioctl(inode, filp, cmd, arg); + if (ncp_ioctl_need_write(cmd)) + mnt_drop_write(filp->f_path.mnt); + return ret; +} + #ifdef CONFIG_COMPAT long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 6cea7479c5b4..d9e30ac2798d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -967,7 +967,8 @@ static int is_atomic_open(struct inode *dir, struct nameidata *nd) if (nd->flags & LOOKUP_DIRECTORY) return 0; /* Are we trying to write to a read only partition? */ - if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) + if (__mnt_is_readonly(nd->path.mnt) && + (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) return 0; return 1; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index c593db047d8b..c309c881bd4e 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -658,14 +658,19 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } } + status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt); + if (status) + return status; status = nfs_ok; if (setattr->sa_acl != NULL) status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh, setattr->sa_acl); if (status) - return status; + goto out; status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, 0, (time_t)0); +out: + mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt); return status; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 1ff90625860f..145b3c877a27 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -46,6 +46,7 @@ #include <linux/scatterlist.h> #include <linux/crypto.h> #include <linux/sched.h> +#include <linux/mount.h> #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -154,7 +155,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); goto out_put; } + status = mnt_want_write(rec_dir.path.mnt); + if (status) + goto out_put; status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU); + mnt_drop_write(rec_dir.path.mnt); out_put: dput(dentry); out_unlock: @@ -313,12 +318,17 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (!rec_dir_init || !clp->cl_firststate) return; + status = mnt_want_write(rec_dir.path.mnt); + if (status) + goto out; clp->cl_firststate = 0; nfs4_save_user(&uid, &gid); status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); nfs4_reset_user(uid, gid); if (status == 0) nfsd4_sync_rec_dir(); + mnt_drop_write(rec_dir.path.mnt); +out: if (status) printk("NFSD: Failed to remove expired client state directory" " %.*s\n", HEXDIR_LEN, clp->cl_recdir); @@ -347,13 +357,17 @@ nfsd4_recdir_purge_old(void) { if (!rec_dir_init) return; + status = mnt_want_write(rec_dir.path.mnt); + if (status) + goto out; status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old); if (status == 0) nfsd4_sync_rec_dir(); + mnt_drop_write(rec_dir.path.mnt); +out: if (status) printk("nfsd4: failed to purge old clients from recovery" " directory %s\n", rec_dir.path.dentry->d_name.name); - return; } static int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index bcb97d8e8b8b..81a75f3081f4 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -41,6 +41,7 @@ #include <linux/sunrpc/svc.h> #include <linux/nfsd/nfsd.h> #include <linux/nfsd/cache.h> +#include <linux/file.h> #include <linux/mount.h> #include <linux/workqueue.h> #include <linux/smp_lock.h> @@ -1239,7 +1240,7 @@ static inline void nfs4_file_downgrade(struct file *filp, unsigned int share_access) { if (share_access & NFS4_SHARE_ACCESS_WRITE) { - put_write_access(filp->f_path.dentry->d_inode); + drop_file_write_access(filp); filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; } } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 46f59d5365a0..304bf5f643c9 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -1255,23 +1255,35 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err = 0; switch (type) { case S_IFREG: + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); break; case S_IFDIR: + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); break; case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); break; default: printk("nfsd: bad file type %o in nfsd_create\n", type); host_err = -EINVAL; + goto out_nfserr; } - if (host_err < 0) + if (host_err < 0) { + mnt_drop_write(fhp->fh_export->ex_path.mnt); goto out_nfserr; + } if (EX_ISSYNC(fhp->fh_export)) { err = nfserrno(nfsd_sync_dir(dentry)); @@ -1282,6 +1294,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, err2 = nfsd_create_setattr(rqstp, resfhp, iap); if (err2) err = err2; + mnt_drop_write(fhp->fh_export->ex_path.mnt); /* * Update the file handle to get the new inode info. */ @@ -1359,6 +1372,9 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, v_atime = verifier[1]&0x7fffffff; } + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; if (dchild->d_inode) { err = 0; @@ -1390,12 +1406,15 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, case NFS3_CREATE_GUARDED: err = nfserr_exist; } + mnt_drop_write(fhp->fh_export->ex_path.mnt); goto out; } host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); - if (host_err < 0) + if (host_err < 0) { + mnt_drop_write(fhp->fh_export->ex_path.mnt); goto out_nfserr; + } if (created) *created = 1; @@ -1420,6 +1439,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, if (err2) err = err2; + mnt_drop_write(fhp->fh_export->ex_path.mnt); /* * Update the filehandle to get the new inode info. */ @@ -1522,6 +1542,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, if (iap && (iap->ia_valid & ATTR_MODE)) mode = iap->ia_mode & S_IALLUGO; + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; + if (unlikely(path[plen] != 0)) { char *path_alloced = kmalloc(plen+1, GFP_KERNEL); if (path_alloced == NULL) @@ -1542,6 +1566,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserrno(host_err); fh_unlock(fhp); + mnt_drop_write(fhp->fh_export->ex_path.mnt); + cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); dput(dnew); if (err==0) err = cerr; @@ -1592,6 +1618,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, dold = tfhp->fh_dentry; dest = dold->d_inode; + host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt); + if (host_err) { + err = nfserrno(host_err); + goto out_dput; + } host_err = vfs_link(dold, dirp, dnew); if (!host_err) { if (EX_ISSYNC(ffhp->fh_export)) { @@ -1605,7 +1636,8 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, else err = nfserrno(host_err); } - + mnt_drop_write(tfhp->fh_export->ex_path.mnt); +out_dput: dput(dnew); out_unlock: fh_unlock(ffhp); @@ -1678,13 +1710,20 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, if (ndentry == trap) goto out_dput_new; -#ifdef MSNFS - if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) && + if (svc_msnfs(ffhp) && ((atomic_read(&odentry->d_count) > 1) || (atomic_read(&ndentry->d_count) > 1))) { host_err = -EPERM; - } else -#endif + goto out_dput_new; + } + + host_err = -EXDEV; + if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt) + goto out_dput_new; + host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt); + if (host_err) + goto out_dput_new; + host_err = vfs_rename(fdir, odentry, tdir, ndentry); if (!host_err && EX_ISSYNC(tfhp->fh_export)) { host_err = nfsd_sync_dir(tdentry); @@ -1692,6 +1731,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, host_err = nfsd_sync_dir(fdentry); } + mnt_drop_write(ffhp->fh_export->ex_path.mnt); + out_dput_new: dput(ndentry); out_dput_old: @@ -1750,6 +1791,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, if (!type) type = rdentry->d_inode->i_mode & S_IFMT; + host_err = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (host_err) + goto out_nfserr; + if (type != S_IFDIR) { /* It's UNLINK */ #ifdef MSNFS if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && @@ -1765,10 +1810,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dput(rdentry); if (host_err) - goto out_nfserr; + goto out_drop; if (EX_ISSYNC(fhp->fh_export)) host_err = nfsd_sync_dir(dentry); +out_drop: + mnt_drop_write(fhp->fh_export->ex_path.mnt); out_nfserr: err = nfserrno(host_err); out: @@ -1865,7 +1912,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, inode->i_mode, IS_IMMUTABLE(inode)? " immut" : "", IS_APPEND(inode)? " append" : "", - IS_RDONLY(inode)? " ro" : ""); + __mnt_is_readonly(exp->ex_path.mnt)? " ro" : ""); dprintk(" owner %d/%d user %d/%d\n", inode->i_uid, inode->i_gid, current->fsuid, current->fsgid); #endif @@ -1876,7 +1923,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, */ if (!(acc & MAY_LOCAL_ACCESS)) if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { - if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode)) + if (exp_rdonly(rqstp, exp) || + __mnt_is_readonly(exp->ex_path.mnt)) return nfserr_rofs; if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) return nfserr_perm; @@ -2039,6 +2087,9 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) } else size = 0; + error = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (error) + goto getout; if (size) error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); else { @@ -2050,6 +2101,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) error = 0; } } + mnt_drop_write(fhp->fh_export->ex_path.mnt); getout: kfree(value); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index b413166dd163..7b142f0ce995 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -60,10 +60,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, goto bail; } - status = -EROFS; - if (IS_RDONLY(inode)) - goto bail_unlock; - status = -EACCES; if (!is_owner_or_cap(inode)) goto bail_unlock; @@ -134,8 +130,13 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (get_user(flags, (int __user *) arg)) return -EFAULT; - return ocfs2_set_inode_attr(inode, flags, + status = mnt_want_write(filp->f_path.mnt); + if (status) + return status; + status = ocfs2_set_inode_attr(inode, flags, OCFS2_FL_MODIFIABLE); + mnt_drop_write(filp->f_path.mnt); + return status; case OCFS2_IOC_RESVSP: case OCFS2_IOC_RESVSP64: case OCFS2_IOC_UNRESVSP: diff --git a/fs/open.c b/fs/open.c index 3fa4e4ffce4c..b70e7666bb2c 100644 --- a/fs/open.c +++ b/fs/open.c @@ -244,21 +244,21 @@ static long do_sys_truncate(const char __user * path, loff_t length) if (!S_ISREG(inode->i_mode)) goto dput_and_out; - error = vfs_permission(&nd, MAY_WRITE); + error = mnt_want_write(nd.path.mnt); if (error) goto dput_and_out; - error = -EROFS; - if (IS_RDONLY(inode)) - goto dput_and_out; + error = vfs_permission(&nd, MAY_WRITE); + if (error) + goto mnt_drop_write_and_out; error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto dput_and_out; + goto mnt_drop_write_and_out; error = get_write_access(inode); if (error) - goto dput_and_out; + goto mnt_drop_write_and_out; /* * Make sure that there are no leases. get_write_access() protects @@ -276,6 +276,8 @@ static long do_sys_truncate(const char __user * path, loff_t length) put_write_and_out: put_write_access(inode); +mnt_drop_write_and_out: + mnt_drop_write(nd.path.mnt); dput_and_out: path_put(&nd.path); out: @@ -457,8 +459,17 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode) if(res || !(mode & S_IWOTH) || special_file(nd.path.dentry->d_inode->i_mode)) goto out_path_release; - - if(IS_RDONLY(nd.path.dentry->d_inode)) + /* + * This is a rare case where using __mnt_is_readonly() + * is OK without a mnt_want/drop_write() pair. Since + * no actual write to the fs is performed here, we do + * not need to telegraph to that to anyone. + * + * By doing this, we accept that this access is + * inherently racy and know that the fs may change + * state before we even see this result. + */ + if (__mnt_is_readonly(nd.path.mnt)) res = -EROFS; out_path_release: @@ -567,12 +578,12 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) audit_inode(NULL, dentry); - err = -EROFS; - if (IS_RDONLY(inode)) + err = mnt_want_write(file->f_path.mnt); + if (err) goto out_putf; err = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto out_putf; + goto out_drop_write; mutex_lock(&inode->i_mutex); if (mode == (mode_t) -1) mode = inode->i_mode; @@ -581,6 +592,8 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode) err = notify_change(dentry, &newattrs); mutex_unlock(&inode->i_mutex); +out_drop_write: + mnt_drop_write(file->f_path.mnt); out_putf: fput(file); out: @@ -600,13 +613,13 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename, goto out; inode = nd.path.dentry->d_inode; - error = -EROFS; - if (IS_RDONLY(inode)) + error = mnt_want_write(nd.path.mnt); + if (error) goto dput_and_out; error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) - goto dput_and_out; + goto out_drop_write; mutex_lock(&inode->i_mutex); if (mode == (mode_t) -1) @@ -616,6 +629,8 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename, error = notify_change(nd.path.dentry, &newattrs); mutex_unlock(&inode->i_mutex); +out_drop_write: + mnt_drop_write(nd.path.mnt); dput_and_out: path_put(&nd.path); out: @@ -638,9 +653,6 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group) printk(KERN_ERR "chown_common: NULL inode\n"); goto out; } - error = -EROFS; - if (IS_RDONLY(inode)) - goto out; error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) goto out; @@ -671,7 +683,12 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group) error = user_path_walk(filename, &nd); if (error) goto out; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_release; error = chown_common(nd.path.dentry, user, group); + mnt_drop_write(nd.path.mnt); +out_release: path_put(&nd.path); out: return error; @@ -691,7 +708,12 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user, error = __user_walk_fd(dfd, filename, follow, &nd); if (error) goto out; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_release; error = chown_common(nd.path.dentry, user, group); + mnt_drop_write(nd.path.mnt); +out_release: path_put(&nd.path); out: return error; @@ -705,7 +727,12 @@ asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group error = user_path_walk_link(filename, &nd); if (error) goto out; + error = mnt_want_write(nd.path.mnt); + if (error) + goto out_release; error = chown_common(nd.path.dentry, user, group); + mnt_drop_write(nd.path.mnt); +out_release: path_put(&nd.path); out: return error; @@ -722,14 +749,48 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group) if (!file) goto out; + error = mnt_want_write(file->f_path.mnt); + if (error) + goto out_fput; dentry = file->f_path.dentry; audit_inode(NULL, dentry); error = chown_common(dentry, user, group); + mnt_drop_write(file->f_path.mnt); +out_fput: fput(file); out: return error; } +/* + * You have to be very careful that these write + * counts get cleaned up in error cases and + * upon __fput(). This should probably never + * be called outside of __dentry_open(). + */ +static inline int __get_file_write_access(struct inode *inode, + struct vfsmount *mnt) +{ + int error; + error = get_write_access(inode); + if (error) + return error; + /* + * Do not take mount writer counts on + * special files since no writes to + * the mount itself will occur. + */ + if (!special_file(inode->i_mode)) { + /* + * Balanced in __fput() + */ + error = mnt_want_write(mnt); + if (error) + put_write_access(inode); + } + return error; +} + static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, struct file *f, int (*open)(struct inode *, struct file *)) @@ -742,9 +803,11 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { - error = get_write_access(inode); + error = __get_file_write_access(inode, mnt); if (error) goto cleanup_file; + if (!special_file(inode->i_mode)) + file_take_write(f); } f->f_mapping = inode->i_mapping; @@ -784,8 +847,19 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, cleanup_all: fops_put(f->f_op); - if (f->f_mode & FMODE_WRITE) + if (f->f_mode & FMODE_WRITE) { put_write_access(inode); + if (!special_file(inode->i_mode)) { + /* + * We don't consider this a real + * mnt_want/drop_write() pair + * because it all happenend right + * here, so just reset the state. + */ + file_reset_write(f); + mnt_drop_write(mnt); + } + } file_kill(f); f->f_path.dentry = NULL; f->f_path.mnt = NULL; @@ -796,43 +870,6 @@ cleanup_file: return ERR_PTR(error); } -/* - * Note that while the flag value (low two bits) for sys_open means: - * 00 - read-only - * 01 - write-only - * 10 - read-write - * 11 - special - * it is changed into - * 00 - no permissions needed - * 01 - read-permission - * 10 - write-permission - * 11 - read-write - * for the internal routines (ie open_namei()/follow_link() etc). 00 is - * used by symlinks. - */ -static struct file *do_filp_open(int dfd, const char *filename, int flags, - int mode) -{ - int namei_flags, error; - struct nameidata nd; - - namei_flags = flags; - if ((namei_flags+1) & O_ACCMODE) - namei_flags++; - - error = open_namei(dfd, filename, namei_flags, mode, &nd); - if (!error) - return nameidata_to_filp(&nd, flags); - - return ERR_PTR(error); -} - -struct file *filp_open(const char *filename, int flags, int mode) -{ - return do_filp_open(AT_FDCWD, filename, flags, mode); -} -EXPORT_SYMBOL(filp_open); - /** * lookup_instantiate_filp - instantiates the open intent filp * @nd: pointer to nameidata diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 03f808c5b79d..6149e4b58c88 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -473,6 +473,10 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) return 0; if (IS_ERR(state)) /* I/O error reading the partition table */ return -EIO; + + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE); + for (p = 1; p < state->limit; p++) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index e0f0f098a523..74363a7aacbc 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -4,6 +4,7 @@ #include <linux/capability.h> #include <linux/fs.h> +#include <linux/mount.h> #include <linux/reiserfs_fs.h> #include <linux/time.h> #include <asm/uaccess.h> @@ -25,6 +26,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { unsigned int flags; + int err = 0; switch (cmd) { case REISERFS_IOC_UNPACK: @@ -48,50 +50,67 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, if (!reiserfs_attrs(inode->i_sb)) return -ENOTTY; - if (IS_RDONLY(inode)) - return -EROFS; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; - if (!is_owner_or_cap(inode)) - return -EPERM; - - if (get_user(flags, (int __user *)arg)) - return -EFAULT; - - /* Is it quota file? Do not allow user to mess with it. */ - if (IS_NOQUOTA(inode)) - return -EPERM; + if (!is_owner_or_cap(inode)) { + err = -EPERM; + goto setflags_out; + } + if (get_user(flags, (int __user *)arg)) { + err = -EFAULT; + goto setflags_out; + } + /* + * Is it quota file? Do not allow user to mess with it + */ + if (IS_NOQUOTA(inode)) { + err = -EPERM; + goto setflags_out; + } if (((flags ^ REISERFS_I(inode)-> i_attrs) & (REISERFS_IMMUTABLE_FL | REISERFS_APPEND_FL)) - && !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - + && !capable(CAP_LINUX_IMMUTABLE)) { + err = -EPERM; + goto setflags_out; + } if ((flags & REISERFS_NOTAIL_FL) && S_ISREG(inode->i_mode)) { int result; result = reiserfs_unpack(inode, filp); - if (result) - return result; + if (result) { + err = result; + goto setflags_out; + } } sd_attrs_to_i_attrs(flags, inode); REISERFS_I(inode)->i_attrs = flags; inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - return 0; +setflags_out: + mnt_drop_write(filp->f_path.mnt); + return err; } case REISERFS_IOC_GETVERSION: return put_user(inode->i_generation, (int __user *)arg); case REISERFS_IOC_SETVERSION: if (!is_owner_or_cap(inode)) return -EPERM; - if (IS_RDONLY(inode)) - return -EROFS; - if (get_user(inode->i_generation, (int __user *)arg)) - return -EFAULT; + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + if (get_user(inode->i_generation, (int __user *)arg)) { + err = -EFAULT; + goto setversion_out; + } inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - return 0; +setversion_out: + mnt_drop_write(filp->f_path.mnt); + return err; default: return -ENOTTY; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index bb05a3e51b93..060eb3f598e7 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -38,7 +38,7 @@ #include <asm/system.h> #include <linux/time.h> -#include <asm/semaphore.h> +#include <linux/semaphore.h> #include <linux/vmalloc.h> #include <linux/reiserfs_fs.h> diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 344b9b96cc56..d7c4935c1034 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -44,7 +44,6 @@ #include <net/checksum.h> #include <linux/smp_lock.h> #include <linux/stat.h> -#include <asm/semaphore.h> #define FL_READONLY 128 #define FL_DIR_SEM_HELD 256 diff --git a/fs/select.c b/fs/select.c index 5633fe980781..00f58c5c7e05 100644 --- a/fs/select.c +++ b/fs/select.c @@ -260,7 +260,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) wait = NULL; if (retval || !*timeout || signal_pending(current)) break; - if(table.error) { + if (table.error) { retval = table.error; break; } diff --git a/fs/super.c b/fs/super.c index 09008dbd264e..1f8f05ede437 100644 --- a/fs/super.c +++ b/fs/super.c @@ -37,6 +37,7 @@ #include <linux/idr.h> #include <linux/kobject.h> #include <linux/mutex.h> +#include <linux/file.h> #include <asm/uaccess.h> @@ -567,10 +568,29 @@ static void mark_files_ro(struct super_block *sb) { struct file *f; +retry: file_list_lock(); list_for_each_entry(f, &sb->s_files, f_u.fu_list) { - if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f)) - f->f_mode &= ~FMODE_WRITE; + struct vfsmount *mnt; + if (!S_ISREG(f->f_path.dentry->d_inode->i_mode)) + continue; + if (!file_count(f)) + continue; + if (!(f->f_mode & FMODE_WRITE)) + continue; + f->f_mode &= ~FMODE_WRITE; + if (file_check_writeable(f) != 0) + continue; + file_release_write(f); + mnt = mntget(f->f_path.mnt); + file_list_unlock(); + /* + * This can sleep, so we can't hold + * the file_list_lock() spinlock. + */ + mnt_drop_write(mnt); + mntput(mnt); + goto retry; } file_list_unlock(); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 4948d9bc405d..a1c3a1fab7f0 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -20,6 +20,7 @@ #include <linux/idr.h> #include <linux/completion.h> #include <linux/mutex.h> +#include <linux/slab.h> #include "sysfs.h" DEFINE_MUTEX(sysfs_mutex); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index baa663e69388..ade9a7e6a757 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -13,6 +13,7 @@ #include <linux/module.h> #include <linux/kobject.h> #include <linux/kallsyms.h> +#include <linux/slab.h> #include <linux/namei.h> #include <linux/poll.h> #include <linux/list.h> @@ -128,7 +129,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) ssize_t retval = 0; mutex_lock(&buffer->mutex); - if (buffer->needs_read_fill) { + if (buffer->needs_read_fill || *ppos == 0) { retval = fill_read_buffer(file->f_path.dentry,buffer); if (retval) goto out; @@ -409,8 +410,7 @@ static int sysfs_release(struct inode *inode, struct file *filp) * return POLLERR|POLLPRI, and select will return the fd whether * it is waiting for read, write, or exceptions. * Once poll/select indicates that the value has changed, you - * need to close and re-open the file, as simply seeking and reading - * again will not get new data, or reset the state of 'poll'. + * need to close and re-open the file, or seek to 0 and read again. * Reminder: this only works for attributes which actively support * it, and it is not possible to test an attribute from userspace * to see if it supports poll (Neither 'poll' nor 'select' return diff --git a/fs/utimes.c b/fs/utimes.c index b18da9c0b97f..a2bef77dc9c9 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -2,6 +2,7 @@ #include <linux/file.h> #include <linux/fs.h> #include <linux/linkage.h> +#include <linux/mount.h> #include <linux/namei.h> #include <linux/sched.h> #include <linux/stat.h> @@ -59,6 +60,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags struct inode *inode; struct iattr newattrs; struct file *f = NULL; + struct vfsmount *mnt; error = -EINVAL; if (times && (!nsec_valid(times[0].tv_nsec) || @@ -79,18 +81,20 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags if (!f) goto out; dentry = f->f_path.dentry; + mnt = f->f_path.mnt; } else { error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd); if (error) goto out; dentry = nd.path.dentry; + mnt = nd.path.mnt; } inode = dentry->d_inode; - error = -EROFS; - if (IS_RDONLY(inode)) + error = mnt_want_write(mnt); + if (error) goto dput_and_out; /* Don't worry, the checks are done in inode_change_ok() */ @@ -98,7 +102,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags if (times) { error = -EPERM; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - goto dput_and_out; + goto mnt_drop_write_and_out; if (times[0].tv_nsec == UTIME_OMIT) newattrs.ia_valid &= ~ATTR_ATIME; @@ -118,22 +122,24 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags } else { error = -EACCES; if (IS_IMMUTABLE(inode)) - goto dput_and_out; + goto mnt_drop_write_and_out; if (!is_owner_or_cap(inode)) { if (f) { if (!(f->f_mode & FMODE_WRITE)) - goto dput_and_out; + goto mnt_drop_write_and_out; } else { error = vfs_permission(&nd, MAY_WRITE); if (error) - goto dput_and_out; + goto mnt_drop_write_and_out; } } } mutex_lock(&inode->i_mutex); error = notify_change(dentry, &newattrs); mutex_unlock(&inode->i_mutex); +mnt_drop_write_and_out: + mnt_drop_write(mnt); dput_and_out: if (f) fput(f); diff --git a/fs/xattr.c b/fs/xattr.c index 3acab1615460..f7062da505d4 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -11,6 +11,7 @@ #include <linux/slab.h> #include <linux/file.h> #include <linux/xattr.h> +#include <linux/mount.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/syscalls.h> @@ -32,8 +33,6 @@ xattr_permission(struct inode *inode, const char *name, int mask) * filesystem or on an immutable / append-only inode. */ if (mask & MAY_WRITE) { - if (IS_RDONLY(inode)) - return -EROFS; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return -EPERM; } @@ -262,7 +261,11 @@ sys_setxattr(char __user *path, char __user *name, void __user *value, error = user_path_walk(path, &nd); if (error) return error; - error = setxattr(nd.path.dentry, name, value, size, flags); + error = mnt_want_write(nd.path.mnt); + if (!error) { + error = setxattr(nd.path.dentry, name, value, size, flags); + mnt_drop_write(nd.path.mnt); + } path_put(&nd.path); return error; } @@ -277,7 +280,11 @@ sys_lsetxattr(char __user *path, char __user *name, void __user *value, error = user_path_walk_link(path, &nd); if (error) return error; - error = setxattr(nd.path.dentry, name, value, size, flags); + error = mnt_want_write(nd.path.mnt); + if (!error) { + error = setxattr(nd.path.dentry, name, value, size, flags); + mnt_drop_write(nd.path.mnt); + } path_put(&nd.path); return error; } @@ -295,7 +302,12 @@ sys_fsetxattr(int fd, char __user *name, void __user *value, return error; dentry = f->f_path.dentry; audit_inode(NULL, dentry); - error = setxattr(dentry, name, value, size, flags); + error = mnt_want_write(f->f_path.mnt); + if (!error) { + error = setxattr(dentry, name, value, size, flags); + mnt_drop_write(f->f_path.mnt); + } +out_fput: fput(f); return error; } @@ -482,7 +494,11 @@ sys_removexattr(char __user *path, char __user *name) error = user_path_walk(path, &nd); if (error) return error; - error = removexattr(nd.path.dentry, name); + error = mnt_want_write(nd.path.mnt); + if (!error) { + error = removexattr(nd.path.dentry, name); + mnt_drop_write(nd.path.mnt); + } path_put(&nd.path); return error; } @@ -496,7 +512,11 @@ sys_lremovexattr(char __user *path, char __user *name) error = user_path_walk_link(path, &nd); if (error) return error; - error = removexattr(nd.path.dentry, name); + error = mnt_want_write(nd.path.mnt); + if (!error) { + error = removexattr(nd.path.dentry, name); + mnt_drop_write(nd.path.mnt); + } path_put(&nd.path); return error; } @@ -513,7 +533,11 @@ sys_fremovexattr(int fd, char __user *name) return error; dentry = f->f_path.dentry; audit_inode(NULL, dentry); - error = removexattr(dentry, name); + error = mnt_want_write(f->f_path.mnt); + if (!error) { + error = removexattr(dentry, name); + mnt_drop_write(f->f_path.mnt); + } fput(f); return error; } diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h index 2009e6d922ce..3abe7e9ceb33 100644 --- a/fs/xfs/linux-2.6/sema.h +++ b/fs/xfs/linux-2.6/sema.h @@ -20,8 +20,8 @@ #include <linux/time.h> #include <linux/wait.h> +#include <linux/semaphore.h> #include <asm/atomic.h> -#include <asm/semaphore.h> /* * sema_t structure just maps to struct semaphore in Linux kernel. diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index bf7759793856..4ddb86b73c6b 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -535,8 +535,6 @@ xfs_attrmulti_attr_set( char *kbuf; int error = EFAULT; - if (IS_RDONLY(inode)) - return -EROFS; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return EPERM; if (len > XATTR_SIZE_MAX) @@ -562,8 +560,6 @@ xfs_attrmulti_attr_remove( char *name, __uint32_t flags) { - if (IS_RDONLY(inode)) - return -EROFS; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) return EPERM; return xfs_attr_remove(XFS_I(inode), name, flags); @@ -573,6 +569,7 @@ STATIC int xfs_attrmulti_by_handle( xfs_mount_t *mp, void __user *arg, + struct file *parfilp, struct inode *parinode) { int error; @@ -626,13 +623,21 @@ xfs_attrmulti_by_handle( &ops[i].am_length, ops[i].am_flags); break; case ATTR_OP_SET: + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; ops[i].am_error = xfs_attrmulti_attr_set(inode, attr_name, ops[i].am_attrvalue, ops[i].am_length, ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); break; case ATTR_OP_REMOVE: + ops[i].am_error = mnt_want_write(parfilp->f_path.mnt); + if (ops[i].am_error) + break; ops[i].am_error = xfs_attrmulti_attr_remove(inode, attr_name, ops[i].am_flags); + mnt_drop_write(parfilp->f_path.mnt); break; default: ops[i].am_error = EINVAL; @@ -1133,7 +1138,7 @@ xfs_ioctl( return xfs_attrlist_by_handle(mp, arg, inode); case XFS_IOC_ATTRMULTI_BY_HANDLE: - return xfs_attrmulti_by_handle(mp, arg, inode); + return xfs_attrmulti_by_handle(mp, arg, filp, inode); case XFS_IOC_SWAPEXT: { error = xfs_swapext((struct xfs_swapext __user *)arg); diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 0c958cf77758..a1237dad6430 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -155,13 +155,6 @@ xfs_ichgtime_fast( */ ASSERT((flags & XFS_ICHGTIME_ACC) == 0); - /* - * We're not supposed to change timestamps in readonly-mounted - * filesystems. Throw it away if anyone asks us. - */ - if (unlikely(IS_RDONLY(inode))) - return; - if (flags & XFS_ICHGTIME_MOD) { tvp = &inode->i_mtime; ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec; diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 21c0dbc74093..1ebd8004469c 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -51,6 +51,7 @@ #include "xfs_vnodeops.h" #include <linux/capability.h> +#include <linux/mount.h> #include <linux/writeback.h> @@ -670,10 +671,16 @@ start: if (new_size > xip->i_size) xip->i_new_size = new_size; - if (likely(!(ioflags & IO_INVIS))) { + /* + * We're not supposed to change timestamps in readonly-mounted + * filesystems. Throw it away if anyone asks us. + */ + if (likely(!(ioflags & IO_INVIS) && + !mnt_want_write(file->f_path.mnt))) { file_update_time(file); xfs_ichgtime_fast(xip, inode, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); + mnt_drop_write(file->f_path.mnt); } /* |