From 676a0675cf9200ac047fb50825f80867b3bb733b Mon Sep 17 00:00:00 2001 From: Jim Somerville Date: Thu, 21 Feb 2013 16:41:59 -0800 Subject: inotify: remove broken mask checks causing unmount to be EINVAL Running the command: inotifywait -e unmount /mnt/disk immediately aborts with a -EINVAL return code. This is however a valid parameter. This abort occurs only if unmount is the sole event parameter. If other event parameters are supplied, then the unmount event wait will work. The problem was introduced by commit 44b350fc23e ("inotify: Fix mask checks"). In that commit, it states: The mask checks in inotify_update_existing_watch() and inotify_new_watch() are useless because inotify_arg_to_mask() sets FS_IN_IGNORED and FS_EVENT_ON_CHILD bits anyway. But instead of removing the useless checks, it did this: mask = inotify_arg_to_mask(arg); - if (unlikely(!mask)) + if (unlikely(!(mask & IN_ALL_EVENTS))) return -EINVAL; The problem is that IN_ALL_EVENTS doesn't include IN_UNMOUNT, and other parts of the code keep IN_UNMOUNT separate from IN_ALL_EVENTS. So the check should be: if (unlikely(!(mask & (IN_ALL_EVENTS | IN_UNMOUNT)))) But inotify_arg_to_mask(arg) always sets the IN_UNMOUNT bit in the mask anyway, so the check is always going to pass and thus should simply be removed. Also note that inotify_arg_to_mask completely controls what mask bits get set from arg, there's no way for invalid bits to get enabled there. Lets fix it by simply removing the useless broken checks. Signed-off-by: Jim Somerville Signed-off-by: Paul Gortmaker Cc: Jerome Marchand Cc: John McCutchan Cc: Robert Love Cc: Eric Paris Cc: [2.6.37+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/inotify/inotify_user.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 228a2c2ad8d7..07f7a92fe88e 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -576,8 +576,6 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); - if (unlikely(!(mask & IN_ALL_EVENTS))) - return -EINVAL; fsn_mark = fsnotify_find_inode_mark(group, inode); if (!fsn_mark) @@ -629,8 +627,6 @@ static int inotify_new_watch(struct fsnotify_group *group, /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); - if (unlikely(!(mask & IN_ALL_EVENTS))) - return -EINVAL; tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); if (unlikely(!tmp_i_mark)) -- cgit v1.2.3 From 7630b661da330b35dd57b6f5d6d62b386f2dd751 Mon Sep 17 00:00:00 2001 From: MITSUNARI Shigeo Date: Thu, 21 Feb 2013 16:42:01 -0800 Subject: fs/block_dev.c: page cache wrongly left invalidated after revalidate_disk() We found that bdev->bd_invalidated was left set once revalidate_disk() is called, which results in page cache flush every time that device is open. Specifically, we found this problem in MD block device. Once we resize a MD device, mdadm --monitor periodically flush all page cache for that device every 60 or 1000 seconds when it opens the device. This bug lies since at least 3.2.0 till the latest kernel(3.6.2). Patch is attached. The following steps will reproduce the problem. 1. prepair a block device (eg /dev/sdb). 2. create two partitions: sudo parted /dev/sdb mklabel gpt mkpart primary 0% 50% mkpart primary 50% 100% 3. create a md device. sudo mdadm -C /dev/md/hoge -l 1 -n 2 -e 1.2 --assume-clean --auto=md --symlink=no /dev/sdb1 /dev/sdb2 4. create file system and mount it sudo mkfs.ext3 /dev/md/hoge sudo mkdir /mnt/test sudo mount /dev/md/hoge /mnt/test 5. try to resize the device sudo mdadm -G /dev/md/hoge --size=max 6. create a file to fill file cache. sudo dd if=/dev/urandom of=/mnt/test/data bs=1M count=10 and verify the current status of file by free command. 7. mdadm monitor will open the md device every 1000 seconds and you will find all file cache on the device are cleared. The timing can be reduced by the following steps. a) kill mdadm and restart it with --delay option /sbin/mdadm --monitor --delay=30 --pid-file /var/run/mdadm/monitor.pid --daemonise --scan --syslog or open the md device directly. sudo dd if=/dev/md/hoge of=/dev/null bs=4096 count=1 Signed-off-by: MITSUNARI Shigeo Cc: Al Viro Cc: Jeff Moyer Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 172f8491a2bd..78333a37f49d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -994,6 +994,7 @@ int revalidate_disk(struct gendisk *disk) mutex_lock(&bdev->bd_mutex); check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; mutex_unlock(&bdev->bd_mutex); bdput(bdev); return ret; -- cgit v1.2.3 From 49deb4bc227cb9db5b8ebf9434367f8bed057c7a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 21 Feb 2013 16:42:43 -0800 Subject: configfs: move the dereference below the NULL test The dereference should be moved below the NULL test. spatch with a semantic match is used to found this. (http://coccinelle.lip6.fr/) Signed-off-by: Wei Yongjun Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/configfs/dir.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 712b10f64c70..e9dcfa3c208c 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1037,10 +1037,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level) static int configfs_depend_prep(struct dentry *origin, struct config_item *target) { - struct configfs_dirent *child_sd, *sd = origin->d_fsdata; + struct configfs_dirent *child_sd, *sd; int ret = 0; - BUG_ON(!origin || !sd); + BUG_ON(!origin || !origin->d_fsdata); + sd = origin->d_fsdata; if (sd->s_element == target) /* Boo-yah */ goto out; -- cgit v1.2.3 From d787ab0977c58e2c421b8d0ab49e363893ddb814 Mon Sep 17 00:00:00 2001 From: Tim Gardner Date: Thu, 21 Feb 2013 16:42:44 -0800 Subject: ocfs2: remove kfree() redundant null checks smatch analysis indicates a number of redundant NULL checks before calling kfree(), eg: fs/ocfs2/alloc.c:6138 ocfs2_begin_truncate_log_recovery() info: redundant null check on *tl_copy calling kfree() fs/ocfs2/alloc.c:6755 ocfs2_zero_range_for_truncate() info: redundant null check on pages calling kfree() etc.... [akpm@linux-foundation.org: revert dubious change in ocfs2_begin_truncate_log_recovery()] Signed-off-by: Tim Gardner Cc: Mark Fasheh Acked-by: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 3 +-- fs/ocfs2/cluster/heartbeat.c | 6 ++---- fs/ocfs2/cluster/tcp.c | 6 ++---- fs/ocfs2/dlm/dlmdomain.c | 4 +--- fs/ocfs2/extent_map.c | 3 +-- fs/ocfs2/journal.c | 10 +++------- fs/ocfs2/localalloc.c | 8 +++----- fs/ocfs2/stack_o2cb.c | 2 +- fs/ocfs2/super.c | 6 ++---- fs/ocfs2/sysfile.c | 3 +-- 10 files changed, 17 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 31b9463fba1f..b8a9d87231b1 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6751,8 +6751,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, mlog_errno(ret); out: - if (pages) - kfree(pages); + kfree(pages); return ret; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f7c648d7d6bf..42252bf64b51 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1471,8 +1471,7 @@ static void o2hb_region_release(struct config_item *item) mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name); - if (reg->hr_tmp_block) - kfree(reg->hr_tmp_block); + kfree(reg->hr_tmp_block); if (reg->hr_slot_data) { for (i = 0; i < reg->hr_num_pages; i++) { @@ -1486,8 +1485,7 @@ static void o2hb_region_release(struct config_item *item) if (reg->hr_bdev) blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE); - if (reg->hr_slots) - kfree(reg->hr_slots); + kfree(reg->hr_slots); kfree(reg->hr_db_regnum); kfree(reg->hr_db_livenodes); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 1bfe8802cc1e..f0edbd84f7bc 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1165,10 +1165,8 @@ out: o2net_debug_del_nst(&nst); /* must be before dropping sc and node */ if (sc) sc_put(sc); - if (vec) - kfree(vec); - if (msg) - kfree(msg); + kfree(vec); + kfree(msg); o2net_complete_nsw(nn, &nsw, 0, 0, 0); return ret; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 9e89d70df337..dbb17c07656a 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -319,9 +319,7 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) if (dlm->master_hash) dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); - if (dlm->name) - kfree(dlm->name); - + kfree(dlm->name); kfree(dlm); } diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index f487aa343442..1c39efb71bab 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -282,8 +282,7 @@ search: spin_unlock(&oi->ip_lock); out: - if (new_emi) - kfree(new_emi); + kfree(new_emi); } static int ocfs2_last_eb_is_empty(struct inode *inode, diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 2dd36af79e26..8eccfabcd12e 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1234,11 +1234,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, /* Though we wish to avoid it, we are in fact safe in * skipping local alloc cleanup as fsck.ocfs2 is more * than capable of reclaiming unused space. */ - if (la_dinode) - kfree(la_dinode); - - if (tl_dinode) - kfree(tl_dinode); + kfree(la_dinode); + kfree(tl_dinode); if (qrec) ocfs2_free_quota_recovery(qrec); @@ -1408,8 +1405,7 @@ bail: mutex_unlock(&osb->recovery_lock); - if (rm_quota) - kfree(rm_quota); + kfree(rm_quota); /* no one is callint kthread_stop() for us so the kthread() api * requires that we call do_exit(). And it isn't exported, but diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index a9f78c74d687..aebeacd807c3 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -476,8 +476,7 @@ out: if (local_alloc_inode) iput(local_alloc_inode); - if (alloc_copy) - kfree(alloc_copy); + kfree(alloc_copy); } /* @@ -534,7 +533,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, mlog_errno(status); bail: - if ((status < 0) && (*alloc_copy)) { + if (status < 0) { kfree(*alloc_copy); *alloc_copy = NULL; } @@ -1290,8 +1289,7 @@ bail: if (main_bm_inode) iput(main_bm_inode); - if (alloc_copy) - kfree(alloc_copy); + kfree(alloc_copy); if (ac) ocfs2_free_alloc_context(ac); diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 94368017edb3..bf1f8930456f 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -376,7 +376,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn) dlm_register_eviction_cb(dlm, &priv->op_eviction_cb); out_free: - if (rc && conn->cc_private) + if (rc) kfree(conn->cc_private); out: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 0e91ec22a940..9b6910dec4ba 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2525,8 +2525,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb) mlog_errno(status); finally: - if (local_alloc) - kfree(local_alloc); + kfree(local_alloc); if (status) mlog_errno(status); @@ -2553,8 +2552,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) * we free it here. */ kfree(osb->journal); - if (osb->local_alloc_copy) - kfree(osb->local_alloc_copy); + kfree(osb->local_alloc_copy); kfree(osb->uuid_str); ocfs2_put_dlm_debug(osb->osb_dlm_debug); memset(osb, 0, sizeof(struct ocfs2_super)); diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c index 3d635f4bbb20..f053688d22a3 100644 --- a/fs/ocfs2/sysfile.c +++ b/fs/ocfs2/sysfile.c @@ -91,8 +91,7 @@ static struct inode **get_local_system_inode(struct ocfs2_super *osb, } else osb->local_system_inodes = local_system_inodes; spin_unlock(&osb->osb_lock); - if (unlikely(free)) - kfree(free); + kfree(free); } index = (slot * NUM_LOCAL_SYSTEM_INODES) + -- cgit v1.2.3 From 3278bb748d2437eb1464765f36429e5d6aa91c38 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Thu, 21 Feb 2013 16:42:45 -0800 Subject: ocfs2: unlock super lock if lockres refresh failed If lockres refresh failed, the super lock will never be released which will cause some processes on other cluster nodes hung forever. Signed-off-by: Junxiao Bi Cc: Joel Becker Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4f7795fb5fc0..88577eb5d712 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2545,6 +2545,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb, * everything is up to the caller :) */ status = ocfs2_should_refresh_lock_res(lockres); if (status < 0) { + ocfs2_cluster_unlock(osb, lockres, level); mlog_errno(status); goto bail; } @@ -2553,8 +2554,10 @@ int ocfs2_super_lock(struct ocfs2_super *osb, ocfs2_complete_lock_res_refresh(lockres, status); - if (status < 0) + if (status < 0) { + ocfs2_cluster_unlock(osb, lockres, level); mlog_errno(status); + } ocfs2_track_lock_refresh(lockres); } bail: -- cgit v1.2.3 From 1d1d1a767206fbe5d4c69493b7e6d2a8d08cc0a0 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Feb 2013 16:42:51 -0800 Subject: mm: only enforce stable page writes if the backing device requires it Create a helper function to check if a backing device requires stable page writes and, if so, performs the necessary wait. Then, make it so that all points in the memory manager that handle making pages writable use the helper function. This should provide stable page write support to most filesystems, while eliminating unnecessary waiting for devices that don't require the feature. Before this patchset, all filesystems would block, regardless of whether or not it was necessary. ext3 would wait, but still generate occasional checksum errors. The network filesystems were left to do their own thing, so they'd wait too. After this patchset, all the disk filesystems except ext3 and btrfs will wait only if the hardware requires it. ext3 (if necessary) snapshots pages instead of blocking, and btrfs provides its own bdi so the mm will never wait. Network filesystems haven't been touched, so either they provide their own stable page guarantees or they don't block at all. The blocking behavior is back to what it was before 3.0 if you don't have a disk requiring stable page writes. Here's the result of using dbench to test latency on ext2: 3.8.0-rc3: Operation Count AvgLat MaxLat ---------------------------------------- WriteX 109347 0.028 59.817 ReadX 347180 0.004 3.391 Flush 15514 29.828 287.283 Throughput 57.429 MB/sec 4 clients 4 procs max_latency=287.290 ms 3.8.0-rc3 + patches: WriteX 105556 0.029 4.273 ReadX 335004 0.005 4.112 Flush 14982 30.540 298.634 Throughput 55.4496 MB/sec 4 clients 4 procs max_latency=298.650 ms As you can see, the maximum write latency drops considerably with this patch enabled. The other filesystems (ext3/ext4/xfs/btrfs) behave similarly, but see the cover letter for those results. Signed-off-by: Darrick J. Wong Acked-by: Steven Whitehouse Reviewed-by: Jan Kara Cc: Adrian Hunter Cc: Andy Lutomirski Cc: Artem Bityutskiy Cc: Joel Becker Cc: Mark Fasheh Cc: Jens Axboe Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 2 +- fs/ext4/inode.c | 2 +- fs/gfs2/file.c | 2 +- fs/nilfs2/file.c | 2 +- include/linux/pagemap.h | 1 + mm/filemap.c | 3 ++- mm/page-writeback.c | 20 ++++++++++++++++++++ 7 files changed, 27 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 7a75c3e0fd58..2ea9cd44aeae 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2359,7 +2359,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (unlikely(ret < 0)) goto out_unlock; set_page_dirty(page); - wait_on_page_writeback(page); + wait_for_stable_page(page); return 0; out_unlock: unlock_page(page); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cbfe13bf5b2a..cd818d8bb221 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4968,7 +4968,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) 0, len, NULL, ext4_bh_unmapped)) { /* Wait so that we don't change page under IO */ - wait_on_page_writeback(page); + wait_for_stable_page(page); ret = VM_FAULT_LOCKED; goto out; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 06b7092a3f25..2687f50d98cb 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -483,7 +483,7 @@ out: gfs2_holder_uninit(&gh); if (ret == 0) { set_page_dirty(page); - wait_on_page_writeback(page); + wait_for_stable_page(page); } sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 61946883025c..bec4af6eab13 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -126,7 +126,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) nilfs_transaction_commit(inode->i_sb); mapped: - wait_on_page_writeback(page); + wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 6da609d14c15..0e38e13eb249 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -414,6 +414,7 @@ static inline void wait_on_page_writeback(struct page *page) } extern void end_page_writeback(struct page *page); +void wait_for_stable_page(struct page *page); /* * Add an arbitrary waiter to a page's wait queue diff --git a/mm/filemap.c b/mm/filemap.c index 24a7ea583f0c..c610076c30e1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1728,6 +1728,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) * see the dirty page and writeprotect it again. */ set_page_dirty(page); + wait_for_stable_page(page); out: sb_end_pagefault(inode->i_sb); return ret; @@ -2274,7 +2275,7 @@ repeat: return NULL; } found: - wait_on_page_writeback(page); + wait_for_stable_page(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 66a0024becd9..355d5ee69058 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2290,3 +2290,23 @@ int mapping_tagged(struct address_space *mapping, int tag) return radix_tree_tagged(&mapping->page_tree, tag); } EXPORT_SYMBOL(mapping_tagged); + +/** + * wait_for_stable_page() - wait for writeback to finish, if necessary. + * @page: The page to wait on. + * + * This function determines if the given page is related to a backing device + * that requires page contents to be held stable during writeback. If so, then + * it will wait for any pending writeback to complete. + */ +void wait_for_stable_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + struct backing_dev_info *bdi = mapping->backing_dev_info; + + if (!bdi_cap_stable_pages_required(bdi)) + return; + + wait_on_page_writeback(page); +} +EXPORT_SYMBOL_GPL(wait_for_stable_page); -- cgit v1.2.3 From 13575ca14fcdacd1ad914d00bc63eb4d96280986 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Feb 2013 16:42:53 -0800 Subject: 9pfs: fix filesystem to wait for stable page writeback Fix up the ->page_mkwrite handler to provide stable page writes if necessary. Signed-off-by: Darrick J. Wong Cc: Adrian Hunter Cc: Andy Lutomirski Cc: Artem Bityutskiy Cc: Jan Kara Cc: Joel Becker Cc: Mark Fasheh Cc: Steven Whitehouse Cc: Jens Axboe Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/vfs_file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 7a3399767570..c921ac92ea4c 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -616,6 +616,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) lock_page(page); if (page->mapping != inode->i_mapping) goto out_unlock; + wait_for_stable_page(page); return VM_FAULT_LOCKED; out_unlock: -- cgit v1.2.3 From ffecfd1a72fccfcee3dabb99b9ecba9735318f90 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 21 Feb 2013 16:42:55 -0800 Subject: block: optionally snapshot page contents to provide stable pages during write This provides a band-aid to provide stable page writes on jbd without needing to backport the fixed locking and page writeback bit handling schemes of jbd2. The band-aid works by using bounce buffers to snapshot page contents instead of waiting. For those wondering about the ext3 bandage -- fixing the jbd locking (which was done as part of ext4dev years ago) is a lot of surgery, and setting PG_writeback on data pages when we actually hold the page lock dropped ext3 performance by nearly an order of magnitude. If we're going to migrate iscsi and raid to use stable page writes, the complaints about high latency will likely return. We might as well centralize their page snapshotting thing to one place. Signed-off-by: Darrick J. Wong Tested-by: Andy Lutomirski Cc: Adrian Hunter Cc: Artem Bityutskiy Reviewed-by: Jan Kara Cc: Joel Becker Cc: Mark Fasheh Cc: Steven Whitehouse Cc: Jens Axboe Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/tile/Kconfig | 6 ------ block/blk-core.c | 8 +++++--- fs/ext3/super.c | 1 + include/uapi/linux/fs.h | 3 +++ mm/Kconfig | 13 +++++++++++++ mm/bounce.c | 48 ++++++++++++++++++++++++++++++++++++++++++++---- mm/page-writeback.c | 4 ++++ 7 files changed, 70 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 1bb7ad4aeff4..b1e68f52029c 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -412,12 +412,6 @@ config TILE_USB Provides USB host adapter support for the built-in EHCI and OHCI interfaces on TILE-Gx chips. -# USB OHCI needs the bounce pool since tilegx will often have more -# than 4GB of memory, but we don't currently use the IOTLB to present -# a 32-bit address to OHCI. So we need to use a bounce pool instead. -config NEED_BOUNCE_POOL - def_bool USB_OHCI_HCD - source "drivers/pci/hotplug/Kconfig" endmenu diff --git a/block/blk-core.c b/block/blk-core.c index c973249d68cd..277134cb5d32 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1474,6 +1474,11 @@ void blk_queue_bio(struct request_queue *q, struct bio *bio) */ blk_queue_bounce(q, &bio); + if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { + bio_endio(bio, -EIO); + return; + } + if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { spin_lock_irq(q->queue_lock); where = ELEVATOR_INSERT_FLUSH; @@ -1714,9 +1719,6 @@ generic_make_request_checks(struct bio *bio) */ blk_partition_remap(bio); - if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) - goto end_io; - if (bio_check_eod(bio, nr_sectors)) goto end_io; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6e50223b3299..4ba2683c1d44 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2065,6 +2065,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal": test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); + sb->s_flags |= MS_SNAP_STABLE; return 0; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 780d4c6093eb..c7fc1e6517c3 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -86,6 +86,9 @@ struct inodes_stat_t { #define MS_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define MS_I_VERSION (1<<23) /* Update inode I_version field */ #define MS_STRICTATIME (1<<24) /* Always perform atime updates */ + +/* These sb flags are internal to the kernel */ +#define MS_SNAP_STABLE (1<<27) /* Snapshot pages during writeback, if needed */ #define MS_NOSEC (1<<28) #define MS_BORN (1<<29) #define MS_ACTIVE (1<<30) diff --git a/mm/Kconfig b/mm/Kconfig index 278e3ab1f169..7901d839aab2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -258,6 +258,19 @@ config BOUNCE def_bool y depends on BLOCK && MMU && (ZONE_DMA || HIGHMEM) +# On the 'tile' arch, USB OHCI needs the bounce pool since tilegx will often +# have more than 4GB of memory, but we don't currently use the IOTLB to present +# a 32-bit address to OHCI. So we need to use a bounce pool instead. +# +# We also use the bounce pool to provide stable page writes for jbd. jbd +# initiates buffer writeback without locking the page or setting PG_writeback, +# and fixing that behavior (a second time; jbd2 doesn't have this problem) is +# a major rework effort. Instead, use the bounce buffer to snapshot pages +# (until jbd goes away). The only jbd user is ext3. +config NEED_BOUNCE_POOL + bool + default y if (TILE && USB_OHCI_HCD) || (BLK_DEV_INTEGRITY && JBD) + config NR_QUICK int depends on QUICKLIST diff --git a/mm/bounce.c b/mm/bounce.c index 042086775561..5f8901768602 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -178,8 +178,45 @@ static void bounce_end_io_read_isa(struct bio *bio, int err) __bounce_end_io_read(bio, isa_page_pool, err); } +#ifdef CONFIG_NEED_BOUNCE_POOL +static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) +{ + struct page *page; + struct backing_dev_info *bdi; + struct address_space *mapping; + struct bio_vec *from; + int i; + + if (bio_data_dir(bio) != WRITE) + return 0; + + if (!bdi_cap_stable_pages_required(&q->backing_dev_info)) + return 0; + + /* + * Based on the first page that has a valid mapping, decide whether or + * not we have to employ bounce buffering to guarantee stable pages. + */ + bio_for_each_segment(from, bio, i) { + page = from->bv_page; + mapping = page_mapping(page); + if (!mapping) + continue; + bdi = mapping->backing_dev_info; + return mapping->host->i_sb->s_flags & MS_SNAP_STABLE; + } + + return 0; +} +#else +static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio) +{ + return 0; +} +#endif /* CONFIG_NEED_BOUNCE_POOL */ + static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool) + mempool_t *pool, int force) { struct page *page; struct bio *bio = NULL; @@ -192,7 +229,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, /* * is destination page below bounce pfn? */ - if (page_to_pfn(page) <= queue_bounce_pfn(q)) + if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force) continue; /* @@ -270,6 +307,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) { + int must_bounce; mempool_t *pool; /* @@ -278,13 +316,15 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) if (!bio_has_data(*bio_orig)) return; + must_bounce = must_snapshot_stable_pages(q, *bio_orig); + /* * for non-isa bounce case, just check if the bounce pfn is equal * to or bigger than the highest pfn in the system -- in that case, * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (queue_bounce_pfn(q) >= blk_max_pfn) + if (queue_bounce_pfn(q) >= blk_max_pfn && !must_bounce) return; pool = page_pool; } else { @@ -295,7 +335,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) /* * slow path */ - __blk_queue_bounce(q, bio_orig, pool); + __blk_queue_bounce(q, bio_orig, pool, must_bounce); } EXPORT_SYMBOL(blk_queue_bounce); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 355d5ee69058..7300c9d5e1d9 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2306,6 +2306,10 @@ void wait_for_stable_page(struct page *page) if (!bdi_cap_stable_pages_required(bdi)) return; +#ifdef CONFIG_NEED_BOUNCE_POOL + if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE) + return; +#endif /* CONFIG_NEED_BOUNCE_POOL */ wait_on_page_writeback(page); } -- cgit v1.2.3 From 1269529bda27823ed42798762e896c059ea5e486 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 Feb 2013 16:42:57 -0800 Subject: ocfs2: wait for page writeback to provide stable pages When stable pages are required, we have to wait if the page is just going to disk and we want to modify it. Add proper callback to ocfs2_grab_pages_for_write(). Signed-off-by: Jan Kara Signed-off-by: Darrick J. Wong Acked-by: Joel Becker Cc: Mark Fasheh Cc: Adrian Hunter Cc: Andy Lutomirski Cc: Artem Bityutskiy Cc: Steven Whitehouse Cc: Jens Axboe Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 657743254eb9..9796330d8f04 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1194,6 +1194,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, goto out; } } + wait_for_stable_page(wc->w_pages[i]); if (index == target_index) wc->w_target_page = wc->w_pages[i]; -- cgit v1.2.3 From 182dcfd648aef0705fd04c42cbe507c09e00c25d Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 21 Feb 2013 16:42:59 -0800 Subject: ubifs: wait for page writeback to provide stable pages When stable pages are required, we have to wait if the page is just going to disk and we want to modify it. Add proper callback to ubifs_vm_page_mkwrite(). Signed-off-by: Jan Kara Signed-off-by: Darrick J. Wong Cc: Artem Bityutskiy Cc: Adrian Hunter Cc: Andy Lutomirski Cc: Joel Becker Cc: Mark Fasheh Cc: Steven Whitehouse Cc: Jens Axboe Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ubifs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5bc77817f382..4f6493c130e0 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1522,6 +1522,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, ubifs_release_dirty_inode_budget(c, ui); } + wait_for_stable_page(page); unlock_page(page); return 0; -- cgit v1.2.3 From d3330cf08ccf9aef28a3e5740fd5cc7ac536db84 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Thu, 21 Feb 2013 16:44:20 -0800 Subject: binfmt_elf: remove unused argument in fill_elf_header In fill_elf_header(), elf->e_ident[EI_OSABI] is always set to ELF_OSABI, so remove the unused argument 'osabi'. Signed-off-by: Zhang Yanfei Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 49d0b43458b7..ff9dbc630efa 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1249,7 +1249,7 @@ static int writenote(struct memelfnote *men, struct file *file, #undef DUMP_WRITE static void fill_elf_header(struct elfhdr *elf, int segs, - u16 machine, u32 flags, u8 osabi) + u16 machine, u32 flags) { memset(elf, 0, sizeof(*elf)); @@ -1634,7 +1634,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Initialize the ELF file header. */ fill_elf_header(elf, phdrs, - view->e_machine, view->e_flags, view->ei_osabi); + view->e_machine, view->e_flags); /* * Allocate a structure for each thread. @@ -1874,7 +1874,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, elf_core_copy_regs(&info->prstatus->pr_reg, regs); /* Set up header */ - fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI); + fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); /* * Set up the notes in similar form to SVR4 core dumps made -- cgit v1.2.3