From bd5fe6c5eb9c548d7f07fe8f89a150bb6705e8e3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:43 -0400 Subject: fs: kill i_alloc_sem i_alloc_sem is a rather special rw_semaphore. It's the last one that may be released by a non-owner, and it's write side is always mirrored by real exclusion. It's intended use it to wait for all pending direct I/O requests to finish before starting a truncate. Replace it with a hand-grown construct: - exclusion for truncates is already guaranteed by i_mutex, so it can simply fall way - the reader side is replaced by an i_dio_count member in struct inode that counts the number of pending direct I/O requests. Truncate can't proceed as long as it's non-zero - when i_dio_count reaches non-zero we wake up a pending truncate using wake_up_bit on a new bit in i_flags - new references to i_dio_count can't appear while we are waiting for it to read zero because the direct I/O count always needs i_mutex (or an equivalent like XFS's i_iolock) for starting a new operation. This scheme is much simpler, and saves the space of a spinlock_t and a struct list_head in struct inode (typically 160 bits on a non-debug 64-bit system). Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/ocfs2/aops.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/ocfs2/aops.c') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index ac97bca282d2..de1d3953599d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -551,9 +551,8 @@ bail: /* * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're - * particularly interested in the aio/dio case. Like the core uses - * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from - * truncation on another. + * particularly interested in the aio/dio case. We use the rw_lock DLM lock + * to protect io on one node from truncation on another. */ static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, @@ -569,7 +568,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); if (ocfs2_iocb_is_sem_locked(iocb)) { - up_read(&inode->i_alloc_sem); + inode_dio_done(inode); ocfs2_iocb_clear_sem_locked(iocb); } -- cgit v1.2.3 From df2d6f26586f12a24f3ae5df4e236dc5c08d6eb4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:46 -0400 Subject: fs: always maintain i_dio_count Maintain i_dio_count for all filesystems, not just those using DIO_LOCKING. This these filesystems to also protect truncate against direct I/O requests by using common code. Right now the only non-DIO_LOCKING filesystem that appears to do so is XFS, which uses an opencoded variant of the i_dio_count scheme. Behaviour doesn't change for filesystems never calling inode_dio_wait. For ext4 behaviour changes when using the dioread_nonlock option, which previously was missing any protection between truncate and direct I/O reads. For ocfs2 that handcrafted i_dio_count manipulations are replaced with the common code now enable. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/direct-io.c | 25 +++++++++++++------------ fs/ocfs2/aops.c | 4 +--- fs/ocfs2/file.c | 12 +++--------- 3 files changed, 17 insertions(+), 24 deletions(-) (limited to 'fs/ocfs2/aops.c') diff --git a/fs/direct-io.c b/fs/direct-io.c index 354cbdbc14bd..0a073c7125a6 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -297,8 +297,7 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is aio_complete(dio->iocb, ret, 0); } - if (dio->flags & DIO_LOCKING) - inode_dio_done(dio->inode); + inode_dio_done(dio->inode); return ret; } @@ -1185,14 +1184,16 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, * For writes this function is called under i_mutex and returns with * i_mutex held, for reads, i_mutex is not held on entry, but it is * taken and dropped again before returning. - * The i_dio_count counter keeps track of the number of outstanding - * direct I/O requests, and truncate waits for it to reach zero. - * New references to i_dio_count must only be grabbed with i_mutex - * held. - * * - if the flags value does NOT contain DIO_LOCKING we don't use any * internal locking but rather rely on the filesystem to synchronize * direct I/O reads/writes versus each other and truncate. + * + * To help with locking against truncate we incremented the i_dio_count + * counter before starting direct I/O, and decrement it once we are done. + * Truncate can wait for it to reach zero to provide exclusion. It is + * expected that filesystem provide exclusion between new direct I/O + * and truncates. For DIO_LOCKING filesystems this is done by i_mutex, + * but other filesystems need to take care of this on their own. */ ssize_t __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, @@ -1270,13 +1271,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, goto out; } } - - /* - * Will be decremented at I/O completion time. - */ - atomic_inc(&inode->i_dio_count); } + /* + * Will be decremented at I/O completion time. + */ + atomic_inc(&inode->i_dio_count); + /* * For file extending writes updating i_size before data * writeouts complete can expose uninitialized blocks. So diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index de1d3953599d..524d6167fb63 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -567,10 +567,8 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - if (ocfs2_iocb_is_sem_locked(iocb)) { - inode_dio_done(inode); + if (ocfs2_iocb_is_sem_locked(iocb)) ocfs2_iocb_clear_sem_locked(iocb); - } ocfs2_iocb_clear_rw_locked(iocb); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 736283ca4a4c..22d604601957 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2240,7 +2240,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, relock: /* to match setattr's i_mutex -> rw_lock ordering */ if (direct_io) { - atomic_inc(&inode->i_dio_count); have_alloc_sem = 1; /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_sem_locked(iocb); @@ -2292,7 +2291,6 @@ relock: */ if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); - inode_dio_done(inode); have_alloc_sem = 0; rw_level = -1; @@ -2379,10 +2377,8 @@ out: ocfs2_rw_unlock(inode, rw_level); out_sems: - if (have_alloc_sem) { - inode_dio_done(inode); + if (have_alloc_sem) ocfs2_iocb_clear_sem_locked(iocb); - } mutex_unlock(&inode->i_mutex); @@ -2533,7 +2529,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, */ if (filp->f_flags & O_DIRECT) { have_alloc_sem = 1; - atomic_inc(&inode->i_dio_count); ocfs2_iocb_set_sem_locked(iocb); ret = ocfs2_rw_lock(inode, 0); @@ -2575,10 +2570,9 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, } bail: - if (have_alloc_sem) { - inode_dio_done(inode); + if (have_alloc_sem) ocfs2_iocb_clear_sem_locked(iocb); - } + if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); -- cgit v1.2.3 From 72c5052ddc3956d847f21c2b8d55c93664a51b2c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Jun 2011 14:29:48 -0400 Subject: fs: move inode_dio_done to the end_io handler For filesystems that delay their end_io processing we should keep our i_dio_count until the the processing is done. Enable this by moving the inode_dio_done call to the end_io handler if one exist. Note that the actual move to the workqueue for ext4 and XFS is not done in this patch yet, but left to the filesystem maintainers. At least for XFS it's not needed yet either as XFS has an internal equivalent to i_dio_count. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/direct-io.c | 7 ++++--- fs/ext4/inode.c | 5 +++++ fs/ocfs2/aops.c | 1 + fs/xfs/linux-2.6/xfs_aops.c | 3 +++ 4 files changed, 13 insertions(+), 3 deletions(-) (limited to 'fs/ocfs2/aops.c') diff --git a/fs/direct-io.c b/fs/direct-io.c index 0a073c7125a6..01d2d9ef609c 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -293,11 +293,12 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is if (dio->end_io && dio->result) { dio->end_io(dio->iocb, offset, transferred, dio->map_bh.b_private, ret, is_async); - } else if (is_async) { - aio_complete(dio->iocb, ret, 0); + } else { + if (is_async) + aio_complete(dio->iocb, ret, 0); + inode_dio_done(dio->inode); } - inode_dio_done(dio->inode); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1f35573a34e1..678cde834f19 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3573,6 +3573,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private, int ret, bool is_async) { + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; unsigned long flags; @@ -3594,6 +3595,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, out: if (is_async) aio_complete(iocb, ret, 0); + inode_dio_done(inode); return; } @@ -3614,6 +3616,9 @@ out: /* queue the work to convert unwritten extents to written */ queue_work(wq, &io_end->work); iocb->private = NULL; + + /* XXX: probably should move into the real I/O completion handler */ + inode_dio_done(inode); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 524d6167fb63..c1efe939c774 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -577,6 +577,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (is_async) aio_complete(iocb, ret, 0); + inode_dio_done(inode); } /* diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 79ce38be15a1..b3b418f519f3 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1339,6 +1339,9 @@ xfs_end_io_direct_write( } else { xfs_finish_ioend_sync(ioend); } + + /* XXX: probably should move into the real I/O completion handler */ + inode_dio_done(ioend->io_inode); } STATIC ssize_t -- cgit v1.2.3 From 5cffff9e29866a3de98c2c25135b3199491f93b0 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Sun, 24 Jul 2011 10:36:54 -0700 Subject: ocfs2: Fix ocfs2_page_mkwrite() This patch address two shortcomings in ocfs2_page_mkwrite(): 1. Makes the function return better VM_FAULT_* errors. 2. It handles a error that is triggered when a page is dropped from the mapping due to memory pressure. This patch locks the page to prevent that. [Patch was cleaned up by Sunil Mushran.] Signed-off-by: Wengang Wang Signed-off-by: Sunil Mushran --- fs/ocfs2/aops.c | 51 +++++++++++++++++++++++++++++++++++++++++++-------- fs/ocfs2/mmap.c | 53 ++++++++++++++++++++++++----------------------------- 2 files changed, 67 insertions(+), 37 deletions(-) (limited to 'fs/ocfs2/aops.c') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c1efe939c774..ff98c169b631 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -862,6 +862,12 @@ struct ocfs2_write_ctxt { struct page *w_pages[OCFS2_MAX_CTXT_PAGES]; struct page *w_target_page; + /* + * w_target_locked is used for page_mkwrite path indicating no unlocking + * against w_target_page in ocfs2_write_end_nolock. + */ + unsigned int w_target_locked:1; + /* * ocfs2_write_end() uses this to know what the real range to * write in the target should be. @@ -895,6 +901,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) { + int i; + + /* + * w_target_locked is only set to true in the page_mkwrite() case. + * The intent is to allow us to lock the target page from write_begin() + * to write_end(). The caller must hold a ref on w_target_page. + */ + if (wc->w_target_locked) { + BUG_ON(!wc->w_target_page); + for (i = 0; i < wc->w_num_pages; i++) { + if (wc->w_target_page == wc->w_pages[i]) { + wc->w_pages[i] = NULL; + break; + } + } + mark_page_accessed(wc->w_target_page); + page_cache_release(wc->w_target_page); + } ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); brelse(wc->w_di_bh); @@ -1132,20 +1156,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, */ lock_page(mmap_page); + /* Exit and let the caller retry */ if (mmap_page->mapping != mapping) { + WARN_ON(mmap_page->mapping); unlock_page(mmap_page); - /* - * Sanity check - the locking in - * ocfs2_pagemkwrite() should ensure - * that this code doesn't trigger. - */ - ret = -EINVAL; - mlog_errno(ret); + ret = -EAGAIN; goto out; } page_cache_get(mmap_page); wc->w_pages[i] = mmap_page; + wc->w_target_locked = true; } else { wc->w_pages[i] = find_or_create_page(mapping, index, GFP_NOFS); @@ -1160,6 +1181,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping, wc->w_target_page = wc->w_pages[i]; } out: + if (ret) + wc->w_target_locked = false; return ret; } @@ -1817,11 +1840,23 @@ try_again: */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, cluster_of_pages, mmap_page); - if (ret) { + if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out_quota; } + /* + * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock + * the target page. In this case, we exit with no error and no target + * page. This will trigger the caller, page_mkwrite(), to re-try + * the operation. + */ + if (ret == -EAGAIN) { + BUG_ON(wc->w_target_page); + ret = 0; + goto out_quota; + } + ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, len); if (ret) { diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 3e9393ca39eb..9cd41083e991 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf) static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, struct page *page) { - int ret; + int ret = VM_FAULT_NOPAGE; struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); @@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, void *fsdata; loff_t size = i_size_read(inode); - /* - * Another node might have truncated while we were waiting on - * cluster locks. - * We don't check size == 0 before the shift. This is borrowed - * from do_generic_file_read. - */ last_index = (size - 1) >> PAGE_CACHE_SHIFT; - if (unlikely(!size || page->index > last_index)) { - ret = -EINVAL; - goto out; - } /* - * The i_size check above doesn't catch the case where nodes - * truncated and then re-extended the file. We'll re-check the - * page mapping after taking the page lock inside of - * ocfs2_write_begin_nolock(). + * There are cases that lead to the page no longer bebongs to the + * mapping. + * 1) pagecache truncates locally due to memory pressure. + * 2) pagecache truncates when another is taking EX lock against + * inode lock. see ocfs2_data_convert_worker. + * + * The i_size check doesn't catch the case where nodes truncated and + * then re-extended the file. We'll re-check the page mapping after + * taking the page lock inside of ocfs2_write_begin_nolock(). + * + * Let VM retry with these cases. */ - if (!PageUptodate(page) || page->mapping != inode->i_mapping) { - /* - * the page has been umapped in ocfs2_data_downconvert_worker. - * So return 0 here and let VFS retry. - */ - ret = 0; + if ((page->mapping != inode->i_mapping) || + (!PageUptodate(page)) || + (page_offset(page) >= size)) goto out; - } /* * Call ocfs2_write_begin() and ocfs2_write_end() to take @@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, if (ret) { if (ret != -ENOSPC) mlog_errno(ret); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, - fsdata); - if (ret < 0) { - mlog_errno(ret); + if (!locked_page) { + ret = VM_FAULT_NOPAGE; goto out; } + ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, + fsdata); BUG_ON(ret != len); - ret = 0; + ret = VM_FAULT_LOCKED; out: return ret; } @@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) out: ocfs2_unblock_signals(&oldset); - if (ret) - ret = VM_FAULT_SIGBUS; return ret; } -- cgit v1.2.3