From ae54870a1dc978a88377ae8af0780648f2ccd4dc Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 21 Apr 2011 23:47:16 +0200 Subject: ext3: Fix lock inversion in ext3_symlink() ext3_symlink() cannot call __page_symlink() with transaction open. __page_symlink() calls ext3_write_begin() which gets page lock which ranks above transaction start (thus lock ordering is violated) and and also ext3_write_begin() waits for a transaction commit when we run out of space which never happens if we hold transaction open. Fix the problem by stopping a transaction before calling __page_symlink() (we have to be careful and put inode to orphan list so that it gets deleted in case of crash) and starting another one after __page_symlink() returns for addition of symlink into a directory. Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext3/namei.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 56 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 32f3b8695859..f6ce3e79d315 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2189,6 +2189,7 @@ static int ext3_symlink (struct inode * dir, handle_t *handle; struct inode * inode; int l, err, retries = 0; + int credits; l = strlen(symname)+1; if (l > dir->i_sb->s_blocksize) @@ -2196,10 +2197,26 @@ static int ext3_symlink (struct inode * dir, dquot_initialize(dir); + if (l > EXT3_N_BLOCKS * 4) { + /* + * For non-fast symlinks, we just allocate inode and put it on + * orphan list in the first transaction => we need bitmap, + * group descriptor, sb, inode block, quota blocks. + */ + credits = 4 + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } else { + /* + * Fast symlink. We have to add entry to directory + * (EXT3_DATA_TRANS_BLOCKS + EXT3_INDEX_EXTRA_TRANS_BLOCKS), + * allocate new inode (bitmap, group descriptor, inode block, + * quota blocks, sb is already counted in previous macros). + */ + credits = EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb); + } retry: - handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + - EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); + handle = ext3_journal_start(dir, credits); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -2211,21 +2228,45 @@ retry: if (IS_ERR(inode)) goto out_stop; - if (l > sizeof (EXT3_I(inode)->i_data)) { + if (l > EXT3_N_BLOCKS * 4) { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); /* - * page_symlink() calls into ext3_prepare/commit_write. - * We have a transaction open. All is sweetness. It also sets - * i_size in generic_commit_write(). + * We cannot call page_symlink() with transaction started + * because it calls into ext3_write_begin() which acquires page + * lock which ranks below transaction start (and it can also + * wait for journal commit if we are running out of space). So + * we have to stop transaction now and restart it when symlink + * contents is written. + * + * To keep fs consistent in case of crash, we have to put inode + * to orphan list in the mean time. */ + drop_nlink(inode); + err = ext3_orphan_add(handle, inode); + ext3_journal_stop(handle); + if (err) + goto err_drop_inode; err = __page_symlink(inode, symname, l, 1); + if (err) + goto err_drop_inode; + /* + * Now inode is being linked into dir (EXT3_DATA_TRANS_BLOCKS + * + EXT3_INDEX_EXTRA_TRANS_BLOCKS), inode is also modified + */ + handle = ext3_journal_start(dir, + EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto err_drop_inode; + } + inc_nlink(inode); + err = ext3_orphan_del(handle, inode); if (err) { + ext3_journal_stop(handle); drop_nlink(inode); - unlock_new_inode(inode); - ext3_mark_inode_dirty(handle, inode); - iput (inode); - goto out_stop; + goto err_drop_inode; } } else { inode->i_op = &ext3_fast_symlink_inode_operations; @@ -2239,6 +2280,10 @@ out_stop: if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries)) goto retry; return err; +err_drop_inode: + unlock_new_inode(inode); + iput (inode); + return err; } static int ext3_link (struct dentry * old_dentry, -- cgit v1.2.3 From 86c4f6d85595cd7da635dc6985d27bfa43b1ae10 Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Wed, 27 Apr 2011 18:20:44 +0200 Subject: ext3: Fix fs corruption when make_indexed_dir() fails When make_indexed_dir() fails (e.g. because of ENOSPC) after it has allocated block for index tree root, we did not properly mark all changed buffers dirty. This lead to only some of these buffers being written out and thus effectively corrupting the directory. Fix the issue by marking all changed data dirty even in the error failure case. CC: stable@kernel.org Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext3/namei.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index f6ce3e79d315..34b6d9bfc48a 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1416,10 +1416,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, frame->at = entries; frame->bh = bh; bh = bh2; + /* + * Mark buffers dirty here so that if do_split() fails we write a + * consistent set of buffers to disk. + */ + ext3_journal_dirty_metadata(handle, frame->bh); + ext3_journal_dirty_metadata(handle, bh); de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - dx_release (frames); - if (!(de)) + if (!de) { + ext3_mark_inode_dirty(handle, dir); + dx_release(frames); return retval; + } + dx_release(frames); return add_dirent_to_buf(handle, dentry, inode, de, bh); } @@ -2282,7 +2291,7 @@ out_stop: return err; err_drop_inode: unlock_new_inode(inode); - iput (inode); + iput(inode); return err; } -- cgit v1.2.3 From d9b01934d56a96d9f4ae2d6204d4ea78a36f5f36 Mon Sep 17 00:00:00 2001 From: Ted Ts'o <tytso@mit.edu> Date: Sat, 30 Apr 2011 13:17:11 -0400 Subject: jbd: fix fsync() tid wraparound bug If an application program does not make any changes to the indirect blocks or extent tree, i_datasync_tid will not get updated. If there are enough commits (i.e., 2**31) such that tid_geq()'s calculations wrap, and there isn't a currently active transaction at the time of the fdatasync() call, this can end up triggering a BUG_ON in fs/jbd/commit.c: J_ASSERT(journal->j_running_transaction != NULL); It's pretty rare that this can happen, since it requires the use of fdatasync() plus *very* frequent and excessive use of fsync(). But with the right workload, it can. We fix this by replacing the use of tid_geq() with an equality test, since there's only one valid transaction id that is valid for us to start: namely, the currently running transaction (if it exists). CC: stable@kernel.org Reported-by: Martin_Zielinski@McAfee.com Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/jbd/journal.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index b3713afaaa9e..e2d4285fbe90 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -437,9 +437,12 @@ int __log_space_left(journal_t *journal) int __log_start_commit(journal_t *journal, tid_t target) { /* - * Are we already doing a recent enough commit? + * The only transaction we can possibly wait upon is the + * currently running transaction (if it exists). Otherwise, + * the target tid must be an old one. */ - if (!tid_geq(journal->j_commit_request, target)) { + if (journal->j_running_transaction && + journal->j_running_transaction->t_tid == target) { /* * We want a new commit: OK, mark the request and wakeup the * commit thread. We do _not_ do the commit ourselves. @@ -451,7 +454,14 @@ int __log_start_commit(journal_t *journal, tid_t target) journal->j_commit_sequence); wake_up(&journal->j_wait_commit); return 1; - } + } else if (!tid_geq(journal->j_commit_request, target)) + /* This should never happen, but if it does, preserve + the evidence before kjournald goes into a loop and + increments j_commit_sequence beyond all recognition. */ + WARN_ONCE(1, "jbd: bad log_start_commit: %u %u %u %u\n", + journal->j_commit_request, journal->j_commit_sequence, + target, journal->j_running_transaction ? + journal->j_running_transaction->t_tid : 0); return 0; } -- cgit v1.2.3 From 4e299c1d9113b5c1c3845c0d4e78d09dd60a4fe8 Mon Sep 17 00:00:00 2001 From: Robin Dong <hao.bigrat@gmail.com> Date: Thu, 5 May 2011 10:44:04 +0800 Subject: ext2: fix error msg when mounting fs with too-large blocksize When ext2 mounts a filesystem, it attempts to set the block device blocksize with a call to sb_set_blocksize, which can fail for several reasons. The current failure message in ext2 prints: EXT2-fs (loop1): error: blocksize is too small which is not correct in all cases. This can be demonstrated by creating a filesystem with # mkfs.ext2 -b 8192 on a 4k page system, and attempting to mount it. Change the error message to a more generic: EXT2-fs (loop1): bad blocksize 8192 to match the error message in ext3. Signed-off-by: Robin Dong <sanbai@taobao.com> Reviewed-by: Coly Li <bosong.ly@taobao.com> Reviewed-by: Eric Sandeen <sandeen@redhat.com> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/ext2/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 0a78dae7e2cb..1dd62ed35b85 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -898,7 +898,8 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); if (!sb_set_blocksize(sb, blocksize)) { - ext2_msg(sb, KERN_ERR, "error: blocksize is too small"); + ext2_msg(sb, KERN_ERR, + "error: bad blocksize %d", blocksize); goto failed_sbi; } -- cgit v1.2.3 From 2842bb20eed2e25cde5114298edc62c8883a1d9a Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@suse.cz> Date: Thu, 5 May 2011 13:59:35 +0200 Subject: jbd: Fix forever sleeping process in do_get_write_access() In do_get_write_access() we wait on BH_Unshadow bit for buffer to get from shadow state. The waking code in journal_commit_transaction() has a bug because it does not issue a memory barrier after the buffer is moved from the shadow state and before wake_up_bit() is called. Thus a waitqueue check can happen before the buffer is actually moved from the shadow state and waiting process may never be woken. Fix the problem by issuing proper barrier. CC: stable@kernel.org Reported-by: Tao Ma <boyu.mt@taobao.com> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/jbd/commit.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 69b180459463..f486ff697781 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -722,8 +722,13 @@ wait_for_iobuf: required. */ JBUFFER_TRACE(jh, "file as BJ_Forget"); journal_file_buffer(jh, commit_transaction, BJ_Forget); - /* Wake up any transactions which were waiting for this - IO to complete */ + /* + * Wake up any transactions which were waiting for this + * IO to complete. The barrier must be here so that changes + * by journal_file_buffer() take effect before wake_up_bit() + * does the waitqueue check. + */ + smp_mb(); wake_up_bit(&bh->b_state, BH_Unshadow); JBUFFER_TRACE(jh, "brelse shadowed buffer"); __brelse(bh); -- cgit v1.2.3 From 9199e66528f61a06abe09f0589bbe1eecaa301a7 Mon Sep 17 00:00:00 2001 From: Tao Ma <boyu.mt@taobao.com> Date: Thu, 5 May 2011 23:54:19 +0800 Subject: jbd/jbd2: remove obsolete summarise_journal_usage. summarise_journal_usage seems to be obsolete for a long time, so remove it. Cc: Jan Kara <jack@suse.cz> Signed-off-by: Tao Ma <boyu.mt@taobao.com> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/jbd/commit.c | 6 ------ fs/jbd2/commit.c | 6 ------ 2 files changed, 12 deletions(-) (limited to 'fs') diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index f486ff697781..72ffa974b0b8 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -302,12 +302,6 @@ void journal_commit_transaction(journal_t *journal) * all outstanding updates to complete. */ -#ifdef COMMIT_STATS - spin_lock(&journal->j_list_lock); - summarise_journal_usage(journal); - spin_unlock(&journal->j_list_lock); -#endif - /* Do we need to erase the effects of a prior journal_flush? */ if (journal->j_flags & JFS_FLUSHED) { jbd_debug(3, "super block updated\n"); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6e28000a4b21..29148a81c783 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -338,12 +338,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) * all outstanding updates to complete. */ -#ifdef COMMIT_STATS - spin_lock(&journal->j_list_lock); - summarise_journal_usage(journal); - spin_unlock(&journal->j_list_lock); -#endif - /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); -- cgit v1.2.3 From c2b67735e5d3a419e90047b3f4179e54a39637bc Mon Sep 17 00:00:00 2001 From: Eryu Guan <guaneryu@gmail.com> Date: Tue, 24 May 2011 00:17:34 +0800 Subject: jbd: Fix comment to match the code in journal_start() journal_start returns an ERR_PTR() value rather than NULL on failure. Cc: Jan Kara <jack@suse.cz> Signed-off-by: Eryu Guan <guaneryu@gmail.com> Signed-off-by: Jan Kara <jack@suse.cz> --- fs/jbd/transaction.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 60d2319651b2..f7ee81a065da 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -266,7 +266,8 @@ static handle_t *new_handle(int nblocks) * This function is visible to journal users (like ext3fs), so is not * called with the journal already locked. * - * Return a pointer to a newly allocated handle, or NULL on failure + * Return a pointer to a newly allocated handle, or an ERR_PTR() value + * on failure. */ handle_t *journal_start(journal_t *journal, int nblocks) { -- cgit v1.2.3