From aaddea812cb0a2dc38b55ba557b68999bc2f6203 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Wed, 16 Jan 2013 20:21:26 -0500 Subject: ext4: add tracepoint in punching hole This patch adds a tracepoint in ext4_punch_hole. CC: Lukas Czerner Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'include/trace') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 7e8c36bc7082..6080ea1380b8 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1324,6 +1324,31 @@ TRACE_EVENT(ext4_fallocate_exit, __entry->ret) ); +TRACE_EVENT(ext4_punch_hole, + TP_PROTO(struct inode *inode, loff_t offset, loff_t len), + + TP_ARGS(inode, offset, len), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( loff_t, offset ) + __field( loff_t, len ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->offset = offset; + __entry->len = len; + ), + + TP_printk("dev %d,%d ino %lu offset %lld len %lld", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, + __entry->offset, __entry->len) +); + TRACE_EVENT(ext4_unlink_enter, TP_PROTO(struct inode *parent, struct dentry *dentry), -- cgit v1.2.3 From 9fff24aa2c5c504aadead1ff9599e813604c2e53 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 6 Feb 2013 22:30:23 -0500 Subject: jbd2: track request delay statistics Track the delay between when we first request that the commit begin and when it actually begins, so we can see how much of a gap exists. In theory, this should just be the remaining scheduling quantuum of the thread which requested the commit (assuming it was not a synchronous operation which triggered the commit request) plus scheduling overhead; however, it's possible that real time processes might get in the way of letting the kjournald thread from executing. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 8 ++++++++ fs/jbd2/journal.c | 12 +++++++++--- fs/jbd2/transaction.c | 1 + include/linux/jbd2.h | 7 +++++++ include/trace/events/jbd2.h | 8 ++++++-- 5 files changed, 31 insertions(+), 5 deletions(-) (limited to 'include/trace') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 3091d42992f0..750c70148eff 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -435,7 +435,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) trace_jbd2_commit_locking(journal, commit_transaction); stats.run.rs_wait = commit_transaction->t_max_wait; + stats.run.rs_request_delay = 0; stats.run.rs_locked = jiffies; + if (commit_transaction->t_requested) + stats.run.rs_request_delay = + jbd2_time_diff(commit_transaction->t_requested, + stats.run.rs_locked); stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start, stats.run.rs_locked); @@ -1116,7 +1121,10 @@ restart_loop: */ spin_lock(&journal->j_history_lock); journal->j_stats.ts_tid++; + if (commit_transaction->t_requested) + journal->j_stats.ts_requested++; journal->j_stats.run.rs_wait += stats.run.rs_wait; + journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay; journal->j_stats.run.rs_running += stats.run.rs_running; journal->j_stats.run.rs_locked += stats.run.rs_locked; journal->j_stats.run.rs_flushing += stats.run.rs_flushing; diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 1a80e3146a59..4ba2e81e35ac 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -533,6 +533,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t target) jbd_debug(1, "JBD2: requesting commit %d/%d\n", journal->j_commit_request, journal->j_commit_sequence); + journal->j_running_transaction->t_requested = jiffies; wake_up(&journal->j_wait_commit); return 1; } else if (!tid_geq(journal->j_commit_request, target)) @@ -898,13 +899,18 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) if (v != SEQ_START_TOKEN) return 0; - seq_printf(seq, "%lu transaction, each up to %u blocks\n", - s->stats->ts_tid, - s->journal->j_max_transaction_buffers); + seq_printf(seq, "%lu transactions (%lu requested), " + "each up to %u blocks\n", + s->stats->ts_tid, s->stats->ts_requested, + s->journal->j_max_transaction_buffers); if (s->stats->ts_tid == 0) return 0; seq_printf(seq, "average: \n %ums waiting for transaction\n", jiffies_to_msecs(s->stats->run.rs_wait / s->stats->ts_tid)); + seq_printf(seq, " %ums request delay\n", + (s->stats->ts_requested == 0) ? 0 : + jiffies_to_msecs(s->stats->run.rs_request_delay / + s->stats->ts_requested)); seq_printf(seq, " %ums running transaction\n", jiffies_to_msecs(s->stats->run.rs_running / s->stats->ts_tid)); seq_printf(seq, " %ums transaction was being locked\n", diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index df9f29760efa..735609e2d636 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -100,6 +100,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) journal->j_running_transaction = transaction; transaction->t_max_wait = 0; transaction->t_start = jiffies; + transaction->t_requested = 0; return transaction; } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index e30b66346942..e0aafc46064f 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -580,6 +580,11 @@ struct transaction_s */ unsigned long t_start; + /* + * When commit was requested + */ + unsigned long t_requested; + /* * Checkpointing stats [j_checkpoint_sem] */ @@ -637,6 +642,7 @@ struct transaction_s struct transaction_run_stats_s { unsigned long rs_wait; + unsigned long rs_request_delay; unsigned long rs_running; unsigned long rs_locked; unsigned long rs_flushing; @@ -649,6 +655,7 @@ struct transaction_run_stats_s { struct transaction_stats_s { unsigned long ts_tid; + unsigned long ts_requested; struct transaction_run_stats_s run; }; diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 127993dbf322..5419f57beb1f 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -142,6 +142,7 @@ TRACE_EVENT(jbd2_run_stats, __field( dev_t, dev ) __field( unsigned long, tid ) __field( unsigned long, wait ) + __field( unsigned long, request_delay ) __field( unsigned long, running ) __field( unsigned long, locked ) __field( unsigned long, flushing ) @@ -155,6 +156,7 @@ TRACE_EVENT(jbd2_run_stats, __entry->dev = dev; __entry->tid = tid; __entry->wait = stats->rs_wait; + __entry->request_delay = stats->rs_request_delay; __entry->running = stats->rs_running; __entry->locked = stats->rs_locked; __entry->flushing = stats->rs_flushing; @@ -164,10 +166,12 @@ TRACE_EVENT(jbd2_run_stats, __entry->blocks_logged = stats->rs_blocks_logged; ), - TP_printk("dev %d,%d tid %lu wait %u running %u locked %u flushing %u " - "logging %u handle_count %u blocks %u blocks_logged %u", + TP_printk("dev %d,%d tid %lu wait %u request_delay %u running %u " + "locked %u flushing %u logging %u handle_count %u " + "blocks %u blocks_logged %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, jiffies_to_msecs(__entry->wait), + jiffies_to_msecs(__entry->request_delay), jiffies_to_msecs(__entry->running), jiffies_to_msecs(__entry->locked), jiffies_to_msecs(__entry->flushing), -- cgit v1.2.3 From 343d9c283c9847da043fda3e76e3197f27b667dd Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 8 Feb 2013 13:00:22 -0500 Subject: jbd2: add tracepoints which provide per-handle statistics Handles which stay open a long time are problematic when it comes time to close down a transaction so it can be committed. These tracepoints will help us determine which ones are the problematic ones, and to validate whether changes makes things better or worse. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/transaction.c | 28 ++++++++++++- include/linux/jbd2.h | 8 +++- include/trace/events/jbd2.h | 98 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 131 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 735609e2d636..b7e2385c6e92 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -30,6 +30,8 @@ #include #include +#include + static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); static void __jbd2_journal_unfile_buffer(struct journal_head *jh); @@ -307,6 +309,8 @@ repeat: */ update_t_max_wait(transaction, ts); handle->h_transaction = transaction; + handle->h_requested_credits = nblocks; + handle->h_start_jiffies = jiffies; atomic_inc(&transaction->t_updates); atomic_inc(&transaction->t_handle_count); jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", @@ -353,7 +357,8 @@ static handle_t *new_handle(int nblocks) * Return a pointer to a newly allocated handle, or an ERR_PTR() value * on failure. */ -handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask, + unsigned int type, unsigned int line_no) { handle_t *handle = journal_current_handle(); int err; @@ -379,6 +384,11 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, gfp_t gfp_mask) current->journal_info = NULL; handle = ERR_PTR(err); } + handle->h_type = type; + handle->h_line_no = line_no; + trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, type, + line_no, nblocks); return handle; } EXPORT_SYMBOL(jbd2__journal_start); @@ -386,7 +396,7 @@ EXPORT_SYMBOL(jbd2__journal_start); handle_t *jbd2_journal_start(journal_t *journal, int nblocks) { - return jbd2__journal_start(journal, nblocks, GFP_NOFS); + return jbd2__journal_start(journal, nblocks, GFP_NOFS, 0, 0); } EXPORT_SYMBOL(jbd2_journal_start); @@ -448,7 +458,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) goto unlock; } + trace_jbd2_handle_extend(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, + handle->h_type, handle->h_line_no, + handle->h_buffer_credits, + nblocks); + handle->h_buffer_credits += nblocks; + handle->h_requested_credits += nblocks; atomic_add(nblocks, &transaction->t_outstanding_credits); result = 0; @@ -1377,6 +1394,13 @@ int jbd2_journal_stop(handle_t *handle) } jbd_debug(4, "Handle %p going down\n", handle); + trace_jbd2_handle_stats(journal->j_fs_dev->bd_dev, + handle->h_transaction->t_tid, + handle->h_type, handle->h_line_no, + jiffies - handle->h_start_jiffies, + handle->h_sync, handle->h_requested_credits, + (handle->h_requested_credits - + handle->h_buffer_credits)); /* * Implement synchronous transaction batching. If the handle diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 24db7256a5ff..fa5fea17b619 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -400,6 +400,11 @@ struct jbd2_journal_handle unsigned int h_sync: 1; /* sync-on-close */ unsigned int h_jdata: 1; /* force data journaling */ unsigned int h_aborted: 1; /* fatal error on handle */ + unsigned int h_type: 8; /* for handle statistics */ + unsigned int h_line_no: 16; /* for handle statistics */ + + unsigned long h_start_jiffies; + unsigned int h_requested_credits; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map h_lockdep_map; @@ -1071,7 +1076,8 @@ static inline handle_t *journal_current_handle(void) */ extern handle_t *jbd2_journal_start(journal_t *, int nblocks); -extern handle_t *jbd2__journal_start(journal_t *, int nblocks, gfp_t gfp_mask); +extern handle_t *jbd2__journal_start(journal_t *, int nblocks, gfp_t gfp_mask, + unsigned int type, unsigned int line_no); extern int jbd2_journal_restart(handle_t *, int nblocks); extern int jbd2__journal_restart(handle_t *, int nblocks, gfp_t gfp_mask); extern int jbd2_journal_extend (handle_t *, int nblocks); diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 5419f57beb1f..070df49e4a1d 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -132,6 +132,104 @@ TRACE_EVENT(jbd2_submit_inode_data, (unsigned long) __entry->ino) ); +TRACE_EVENT(jbd2_handle_start, + TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, + unsigned int line_no, int requested_blocks), + + TP_ARGS(dev, tid, type, line_no, requested_blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, tid ) + __field( unsigned int, type ) + __field( unsigned int, line_no ) + __field( int, requested_blocks) + ), + + TP_fast_assign( + __entry->dev = dev; + __entry->tid = tid; + __entry->type = type; + __entry->line_no = line_no; + __entry->requested_blocks = requested_blocks; + ), + + TP_printk("dev %d,%d tid %lu type %u line_no %u " + "requested_blocks %d", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, + __entry->type, __entry->line_no, __entry->requested_blocks) +); + +TRACE_EVENT(jbd2_handle_extend, + TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, + unsigned int line_no, int buffer_credits, + int requested_blocks), + + TP_ARGS(dev, tid, type, line_no, buffer_credits, requested_blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, tid ) + __field( unsigned int, type ) + __field( unsigned int, line_no ) + __field( int, buffer_credits ) + __field( int, requested_blocks) + ), + + TP_fast_assign( + __entry->dev = dev; + __entry->tid = tid; + __entry->type = type; + __entry->line_no = line_no; + __entry->buffer_credits = buffer_credits; + __entry->requested_blocks = requested_blocks; + ), + + TP_printk("dev %d,%d tid %lu type %u line_no %u " + "buffer_credits %d requested_blocks %d", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, + __entry->type, __entry->line_no, __entry->buffer_credits, + __entry->requested_blocks) +); + +TRACE_EVENT(jbd2_handle_stats, + TP_PROTO(dev_t dev, unsigned long tid, unsigned int type, + unsigned int line_no, int interval, int sync, + int requested_blocks, int dirtied_blocks), + + TP_ARGS(dev, tid, type, line_no, interval, sync, + requested_blocks, dirtied_blocks), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( unsigned long, tid ) + __field( unsigned int, type ) + __field( unsigned int, line_no ) + __field( int, interval ) + __field( int, sync ) + __field( int, requested_blocks) + __field( int, dirtied_blocks ) + ), + + TP_fast_assign( + __entry->dev = dev; + __entry->tid = tid; + __entry->type = type; + __entry->line_no = line_no; + __entry->interval = interval; + __entry->sync = sync; + __entry->requested_blocks = requested_blocks; + __entry->dirtied_blocks = dirtied_blocks; + ), + + TP_printk("dev %d,%d tid %lu type %u line_no %u interval %d " + "sync %d requested_blocks %d dirtied_blocks %d", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tid, + __entry->type, __entry->line_no, __entry->interval, + __entry->sync, __entry->requested_blocks, + __entry->dirtied_blocks) +); + TRACE_EVENT(jbd2_run_stats, TP_PROTO(dev_t dev, unsigned long tid, struct transaction_run_stats_s *stats), -- cgit v1.2.3 From 06b0c886214a223dde7b21cbfc3008fd20a8ce16 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 18 Feb 2013 00:26:51 -0500 Subject: ext4: refine extent status tree This commit refines the extent status tree code. 1) A prefix 'es_' is added to to the extent status tree structure members. 2) Refactored es_remove_extent() so that __es_remove_extent() can be used by es_insert_extent() to remove the old extent entry(-ies) before inserting a new one. 3) Rename extent_status_end() to ext4_es_end() 4) ext4_es_can_be_merged() is define to check whether two extents can be merged or not. 5) Update and clarified comments. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents.c | 21 +-- fs/ext4/extents_status.c | 322 +++++++++++++++++++++++++------------------- fs/ext4/extents_status.h | 8 +- fs/ext4/file.c | 12 +- include/trace/events/ext4.h | 40 +++--- 5 files changed, 221 insertions(+), 182 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b6b54d658dc2..37f94a751ad7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3528,13 +3528,14 @@ static int ext4_find_delalloc_range(struct inode *inode, { struct extent_status es; - es.start = lblk_start; - ext4_es_find_extent(inode, &es); - if (es.len == 0) + es.es_lblk = lblk_start; + (void)ext4_es_find_extent(inode, &es); + if (es.es_len == 0) return 0; /* there is no delay extent in this tree */ - else if (es.start <= lblk_start && lblk_start < es.start + es.len) + else if (es.es_lblk <= lblk_start && + lblk_start < es.es_lblk + es.es_len) return 1; - else if (lblk_start <= es.start && es.start <= lblk_end) + else if (lblk_start <= es.es_lblk && es.es_lblk <= lblk_end) return 1; else return 0; @@ -4569,7 +4570,7 @@ static int ext4_find_delayed_extent(struct inode *inode, struct extent_status es; ext4_lblk_t next_del; - es.start = newex->ec_block; + es.es_lblk = newex->ec_block; next_del = ext4_es_find_extent(inode, &es); if (newex->ec_start == 0) { @@ -4577,18 +4578,18 @@ static int ext4_find_delayed_extent(struct inode *inode, * No extent in extent-tree contains block @newex->ec_start, * then the block may stay in 1)a hole or 2)delayed-extent. */ - if (es.len == 0) + if (es.es_len == 0) /* A hole found. */ return 0; - if (es.start > newex->ec_block) { + if (es.es_lblk > newex->ec_block) { /* A hole found. */ - newex->ec_len = min(es.start - newex->ec_block, + newex->ec_len = min(es.es_lblk - newex->ec_block, newex->ec_len); return 0; } - newex->ec_len = es.start + es.len - newex->ec_block; + newex->ec_len = es.es_lblk + es.es_len - newex->ec_block; } return next_del; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 564d981a2fcc..c9921cf4f817 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -23,40 +23,53 @@ * (e.g. Reservation space warning), and provide extent-level locking. * Delay extent tree is the first step to achieve this goal. It is * original built by Yongqiang Yang. At that time it is called delay - * extent tree, whose goal is only track delay extent in memory to + * extent tree, whose goal is only track delayed extents in memory to * simplify the implementation of fiemap and bigalloc, and introduce * lseek SEEK_DATA/SEEK_HOLE support. That is why it is still called - * delay extent tree at the following comment. But for better - * understand what it does, it has been rename to extent status tree. + * delay extent tree at the first commit. But for better understand + * what it does, it has been rename to extent status tree. * - * Currently the first step has been done. All delay extents are - * tracked in the tree. It maintains the delay extent when a delay - * allocation is issued, and the delay extent is written out or + * Step1: + * Currently the first step has been done. All delayed extents are + * tracked in the tree. It maintains the delayed extent when a delayed + * allocation is issued, and the delayed extent is written out or * invalidated. Therefore the implementation of fiemap and bigalloc * are simplified, and SEEK_DATA/SEEK_HOLE are introduced. * * The following comment describes the implemenmtation of extent * status tree and future works. + * + * Step2: + * In this step all extent status are tracked by extent status tree. + * Thus, we can first try to lookup a block mapping in this tree before + * finding it in extent tree. Hence, single extent cache can be removed + * because extent status tree can do a better job. Extents in status + * tree are loaded on-demand. Therefore, the extent status tree may not + * contain all of the extents in a file. Meanwhile we define a shrinker + * to reclaim memory from extent status tree because fragmented extent + * tree will make status tree cost too much memory. written/unwritten/- + * hole extents in the tree will be reclaimed by this shrinker when we + * are under high memory pressure. Delayed extents will not be + * reclimed because fiemap, bigalloc, and seek_data/hole need it. */ /* - * extents status tree implementation for ext4. + * Extent status tree implementation for ext4. * * * ========================================================================== - * Extents status encompass delayed extents and extent locks + * Extent status tree tracks all extent status. * - * 1. Why delayed extent implementation ? + * 1. Why we need to implement extent status tree? * - * Without delayed extent, ext4 identifies a delayed extent by looking + * Without extent status tree, ext4 identifies a delayed extent by looking * up page cache, this has several deficiencies - complicated, buggy, * and inefficient code. * - * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need - * to know if a block or a range of blocks are belonged to a delayed - * extent. + * FIEMAP, SEEK_HOLE/DATA, bigalloc, and writeout all need to know if a + * block or a range of blocks are belonged to a delayed extent. * - * Let us have a look at how they do without delayed extents implementation. + * Let us have a look at how they do without extent status tree. * -- FIEMAP * FIEMAP looks up page cache to identify delayed allocations from holes. * @@ -68,47 +81,48 @@ * already under delayed allocation or not to determine whether * quota reserving is needed for the cluster. * - * -- punch hole - * punch hole looks up page cache to identify a delayed extent. - * * -- writeout * Writeout looks up whole page cache to see if a buffer is * mapped, If there are not very many delayed buffers, then it is * time comsuming. * - * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, + * With extent status tree implementation, FIEMAP, SEEK_HOLE/DATA, * bigalloc and writeout can figure out if a block or a range of * blocks is under delayed allocation(belonged to a delayed extent) or - * not by searching the delayed extent tree. + * not by searching the extent tree. * * * ========================================================================== - * 2. ext4 delayed extents impelmentation + * 2. Ext4 extent status tree impelmentation + * + * -- extent + * A extent is a range of blocks which are contiguous logically and + * physically. Unlike extent in extent tree, this extent in ext4 is + * a in-memory struct, there is no corresponding on-disk data. There + * is no limit on length of extent, so an extent can contain as many + * blocks as they are contiguous logically and physically. * - * -- delayed extent - * A delayed extent is a range of blocks which are contiguous - * logically and under delayed allocation. Unlike extent in - * ext4, delayed extent in ext4 is a in-memory struct, there is - * no corresponding on-disk data. There is no limit on length of - * delayed extent, so a delayed extent can contain as many blocks - * as they are contiguous logically. + * -- extent status tree + * Every inode has an extent status tree and all allocation blocks + * are added to the tree with different status. The extent in the + * tree are ordered by logical block no. * - * -- delayed extent tree - * Every inode has a delayed extent tree and all under delayed - * allocation blocks are added to the tree as delayed extents. - * Delayed extents in the tree are ordered by logical block no. + * -- operations on a extent status tree + * There are three important operations on a delayed extent tree: find + * next extent, adding a extent(a range of blocks) and removing a extent. * - * -- operations on a delayed extent tree - * There are three operations on a delayed extent tree: find next - * delayed extent, adding a space(a range of blocks) and removing - * a space. + * -- race on a extent status tree + * Extent status tree is protected by inode->i_es_lock. * - * -- race on a delayed extent tree - * Delayed extent tree is protected inode->i_es_lock. + * -- memory consumption + * Fragmented extent tree will make extent status tree cost too much + * memory. Hence, we will reclaim written/unwritten/hole extents from + * the tree under a heavy memory pressure. * * * ========================================================================== - * 3. performance analysis + * 3. Performance analysis + * * -- overhead * 1. There is a cache extent for write access, so if writes are * not very random, adding space operaions are in O(1) time. @@ -120,15 +134,19 @@ * * ========================================================================== * 4. TODO list - * -- Track all extent status * - * -- Improve get block process + * -- Refactor delayed space reservation * * -- Extent-level locking */ static struct kmem_cache *ext4_es_cachep; +static int __es_insert_extent(struct ext4_es_tree *tree, + struct extent_status *newes); +static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk, + ext4_lblk_t end); + int __init ext4_init_es(void) { ext4_es_cachep = KMEM_CACHE(extent_status, SLAB_RECLAIM_ACCOUNT); @@ -161,7 +179,7 @@ static void ext4_es_print_tree(struct inode *inode) while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); - printk(KERN_DEBUG " [%u/%u)", es->start, es->len); + printk(KERN_DEBUG " [%u/%u)", es->es_lblk, es->es_len); node = rb_next(node); } printk(KERN_DEBUG "\n"); @@ -170,10 +188,10 @@ static void ext4_es_print_tree(struct inode *inode) #define ext4_es_print_tree(inode) #endif -static inline ext4_lblk_t extent_status_end(struct extent_status *es) +static inline ext4_lblk_t ext4_es_end(struct extent_status *es) { - BUG_ON(es->start + es->len < es->start); - return es->start + es->len - 1; + BUG_ON(es->es_lblk + es->es_len < es->es_lblk); + return es->es_lblk + es->es_len - 1; } /* @@ -181,25 +199,25 @@ static inline ext4_lblk_t extent_status_end(struct extent_status *es) * it can't be found, try to find next extent. */ static struct extent_status *__es_tree_search(struct rb_root *root, - ext4_lblk_t offset) + ext4_lblk_t lblk) { struct rb_node *node = root->rb_node; struct extent_status *es = NULL; while (node) { es = rb_entry(node, struct extent_status, rb_node); - if (offset < es->start) + if (lblk < es->es_lblk) node = node->rb_left; - else if (offset > extent_status_end(es)) + else if (lblk > ext4_es_end(es)) node = node->rb_right; else return es; } - if (es && offset < es->start) + if (es && lblk < es->es_lblk) return es; - if (es && offset > extent_status_end(es)) { + if (es && lblk > ext4_es_end(es)) { node = rb_next(&es->rb_node); return node ? rb_entry(node, struct extent_status, rb_node) : NULL; @@ -209,8 +227,8 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_extent: find the 1st delayed extent covering @es->start - * if it exists, otherwise, the next extent after @es->start. + * ext4_es_find_extent: find the 1st delayed extent covering @es->lblk + * if it exists, otherwise, the next extent after @es->lblk. * * @inode: the inode which owns delayed extents * @es: delayed extent that we found @@ -226,7 +244,7 @@ ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) struct rb_node *node; ext4_lblk_t ret = EXT_MAX_BLOCKS; - trace_ext4_es_find_extent_enter(inode, es->start); + trace_ext4_es_find_extent_enter(inode, es->es_lblk); read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; @@ -234,25 +252,25 @@ ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) /* find delay extent in cache firstly */ if (tree->cache_es) { es1 = tree->cache_es; - if (in_range(es->start, es1->start, es1->len)) { + if (in_range(es->es_lblk, es1->es_lblk, es1->es_len)) { es_debug("%u cached by [%u/%u)\n", - es->start, es1->start, es1->len); + es->es_lblk, es1->es_lblk, es1->es_len); goto out; } } - es->len = 0; - es1 = __es_tree_search(&tree->root, es->start); + es->es_len = 0; + es1 = __es_tree_search(&tree->root, es->es_lblk); out: if (es1) { tree->cache_es = es1; - es->start = es1->start; - es->len = es1->len; + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; node = rb_next(&es1->rb_node); if (node) { es1 = rb_entry(node, struct extent_status, rb_node); - ret = es1->start; + ret = es1->es_lblk; } } @@ -263,14 +281,14 @@ out: } static struct extent_status * -ext4_es_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) +ext4_es_alloc_extent(ext4_lblk_t lblk, ext4_lblk_t len) { struct extent_status *es; es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); if (es == NULL) return NULL; - es->start = start; - es->len = len; + es->es_lblk = lblk; + es->es_len = len; return es; } @@ -279,6 +297,20 @@ static void ext4_es_free_extent(struct extent_status *es) kmem_cache_free(ext4_es_cachep, es); } +/* + * Check whether or not two extents can be merged + * Condition: + * - logical block number is contiguous + */ +static int ext4_es_can_be_merged(struct extent_status *es1, + struct extent_status *es2) +{ + if (es1->es_lblk + es1->es_len != es2->es_lblk) + return 0; + + return 1; +} + static struct extent_status * ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) { @@ -290,8 +322,8 @@ ext4_es_try_to_merge_left(struct ext4_es_tree *tree, struct extent_status *es) return es; es1 = rb_entry(node, struct extent_status, rb_node); - if (es->start == extent_status_end(es1) + 1) { - es1->len += es->len; + if (ext4_es_can_be_merged(es1, es)) { + es1->es_len += es->es_len; rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(es); es = es1; @@ -311,8 +343,8 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) return es; es1 = rb_entry(node, struct extent_status, rb_node); - if (es1->start == extent_status_end(es) + 1) { - es->len += es1->len; + if (ext4_es_can_be_merged(es, es1)) { + es->es_len += es1->es_len; rb_erase(node, &tree->root); ext4_es_free_extent(es1); } @@ -320,60 +352,43 @@ ext4_es_try_to_merge_right(struct ext4_es_tree *tree, struct extent_status *es) return es; } -static int __es_insert_extent(struct ext4_es_tree *tree, ext4_lblk_t offset, - ext4_lblk_t len) +static int __es_insert_extent(struct ext4_es_tree *tree, + struct extent_status *newes) { struct rb_node **p = &tree->root.rb_node; struct rb_node *parent = NULL; struct extent_status *es; - ext4_lblk_t end = offset + len - 1; - - BUG_ON(end < offset); - es = tree->cache_es; - if (es && offset == (extent_status_end(es) + 1)) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - es->len += len; - es = ext4_es_try_to_merge_right(tree, es); - goto out; - } else if (es && es->start == end + 1) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - es->start = offset; - es->len += len; - es = ext4_es_try_to_merge_left(tree, es); - goto out; - } else if (es && es->start <= offset && - end <= extent_status_end(es)) { - es_debug("cached by [%u/%u)\n", es->start, es->len); - goto out; - } while (*p) { parent = *p; es = rb_entry(parent, struct extent_status, rb_node); - if (offset < es->start) { - if (es->start == end + 1) { - es->start = offset; - es->len += len; + if (newes->es_lblk < es->es_lblk) { + if (ext4_es_can_be_merged(newes, es)) { + /* + * Here we can modify es_lblk directly + * because it isn't overlapped. + */ + es->es_lblk = newes->es_lblk; + es->es_len += newes->es_len; es = ext4_es_try_to_merge_left(tree, es); goto out; } p = &(*p)->rb_left; - } else if (offset > extent_status_end(es)) { - if (offset == extent_status_end(es) + 1) { - es->len += len; + } else if (newes->es_lblk > ext4_es_end(es)) { + if (ext4_es_can_be_merged(es, newes)) { + es->es_len += newes->es_len; es = ext4_es_try_to_merge_right(tree, es); goto out; } p = &(*p)->rb_right; } else { - if (extent_status_end(es) <= end) - es->len = offset - es->start + len; - goto out; + BUG_ON(1); + return -EINVAL; } } - es = ext4_es_alloc_extent(offset, len); + es = ext4_es_alloc_extent(newes->es_lblk, newes->es_len); if (!es) return -ENOMEM; rb_link_node(&es->rb_node, parent, p); @@ -385,27 +400,38 @@ out: } /* - * ext4_es_insert_extent() adds a space to a delayed extent tree. - * Caller holds inode->i_es_lock. + * ext4_es_insert_extent() adds a space to a extent status tree. * * ext4_es_insert_extent is called by ext4_da_write_begin and * ext4_es_remove_extent. * * Return 0 on success, error code on failure. */ -int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, +int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len) { struct ext4_es_tree *tree; + struct extent_status newes; + ext4_lblk_t end = lblk + len - 1; int err = 0; - trace_ext4_es_insert_extent(inode, offset, len); + trace_ext4_es_insert_extent(inode, lblk, len); es_debug("add [%u/%u) to extent status tree of inode %lu\n", - offset, len, inode->i_ino); + lblk, len, inode->i_ino); + + BUG_ON(end < lblk); + + newes.es_lblk = lblk; + newes.es_len = len; write_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; - err = __es_insert_extent(tree, offset, len); + err = __es_remove_extent(tree, lblk, end); + if (err != 0) + goto error; + err = __es_insert_extent(tree, &newes); + +error: write_unlock(&EXT4_I(inode)->i_es_lock); ext4_es_print_tree(inode); @@ -413,57 +439,45 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t offset, return err; } -/* - * ext4_es_remove_extent() removes a space from a delayed extent tree. - * Caller holds inode->i_es_lock. - * - * Return 0 on success, error code on failure. - */ -int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, - ext4_lblk_t len) +static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk, + ext4_lblk_t end) { struct rb_node *node; - struct ext4_es_tree *tree; struct extent_status *es; struct extent_status orig_es; - ext4_lblk_t len1, len2, end; + ext4_lblk_t len1, len2; int err = 0; - trace_ext4_es_remove_extent(inode, offset, len); - es_debug("remove [%u/%u) from extent status tree of inode %lu\n", - offset, len, inode->i_ino); - - end = offset + len - 1; - BUG_ON(end < offset); - write_lock(&EXT4_I(inode)->i_es_lock); - tree = &EXT4_I(inode)->i_es_tree; - es = __es_tree_search(&tree->root, offset); + es = __es_tree_search(&tree->root, lblk); if (!es) goto out; - if (es->start > end) + if (es->es_lblk > end) goto out; /* Simply invalidate cache_es. */ tree->cache_es = NULL; - orig_es.start = es->start; - orig_es.len = es->len; - len1 = offset > es->start ? offset - es->start : 0; - len2 = extent_status_end(es) > end ? - extent_status_end(es) - end : 0; + orig_es.es_lblk = es->es_lblk; + orig_es.es_len = es->es_len; + len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; + len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; if (len1 > 0) - es->len = len1; + es->es_len = len1; if (len2 > 0) { if (len1 > 0) { - err = __es_insert_extent(tree, end + 1, len2); + struct extent_status newes; + + newes.es_lblk = end + 1; + newes.es_len = len2; + err = __es_insert_extent(tree, &newes); if (err) { - es->start = orig_es.start; - es->len = orig_es.len; + es->es_lblk = orig_es.es_lblk; + es->es_len = orig_es.es_len; goto out; } } else { - es->start = end + 1; - es->len = len2; + es->es_lblk = end + 1; + es->es_len = len2; } goto out; } @@ -476,7 +490,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, es = NULL; } - while (es && extent_status_end(es) <= end) { + while (es && ext4_es_end(es) <= end) { node = rb_next(&es->rb_node); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(es); @@ -487,13 +501,39 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t offset, es = rb_entry(node, struct extent_status, rb_node); } - if (es && es->start < end + 1) { - len1 = extent_status_end(es) - end; - es->start = end + 1; - es->len = len1; + if (es && es->es_lblk < end + 1) { + len1 = ext4_es_end(es) - end; + es->es_lblk = end + 1; + es->es_len = len1; } out: + return err; +} + +/* + * ext4_es_remove_extent() removes a space from a extent status tree. + * + * Return 0 on success, error code on failure. + */ +int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, + ext4_lblk_t len) +{ + struct ext4_es_tree *tree; + ext4_lblk_t end; + int err = 0; + + trace_ext4_es_remove_extent(inode, lblk, len); + es_debug("remove [%u/%u) from extent status tree of inode %lu\n", + lblk, len, inode->i_ino); + + end = lblk + len - 1; + BUG_ON(end < lblk); + + tree = &EXT4_I(inode)->i_es_tree; + + write_lock(&EXT4_I(inode)->i_es_lock); + err = __es_remove_extent(tree, lblk, end); write_unlock(&EXT4_I(inode)->i_es_lock); ext4_es_print_tree(inode); return err; diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 077f82db092a..81e9339f23f1 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -22,8 +22,8 @@ struct extent_status { struct rb_node rb_node; - ext4_lblk_t start; /* first block extent covers */ - ext4_lblk_t len; /* length of extent in block */ + ext4_lblk_t es_lblk; /* first logical block extent covers */ + ext4_lblk_t es_len; /* length of extent in block */ }; struct ext4_es_tree { @@ -35,9 +35,9 @@ extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); -extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t start, +extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t start, +extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es); diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2cf8ab810687..2df9354b105e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -464,10 +464,9 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * it will be as a data. */ - es.start = last; + es.es_lblk = last; (void)ext4_es_find_extent(inode, &es); - if (last >= es.start && - last < es.start + es.len) { + if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { if (last != start) dataoff = last << blkbits; break; @@ -549,11 +548,10 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * we will skip this extent. */ - es.start = last; + es.es_lblk = last; (void)ext4_es_find_extent(inode, &es); - if (last >= es.start && - last < es.start + es.len) { - last = es.start + es.len; + if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { + last = es.es_lblk + es.es_len; holeoff = last << blkbits; continue; } diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6080ea1380b8..52c923851959 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2093,75 +2093,75 @@ TRACE_EVENT(ext4_ext_remove_space_done, ); TRACE_EVENT(ext4_es_insert_extent, - TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len), - TP_ARGS(inode, start, len), + TP_ARGS(inode, lblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( loff_t, start ) + __field( loff_t, lblk ) __field( loff_t, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->start = start; + __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu es [%lld/%lld)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->start, __entry->len) + __entry->lblk, __entry->len) ); TRACE_EVENT(ext4_es_remove_extent, - TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len), + TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len), - TP_ARGS(inode, start, len), + TP_ARGS(inode, lblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( loff_t, start ) + __field( loff_t, lblk ) __field( loff_t, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->start = start; + __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu es [%lld/%lld)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->start, __entry->len) + __entry->lblk, __entry->len) ); TRACE_EVENT(ext4_es_find_extent_enter, - TP_PROTO(struct inode *inode, ext4_lblk_t start), + TP_PROTO(struct inode *inode, ext4_lblk_t lblk), - TP_ARGS(inode, start), + TP_ARGS(inode, lblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( ext4_lblk_t, start ) + __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->start = start; + __entry->lblk = lblk; ), - TP_printk("dev %d,%d ino %lu start %u", + TP_printk("dev %d,%d ino %lu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), - (unsigned long) __entry->ino, __entry->start) + (unsigned long) __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_find_extent_exit, @@ -2173,7 +2173,7 @@ TRACE_EVENT(ext4_es_find_extent_exit, TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) - __field( ext4_lblk_t, start ) + __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_lblk_t, ret ) ), @@ -2181,15 +2181,15 @@ TRACE_EVENT(ext4_es_find_extent_exit, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->start = es->start; - __entry->len = es->len; + __entry->lblk = es->es_lblk; + __entry->len = es->es_len; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->start, __entry->len, __entry->ret) + __entry->lblk, __entry->len, __entry->ret) ); #endif /* _TRACE_EXT4_H */ -- cgit v1.2.3 From fdc0212e86ca15c5cfed77088af7cc5eb79ccbc7 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 18 Feb 2013 00:26:51 -0500 Subject: ext4: add physical block and status member into extent status tree This commit adds two members in extent_status structure to let it record physical block and extent status. Here es_pblk is used to record both of them because physical block only has 48 bits. So extent status could be stashed into it so that we can save some memory. Now written, unwritten, delayed and hole are defined as status. Due to new member is added into extent status tree, all interfaces need to be adjusted. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara --- fs/ext4/extents_status.c | 67 +++++++++++++++++++++++++++++++++++++-------- fs/ext4/extents_status.h | 64 ++++++++++++++++++++++++++++++++++++++++++- fs/ext4/inode.c | 3 +- include/trace/events/ext4.h | 34 +++++++++++++++-------- 4 files changed, 142 insertions(+), 26 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c9921cf4f817..1f5fd44993e9 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -179,7 +179,9 @@ static void ext4_es_print_tree(struct inode *inode) while (node) { struct extent_status *es; es = rb_entry(node, struct extent_status, rb_node); - printk(KERN_DEBUG " [%u/%u)", es->es_lblk, es->es_len); + printk(KERN_DEBUG " [%u/%u) %llu %llx", + es->es_lblk, es->es_len, + ext4_es_pblock(es), ext4_es_status(es)); node = rb_next(node); } printk(KERN_DEBUG "\n"); @@ -234,7 +236,7 @@ static struct extent_status *__es_tree_search(struct rb_root *root, * @es: delayed extent that we found * * Returns the first block of the next extent after es, otherwise - * EXT_MAX_BLOCKS if no delay extent is found. + * EXT_MAX_BLOCKS if no extent is found. * Delayed extent is returned via @es. */ ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) @@ -249,17 +251,18 @@ ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; - /* find delay extent in cache firstly */ + /* find extent in cache firstly */ + es->es_len = es->es_pblk = 0; if (tree->cache_es) { es1 = tree->cache_es; if (in_range(es->es_lblk, es1->es_lblk, es1->es_len)) { - es_debug("%u cached by [%u/%u)\n", - es->es_lblk, es1->es_lblk, es1->es_len); + es_debug("%u cached by [%u/%u) %llu %llx\n", + es->es_lblk, es1->es_lblk, es1->es_len, + ext4_es_pblock(es1), ext4_es_status(es1)); goto out; } } - es->es_len = 0; es1 = __es_tree_search(&tree->root, es->es_lblk); out: @@ -267,6 +270,7 @@ out: tree->cache_es = es1; es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; node = rb_next(&es1->rb_node); if (node) { es1 = rb_entry(node, struct extent_status, rb_node); @@ -281,7 +285,7 @@ out: } static struct extent_status * -ext4_es_alloc_extent(ext4_lblk_t lblk, ext4_lblk_t len) +ext4_es_alloc_extent(ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) { struct extent_status *es; es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC); @@ -289,6 +293,7 @@ ext4_es_alloc_extent(ext4_lblk_t lblk, ext4_lblk_t len) return NULL; es->es_lblk = lblk; es->es_len = len; + es->es_pblk = pblk; return es; } @@ -301,6 +306,8 @@ static void ext4_es_free_extent(struct extent_status *es) * Check whether or not two extents can be merged * Condition: * - logical block number is contiguous + * - physical block number is contiguous + * - status is equal */ static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) @@ -308,6 +315,13 @@ static int ext4_es_can_be_merged(struct extent_status *es1, if (es1->es_lblk + es1->es_len != es2->es_lblk) return 0; + if (ext4_es_status(es1) != ext4_es_status(es2)) + return 0; + + if ((ext4_es_is_written(es1) || ext4_es_is_unwritten(es1)) && + (ext4_es_pblock(es1) + es1->es_len != ext4_es_pblock(es2))) + return 0; + return 1; } @@ -371,6 +385,10 @@ static int __es_insert_extent(struct ext4_es_tree *tree, */ es->es_lblk = newes->es_lblk; es->es_len += newes->es_len; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) + ext4_es_store_pblock(es, + newes->es_pblk); es = ext4_es_try_to_merge_left(tree, es); goto out; } @@ -388,7 +406,8 @@ static int __es_insert_extent(struct ext4_es_tree *tree, } } - es = ext4_es_alloc_extent(newes->es_lblk, newes->es_len); + es = ext4_es_alloc_extent(newes->es_lblk, newes->es_len, + newes->es_pblk); if (!es) return -ENOMEM; rb_link_node(&es->rb_node, parent, p); @@ -408,21 +427,24 @@ out: * Return 0 on success, error code on failure. */ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len) + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned long long status) { struct ext4_es_tree *tree; struct extent_status newes; ext4_lblk_t end = lblk + len - 1; int err = 0; - trace_ext4_es_insert_extent(inode, lblk, len); - es_debug("add [%u/%u) to extent status tree of inode %lu\n", - lblk, len, inode->i_ino); + es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n", + lblk, len, pblk, status, inode->i_ino); BUG_ON(end < lblk); newes.es_lblk = lblk; newes.es_len = len; + ext4_es_store_pblock(&newes, pblk); + ext4_es_store_status(&newes, status); + trace_ext4_es_insert_extent(inode, &newes); write_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; @@ -446,6 +468,7 @@ static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk, struct extent_status *es; struct extent_status orig_es; ext4_lblk_t len1, len2; + ext4_fsblk_t block; int err = 0; es = __es_tree_search(&tree->root, lblk); @@ -459,6 +482,8 @@ static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk, orig_es.es_lblk = es->es_lblk; orig_es.es_len = es->es_len; + orig_es.es_pblk = es->es_pblk; + len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0; len2 = ext4_es_end(es) > end ? ext4_es_end(es) - end : 0; if (len1 > 0) @@ -469,6 +494,13 @@ static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk, newes.es_lblk = end + 1; newes.es_len = len2; + if (ext4_es_is_written(&orig_es) || + ext4_es_is_unwritten(&orig_es)) { + block = ext4_es_pblock(&orig_es) + + orig_es.es_len - len2; + ext4_es_store_pblock(&newes, block); + } + ext4_es_store_status(&newes, ext4_es_status(&orig_es)); err = __es_insert_extent(tree, &newes); if (err) { es->es_lblk = orig_es.es_lblk; @@ -478,6 +510,11 @@ static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk, } else { es->es_lblk = end + 1; es->es_len = len2; + if (ext4_es_is_written(es) || + ext4_es_is_unwritten(es)) { + block = orig_es.es_pblk + orig_es.es_len - len2; + ext4_es_store_pblock(es, block); + } } goto out; } @@ -502,9 +539,15 @@ static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk, } if (es && es->es_lblk < end + 1) { + ext4_lblk_t orig_len = es->es_len; + len1 = ext4_es_end(es) - end; es->es_lblk = end + 1; es->es_len = len1; + if (ext4_es_is_written(es) || ext4_es_is_unwritten(es)) { + block = es->es_pblk + orig_len - len1; + ext4_es_store_pblock(es, block); + } } out: diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 81e9339f23f1..3cad83303adb 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -20,10 +20,21 @@ #define es_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif +#define EXTENT_STATUS_WRITTEN 0x80000000 /* written extent */ +#define EXTENT_STATUS_UNWRITTEN 0x40000000 /* unwritten extent */ +#define EXTENT_STATUS_DELAYED 0x20000000 /* delayed extent */ +#define EXTENT_STATUS_HOLE 0x10000000 /* hole */ + +#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) + struct extent_status { struct rb_node rb_node; ext4_lblk_t es_lblk; /* first logical block extent covers */ ext4_lblk_t es_len; /* length of extent in block */ + ext4_fsblk_t es_pblk; /* first physical block */ }; struct ext4_es_tree { @@ -36,10 +47,61 @@ extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, - ext4_lblk_t len); + ext4_lblk_t len, ext4_fsblk_t pblk, + unsigned long long status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es); +static inline int ext4_es_is_written(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_WRITTEN); +} + +static inline int ext4_es_is_unwritten(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_UNWRITTEN); +} + +static inline int ext4_es_is_delayed(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_DELAYED); +} + +static inline int ext4_es_is_hole(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_HOLE); +} + +static inline ext4_fsblk_t ext4_es_status(struct extent_status *es) +{ + return (es->es_pblk & EXTENT_STATUS_FLAGS); +} + +static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) +{ + return (es->es_pblk & ~EXTENT_STATUS_FLAGS); +} + +static inline void ext4_es_store_pblock(struct extent_status *es, + ext4_fsblk_t pb) +{ + ext4_fsblk_t block; + + block = (pb & ~EXTENT_STATUS_FLAGS) | + (es->es_pblk & EXTENT_STATUS_FLAGS); + es->es_pblk = block; +} + +static inline void ext4_es_store_status(struct extent_status *es, + unsigned long long status) +{ + ext4_fsblk_t block; + + block = (status & EXTENT_STATUS_FLAGS) | + (es->es_pblk & ~EXTENT_STATUS_FLAGS); + es->es_pblk = block; +} + #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f4466c3650dc..e0e1cb0863f4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1784,7 +1784,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, goto out_unlock; } - retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len); + retval = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + ~0, EXTENT_STATUS_DELAYED); if (retval) goto out_unlock; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 52c923851959..0ee507ff216d 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2093,28 +2093,33 @@ TRACE_EVENT(ext4_ext_remove_space_done, ); TRACE_EVENT(ext4_es_insert_extent, - TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len), + TP_PROTO(struct inode *inode, struct extent_status *es), - TP_ARGS(inode, lblk, len), + TP_ARGS(inode, es), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( loff_t, lblk ) - __field( loff_t, len ) + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, lblk ) + __field( ext4_lblk_t, len ) + __field( ext4_fsblk_t, pblk ) + __field( unsigned long long, status ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->lblk = lblk; - __entry->len = len; + __entry->lblk = es->es_lblk; + __entry->len = es->es_len; + __entry->pblk = ext4_es_pblock(es); + __entry->status = ext4_es_status(es); ), - TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->lblk, __entry->len) + __entry->lblk, __entry->len, + __entry->pblk, __entry->status) ); TRACE_EVENT(ext4_es_remove_extent, @@ -2175,6 +2180,8 @@ TRACE_EVENT(ext4_es_find_extent_exit, __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) + __field( ext4_fsblk_t, pblk ) + __field( unsigned long long, status ) __field( ext4_lblk_t, ret ) ), @@ -2183,13 +2190,16 @@ TRACE_EVENT(ext4_es_find_extent_exit, __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; + __entry->pblk = ext4_es_pblock(es); + __entry->status = ext4_es_status(es); __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) ret %u", + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx ret %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->lblk, __entry->len, __entry->ret) + __entry->lblk, __entry->len, + __entry->pblk, __entry->status, __entry->ret) ); #endif /* _TRACE_EXT4_H */ -- cgit v1.2.3 From be401363ac5ec652c706263a59b0bd0acc3612e8 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 18 Feb 2013 00:27:26 -0500 Subject: ext4: rename and improbe ext4_es_find_extent() This commit renames ext4_es_find_extent with ext4_es_find_delayed_extent and improve this function. First, we split input and output parameter. Second, this function never return the first block of the next delayed extent after 'es'. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Jan kara --- fs/ext4/extents.c | 15 ++++++++++----- fs/ext4/extents_status.c | 40 ++++++++++++++++++++-------------------- fs/ext4/extents_status.h | 4 ++-- fs/ext4/file.c | 6 ++---- include/trace/events/ext4.h | 15 ++++++--------- 5 files changed, 40 insertions(+), 40 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 37f94a751ad7..895c19595ecf 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3528,8 +3528,7 @@ static int ext4_find_delalloc_range(struct inode *inode, { struct extent_status es; - es.es_lblk = lblk_start; - (void)ext4_es_find_extent(inode, &es); + ext4_es_find_delayed_extent(inode, lblk_start, &es); if (es.es_len == 0) return 0; /* there is no delay extent in this tree */ else if (es.es_lblk <= lblk_start && @@ -4568,10 +4567,9 @@ static int ext4_find_delayed_extent(struct inode *inode, struct ext4_ext_cache *newex) { struct extent_status es; - ext4_lblk_t next_del; + ext4_lblk_t block, next_del; - es.es_lblk = newex->ec_block; - next_del = ext4_es_find_extent(inode, &es); + ext4_es_find_delayed_extent(inode, newex->ec_block, &es); if (newex->ec_start == 0) { /* @@ -4592,6 +4590,13 @@ static int ext4_find_delayed_extent(struct inode *inode, newex->ec_len = es.es_lblk + es.es_len - newex->ec_block; } + block = newex->ec_block + newex->ec_len; + ext4_es_find_delayed_extent(inode, block, &es); + if (es.es_len == 0) + next_del = EXT_MAX_BLOCKS; + else + next_del = es.es_lblk; + return next_del; } /* fiemap flags we can handle specified here */ diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 1f5fd44993e9..76f4351ea821 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -229,59 +229,59 @@ static struct extent_status *__es_tree_search(struct rb_root *root, } /* - * ext4_es_find_extent: find the 1st delayed extent covering @es->lblk + * ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk * if it exists, otherwise, the next extent after @es->lblk. * * @inode: the inode which owns delayed extents + * @lblk: the offset where we start to search * @es: delayed extent that we found - * - * Returns the first block of the next extent after es, otherwise - * EXT_MAX_BLOCKS if no extent is found. - * Delayed extent is returned via @es. */ -ext4_lblk_t ext4_es_find_extent(struct inode *inode, struct extent_status *es) +void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es) { struct ext4_es_tree *tree = NULL; struct extent_status *es1 = NULL; struct rb_node *node; - ext4_lblk_t ret = EXT_MAX_BLOCKS; - trace_ext4_es_find_extent_enter(inode, es->es_lblk); + BUG_ON(es == NULL); + trace_ext4_es_find_delayed_extent_enter(inode, lblk); read_lock(&EXT4_I(inode)->i_es_lock); tree = &EXT4_I(inode)->i_es_tree; /* find extent in cache firstly */ - es->es_len = es->es_pblk = 0; + es->es_lblk = es->es_len = es->es_pblk = 0; if (tree->cache_es) { es1 = tree->cache_es; - if (in_range(es->es_lblk, es1->es_lblk, es1->es_len)) { + if (in_range(lblk, es1->es_lblk, es1->es_len)) { es_debug("%u cached by [%u/%u) %llu %llx\n", - es->es_lblk, es1->es_lblk, es1->es_len, + lblk, es1->es_lblk, es1->es_len, ext4_es_pblock(es1), ext4_es_status(es1)); goto out; } } - es1 = __es_tree_search(&tree->root, es->es_lblk); + es1 = __es_tree_search(&tree->root, lblk); out: - if (es1) { + if (es1 && !ext4_es_is_delayed(es1)) { + while ((node = rb_next(&es1->rb_node)) != NULL) { + es1 = rb_entry(node, struct extent_status, rb_node); + if (ext4_es_is_delayed(es1)) + break; + } + } + + if (es1 && ext4_es_is_delayed(es1)) { tree->cache_es = es1; es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; - node = rb_next(&es1->rb_node); - if (node) { - es1 = rb_entry(node, struct extent_status, rb_node); - ret = es1->es_lblk; - } } read_unlock(&EXT4_I(inode)->i_es_lock); - trace_ext4_es_find_extent_exit(inode, es, ret); - return ret; + trace_ext4_es_find_delayed_extent_exit(inode, es); } static struct extent_status * diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 3cad83303adb..3f69d097c6e7 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -51,8 +51,8 @@ extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, unsigned long long status); extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); -extern ext4_lblk_t ext4_es_find_extent(struct inode *inode, - struct extent_status *es); +extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); static inline int ext4_es_is_written(struct extent_status *es) { diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 2df9354b105e..7e85a10a6f4f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -464,8 +464,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * it will be as a data. */ - es.es_lblk = last; - (void)ext4_es_find_extent(inode, &es); + ext4_es_find_delayed_extent(inode, last, &es); if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { if (last != start) dataoff = last << blkbits; @@ -548,8 +547,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) * If there is a delay extent at this offset, * we will skip this extent. */ - es.es_lblk = last; - (void)ext4_es_find_extent(inode, &es); + ext4_es_find_delayed_extent(inode, last, &es); if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { last = es.es_lblk + es.es_len; holeoff = last << blkbits; diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 0ee507ff216d..c121cdf55ab3 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2147,7 +2147,7 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->lblk, __entry->len) ); -TRACE_EVENT(ext4_es_find_extent_enter, +TRACE_EVENT(ext4_es_find_delayed_extent_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), @@ -2169,11 +2169,10 @@ TRACE_EVENT(ext4_es_find_extent_enter, (unsigned long) __entry->ino, __entry->lblk) ); -TRACE_EVENT(ext4_es_find_extent_exit, - TP_PROTO(struct inode *inode, struct extent_status *es, - ext4_lblk_t ret), +TRACE_EVENT(ext4_es_find_delayed_extent_exit, + TP_PROTO(struct inode *inode, struct extent_status *es), - TP_ARGS(inode, es, ret), + TP_ARGS(inode, es), TP_STRUCT__entry( __field( dev_t, dev ) @@ -2182,7 +2181,6 @@ TRACE_EVENT(ext4_es_find_extent_exit, __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( unsigned long long, status ) - __field( ext4_lblk_t, ret ) ), TP_fast_assign( @@ -2192,14 +2190,13 @@ TRACE_EVENT(ext4_es_find_extent_exit, __entry->len = es->es_len; __entry->pblk = ext4_es_pblock(es); __entry->status = ext4_es_status(es); - __entry->ret = ret; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx ret %u", + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %llx", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, - __entry->pblk, __entry->status, __entry->ret) + __entry->pblk, __entry->status) ); #endif /* _TRACE_EXT4_H */ -- cgit v1.2.3 From d100eef2440fea13e4f09e88b1c8bcbca64beb9f Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 18 Feb 2013 00:29:59 -0500 Subject: ext4: lookup block mapping in extent status tree After tracking all extent status, we already have a extent cache in memory. Every time we want to lookup a block mapping, we can first try to lookup it in extent status tree to avoid a potential disk I/O. A new function called ext4_es_lookup_extent is defined to finish this work. When we try to lookup a block mapping, we always call ext4_map_blocks and/or ext4_da_map_blocks. So in these functions we first try to lookup a block mapping in extent status tree. A new flag EXT4_GET_BLOCKS_NO_PUT_HOLE is used in ext4_da_map_blocks in order not to put a hole into extent status tree because this hole will be converted to delayed extent in the tree immediately. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Jan kara --- fs/ext4/ext4.h | 2 ++ fs/ext4/extents.c | 9 ++++++- fs/ext4/extents_status.c | 60 +++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 2 ++ fs/ext4/inode.c | 66 +++++++++++++++++++++++++++++++++++++++++++-- include/trace/events/ext4.h | 56 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 192 insertions(+), 3 deletions(-) (limited to 'include/trace') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5c31d6ac9500..329e7fba47d6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -579,6 +579,8 @@ enum { #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* Do not take i_data_sem locking in ext4_map_blocks */ #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 + /* Do not put hole in extent cache */ +#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* * Flags used by ext4_free_blocks diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index be0b1b3eed97..b9d7a2363736 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2167,6 +2167,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block, le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); + if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) + ext4_es_insert_extent(inode, lblock, len, ~0, + EXTENT_STATUS_HOLE); } else if (block >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; @@ -2180,6 +2183,9 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block); BUG_ON(next == lblock); len = next - lblock; + if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) + ext4_es_insert_extent(inode, lblock, len, ~0, + EXTENT_STATUS_HOLE); } else { lblock = len = 0; BUG(); @@ -4018,7 +4024,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * put just found gap into cache to speed up * subsequent requests */ - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 76f4351ea821..eeb893122d8d 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -461,6 +461,66 @@ error: return err; } +/* + * ext4_es_lookup_extent() looks up an extent in extent status tree. + * + * ext4_es_lookup_extent is called by ext4_map_blocks/ext4_da_map_blocks. + * + * Return: 1 on found, 0 on not + */ +int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es) +{ + struct ext4_es_tree *tree; + struct extent_status *es1 = NULL; + struct rb_node *node; + int found = 0; + + trace_ext4_es_lookup_extent_enter(inode, lblk); + es_debug("lookup extent in block %u\n", lblk); + + tree = &EXT4_I(inode)->i_es_tree; + read_lock(&EXT4_I(inode)->i_es_lock); + + /* find extent in cache firstly */ + es->es_lblk = es->es_len = es->es_pblk = 0; + if (tree->cache_es) { + es1 = tree->cache_es; + if (in_range(lblk, es1->es_lblk, es1->es_len)) { + es_debug("%u cached by [%u/%u)\n", + lblk, es1->es_lblk, es1->es_len); + found = 1; + goto out; + } + } + + node = tree->root.rb_node; + while (node) { + es1 = rb_entry(node, struct extent_status, rb_node); + if (lblk < es1->es_lblk) + node = node->rb_left; + else if (lblk > ext4_es_end(es1)) + node = node->rb_right; + else { + found = 1; + break; + } + } + +out: + if (found) { + BUG_ON(!es1); + es->es_lblk = es1->es_lblk; + es->es_len = es1->es_len; + es->es_pblk = es1->es_pblk; + } + + read_unlock(&EXT4_I(inode)->i_es_lock); + + trace_ext4_es_lookup_extent_exit(inode, es, found); + return found; +} + static int __es_remove_extent(struct ext4_es_tree *tree, ext4_lblk_t lblk, ext4_lblk_t end) { diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 3f69d097c6e7..8ffc90c784fa 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -53,6 +53,8 @@ extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len); extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, + struct extent_status *es); static inline int ext4_es_is_written(struct extent_status *es) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 576b586b61aa..95a0c62c5683 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -507,12 +507,33 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx, int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { + struct extent_status es; int retval; map->m_flags = 0; ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," "logical block %lu\n", inode->i_ino, flags, map->m_len, (unsigned long) map->m_lblk); + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { + if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { + map->m_pblk = ext4_es_pblock(&es) + + map->m_lblk - es.es_lblk; + map->m_flags |= ext4_es_is_written(&es) ? + EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; + retval = es.es_len - (map->m_lblk - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { + retval = 0; + } else { + BUG_ON(1); + } + goto found; + } + /* * Try to see if we can get the block without requesting a new * file system block. @@ -544,6 +565,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) up_read((&EXT4_I(inode)->i_data_sem)); +found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { int ret = check_block_validity(inode, map); if (ret != 0) @@ -1743,6 +1765,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct ext4_map_blocks *map, struct buffer_head *bh) { + struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); @@ -1753,6 +1776,42 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," "logical block %lu\n", inode->i_ino, map->m_len, (unsigned long) map->m_lblk); + + /* Lookup extent status tree firstly */ + if (ext4_es_lookup_extent(inode, iblock, &es)) { + + if (ext4_es_is_hole(&es)) { + retval = 0; + down_read((&EXT4_I(inode)->i_data_sem)); + goto add_delayed; + } + + /* + * Delayed extent could be allocated by fallocate. + * So we need to check it. + */ + if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { + map_bh(bh, inode->i_sb, invalid_block); + set_buffer_new(bh); + set_buffer_delay(bh); + return 0; + } + + map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; + retval = es.es_len - (iblock - es.es_lblk); + if (retval > map->m_len) + retval = map->m_len; + map->m_len = retval; + if (ext4_es_is_written(&es)) + map->m_flags |= EXT4_MAP_MAPPED; + else if (ext4_es_is_unwritten(&es)) + map->m_flags |= EXT4_MAP_UNWRITTEN; + else + BUG_ON(1); + + return retval; + } + /* * Try to see if we can get the block without requesting a new * file system block. @@ -1771,10 +1830,13 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, map->m_flags |= EXT4_MAP_FROM_CLUSTER; retval = 0; } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, 0); + retval = ext4_ext_map_blocks(NULL, inode, map, + EXT4_GET_BLOCKS_NO_PUT_HOLE); else - retval = ext4_ind_map_blocks(NULL, inode, map, 0); + retval = ext4_ind_map_blocks(NULL, inode, map, + EXT4_GET_BLOCKS_NO_PUT_HOLE); +add_delayed: if (retval == 0) { int ret; /* diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index c121cdf55ab3..1e590b68cec4 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2199,6 +2199,62 @@ TRACE_EVENT(ext4_es_find_delayed_extent_exit, __entry->pblk, __entry->status) ); +TRACE_EVENT(ext4_es_lookup_extent_enter, + TP_PROTO(struct inode *inode, ext4_lblk_t lblk), + + TP_ARGS(inode, lblk), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, lblk ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->lblk = lblk; + ), + + TP_printk("dev %d,%d ino %lu lblk %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->lblk) +); + +TRACE_EVENT(ext4_es_lookup_extent_exit, + TP_PROTO(struct inode *inode, struct extent_status *es, + int found), + + TP_ARGS(inode, es, found), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( ino_t, ino ) + __field( ext4_lblk_t, lblk ) + __field( ext4_lblk_t, len ) + __field( ext4_fsblk_t, pblk ) + __field( unsigned long long, status ) + __field( int, found ) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->lblk = es->es_lblk; + __entry->len = es->es_len; + __entry->pblk = ext4_es_pblock(es); + __entry->status = ext4_es_status(es); + __entry->found = found; + ), + + TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long) __entry->ino, __entry->found, + __entry->lblk, __entry->len, + __entry->found ? __entry->pblk : 0, + __entry->found ? __entry->status : 0) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3 From 74cd15cd02708c7188581f279f33a98b2ae8d322 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Mon, 18 Feb 2013 00:32:55 -0500 Subject: ext4: reclaim extents from extent status tree Although extent status is loaded on-demand, we also need to reclaim extent from the tree when we are under a heavy memory pressure because in some cases fragmented extent tree causes status tree costs too much memory. Here we maintain a lru list in super_block. When the extent status of an inode is accessed and changed, this inode will be move to the tail of the list. The inode will be dropped from this list when it is cleared. In the inode, a counter is added to count the number of cached objects in extent status tree. Here only written/unwritten/hole extent is counted because delayed extent doesn't be reclaimed due to fiemap, bigalloc and seek_data/hole need it. The counter will be increased as a new extent is allocated, and it will be decreased as a extent is freed. In this commit we use normal shrinker framework to reclaim memory from the status tree. ext4_es_reclaim_extents_count() traverses the lru list to count the number of reclaimable extents. ext4_es_shrink() tries to reclaim written/unwritten/hole extents from extent status tree. The inode that has been shrunk is moved to the tail of lru list. Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" Cc: Jan kara --- fs/ext4/ext4.h | 7 ++ fs/ext4/extents_status.c | 156 ++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/extents_status.h | 5 ++ fs/ext4/super.c | 7 ++ include/trace/events/ext4.h | 60 +++++++++++++++++ 5 files changed, 235 insertions(+) (limited to 'include/trace') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 0c565c941f7a..6e16c1867959 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -888,6 +888,8 @@ struct ext4_inode_info { /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; + struct list_head i_es_lru; + unsigned int i_es_lru_nr; /* protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -1303,6 +1305,11 @@ struct ext4_sb_info { /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; + + /* Reclaim extents from extent status tree */ + struct shrinker s_es_shrinker; + struct list_head s_es_lru; + spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index cce152c3c8dc..9f1380e05474 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -145,6 +145,9 @@ static struct kmem_cache *ext4_es_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end); +static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan); +static int ext4_es_reclaim_extents_count(struct super_block *sb); int __init ext4_init_es(void) { @@ -280,6 +283,7 @@ out: read_unlock(&EXT4_I(inode)->i_es_lock); + ext4_es_lru_add(inode); trace_ext4_es_find_delayed_extent_exit(inode, es); } @@ -294,11 +298,24 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, es->es_lblk = lblk; es->es_len = len; es->es_pblk = pblk; + + /* + * We don't count delayed extent because we never try to reclaim them + */ + if (!ext4_es_is_delayed(es)) + EXT4_I(inode)->i_es_lru_nr++; + return es; } static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) { + /* Decrease the lru counter when this es is not delayed */ + if (!ext4_es_is_delayed(es)) { + BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); + EXT4_I(inode)->i_es_lru_nr--; + } + kmem_cache_free(ext4_es_cachep, es); } @@ -456,6 +473,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, error: write_unlock(&EXT4_I(inode)->i_es_lock); + ext4_es_lru_add(inode); ext4_es_print_tree(inode); return err; @@ -517,6 +535,7 @@ out: read_unlock(&EXT4_I(inode)->i_es_lock); + ext4_es_lru_add(inode); trace_ext4_es_lookup_extent_exit(inode, es, found); return found; } @@ -639,3 +658,140 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_es_print_tree(inode); return err; } + +static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) +{ + struct ext4_sb_info *sbi = container_of(shrink, + struct ext4_sb_info, s_es_shrinker); + struct ext4_inode_info *ei; + struct list_head *cur, *tmp, scanned; + int nr_to_scan = sc->nr_to_scan; + int ret, nr_shrunk = 0; + + trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan); + + if (!nr_to_scan) + return ext4_es_reclaim_extents_count(sbi->s_sb); + + INIT_LIST_HEAD(&scanned); + + spin_lock(&sbi->s_es_lru_lock); + list_for_each_safe(cur, tmp, &sbi->s_es_lru) { + list_move_tail(cur, &scanned); + + ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + + read_lock(&ei->i_es_lock); + if (ei->i_es_lru_nr == 0) { + read_unlock(&ei->i_es_lock); + continue; + } + read_unlock(&ei->i_es_lock); + + write_lock(&ei->i_es_lock); + ret = __es_try_to_reclaim_extents(ei, nr_to_scan); + write_unlock(&ei->i_es_lock); + + nr_shrunk += ret; + nr_to_scan -= ret; + if (nr_to_scan == 0) + break; + } + list_splice_tail(&scanned, &sbi->s_es_lru); + spin_unlock(&sbi->s_es_lru_lock); + trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk); + + return ext4_es_reclaim_extents_count(sbi->s_sb); +} + +void ext4_es_register_shrinker(struct super_block *sb) +{ + struct ext4_sb_info *sbi; + + sbi = EXT4_SB(sb); + INIT_LIST_HEAD(&sbi->s_es_lru); + spin_lock_init(&sbi->s_es_lru_lock); + sbi->s_es_shrinker.shrink = ext4_es_shrink; + sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; + register_shrinker(&sbi->s_es_shrinker); +} + +void ext4_es_unregister_shrinker(struct super_block *sb) +{ + unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); +} + +void ext4_es_lru_add(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lru_lock); + if (list_empty(&ei->i_es_lru)) + list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); + else + list_move_tail(&ei->i_es_lru, &sbi->s_es_lru); + spin_unlock(&sbi->s_es_lru_lock); +} + +void ext4_es_lru_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lru_lock); + if (!list_empty(&ei->i_es_lru)) + list_del_init(&ei->i_es_lru); + spin_unlock(&sbi->s_es_lru_lock); +} + +static int ext4_es_reclaim_extents_count(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_inode_info *ei; + struct list_head *cur; + int nr_cached = 0; + + spin_lock(&sbi->s_es_lru_lock); + list_for_each(cur, &sbi->s_es_lru) { + ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + read_lock(&ei->i_es_lock); + nr_cached += ei->i_es_lru_nr; + read_unlock(&ei->i_es_lock); + } + spin_unlock(&sbi->s_es_lru_lock); + trace_ext4_es_reclaim_extents_count(sb, nr_cached); + return nr_cached; +} + +static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, + int nr_to_scan) +{ + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; + struct rb_node *node; + struct extent_status *es; + int nr_shrunk = 0; + + if (ei->i_es_lru_nr == 0) + return 0; + + node = rb_first(&tree->root); + while (node != NULL) { + es = rb_entry(node, struct extent_status, rb_node); + node = rb_next(&es->rb_node); + /* + * We can't reclaim delayed extent from status tree because + * fiemap, bigallic, and seek_data/hole need to use it. + */ + if (!ext4_es_is_delayed(es)) { + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + nr_shrunk++; + if (--nr_to_scan == 0) + break; + } + } + tree->cache_es = NULL; + return nr_shrunk; +} diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index 8ffc90c784fa..cf83e77b16cb 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -106,4 +106,9 @@ static inline void ext4_es_store_status(struct extent_status *es, es->es_pblk = block; } +extern void ext4_es_register_shrinker(struct super_block *sb); +extern void ext4_es_unregister_shrinker(struct super_block *sb); +extern void ext4_es_lru_add(struct inode *inode); +extern void ext4_es_lru_del(struct inode *inode); + #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d80bfe5ac11c..373d46cd5d3f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -755,6 +755,7 @@ static void ext4_put_super(struct super_block *sb) ext4_abort(sb, "Couldn't clean up the journal"); } + ext4_es_unregister_shrinker(sb); del_timer(&sbi->s_err_report); ext4_release_system_zone(sb); ext4_mb_release(sb); @@ -840,6 +841,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); + INIT_LIST_HEAD(&ei->i_es_lru); + ei->i_es_lru_nr = 0; ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -928,6 +931,7 @@ void ext4_clear_inode(struct inode *inode) dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); + ext4_es_lru_del(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -3693,6 +3697,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_max_writeback_mb_bump = 128; sbi->s_extent_max_zeroout_kb = 32; + /* Register extent status tree shrinker */ + ext4_es_register_shrinker(sb); + /* * set up enough so that it can read an inode */ diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 1e590b68cec4..c0457c0d1a68 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2255,6 +2255,66 @@ TRACE_EVENT(ext4_es_lookup_extent_exit, __entry->found ? __entry->status : 0) ); +TRACE_EVENT(ext4_es_reclaim_extents_count, + TP_PROTO(struct super_block *sb, int nr_cached), + + TP_ARGS(sb, nr_cached), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, nr_cached ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->nr_cached = nr_cached; + ), + + TP_printk("dev %d,%d cached objects nr %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_cached) +); + +TRACE_EVENT(ext4_es_shrink_enter, + TP_PROTO(struct super_block *sb, int nr_to_scan), + + TP_ARGS(sb, nr_to_scan), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, nr_to_scan ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->nr_to_scan = nr_to_scan; + ), + + TP_printk("dev %d,%d nr to scan %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_to_scan) +); + +TRACE_EVENT(ext4_es_shrink_exit, + TP_PROTO(struct super_block *sb, int shrunk_nr), + + TP_ARGS(sb, shrunk_nr), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( int, shrunk_nr ) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->shrunk_nr = shrunk_nr; + ), + + TP_printk("dev %d,%d nr to scan %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->shrunk_nr) +); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ -- cgit v1.2.3