diff options
Diffstat (limited to 'fs/logfs/gc.c')
-rw-r--r-- | fs/logfs/gc.c | 732 |
1 files changed, 0 insertions, 732 deletions
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c deleted file mode 100644 index d4efb061bdc5..000000000000 --- a/fs/logfs/gc.c +++ /dev/null @@ -1,732 +0,0 @@ -/* - * fs/logfs/gc.c - garbage collection code - * - * As should be obvious for Linux kernel code, license is GPLv2 - * - * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org> - */ -#include "logfs.h" -#include <linux/sched.h> -#include <linux/slab.h> - -/* - * Wear leveling needs to kick in when the difference between low erase - * counts and high erase counts gets too big. A good value for "too big" - * may be somewhat below 10% of maximum erase count for the device. - * Why not 397, to pick a nice round number with no specific meaning? :) - * - * WL_RATELIMIT is the minimum time between two wear level events. A huge - * number of segments may fulfil the requirements for wear leveling at the - * same time. If that happens we don't want to cause a latency from hell, - * but just gently pick one segment every so often and minimize overhead. - */ -#define WL_DELTA 397 -#define WL_RATELIMIT 100 -#define MAX_OBJ_ALIASES 2600 -#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */ -#define LIST_SIZE 64 /* base size of candidate lists */ -#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */ -#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */ - -static int no_free_segments(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - - return super->s_free_list.count; -} - -/* journal has distance -1, top-most ifile layer distance 0 */ -static u8 root_distance(struct super_block *sb, gc_level_t __gc_level) -{ - struct logfs_super *super = logfs_super(sb); - u8 gc_level = (__force u8)__gc_level; - - switch (gc_level) { - case 0: /* fall through */ - case 1: /* fall through */ - case 2: /* fall through */ - case 3: - /* file data or indirect blocks */ - return super->s_ifile_levels + super->s_iblock_levels - gc_level; - case 6: /* fall through */ - case 7: /* fall through */ - case 8: /* fall through */ - case 9: - /* inode file data or indirect blocks */ - return super->s_ifile_levels - (gc_level - 6); - default: - printk(KERN_ERR"LOGFS: segment of unknown level %x found\n", - gc_level); - WARN_ON(1); - return super->s_ifile_levels + super->s_iblock_levels; - } -} - -static int segment_is_reserved(struct super_block *sb, u32 segno) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_area *area; - void *reserved; - int i; - - /* Some segments are reserved. Just pretend they were all valid */ - reserved = btree_lookup32(&super->s_reserved_segments, segno); - if (reserved) - return 1; - - /* Currently open segments */ - for_each_area(i) { - area = super->s_area[i]; - if (area->a_is_open && area->a_segno == segno) - return 1; - } - - return 0; -} - -static void logfs_mark_segment_bad(struct super_block *sb, u32 segno) -{ - BUG(); -} - -/* - * Returns the bytes consumed by valid objects in this segment. Object headers - * are counted, the segment header is not. - */ -static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec, - gc_level_t *gc_level) -{ - struct logfs_segment_entry se; - u32 ec_level; - - logfs_get_segment_entry(sb, segno, &se); - if (se.ec_level == cpu_to_be32(BADSEG) || - se.valid == cpu_to_be32(RESERVED)) - return RESERVED; - - ec_level = be32_to_cpu(se.ec_level); - *ec = ec_level >> 4; - *gc_level = GC_LEVEL(ec_level & 0xf); - return be32_to_cpu(se.valid); -} - -static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino, - u64 bix, gc_level_t gc_level) -{ - struct inode *inode; - int err, cookie; - - inode = logfs_safe_iget(sb, ino, &cookie); - err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0); - BUG_ON(err); - logfs_safe_iput(inode, cookie); -} - -static u32 logfs_gc_segment(struct super_block *sb, u32 segno) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_segment_header sh; - struct logfs_object_header oh; - u64 ofs, ino, bix; - u32 seg_ofs, logical_segno, cleaned = 0; - int err, len, valid; - gc_level_t gc_level; - - LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb); - - btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS); - err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh); - BUG_ON(err); - gc_level = GC_LEVEL(sh.level); - logical_segno = be32_to_cpu(sh.segno); - if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) { - logfs_mark_segment_bad(sb, segno); - cleaned = -1; - goto out; - } - - for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE; - seg_ofs + sizeof(oh) < super->s_segsize; ) { - ofs = dev_ofs(sb, logical_segno, seg_ofs); - err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh), - &oh); - BUG_ON(err); - - if (!memchr_inv(&oh, 0xff, sizeof(oh))) - break; - - if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) { - logfs_mark_segment_bad(sb, segno); - cleaned = super->s_segsize - 1; - goto out; - } - - ino = be64_to_cpu(oh.ino); - bix = be64_to_cpu(oh.bix); - len = sizeof(oh) + be16_to_cpu(oh.len); - valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level); - if (valid == 1) { - logfs_cleanse_block(sb, ofs, ino, bix, gc_level); - cleaned += len; - } else if (valid == 2) { - /* Will be invalid upon journal commit */ - cleaned += len; - } - seg_ofs += len; - } -out: - btree_remove32(&super->s_reserved_segments, segno); - return cleaned; -} - -static struct gc_candidate *add_list(struct gc_candidate *cand, - struct candidate_list *list) -{ - struct rb_node **p = &list->rb_tree.rb_node; - struct rb_node *parent = NULL; - struct gc_candidate *cur; - int comp; - - cand->list = list; - while (*p) { - parent = *p; - cur = rb_entry(parent, struct gc_candidate, rb_node); - - if (list->sort_by_ec) - comp = cand->erase_count < cur->erase_count; - else - comp = cand->valid < cur->valid; - - if (comp) - p = &parent->rb_left; - else - p = &parent->rb_right; - } - rb_link_node(&cand->rb_node, parent, p); - rb_insert_color(&cand->rb_node, &list->rb_tree); - - if (list->count <= list->maxcount) { - list->count++; - return NULL; - } - cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node); - rb_erase(&cand->rb_node, &list->rb_tree); - cand->list = NULL; - return cand; -} - -static void remove_from_list(struct gc_candidate *cand) -{ - struct candidate_list *list = cand->list; - - rb_erase(&cand->rb_node, &list->rb_tree); - list->count--; -} - -static void free_candidate(struct super_block *sb, struct gc_candidate *cand) -{ - struct logfs_super *super = logfs_super(sb); - - btree_remove32(&super->s_cand_tree, cand->segno); - kfree(cand); -} - -u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec) -{ - struct gc_candidate *cand; - u32 segno; - - BUG_ON(list->count == 0); - - cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); - remove_from_list(cand); - segno = cand->segno; - if (ec) - *ec = cand->erase_count; - free_candidate(sb, cand); - return segno; -} - -/* - * We have several lists to manage segments with. The reserve_list is used to - * deal with bad blocks. We try to keep the best (lowest ec) segments on this - * list. - * The free_list contains free segments for normal usage. It usually gets the - * second pick after the reserve_list. But when the free_list is running short - * it is more important to keep the free_list full than to keep a reserve. - * - * Segments that are not free are put onto a per-level low_list. If we have - * to run garbage collection, we pick a candidate from there. All segments on - * those lists should have at least some free space so GC will make progress. - * - * And last we have the ec_list, which is used to pick segments for wear - * leveling. - * - * If all appropriate lists are full, we simply free the candidate and forget - * about that segment for a while. We have better candidates for each purpose. - */ -static void __add_candidate(struct super_block *sb, struct gc_candidate *cand) -{ - struct logfs_super *super = logfs_super(sb); - u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE; - - if (cand->valid == 0) { - /* 100% free segments */ - log_gc_noisy("add reserve segment %x (ec %x) at %llx\n", - cand->segno, cand->erase_count, - dev_ofs(sb, cand->segno, 0)); - cand = add_list(cand, &super->s_reserve_list); - if (cand) { - log_gc_noisy("add free segment %x (ec %x) at %llx\n", - cand->segno, cand->erase_count, - dev_ofs(sb, cand->segno, 0)); - cand = add_list(cand, &super->s_free_list); - } - } else { - /* good candidates for Garbage Collection */ - if (cand->valid < full) - cand = add_list(cand, &super->s_low_list[cand->dist]); - /* good candidates for wear leveling, - * segments that were recently written get ignored */ - if (cand) - cand = add_list(cand, &super->s_ec_list); - } - if (cand) - free_candidate(sb, cand); -} - -static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec, - u8 dist) -{ - struct logfs_super *super = logfs_super(sb); - struct gc_candidate *cand; - - cand = kmalloc(sizeof(*cand), GFP_NOFS); - if (!cand) - return -ENOMEM; - - cand->segno = segno; - cand->valid = valid; - cand->erase_count = ec; - cand->dist = dist; - - btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS); - __add_candidate(sb, cand); - return 0; -} - -static void remove_segment_from_lists(struct super_block *sb, u32 segno) -{ - struct logfs_super *super = logfs_super(sb); - struct gc_candidate *cand; - - cand = btree_lookup32(&super->s_cand_tree, segno); - if (cand) { - remove_from_list(cand); - free_candidate(sb, cand); - } -} - -static void scan_segment(struct super_block *sb, u32 segno) -{ - u32 valid, ec = 0; - gc_level_t gc_level = 0; - u8 dist; - - if (segment_is_reserved(sb, segno)) - return; - - remove_segment_from_lists(sb, segno); - valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); - if (valid == RESERVED) - return; - - dist = root_distance(sb, gc_level); - add_candidate(sb, segno, valid, ec, dist); -} - -static struct gc_candidate *first_in_list(struct candidate_list *list) -{ - if (list->count == 0) - return NULL; - return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node); -} - -/* - * Find the best segment for garbage collection. Main criterion is - * the segment requiring the least effort to clean. Secondary - * criterion is to GC on the lowest level available. - * - * So we search the least effort segment on the lowest level first, - * then move up and pick another segment iff is requires significantly - * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison. - */ -static struct gc_candidate *get_candidate(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int i, max_dist; - struct gc_candidate *cand = NULL, *this; - - max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1); - - for (i = max_dist; i >= 0; i--) { - this = first_in_list(&super->s_low_list[i]); - if (!this) - continue; - if (!cand) - cand = this; - if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid) - cand = this; - } - return cand; -} - -static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand) -{ - struct logfs_super *super = logfs_super(sb); - gc_level_t gc_level; - u32 cleaned, valid, segno, ec; - u8 dist; - - if (!cand) { - log_gc("GC attempted, but no candidate found\n"); - return 0; - } - - segno = cand->segno; - dist = cand->dist; - valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); - free_candidate(sb, cand); - log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n", - segno, (u64)segno << super->s_segshift, - dist, no_free_segments(sb), valid, - super->s_free_bytes); - cleaned = logfs_gc_segment(sb, segno); - log_gc("GC segment #%02x complete - now %x valid\n", segno, - valid - cleaned); - BUG_ON(cleaned != valid); - return 1; -} - -static int logfs_gc_once(struct super_block *sb) -{ - struct gc_candidate *cand; - - cand = get_candidate(sb); - if (cand) - remove_from_list(cand); - return __logfs_gc_once(sb, cand); -} - -/* returns 1 if a wrap occurs, 0 otherwise */ -static int logfs_scan_some(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - u32 segno; - int i, ret = 0; - - segno = super->s_sweeper; - for (i = SCAN_RATIO; i > 0; i--) { - segno++; - if (segno >= super->s_no_segs) { - segno = 0; - ret = 1; - /* Break out of the loop. We want to read a single - * block from the segment size on next invocation if - * SCAN_RATIO is set to match block size - */ - break; - } - - scan_segment(sb, segno); - } - super->s_sweeper = segno; - return ret; -} - -/* - * In principle, this function should loop forever, looking for GC candidates - * and moving data. LogFS is designed in such a way that this loop is - * guaranteed to terminate. - * - * Limiting the loop to some iterations serves purely to catch cases when - * these guarantees have failed. An actual endless loop is an obvious bug - * and should be reported as such. - */ -static void __logfs_gc_pass(struct super_block *sb, int target) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_block *block; - int round, progress, last_progress = 0; - - /* - * Doing too many changes to the segfile at once would result - * in a large number of aliases. Write the journal before - * things get out of hand. - */ - if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES) - logfs_write_anchor(sb); - - if (no_free_segments(sb) >= target && - super->s_no_object_aliases < MAX_OBJ_ALIASES) - return; - - log_gc("__logfs_gc_pass(%x)\n", target); - for (round = 0; round < SCAN_ROUNDS; ) { - if (no_free_segments(sb) >= target) - goto write_alias; - - /* Sync in-memory state with on-medium state in case they - * diverged */ - logfs_write_anchor(sb); - round += logfs_scan_some(sb); - if (no_free_segments(sb) >= target) - goto write_alias; - progress = logfs_gc_once(sb); - if (progress) - last_progress = round; - else if (round - last_progress > 2) - break; - continue; - - /* - * The goto logic is nasty, I just don't know a better way to - * code it. GC is supposed to ensure two things: - * 1. Enough free segments are available. - * 2. The number of aliases is bounded. - * When 1. is achieved, we take a look at 2. and write back - * some alias-containing blocks, if necessary. However, after - * each such write we need to go back to 1., as writes can - * consume free segments. - */ -write_alias: - if (super->s_no_object_aliases < MAX_OBJ_ALIASES) - return; - if (list_empty(&super->s_object_alias)) { - /* All aliases are still in btree */ - return; - } - log_gc("Write back one alias\n"); - block = list_entry(super->s_object_alias.next, - struct logfs_block, alias_list); - block->ops->write_block(block); - /* - * To round off the nasty goto logic, we reset round here. It - * is a safety-net for GC not making any progress and limited - * to something reasonably small. If incremented it for every - * single alias, the loop could terminate rather quickly. - */ - round = 0; - } - LOGFS_BUG(sb); -} - -static int wl_ratelimit(struct super_block *sb, u64 *next_event) -{ - struct logfs_super *super = logfs_super(sb); - - if (*next_event < super->s_gec) { - *next_event = super->s_gec + WL_RATELIMIT; - return 0; - } - return 1; -} - -static void logfs_wl_pass(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct gc_candidate *wl_cand, *free_cand; - - if (wl_ratelimit(sb, &super->s_wl_gec_ostore)) - return; - - wl_cand = first_in_list(&super->s_ec_list); - if (!wl_cand) - return; - free_cand = first_in_list(&super->s_free_list); - if (!free_cand) - return; - - if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) { - remove_from_list(wl_cand); - __logfs_gc_once(sb, wl_cand); - } -} - -/* - * The journal needs wear leveling as well. But moving the journal is an - * expensive operation so we try to avoid it as much as possible. And if we - * have to do it, we move the whole journal, not individual segments. - * - * Ratelimiting is not strictly necessary here, it mainly serves to avoid the - * calculations. First we check whether moving the journal would be a - * significant improvement. That means that a) the current journal segments - * have more wear than the future journal segments and b) the current journal - * segments have more wear than normal ostore segments. - * Rationale for b) is that we don't have to move the journal if it is aging - * less than the ostore, even if the reserve segments age even less (they are - * excluded from wear leveling, after all). - * Next we check that the superblocks have less wear than the journal. Since - * moving the journal requires writing the superblocks, we have to protect the - * superblocks even more than the journal. - * - * Also we double the acceptable wear difference, compared to ostore wear - * leveling. Journal data is read and rewritten rapidly, comparatively. So - * soft errors have much less time to accumulate and we allow the journal to - * be a bit worse than the ostore. - */ -static void logfs_journal_wl_pass(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - struct gc_candidate *cand; - u32 min_journal_ec = -1, max_reserve_ec = 0; - int i; - - if (wl_ratelimit(sb, &super->s_wl_gec_journal)) - return; - - if (super->s_reserve_list.count < super->s_no_journal_segs) { - /* Reserve is not full enough to move complete journal */ - return; - } - - journal_for_each(i) - if (super->s_journal_seg[i]) - min_journal_ec = min(min_journal_ec, - super->s_journal_ec[i]); - cand = rb_entry(rb_first(&super->s_free_list.rb_tree), - struct gc_candidate, rb_node); - max_reserve_ec = cand->erase_count; - for (i = 0; i < 2; i++) { - struct logfs_segment_entry se; - u32 segno = seg_no(sb, super->s_sb_ofs[i]); - u32 ec; - - logfs_get_segment_entry(sb, segno, &se); - ec = be32_to_cpu(se.ec_level) >> 4; - max_reserve_ec = max(max_reserve_ec, ec); - } - - if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) { - do_logfs_journal_wl_pass(sb); - } -} - -void logfs_gc_pass(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - - //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex)); - /* Write journal before free space is getting saturated with dirty - * objects. - */ - if (super->s_dirty_used_bytes + super->s_dirty_free_bytes - + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes) - logfs_write_anchor(sb); - __logfs_gc_pass(sb, super->s_total_levels); - logfs_wl_pass(sb); - logfs_journal_wl_pass(sb); -} - -static int check_area(struct super_block *sb, int i) -{ - struct logfs_super *super = logfs_super(sb); - struct logfs_area *area = super->s_area[i]; - gc_level_t gc_level; - u32 cleaned, valid, ec; - u32 segno = area->a_segno; - u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes); - - if (!area->a_is_open) - return 0; - - if (super->s_devops->can_write_buf(sb, ofs) == 0) - return 0; - - printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs); - /* - * The device cannot write back the write buffer. Most likely the - * wbuf was already written out and the system crashed at some point - * before the journal commit happened. In that case we wouldn't have - * to do anything. But if the crash happened before the wbuf was - * written out correctly, we must GC this segment. So assume the - * worst and always do the GC run. - */ - area->a_is_open = 0; - valid = logfs_valid_bytes(sb, segno, &ec, &gc_level); - cleaned = logfs_gc_segment(sb, segno); - if (cleaned != valid) - return -EIO; - return 0; -} - -int logfs_check_areas(struct super_block *sb) -{ - int i, err; - - for_each_area(i) { - err = check_area(sb, i); - if (err) - return err; - } - return 0; -} - -static void logfs_init_candlist(struct candidate_list *list, int maxcount, - int sort_by_ec) -{ - list->count = 0; - list->maxcount = maxcount; - list->sort_by_ec = sort_by_ec; - list->rb_tree = RB_ROOT; -} - -int logfs_init_gc(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int i; - - btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool); - logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1); - logfs_init_candlist(&super->s_reserve_list, - super->s_bad_seg_reserve, 1); - for_each_area(i) - logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0); - logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1); - return 0; -} - -static void logfs_cleanup_list(struct super_block *sb, - struct candidate_list *list) -{ - struct gc_candidate *cand; - - while (list->count) { - cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate, - rb_node); - remove_from_list(cand); - free_candidate(sb, cand); - } - BUG_ON(list->rb_tree.rb_node); -} - -void logfs_cleanup_gc(struct super_block *sb) -{ - struct logfs_super *super = logfs_super(sb); - int i; - - if (!super->s_free_list.count) - return; - - /* - * FIXME: The btree may still contain a single empty node. So we - * call the grim visitor to clean up that mess. Btree code should - * do it for us, really. - */ - btree_grim_visitor32(&super->s_cand_tree, 0, NULL); - logfs_cleanup_list(sb, &super->s_free_list); - logfs_cleanup_list(sb, &super->s_reserve_list); - for_each_area(i) - logfs_cleanup_list(sb, &super->s_low_list[i]); - logfs_cleanup_list(sb, &super->s_ec_list); -} |