diff options
127 files changed, 5543 insertions, 1980 deletions
diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 7f7413e597f3..27c1b7b78504 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -296,7 +296,8 @@ Code Seq#(hex) Include File Comments 0x90 00 drivers/cdrom/sbpcd.h 0x92 00-0F drivers/usb/mon/mon_bin.c 0x93 60-7F linux/auto_fs.h -0x94 all fs/btrfs/ioctl.h +0x94 all fs/btrfs/ioctl.h Btrfs filesystem + and linux/fs.h some lifted to vfs/generic 0x97 00-7F fs/ceph/ioctl.h Ceph file system 0x99 00-0F 537-Addinboard driver <mailto:buk@buks.ipn.de> diff --git a/block/bio.c b/block/bio.c index 595663e0281a..5f7563598b1c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -774,7 +774,7 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page return 0; } - if (bio->bi_vcnt >= bio->bi_max_vecs) + if (bio_full(bio)) return 0; /* @@ -822,52 +822,82 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page EXPORT_SYMBOL(bio_add_pc_page); /** - * bio_add_page - attempt to add page to bio - * @bio: destination bio - * @page: page to add - * @len: vec entry length - * @offset: vec entry offset + * __bio_try_merge_page - try appending data to an existing bvec. + * @bio: destination bio + * @page: page to add + * @len: length of the data to add + * @off: offset of the data in @page * - * Attempt to add a page to the bio_vec maplist. This will only fail - * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. + * Try to add the data at @page + @off to the last bvec of @bio. This is a + * a useful optimisation for file systems with a block size smaller than the + * page size. + * + * Return %true on success or %false on failure. */ -int bio_add_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int offset) +bool __bio_try_merge_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int off) { - struct bio_vec *bv; - - /* - * cloned bio must not modify vec list - */ if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return 0; + return false; - /* - * For filesystems with a blocksize smaller than the pagesize - * we will often be called with the same page as last time and - * a consecutive offset. Optimize this special case. - */ if (bio->bi_vcnt > 0) { - bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (page == bv->bv_page && - offset == bv->bv_offset + bv->bv_len) { + if (page == bv->bv_page && off == bv->bv_offset + bv->bv_len) { bv->bv_len += len; - goto done; + bio->bi_iter.bi_size += len; + return true; } } + return false; +} +EXPORT_SYMBOL_GPL(__bio_try_merge_page); - if (bio->bi_vcnt >= bio->bi_max_vecs) - return 0; +/** + * __bio_add_page - add page to a bio in a new segment + * @bio: destination bio + * @page: page to add + * @len: length of the data to add + * @off: offset of the data in @page + * + * Add the data at @page + @off to @bio as a new bvec. The caller must ensure + * that @bio has space for another bvec. + */ +void __bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int off) +{ + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; - bv = &bio->bi_io_vec[bio->bi_vcnt]; - bv->bv_page = page; - bv->bv_len = len; - bv->bv_offset = offset; + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); + WARN_ON_ONCE(bio_full(bio)); + + bv->bv_page = page; + bv->bv_offset = off; + bv->bv_len = len; - bio->bi_vcnt++; -done: bio->bi_iter.bi_size += len; + bio->bi_vcnt++; +} +EXPORT_SYMBOL_GPL(__bio_add_page); + +/** + * bio_add_page - attempt to add page to bio + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This will only fail + * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. + */ +int bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + if (!__bio_try_merge_page(bio, page, len, offset)) { + if (bio_full(bio)) + return 0; + __bio_add_page(bio, page, len, offset); + } return len; } EXPORT_SYMBOL(bio_add_page); diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 2b2332b605e4..1d7bd96511f0 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -74,42 +74,42 @@ EXPORT_SYMBOL_GPL(fs_dax_get_by_bdev); /** * __bdev_dax_supported() - Check if the device supports dax for filesystem - * @sb: The superblock of the device + * @bdev: block device to check * @blocksize: The block size of the device * * This is a library function for filesystems to check if the block device * can be mounted with dax option. * - * Return: negative errno if unsupported, 0 if supported. + * Return: true if supported, false if unsupported */ -int __bdev_dax_supported(struct super_block *sb, int blocksize) +bool __bdev_dax_supported(struct block_device *bdev, int blocksize) { - struct block_device *bdev = sb->s_bdev; struct dax_device *dax_dev; pgoff_t pgoff; int err, id; void *kaddr; pfn_t pfn; long len; + char buf[BDEVNAME_SIZE]; if (blocksize != PAGE_SIZE) { - pr_debug("VFS (%s): error: unsupported blocksize for dax\n", - sb->s_id); - return -EINVAL; + pr_debug("%s: error: unsupported blocksize for dax\n", + bdevname(bdev, buf)); + return false; } err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); if (err) { - pr_debug("VFS (%s): error: unaligned partition for dax\n", - sb->s_id); - return err; + pr_debug("%s: error: unaligned partition for dax\n", + bdevname(bdev, buf)); + return false; } dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); if (!dax_dev) { - pr_debug("VFS (%s): error: device does not support dax\n", - sb->s_id); - return -EOPNOTSUPP; + pr_debug("%s: error: device does not support dax\n", + bdevname(bdev, buf)); + return false; } id = dax_read_lock(); @@ -119,9 +119,9 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) put_dax(dax_dev); if (len < 1) { - pr_debug("VFS (%s): error: dax access failed (%ld)\n", - sb->s_id, len); - return len < 0 ? len : -EIO; + pr_debug("%s: error: dax access failed (%ld)\n", + bdevname(bdev, buf), len); + return false; } if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { @@ -137,12 +137,12 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) } else if (pfn_t_devmap(pfn)) { /* pass */; } else { - pr_debug("VFS (%s): error: dax support not enabled\n", - sb->s_id); - return -EOPNOTSUPP; + pr_debug("%s: error: dax support not enabled\n", + bdevname(bdev, buf)); + return false; } - return 0; + return true; } EXPORT_SYMBOL_GPL(__bdev_dax_supported); #endif diff --git a/fs/buffer.c b/fs/buffer.c index 249b83fafe48..cabc045f483d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3427,120 +3427,6 @@ int bh_submit_read(struct buffer_head *bh) } EXPORT_SYMBOL(bh_submit_read); -/* - * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. - * - * Returns the offset within the file on success, and -ENOENT otherwise. - */ -static loff_t -page_seek_hole_data(struct page *page, loff_t lastoff, int whence) -{ - loff_t offset = page_offset(page); - struct buffer_head *bh, *head; - bool seek_data = whence == SEEK_DATA; - - if (lastoff < offset) - lastoff = offset; - - bh = head = page_buffers(page); - do { - offset += bh->b_size; - if (lastoff >= offset) - continue; - - /* - * Unwritten extents that have data in the page cache covering - * them can be identified by the BH_Unwritten state flag. - * Pages with multiple buffers might have a mix of holes, data - * and unwritten extents - any buffer with valid data in it - * should have BH_Uptodate flag set on it. - */ - - if ((buffer_unwritten(bh) || buffer_uptodate(bh)) == seek_data) - return lastoff; - - lastoff = offset; - } while ((bh = bh->b_this_page) != head); - return -ENOENT; -} - -/* - * Seek for SEEK_DATA / SEEK_HOLE in the page cache. - * - * Within unwritten extents, the page cache determines which parts are holes - * and which are data: unwritten and uptodate buffer heads count as data; - * everything else counts as a hole. - * - * Returns the resulting offset on successs, and -ENOENT otherwise. - */ -loff_t -page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, - int whence) -{ - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); - loff_t lastoff = offset; - struct pagevec pvec; - - if (length <= 0) - return -ENOENT; - - pagevec_init(&pvec); - - do { - unsigned nr_pages, i; - - nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, - end - 1); - if (nr_pages == 0) - break; - - for (i = 0; i < nr_pages; i++) { - struct page *page = pvec.pages[i]; - - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - * - * If current page offset is beyond where we've ended, - * we've found a hole. - */ - if (whence == SEEK_HOLE && - lastoff < page_offset(page)) - goto check_range; - - lock_page(page); - if (likely(page->mapping == inode->i_mapping) && - page_has_buffers(page)) { - lastoff = page_seek_hole_data(page, lastoff, whence); - if (lastoff >= 0) { - unlock_page(page); - goto check_range; - } - } - unlock_page(page); - lastoff = page_offset(page) + PAGE_SIZE; - } - pagevec_release(&pvec); - } while (index < end); - - /* When no page at lastoff and we are not done, we found a hole. */ - if (whence != SEEK_HOLE) - goto not_found; - -check_range: - if (lastoff < offset + length) - goto out; -not_found: - lastoff = -ENOENT; -out: - pagevec_release(&pvec); - return lastoff; -} - void __init buffer_init(void) { unsigned long nrpages; diff --git a/fs/ext2/super.c b/fs/ext2/super.c index de1694512f1f..c09289a42dc5 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -961,8 +961,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size); if (sbi->s_mount_opt & EXT2_MOUNT_DAX) { - err = bdev_dax_supported(sb, blocksize); - if (err) { + if (!bdev_dax_supported(sb->s_bdev, blocksize)) { ext2_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); sbi->s_mount_opt &= ~EXT2_MOUNT_DAX; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 44b4fcdc3755..285ed1588730 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1841,8 +1841,8 @@ int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap) iomap->offset = 0; iomap->length = min_t(loff_t, ext4_get_inline_size(inode), i_size_read(inode)); - iomap->type = 0; - iomap->flags = IOMAP_F_DATA_INLINE; + iomap->type = IOMAP_INLINE; + iomap->flags = 0; out: up_read(&EXT4_I(inode)->xattr_sem); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c1c5c8775ae7..7022d843c667 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3762,8 +3762,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) " that may contain inline data"); sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; } - err = bdev_dax_supported(sb, blocksize); - if (err) { + if (!bdev_dax_supported(sb->s_bdev, blocksize)) { ext4_msg(sb, KERN_ERR, "DAX unsupported by block device. Turning off DAX."); sbi->s_mount_opt &= ~EXT4_MOUNT_DAX; diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index a7b586e02693..ed6699705c13 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -767,10 +767,11 @@ static void gfs2_stuffed_iomap(struct inode *inode, struct iomap *iomap) sizeof(struct gfs2_dinode); iomap->offset = 0; iomap->length = i_size_read(inode); - iomap->type = IOMAP_MAPPED; - iomap->flags = IOMAP_F_DATA_INLINE; + iomap->type = IOMAP_INLINE; } +#define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE + /** * gfs2_iomap_get - Map blocks from an inode to disk blocks * @inode: The inode @@ -846,7 +847,7 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length, iomap->type = IOMAP_MAPPED; iomap->flags = IOMAP_F_MERGED; if (eob) - iomap->flags |= IOMAP_F_BOUNDARY; + iomap->flags |= IOMAP_F_GFS2_BOUNDARY; out: iomap->bdev = inode->i_sb->s_bdev; @@ -952,12 +953,12 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, if (iomap.length > bh_map->b_size) { iomap.length = bh_map->b_size; - iomap.flags &= ~IOMAP_F_BOUNDARY; + iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY; } if (iomap.addr != IOMAP_NULL_ADDR) map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits); bh_map->b_size = iomap.length; - if (iomap.flags & IOMAP_F_BOUNDARY) + if (iomap.flags & IOMAP_F_GFS2_BOUNDARY) set_buffer_boundary(bh_map); if (iomap.flags & IOMAP_F_NEW) set_buffer_new(bh_map); diff --git a/fs/iomap.c b/fs/iomap.c index afd163586aa0..206539d369a8 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -20,6 +20,7 @@ #include <linux/mm.h> #include <linux/swap.h> #include <linux/pagemap.h> +#include <linux/pagevec.h> #include <linux/file.h> #include <linux/uio.h> #include <linux/backing-dev.h> @@ -27,6 +28,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/dax.h> #include <linux/sched/signal.h> +#include <linux/swap.h> #include "internal.h" @@ -95,6 +97,12 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags, return written ? written : ret; } +static sector_t +iomap_sector(struct iomap *iomap, loff_t pos) +{ + return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT; +} + static void iomap_write_failed(struct inode *inode, loff_t pos, unsigned len) { @@ -352,11 +360,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, struct iomap *iomap) { - sector_t sector = (iomap->addr + - (pos & PAGE_MASK) - iomap->offset) >> 9; - - return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector, - offset, bytes); + return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, + iomap_sector(iomap, pos & PAGE_MASK), offset, bytes); } static loff_t @@ -501,10 +506,13 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, case IOMAP_DELALLOC: flags |= FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN; break; + case IOMAP_MAPPED: + break; case IOMAP_UNWRITTEN: flags |= FIEMAP_EXTENT_UNWRITTEN; break; - case IOMAP_MAPPED: + case IOMAP_INLINE: + flags |= FIEMAP_EXTENT_DATA_INLINE; break; } @@ -512,8 +520,6 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, flags |= FIEMAP_EXTENT_MERGED; if (iomap->flags & IOMAP_F_SHARED) flags |= FIEMAP_EXTENT_SHARED; - if (iomap->flags & IOMAP_F_DATA_INLINE) - flags |= FIEMAP_EXTENT_DATA_INLINE; return fiemap_fill_next_extent(fi, iomap->offset, iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0, @@ -587,6 +593,113 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, } EXPORT_SYMBOL_GPL(iomap_fiemap); +/* + * Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff. + * Returns true if found and updates @lastoff to the offset in file. + */ +static bool +page_seek_hole_data(struct inode *inode, struct page *page, loff_t *lastoff, + int whence) +{ + const struct address_space_operations *ops = inode->i_mapping->a_ops; + unsigned int bsize = i_blocksize(inode), off; + bool seek_data = whence == SEEK_DATA; + loff_t poff = page_offset(page); + + if (WARN_ON_ONCE(*lastoff >= poff + PAGE_SIZE)) + return false; + + if (*lastoff < poff) { + /* + * Last offset smaller than the start of the page means we found + * a hole: + */ + if (whence == SEEK_HOLE) + return true; + *lastoff = poff; + } + + /* + * Just check the page unless we can and should check block ranges: + */ + if (bsize == PAGE_SIZE || !ops->is_partially_uptodate) + return PageUptodate(page) == seek_data; + + lock_page(page); + if (unlikely(page->mapping != inode->i_mapping)) + goto out_unlock_not_found; + + for (off = 0; off < PAGE_SIZE; off += bsize) { + if ((*lastoff & ~PAGE_MASK) >= off + bsize) + continue; + if (ops->is_partially_uptodate(page, off, bsize) == seek_data) { + unlock_page(page); + return true; + } + *lastoff = poff + off + bsize; + } + +out_unlock_not_found: + unlock_page(page); + return false; +} + +/* + * Seek for SEEK_DATA / SEEK_HOLE in the page cache. + * + * Within unwritten extents, the page cache determines which parts are holes + * and which are data: uptodate buffer heads count as data; everything else + * counts as a hole. + * + * Returns the resulting offset on successs, and -ENOENT otherwise. + */ +static loff_t +page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, + int whence) +{ + pgoff_t index = offset >> PAGE_SHIFT; + pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE); + loff_t lastoff = offset; + struct pagevec pvec; + + if (length <= 0) + return -ENOENT; + + pagevec_init(&pvec); + + do { + unsigned nr_pages, i; + + nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index, + end - 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (page_seek_hole_data(inode, page, &lastoff, whence)) + goto check_range; + lastoff = page_offset(page) + PAGE_SIZE; + } + pagevec_release(&pvec); + } while (index < end); + + /* When no page at lastoff and we are not done, we found a hole. */ + if (whence != SEEK_HOLE) + goto not_found; + +check_range: + if (lastoff < offset + length) + goto out; +not_found: + lastoff = -ENOENT; +out: + pagevec_release(&pvec); + return lastoff; +} + + static loff_t iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length, void *data, struct iomap *iomap) @@ -685,6 +798,8 @@ EXPORT_SYMBOL_GPL(iomap_seek_data); * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_WRITE_FUA (1 << 28) +#define IOMAP_DIO_NEED_SYNC (1 << 29) #define IOMAP_DIO_WRITE (1 << 30) #define IOMAP_DIO_DIRTY (1 << 31) @@ -759,6 +874,13 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) dio_warn_stale_pagecache(iocb->ki_filp); } + /* + * If this is a DSYNC write, make sure we push it to stable storage now + * that we've written data. + */ + if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC)) + ret = generic_write_sync(iocb, ret); + inode_dio_end(file_inode(iocb->ki_filp)); kfree(dio); @@ -769,13 +891,8 @@ static void iomap_dio_complete_work(struct work_struct *work) { struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work); struct kiocb *iocb = dio->iocb; - bool is_write = (dio->flags & IOMAP_DIO_WRITE); - ssize_t ret; - ret = iomap_dio_complete(dio); - if (is_write && ret > 0) - ret = generic_write_sync(iocb, ret); - iocb->ki_complete(iocb, ret, 0); + iocb->ki_complete(iocb, iomap_dio_complete(dio), 0); } /* @@ -833,14 +950,12 @@ iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos, bio = bio_alloc(GFP_KERNEL, 1); bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = - (iomap->addr + pos - iomap->offset) >> 9; + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; get_page(page); - if (bio_add_page(bio, page, len, 0) != len) - BUG(); + __bio_add_page(bio, page, len, 0); bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); atomic_inc(&dio->ref); @@ -858,6 +973,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, struct iov_iter iter; struct bio *bio; bool need_zeroout = false; + bool use_fua = false; int nr_pages, ret; size_t copied = 0; @@ -881,8 +997,20 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, case IOMAP_MAPPED: if (iomap->flags & IOMAP_F_SHARED) dio->flags |= IOMAP_DIO_COW; - if (iomap->flags & IOMAP_F_NEW) + if (iomap->flags & IOMAP_F_NEW) { need_zeroout = true; + } else { + /* + * Use a FUA write if we need datasync semantics, this + * is a pure data IO that doesn't require any metadata + * updates and the underlying device supports FUA. This + * allows us to avoid cache flushes on IO completion. + */ + if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && + (dio->flags & IOMAP_DIO_WRITE_FUA) && + blk_queue_fua(bdev_get_queue(iomap->bdev))) + use_fua = true; + } break; default: WARN_ON_ONCE(1); @@ -916,8 +1044,7 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, bio = bio_alloc(GFP_KERNEL, nr_pages); bio_set_dev(bio, iomap->bdev); - bio->bi_iter.bi_sector = - (iomap->addr + pos - iomap->offset) >> 9; + bio->bi_iter.bi_sector = iomap_sector(iomap, pos); bio->bi_write_hint = dio->iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; @@ -930,10 +1057,14 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, n = bio->bi_iter.bi_size; if (dio->flags & IOMAP_DIO_WRITE) { - bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE); + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + if (use_fua) + bio->bi_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_FUA; task_io_account_write(n); } else { - bio_set_op_attrs(bio, REQ_OP_READ, 0); + bio->bi_opf = REQ_OP_READ; if (dio->flags & IOMAP_DIO_DIRTY) bio_set_pages_dirty(bio); } @@ -961,6 +1092,15 @@ iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length, return copied; } +/* + * iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO + * is being issued as AIO or not. This allows us to optimise pure data writes + * to use REQ_FUA rather than requiring generic_write_sync() to issue a + * REQ_FLUSH post write. This is slightly tricky because a single request here + * can be mapped into multiple disjoint IOs and only a subset of the IOs issued + * may be pure data writes. In that case, we still need to do a full data sync + * completion. + */ ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, iomap_dio_end_io_t end_io) @@ -1005,8 +1145,21 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iter->type == ITER_IOVEC) dio->flags |= IOMAP_DIO_DIRTY; } else { - dio->flags |= IOMAP_DIO_WRITE; flags |= IOMAP_WRITE; + dio->flags |= IOMAP_DIO_WRITE; + + /* for data sync or sync, we need sync completion processing */ + if (iocb->ki_flags & IOCB_DSYNC) + dio->flags |= IOMAP_DIO_NEED_SYNC; + + /* + * For datasync only writes, we optimistically try using FUA for + * this IO. Any non-FUA write that occurs will clear this flag, + * hence we know before completion whether a cache flush is + * necessary. + */ + if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC) + dio->flags |= IOMAP_DIO_WRITE_FUA; } if (iocb->ki_flags & IOCB_NOWAIT) { @@ -1062,6 +1215,13 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret < 0) iomap_dio_set_error(dio, ret); + /* + * If all the writes we issued were FUA, we don't need to flush the + * cache on IO completion. Clear the sync flag for this case. + */ + if (dio->flags & IOMAP_DIO_WRITE_FUA) + dio->flags &= ~IOMAP_DIO_NEED_SYNC; + if (!atomic_dec_and_test(&dio->ref)) { if (!is_sync_kiocb(iocb)) return -EIOCBQUEUED; @@ -1089,3 +1249,203 @@ out_free_dio: return ret; } EXPORT_SYMBOL_GPL(iomap_dio_rw); + +/* Swapfile activation */ + +#ifdef CONFIG_SWAP +struct iomap_swapfile_info { + struct iomap iomap; /* accumulated iomap */ + struct swap_info_struct *sis; + uint64_t lowest_ppage; /* lowest physical addr seen (pages) */ + uint64_t highest_ppage; /* highest physical addr seen (pages) */ + unsigned long nr_pages; /* number of pages collected */ + int nr_extents; /* extent count */ +}; + +/* + * Collect physical extents for this swap file. Physical extents reported to + * the swap code must be trimmed to align to a page boundary. The logical + * offset within the file is irrelevant since the swapfile code maps logical + * page numbers of the swap device to the physical page-aligned extents. + */ +static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi) +{ + struct iomap *iomap = &isi->iomap; + unsigned long nr_pages; + uint64_t first_ppage; + uint64_t first_ppage_reported; + uint64_t next_ppage; + int error; + + /* + * Round the start up and the end down so that the physical + * extent aligns to a page boundary. + */ + first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT; + next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >> + PAGE_SHIFT; + + /* Skip too-short physical extents. */ + if (first_ppage >= next_ppage) + return 0; + nr_pages = next_ppage - first_ppage; + + /* + * Calculate how much swap space we're adding; the first page contains + * the swap header and doesn't count. The mm still wants that first + * page fed to add_swap_extent, however. + */ + first_ppage_reported = first_ppage; + if (iomap->offset == 0) + first_ppage_reported++; + if (isi->lowest_ppage > first_ppage_reported) + isi->lowest_ppage = first_ppage_reported; + if (isi->highest_ppage < (next_ppage - 1)) + isi->highest_ppage = next_ppage - 1; + + /* Add extent, set up for the next call. */ + error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage); + if (error < 0) + return error; + isi->nr_extents += error; + isi->nr_pages += nr_pages; + return 0; +} + +/* + * Accumulate iomaps for this swap file. We have to accumulate iomaps because + * swap only cares about contiguous page-aligned physical extents and makes no + * distinction between written and unwritten extents. + */ +static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos, + loff_t count, void *data, struct iomap *iomap) +{ + struct iomap_swapfile_info *isi = data; + int error; + + switch (iomap->type) { + case IOMAP_MAPPED: + case IOMAP_UNWRITTEN: + /* Only real or unwritten extents. */ + break; + case IOMAP_INLINE: + /* No inline data. */ + pr_err("swapon: file is inline\n"); + return -EINVAL; + default: + pr_err("swapon: file has unallocated extents\n"); + return -EINVAL; + } + + /* No uncommitted metadata or shared blocks. */ + if (iomap->flags & IOMAP_F_DIRTY) { + pr_err("swapon: file is not committed\n"); + return -EINVAL; + } + if (iomap->flags & IOMAP_F_SHARED) { + pr_err("swapon: file has shared extents\n"); + return -EINVAL; + } + + /* Only one bdev per swap file. */ + if (iomap->bdev != isi->sis->bdev) { + pr_err("swapon: file is on multiple devices\n"); + return -EINVAL; + } + + if (isi->iomap.length == 0) { + /* No accumulated extent, so just store it. */ + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); + } else if (isi->iomap.addr + isi->iomap.length == iomap->addr) { + /* Append this to the accumulated extent. */ + isi->iomap.length += iomap->length; + } else { + /* Otherwise, add the retained iomap and store this one. */ + error = iomap_swapfile_add_extent(isi); + if (error) + return error; + memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); + } + return count; +} + +/* + * Iterate a swap file's iomaps to construct physical extents that can be + * passed to the swapfile subsystem. + */ +int iomap_swapfile_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *pagespan, + const struct iomap_ops *ops) +{ + struct iomap_swapfile_info isi = { + .sis = sis, + .lowest_ppage = (sector_t)-1ULL, + }; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + loff_t pos = 0; + loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE); + loff_t ret; + + ret = filemap_write_and_wait(inode->i_mapping); + if (ret) + return ret; + + while (len > 0) { + ret = iomap_apply(inode, pos, len, IOMAP_REPORT, + ops, &isi, iomap_swapfile_activate_actor); + if (ret <= 0) + return ret; + + pos += ret; + len -= ret; + } + + if (isi.iomap.length) { + ret = iomap_swapfile_add_extent(&isi); + if (ret) + return ret; + } + + *pagespan = 1 + isi.highest_ppage - isi.lowest_ppage; + sis->max = isi.nr_pages; + sis->pages = isi.nr_pages - 1; + sis->highest_bit = isi.nr_pages - 1; + return isi.nr_extents; +} +EXPORT_SYMBOL_GPL(iomap_swapfile_activate); +#endif /* CONFIG_SWAP */ + +static loff_t +iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length, + void *data, struct iomap *iomap) +{ + sector_t *bno = data, addr; + + if (iomap->type == IOMAP_MAPPED) { + addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits; + if (addr > INT_MAX) + WARN(1, "would truncate bmap result\n"); + else + *bno = addr; + } + return 0; +} + +/* legacy ->bmap interface. 0 is the error return (!) */ +sector_t +iomap_bmap(struct address_space *mapping, sector_t bno, + const struct iomap_ops *ops) +{ + struct inode *inode = mapping->host; + loff_t pos = bno >> inode->i_blkbits; + unsigned blocksize = i_blocksize(inode); + + if (filemap_write_and_wait(mapping)) + return 0; + + bno = 0; + iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor); + return bno; +} +EXPORT_SYMBOL_GPL(iomap_bmap); diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 46bcf0e649f5..457ac9f97377 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -85,6 +85,24 @@ config XFS_ONLINE_SCRUB If unsure, say N. +config XFS_ONLINE_REPAIR + bool "XFS online metadata repair support" + default n + depends on XFS_FS && XFS_ONLINE_SCRUB + help + If you say Y here you will be able to repair metadata on a + mounted XFS filesystem. This feature is intended to reduce + filesystem downtime by fixing minor problems before they cause the + filesystem to go down. However, it requires that the filesystem be + formatted with secondary metadata, such as reverse mappings and inode + parent pointers. + + This feature is considered EXPERIMENTAL. Use with caution! + + See the xfs_scrub man page in section 8 for additional information. + + If unsure, say N. + config XFS_WARN bool "XFS Verbose Warnings" depends on XFS_FS && !XFS_DEBUG diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7ceb41a9786a..e8d67a443bd7 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -28,6 +28,7 @@ xfs-y += xfs_trace.o # build the libxfs code first xfs-y += $(addprefix libxfs/, \ + xfs_ag.o \ xfs_alloc.o \ xfs_alloc_btree.o \ xfs_attr.o \ @@ -163,4 +164,12 @@ xfs-y += $(addprefix scrub/, \ xfs-$(CONFIG_XFS_RT) += scrub/rtbitmap.o xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o + +# online repair +ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y) +xfs-y += $(addprefix scrub/, \ + agheader_repair.o \ + repair.o \ + ) +endif endif diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c new file mode 100644 index 000000000000..9345802c99f7 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag.c @@ -0,0 +1,464 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2000-2005 Silicon Graphics, Inc. + * Copyright (c) 2018 Red Hat, Inc. + * All rights reserved. + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "xfs_ag.h" + +static struct xfs_buf * +xfs_get_aghdr_buf( + struct xfs_mount *mp, + xfs_daddr_t blkno, + size_t numblks, + int flags, + const struct xfs_buf_ops *ops) +{ + struct xfs_buf *bp; + + bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); + if (!bp) + return NULL; + + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + bp->b_bn = blkno; + bp->b_maps[0].bm_bn = blkno; + bp->b_ops = ops; + + return bp; +} + +/* + * Generic btree root block init function + */ +static void +xfs_btroot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + xfs_btree_init_block(mp, bp, id->type, 0, 0, id->agno, 0); +} + +/* + * Alloc btree root block init functions + */ +static void +xfs_bnoroot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_alloc_rec *arec; + + xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, id->agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); + arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); + arec->ar_blockcount = cpu_to_be32(id->agsize - + be32_to_cpu(arec->ar_startblock)); +} + +static void +xfs_cntroot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_alloc_rec *arec; + + xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, id->agno, 0); + arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); + arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); + arec->ar_blockcount = cpu_to_be32(id->agsize - + be32_to_cpu(arec->ar_startblock)); +} + +/* + * Reverse map root block init + */ +static void +xfs_rmaproot_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_rmap_rec *rrec; + + xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 4, id->agno, 0); + + /* + * mark the AG header regions as static metadata The BNO + * btree block is the first block after the headers, so + * it's location defines the size of region the static + * metadata consumes. + * + * Note: unlike mkfs, we never have to account for log + * space when growing the data regions + */ + rrec = XFS_RMAP_REC_ADDR(block, 1); + rrec->rm_startblock = 0; + rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); + rrec->rm_offset = 0; + + /* account freespace btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 2); + rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(2); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + rrec->rm_offset = 0; + + /* account inode btree root blocks */ + rrec = XFS_RMAP_REC_ADDR(block, 3); + rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - + XFS_IBT_BLOCK(mp)); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); + rrec->rm_offset = 0; + + /* account for rmap btree root */ + rrec = XFS_RMAP_REC_ADDR(block, 4); + rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); + rrec->rm_offset = 0; + + /* account for refc btree root */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + rrec = XFS_RMAP_REC_ADDR(block, 5); + rrec->rm_startblock = cpu_to_be32(xfs_refc_block(mp)); + rrec->rm_blockcount = cpu_to_be32(1); + rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); + rrec->rm_offset = 0; + be16_add_cpu(&block->bb_numrecs, 1); + } +} + +/* + * Initialise new secondary superblocks with the pre-grow geometry, but mark + * them as "in progress" so we know they haven't yet been activated. This will + * get cleared when the update with the new geometry information is done after + * changes to the primary are committed. This isn't strictly necessary, but we + * get it for free with the delayed buffer write lists and it means we can tell + * if a grow operation didn't complete properly after the fact. + */ +static void +xfs_sbblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_dsb *dsb = XFS_BUF_TO_SBP(bp); + + xfs_sb_to_disk(dsb, &mp->m_sb); + dsb->sb_inprogress = 1; +} + +static void +xfs_agfblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_agf *agf = XFS_BUF_TO_AGF(bp); + xfs_extlen_t tmpsize; + + agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); + agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); + agf->agf_seqno = cpu_to_be32(id->agno); + agf->agf_length = cpu_to_be32(id->agsize); + agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); + agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); + agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + agf->agf_roots[XFS_BTNUM_RMAPi] = + cpu_to_be32(XFS_RMAP_BLOCK(mp)); + agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); + agf->agf_rmap_blocks = cpu_to_be32(1); + } + + agf->agf_flfirst = cpu_to_be32(1); + agf->agf_fllast = 0; + agf->agf_flcount = 0; + tmpsize = id->agsize - mp->m_ag_prealloc_blocks; + agf->agf_freeblks = cpu_to_be32(tmpsize); + agf->agf_longest = cpu_to_be32(tmpsize); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); + if (xfs_sb_version_hasreflink(&mp->m_sb)) { + agf->agf_refcount_root = cpu_to_be32( + xfs_refc_block(mp)); + agf->agf_refcount_level = cpu_to_be32(1); + agf->agf_refcount_blocks = cpu_to_be32(1); + } +} + +static void +xfs_agflblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); + __be32 *agfl_bno; + int bucket; + + if (xfs_sb_version_hascrc(&mp->m_sb)) { + agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); + agfl->agfl_seqno = cpu_to_be32(id->agno); + uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); + } + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); + for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) + agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); +} + +static void +xfs_agiblock_init( + struct xfs_mount *mp, + struct xfs_buf *bp, + struct aghdr_init_data *id) +{ + struct xfs_agi *agi = XFS_BUF_TO_AGI(bp); + int bucket; + + agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); + agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); + agi->agi_seqno = cpu_to_be32(id->agno); + agi->agi_length = cpu_to_be32(id->agsize); + agi->agi_count = 0; + agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); + agi->agi_level = cpu_to_be32(1); + agi->agi_freecount = 0; + agi->agi_newino = cpu_to_be32(NULLAGINO); + agi->agi_dirino = cpu_to_be32(NULLAGINO); + if (xfs_sb_version_hascrc(&mp->m_sb)) + uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) { + agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); + agi->agi_free_level = cpu_to_be32(1); + } + for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) + agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); +} + +typedef void (*aghdr_init_work_f)(struct xfs_mount *mp, struct xfs_buf *bp, + struct aghdr_init_data *id); +static int +xfs_ag_init_hdr( + struct xfs_mount *mp, + struct aghdr_init_data *id, + aghdr_init_work_f work, + const struct xfs_buf_ops *ops) + +{ + struct xfs_buf *bp; + + bp = xfs_get_aghdr_buf(mp, id->daddr, id->numblks, 0, ops); + if (!bp) + return -ENOMEM; + + (*work)(mp, bp, id); + + xfs_buf_delwri_queue(bp, &id->buffer_list); + xfs_buf_relse(bp); + return 0; +} + +struct xfs_aghdr_grow_data { + xfs_daddr_t daddr; + size_t numblks; + const struct xfs_buf_ops *ops; + aghdr_init_work_f work; + xfs_btnum_t type; + bool need_init; +}; + +/* + * Prepare new AG headers to be written to disk. We use uncached buffers here, + * as it is assumed these new AG headers are currently beyond the currently + * valid filesystem address space. Using cached buffers would trip over EOFS + * corruption detection alogrithms in the buffer cache lookup routines. + * + * This is a non-transactional function, but the prepared buffers are added to a + * delayed write buffer list supplied by the caller so they can submit them to + * disk and wait on them as required. + */ +int +xfs_ag_init_headers( + struct xfs_mount *mp, + struct aghdr_init_data *id) + +{ + struct xfs_aghdr_grow_data aghdr_data[] = { + { /* SB */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_SB_DADDR), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_sb_buf_ops, + .work = &xfs_sbblock_init, + .need_init = true + }, + { /* AGF */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGF_DADDR(mp)), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_agf_buf_ops, + .work = &xfs_agfblock_init, + .need_init = true + }, + { /* AGFL */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGFL_DADDR(mp)), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_agfl_buf_ops, + .work = &xfs_agflblock_init, + .need_init = true + }, + { /* AGI */ + .daddr = XFS_AG_DADDR(mp, id->agno, XFS_AGI_DADDR(mp)), + .numblks = XFS_FSS_TO_BB(mp, 1), + .ops = &xfs_agi_buf_ops, + .work = &xfs_agiblock_init, + .need_init = true + }, + { /* BNO root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_BNO_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_allocbt_buf_ops, + .work = &xfs_bnoroot_init, + .need_init = true + }, + { /* CNT root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_CNT_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_allocbt_buf_ops, + .work = &xfs_cntroot_init, + .need_init = true + }, + { /* INO root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_IBT_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_inobt_buf_ops, + .work = &xfs_btroot_init, + .type = XFS_BTNUM_INO, + .need_init = true + }, + { /* FINO root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_FIBT_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_inobt_buf_ops, + .work = &xfs_btroot_init, + .type = XFS_BTNUM_FINO, + .need_init = xfs_sb_version_hasfinobt(&mp->m_sb) + }, + { /* RMAP root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, XFS_RMAP_BLOCK(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_rmapbt_buf_ops, + .work = &xfs_rmaproot_init, + .need_init = xfs_sb_version_hasrmapbt(&mp->m_sb) + }, + { /* REFC root block */ + .daddr = XFS_AGB_TO_DADDR(mp, id->agno, xfs_refc_block(mp)), + .numblks = BTOBB(mp->m_sb.sb_blocksize), + .ops = &xfs_refcountbt_buf_ops, + .work = &xfs_btroot_init, + .type = XFS_BTNUM_REFC, + .need_init = xfs_sb_version_hasreflink(&mp->m_sb) + }, + { /* NULL terminating block */ + .daddr = XFS_BUF_DADDR_NULL, + } + }; + struct xfs_aghdr_grow_data *dp; + int error = 0; + + /* Account for AG free space in new AG */ + id->nfree += id->agsize - mp->m_ag_prealloc_blocks; + for (dp = &aghdr_data[0]; dp->daddr != XFS_BUF_DADDR_NULL; dp++) { + if (!dp->need_init) + continue; + + id->daddr = dp->daddr; + id->numblks = dp->numblks; + id->type = dp->type; + error = xfs_ag_init_hdr(mp, id, dp->work, dp->ops); + if (error) + break; + } + return error; +} + +/* + * Extent the AG indicated by the @id by the length passed in + */ +int +xfs_ag_extend_space( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct aghdr_init_data *id, + xfs_extlen_t len) +{ + struct xfs_owner_info oinfo; + struct xfs_buf *bp; + struct xfs_agi *agi; + struct xfs_agf *agf; + int error; + + /* + * Change the agi length. + */ + error = xfs_ialloc_read_agi(mp, tp, id->agno, &bp); + if (error) + return error; + + agi = XFS_BUF_TO_AGI(bp); + be32_add_cpu(&agi->agi_length, len); + ASSERT(id->agno == mp->m_sb.sb_agcount - 1 || + be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); + xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); + + /* + * Change agf length. + */ + error = xfs_alloc_read_agf(mp, tp, id->agno, 0, &bp); + if (error) + return error; + + agf = XFS_BUF_TO_AGF(bp); + be32_add_cpu(&agf->agf_length, len); + ASSERT(agf->agf_length == agi->agi_length); + xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + + /* + * Free the new space. + * + * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that + * this doesn't actually exist in the rmap btree. + */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); + error = xfs_rmap_free(tp, bp, id->agno, + be32_to_cpu(agf->agf_length) - len, + len, &oinfo); + if (error) + return error; + + return xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, id->agno, + be32_to_cpu(agf->agf_length) - len), + len, &oinfo, XFS_AG_RESV_NONE); +} diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h new file mode 100644 index 000000000000..412702e23f61 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2018 Red Hat, Inc. + * All rights reserved. + */ + +#ifndef __LIBXFS_AG_H +#define __LIBXFS_AG_H 1 + +struct xfs_mount; +struct xfs_trans; + +struct aghdr_init_data { + /* per ag data */ + xfs_agblock_t agno; /* ag to init */ + xfs_extlen_t agsize; /* new AG size */ + struct list_head buffer_list; /* buffer writeback list */ + xfs_rfsblock_t nfree; /* cumulative new free space */ + + /* per header data */ + xfs_daddr_t daddr; /* header location */ + size_t numblks; /* size of header */ + xfs_btnum_t type; /* type of btree root block */ +}; + +int xfs_ag_init_headers(struct xfs_mount *mp, struct aghdr_init_data *id); +int xfs_ag_extend_space(struct xfs_mount *mp, struct xfs_trans *tp, + struct aghdr_init_data *id, xfs_extlen_t len); + +#endif /* __LIBXFS_AG_H */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 4bcc095fe44a..dc9dd3805d97 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -39,6 +39,9 @@ #include "xfs_buf_item.h" #include "xfs_log.h" #include "xfs_ag_resv.h" +#include "xfs_bmap.h" + +extern kmem_zone_t *xfs_bmap_free_item_zone; struct workqueue_struct *xfs_alloc_wq; @@ -2060,6 +2063,30 @@ xfs_alloc_space_available( return true; } +int +xfs_free_agfl_block( + struct xfs_trans *tp, + xfs_agnumber_t agno, + xfs_agblock_t agbno, + struct xfs_buf *agbp, + struct xfs_owner_info *oinfo) +{ + int error; + struct xfs_buf *bp; + + error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, + XFS_AG_RESV_AGFL); + if (error) + return error; + + bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno, 0); + if (!bp) + return -EFSCORRUPTED; + xfs_trans_binval(tp, bp); + + return 0; +} + /* * Check the agfl fields of the agf for inconsistency or corruption. The purpose * is to detect an agfl header padding mismatch between current and early v5 @@ -2148,6 +2175,40 @@ xfs_agfl_reset( } /* + * Defer an AGFL block free. This is effectively equivalent to + * xfs_bmap_add_free() with some special handling particular to AGFL blocks. + * + * Deferring AGFL frees helps prevent log reservation overruns due to too many + * allocation operations in a transaction. AGFL frees are prone to this problem + * because for one they are always freed one at a time. Further, an immediate + * AGFL block free can cause a btree join and require another block free before + * the real allocation can proceed. Deferring the free disconnects freeing up + * the AGFL slot from freeing the block. + */ +STATIC void +xfs_defer_agfl_block( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_agnumber_t agno, + xfs_fsblock_t agbno, + struct xfs_owner_info *oinfo) +{ + struct xfs_extent_free_item *new; /* new element */ + + ASSERT(xfs_bmap_free_item_zone != NULL); + ASSERT(oinfo != NULL); + + new = kmem_zone_alloc(xfs_bmap_free_item_zone, KM_SLEEP); + new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); + new->xefi_blockcount = 1; + new->xefi_oinfo = *oinfo; + + trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); + + xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); +} + +/* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ @@ -2247,21 +2308,20 @@ xfs_alloc_fix_freelist( else xfs_rmap_ag_owner(&targs.oinfo, XFS_RMAP_OWN_AG); while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { - struct xfs_buf *bp; - error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) goto out_agbp_relse; - error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, - &targs.oinfo, XFS_AG_RESV_AGFL); - if (error) - goto out_agbp_relse; - bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); - if (!bp) { - error = -EFSCORRUPTED; - goto out_agbp_relse; + + /* defer agfl frees if dfops is provided */ + if (tp->t_agfl_dfops) { + xfs_defer_agfl_block(mp, tp->t_agfl_dfops, args->agno, + bno, &targs.oinfo); + } else { + error = xfs_free_agfl_block(tp, args->agno, bno, agbp, + &targs.oinfo); + if (error) + goto out_agbp_relse; } - xfs_trans_binval(tp, bp); } targs.tp = tp; @@ -2949,18 +3009,20 @@ out: * after fixing up the freelist. */ int /* error */ -xfs_free_extent( +__xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ struct xfs_owner_info *oinfo, /* extent owner */ - enum xfs_ag_resv_type type) /* block reservation type */ + enum xfs_ag_resv_type type, /* block reservation type */ + bool skip_discard) { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); int error; + unsigned int busy_flags = 0; ASSERT(len != 0); ASSERT(type != XFS_AG_RESV_AGFL); @@ -2984,7 +3046,9 @@ xfs_free_extent( if (error) goto err; - xfs_extent_busy_insert(tp, agno, agbno, len, 0); + if (skip_discard) + busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; + xfs_extent_busy_insert(tp, agno, agbno, len, busy_flags); return 0; err: @@ -3116,3 +3180,40 @@ xfs_alloc_has_record( return xfs_btree_has_record(cur, &low, &high, exists); } + +/* + * Walk all the blocks in the AGFL. The @walk_fn can return any negative + * error code or XFS_BTREE_QUERY_RANGE_ABORT. + */ +int +xfs_agfl_walk( + struct xfs_mount *mp, + struct xfs_agf *agf, + struct xfs_buf *agflbp, + xfs_agfl_walk_fn walk_fn, + void *priv) +{ + __be32 *agfl_bno; + unsigned int i; + int error; + + agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, agflbp); + i = be32_to_cpu(agf->agf_flfirst); + + /* Nothing to walk in an empty AGFL. */ + if (agf->agf_flcount == cpu_to_be32(0)) + return 0; + + /* Otherwise, walk from first to last, wrapping as needed. */ + for (;;) { + error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); + if (error) + return error; + if (i == be32_to_cpu(agf->agf_fllast)) + break; + if (++i == xfs_agfl_size(mp)) + i = 0; + } + + return 0; +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index cbf789ea5a4e..0747adcd57d6 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -191,12 +191,24 @@ xfs_alloc_vextent( * Free an extent. */ int /* error */ -xfs_free_extent( +__xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ struct xfs_owner_info *oinfo, /* extent owner */ - enum xfs_ag_resv_type type); /* block reservation type */ + enum xfs_ag_resv_type type, /* block reservation type */ + bool skip_discard); + +static inline int +xfs_free_extent( + struct xfs_trans *tp, + xfs_fsblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) +{ + return __xfs_free_extent(tp, bno, len, oinfo, type, false); +} int /* error */ xfs_alloc_lookup_le( @@ -223,6 +235,8 @@ int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); int xfs_alloc_read_agfl(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp); +int xfs_free_agfl_block(struct xfs_trans *, xfs_agnumber_t, xfs_agblock_t, + struct xfs_buf *, struct xfs_owner_info *); int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **agbp); @@ -248,4 +262,9 @@ bool xfs_verify_fsbno(struct xfs_mount *mp, xfs_fsblock_t fsbno); int xfs_alloc_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exist); +typedef int (*xfs_agfl_walk_fn)(struct xfs_mount *mp, xfs_agblock_t bno, + void *priv); +int xfs_agfl_walk(struct xfs_mount *mp, struct xfs_agf *agf, + struct xfs_buf *agflbp, xfs_agfl_walk_fn walk_fn, void *priv); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index b451649ba176..18aec7a0e599 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -547,3 +547,12 @@ xfs_allocbt_maxrecs( return blocklen / sizeof(xfs_alloc_rec_t); return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t)); } + +/* Calculate the freespace btree size for some records. */ +xfs_extlen_t +xfs_allocbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_alloc_mnr, len); +} diff --git a/fs/xfs/libxfs/xfs_alloc_btree.h b/fs/xfs/libxfs/xfs_alloc_btree.h index 45e189e7e81c..2fd54728871c 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.h +++ b/fs/xfs/libxfs/xfs_alloc_btree.h @@ -61,5 +61,7 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t, xfs_btnum_t); extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int); +extern xfs_extlen_t xfs_allocbt_calc_size(struct xfs_mount *mp, + unsigned long long len); #endif /* __XFS_ALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 35a124400d60..c3d02a66d39d 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -236,7 +236,7 @@ xfs_attr_set( args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; args.total = xfs_attr_calc_size(&args, &local); - error = xfs_qm_dqattach(dp, 0); + error = xfs_qm_dqattach(dp); if (error) return error; @@ -427,7 +427,7 @@ xfs_attr_remove( */ args.op_flags = XFS_DA_OP_OKNOENT; - error = xfs_qm_dqattach(dp, 0); + error = xfs_qm_dqattach(dp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 21be186067a2..83a6d3c7f872 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -620,7 +620,7 @@ xfs_attr_rmtval_remove( /* * If the "remote" value is in the cache, remove it. */ - bp = xfs_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); + bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK); if (bp) { xfs_buf_stale(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 040eeda8426f..7b0e2b551e23 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -246,7 +246,7 @@ xfs_bmap_get_bp( struct xfs_btree_cur *cur, xfs_fsblock_t bno) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; int i; if (!cur) @@ -260,9 +260,9 @@ xfs_bmap_get_bp( } /* Chase down all the log items to see if the bp is there */ - list_for_each_entry(lidp, &cur->bc_tp->t_items, lid_trans) { - struct xfs_buf_log_item *bip; - bip = (struct xfs_buf_log_item *)lidp->lid_item; + list_for_each_entry(lip, &cur->bc_tp->t_items, li_trans) { + struct xfs_buf_log_item *bip = (struct xfs_buf_log_item *)lip; + if (bip->bli_item.li_type == XFS_LI_BUF && XFS_BUF_ADDR(bip->bli_buf) == bno) return bip->bli_buf; @@ -312,8 +312,9 @@ xfs_check_block( xfs_warn(mp, "%s: thispa(%d) == pp(%d) %Ld", __func__, j, i, (unsigned long long)be64_to_cpu(*thispa)); - panic("%s: ptrs are equal in node\n", + xfs_err(mp, "%s: ptrs are equal in node\n", __func__); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } } } @@ -483,7 +484,8 @@ error0: error_norelse: xfs_warn(mp, "%s: BAD after btree leaves for %d extents", __func__, i); - panic("%s: CORRUPTED BTREE OR SOMETHING", __func__); + xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return; } @@ -542,12 +544,13 @@ xfs_bmap_validate_ret( * The list is maintained sorted (by block number). */ void -xfs_bmap_add_free( +__xfs_bmap_add_free( struct xfs_mount *mp, struct xfs_defer_ops *dfops, xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_owner_info *oinfo) + struct xfs_owner_info *oinfo, + bool skip_discard) { struct xfs_extent_free_item *new; /* new element */ #ifdef DEBUG @@ -574,6 +577,7 @@ xfs_bmap_add_free( new->xefi_oinfo = *oinfo; else xfs_rmap_skip_owner_update(&new->xefi_oinfo); + new->xefi_skip_discard = skip_discard; trace_xfs_bmap_free_defer(mp, XFS_FSB_TO_AGNO(mp, bno), 0, XFS_FSB_TO_AGBNO(mp, bno), len); xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); @@ -2001,10 +2005,13 @@ xfs_bmap_add_extent_delay_real( ASSERT(0); } - /* add reverse mapping */ - error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new); - if (error) - goto done; + /* add reverse mapping unless caller opted out */ + if (!(bma->flags & XFS_BMAPI_NORMAP)) { + error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, + whichfork, new); + if (error) + goto done; + } /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(bma->ip, whichfork)) { @@ -2668,7 +2675,8 @@ xfs_bmap_add_extent_hole_real( struct xfs_bmbt_irec *new, xfs_fsblock_t *first, struct xfs_defer_ops *dfops, - int *logflagsp) + int *logflagsp, + int flags) { struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); struct xfs_mount *mp = ip->i_mount; @@ -2845,10 +2853,12 @@ xfs_bmap_add_extent_hole_real( break; } - /* add reverse mapping */ - error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new); - if (error) - goto done; + /* add reverse mapping unless caller opted out */ + if (!(flags & XFS_BMAPI_NORMAP)) { + error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new); + if (error) + goto done; + } /* convert to a btree if necessary */ if (xfs_bmap_needs_btree(ip, whichfork)) { @@ -4123,7 +4133,8 @@ xfs_bmapi_allocate( else error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, whichfork, &bma->icur, &bma->cur, &bma->got, - bma->firstblock, bma->dfops, &bma->logflags); + bma->firstblock, bma->dfops, &bma->logflags, + bma->flags); bma->logflags |= tmp_logflags; if (error) @@ -4509,30 +4520,37 @@ error0: return error; } -static int +int xfs_bmapi_remap( struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, - struct xfs_defer_ops *dfops) + struct xfs_defer_ops *dfops, + int flags) { struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_ifork *ifp; struct xfs_btree_cur *cur = NULL; xfs_fsblock_t firstblock = NULLFSBLOCK; struct xfs_bmbt_irec got; struct xfs_iext_cursor icur; + int whichfork = xfs_bmapi_whichfork(flags); int logflags = 0, error; + ifp = XFS_IFORK_PTR(ip, whichfork); ASSERT(len > 0); ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC | + XFS_BMAPI_NORMAP))); + ASSERT((flags & (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)) != + (XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC)); if (unlikely(XFS_TEST_ERROR( - (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && - XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE), mp, XFS_ERRTAG_BMAPIFORMAT))) { XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp); return -EFSCORRUPTED; @@ -4542,7 +4560,7 @@ xfs_bmapi_remap( return -EIO; if (!(ifp->if_flags & XFS_IFEXTENTS)) { - error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + error = xfs_iread_extents(tp, ip, whichfork); if (error) return error; } @@ -4557,7 +4575,7 @@ xfs_bmapi_remap( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (ifp->if_flags & XFS_IFBROOT) { - cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); + cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = firstblock; cur->bc_private.b.dfops = dfops; cur->bc_private.b.flags = 0; @@ -4566,18 +4584,21 @@ xfs_bmapi_remap( got.br_startoff = bno; got.br_startblock = startblock; got.br_blockcount = len; - got.br_state = XFS_EXT_NORM; + if (flags & XFS_BMAPI_PREALLOC) + got.br_state = XFS_EXT_UNWRITTEN; + else + got.br_state = XFS_EXT_NORM; - error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &icur, - &cur, &got, &firstblock, dfops, &logflags); + error = xfs_bmap_add_extent_hole_real(tp, ip, whichfork, &icur, + &cur, &got, &firstblock, dfops, &logflags, flags); if (error) goto error0; - if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) { + if (xfs_bmap_wants_extents(ip, whichfork)) { int tmp_logflags = 0; error = xfs_bmap_btree_to_extents(tp, ip, cur, - &tmp_logflags, XFS_DATA_FORK); + &tmp_logflags, whichfork); logflags |= tmp_logflags; } @@ -5104,9 +5125,12 @@ xfs_bmap_del_extent_real( error = xfs_refcount_decrease_extent(mp, dfops, del); if (error) goto done; - } else - xfs_bmap_add_free(mp, dfops, del->br_startblock, - del->br_blockcount, NULL); + } else { + __xfs_bmap_add_free(mp, dfops, del->br_startblock, + del->br_blockcount, NULL, + (bflags & XFS_BMAPI_NODISCARD) || + del->br_state == XFS_EXT_UNWRITTEN); + } } /* @@ -6148,7 +6172,7 @@ xfs_bmap_finish_one( switch (type) { case XFS_BMAP_MAP: error = xfs_bmapi_remap(tp, ip, startoff, *blockcount, - startblock, dfops); + startblock, dfops, 0); *blockcount = 0; break; case XFS_BMAP_UNMAP: diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 2b766b37096d..2c233f9f1a26 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -68,6 +68,7 @@ struct xfs_extent_free_item xfs_extlen_t xefi_blockcount;/* number of blocks in extent */ struct list_head xefi_list; struct xfs_owner_info xefi_oinfo; /* extent owner */ + bool xefi_skip_discard; }; #define XFS_BMAP_MAX_NMAP 4 @@ -116,6 +117,12 @@ struct xfs_extent_free_item /* Only convert unwritten extents, don't allocate new blocks */ #define XFS_BMAPI_CONVERT_ONLY 0x800 +/* Skip online discard of freed extents */ +#define XFS_BMAPI_NODISCARD 0x1000 + +/* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ +#define XFS_BMAPI_NORMAP 0x2000 + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -128,7 +135,9 @@ struct xfs_extent_free_item { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_DELALLOC, "DELALLOC" }, \ - { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" } + { XFS_BMAPI_CONVERT_ONLY, "CONVERT_ONLY" }, \ + { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ + { XFS_BMAPI_NORMAP, "NORMAP" } static inline int xfs_bmapi_aflag(int w) @@ -192,9 +201,9 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno, void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *); int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd); void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork); -void xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, +void __xfs_bmap_add_free(struct xfs_mount *mp, struct xfs_defer_ops *dfops, xfs_fsblock_t bno, xfs_filblks_t len, - struct xfs_owner_info *oinfo); + struct xfs_owner_info *oinfo, bool skip_discard); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); @@ -240,6 +249,17 @@ int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, int eof); +static inline void +xfs_bmap_add_free( + struct xfs_mount *mp, + struct xfs_defer_ops *dfops, + xfs_fsblock_t bno, + xfs_filblks_t len, + struct xfs_owner_info *oinfo) +{ + __xfs_bmap_add_free(mp, dfops, bno, len, oinfo, false); +} + enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, XFS_BMAP_UNMAP, @@ -277,4 +297,8 @@ static inline int xfs_bmap_fork_to_state(int whichfork) xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *irec); +int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip, + xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock, + struct xfs_defer_ops *dfops, int flags); + #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index d89d06bea6e3..ac9d4aeedb09 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -660,3 +660,12 @@ xfs_bmbt_change_owner( xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); return error; } + +/* Calculate the bmap btree size for some records. */ +unsigned long long +xfs_bmbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_bmap_dmnr, len); +} diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index e4505746ccaa..fb3cd2d9e0f8 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -118,4 +118,7 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, + unsigned long long len); + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index ac7d66427e42..c825c8182b30 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4836,14 +4836,14 @@ xfs_btree_query_all( * Calculate the number of blocks needed to store a given number of records * in a short-format (per-AG metadata) btree. */ -xfs_extlen_t +unsigned long long xfs_btree_calc_size( uint *limits, unsigned long long len) { int level; int maxrecs; - xfs_extlen_t rval; + unsigned long long rval; maxrecs = limits[0]; for (level = 0, rval = 0; len > 1; level++) { @@ -4919,3 +4919,24 @@ xfs_btree_has_record( *exists = false; return error; } + +/* Are there more records in this btree? */ +bool +xfs_btree_has_more_records( + struct xfs_btree_cur *cur) +{ + struct xfs_btree_block *block; + struct xfs_buf *bp; + + block = xfs_btree_get_block(cur, 0, &bp); + + /* There are still records in this block. */ + if (cur->bc_ptrs[0] < xfs_btree_get_numrecs(block)) + return true; + + /* There are more record blocks. */ + if (cur->bc_flags & XFS_BTREE_LONG_PTRS) + return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK); + else + return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 9227159a751e..d7911efee6dc 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -482,7 +482,7 @@ xfs_failaddr_t xfs_btree_lblock_verify(struct xfs_buf *bp, unsigned int max_recs); uint xfs_btree_compute_maxlevels(uint *limits, unsigned long len); -xfs_extlen_t xfs_btree_calc_size(uint *limits, unsigned long long len); +unsigned long long xfs_btree_calc_size(uint *limits, unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ @@ -528,5 +528,6 @@ union xfs_btree_key *xfs_btree_high_key_from_key(struct xfs_btree_cur *cur, union xfs_btree_key *key); int xfs_btree_has_record(struct xfs_btree_cur *cur, union xfs_btree_irec *low, union xfs_btree_irec *high, bool *exists); +bool xfs_btree_has_more_records(struct xfs_btree_cur *cur); #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 087fea02c389..3daf175e2535 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -220,7 +220,7 @@ xfs_defer_trans_abort( { struct xfs_defer_pending *dfp; - trace_xfs_defer_trans_abort(tp->t_mountp, dop); + trace_xfs_defer_trans_abort(tp->t_mountp, dop, _RET_IP_); /* Abort intent items that don't have a done item. */ list_for_each_entry(dfp, &dop->dop_pending, dfp_list) { @@ -253,7 +253,7 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]); - trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); + trace_xfs_defer_trans_roll((*tp)->t_mountp, dop, _RET_IP_); /* Roll the transaction. */ error = xfs_trans_roll(tp); @@ -352,10 +352,21 @@ xfs_defer_finish( void *state; int error = 0; void (*cleanup_fn)(struct xfs_trans *, void *, int); + struct xfs_defer_ops *orig_dop; ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - trace_xfs_defer_finish((*tp)->t_mountp, dop); + trace_xfs_defer_finish((*tp)->t_mountp, dop, _RET_IP_); + + /* + * Attach dfops to the transaction during deferred ops processing. This + * explicitly causes calls into the allocator to defer AGFL block frees. + * Note that this code can go away once all dfops users attach to the + * associated tp. + */ + ASSERT(!(*tp)->t_agfl_dfops || ((*tp)->t_agfl_dfops == dop)); + orig_dop = (*tp)->t_agfl_dfops; + (*tp)->t_agfl_dfops = dop; /* Until we run out of pending work to finish... */ while (xfs_defer_has_unfinished_work(dop)) { @@ -428,10 +439,11 @@ xfs_defer_finish( } out: + (*tp)->t_agfl_dfops = orig_dop; if (error) trace_xfs_defer_finish_error((*tp)->t_mountp, dop, error); else - trace_xfs_defer_finish_done((*tp)->t_mountp, dop); + trace_xfs_defer_finish_done((*tp)->t_mountp, dop, _RET_IP_); return error; } @@ -447,7 +459,7 @@ xfs_defer_cancel( struct list_head *pwi; struct list_head *n; - trace_xfs_defer_cancel(NULL, dop); + trace_xfs_defer_cancel(NULL, dop, _RET_IP_); /* * Free the pending items. Caller should already have arranged @@ -532,5 +544,5 @@ xfs_defer_init( *fbp = NULLFSBLOCK; INIT_LIST_HEAD(&dop->dop_intake); INIT_LIST_HEAD(&dop->dop_pending); - trace_xfs_defer_init(NULL, dop); + trace_xfs_defer_init(NULL, dop, _RET_IP_); } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 045beacdd37d..e70725ba1f5f 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -55,6 +55,7 @@ enum xfs_defer_ops_type { XFS_DEFER_OPS_TYPE_REFCOUNT, XFS_DEFER_OPS_TYPE_RMAP, XFS_DEFER_OPS_TYPE_FREE, + XFS_DEFER_OPS_TYPE_AGFL_FREE, XFS_DEFER_OPS_TYPE_MAX, }; diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 8b7a6c3cb599..cce520becee4 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -41,14 +41,18 @@ xfs_calc_dquots_per_chunk( /* * Do some primitive error checking on ondisk dquot data structures. + * + * The xfs_dqblk structure /contains/ the xfs_disk_dquot structure; + * we verify them separately because at some points we have only the + * smaller xfs_disk_dquot structure available. */ + xfs_failaddr_t xfs_dquot_verify( struct xfs_mount *mp, xfs_disk_dquot_t *ddq, xfs_dqid_t id, - uint type, /* used only when IO_dorepair is true */ - uint flags) + uint type) /* used only during quotacheck */ { /* * We can encounter an uninitialized dquot buffer for 2 reasons: @@ -70,6 +74,8 @@ xfs_dquot_verify( if (ddq->d_version != XFS_DQUOT_VERSION) return __this_address; + if (type && ddq->d_flags != type) + return __this_address; if (ddq->d_flags != XFS_DQ_USER && ddq->d_flags != XFS_DQ_PROJ && ddq->d_flags != XFS_DQ_GROUP) @@ -99,33 +105,44 @@ xfs_dquot_verify( return NULL; } +xfs_failaddr_t +xfs_dqblk_verify( + struct xfs_mount *mp, + struct xfs_dqblk *dqb, + xfs_dqid_t id, + uint type) /* used only during quotacheck */ +{ + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid)) + return __this_address; + + return xfs_dquot_verify(mp, &dqb->dd_diskdq, id, type); +} + /* * Do some primitive error checking on ondisk dquot data structures. */ int -xfs_dquot_repair( +xfs_dqblk_repair( struct xfs_mount *mp, - struct xfs_disk_dquot *ddq, + struct xfs_dqblk *dqb, xfs_dqid_t id, uint type) { - struct xfs_dqblk *d = (struct xfs_dqblk *)ddq; - - /* * Typically, a repair is only requested by quotacheck. */ ASSERT(id != -1); - memset(d, 0, sizeof(xfs_dqblk_t)); + memset(dqb, 0, sizeof(xfs_dqblk_t)); - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_flags = type; - d->dd_diskdq.d_id = cpu_to_be32(id); + dqb->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); + dqb->dd_diskdq.d_version = XFS_DQUOT_VERSION; + dqb->dd_diskdq.d_flags = type; + dqb->dd_diskdq.d_id = cpu_to_be32(id); if (xfs_sb_version_hascrc(&mp->m_sb)) { - uuid_copy(&d->dd_uuid, &mp->m_sb.sb_meta_uuid); - xfs_update_cksum((char *)d, sizeof(struct xfs_dqblk), + uuid_copy(&dqb->dd_uuid, &mp->m_sb.sb_meta_uuid); + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } @@ -135,7 +152,8 @@ xfs_dquot_repair( STATIC bool xfs_dquot_buf_verify_crc( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + bool readahead) { struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; int ndquots; @@ -156,10 +174,12 @@ xfs_dquot_buf_verify_crc( for (i = 0; i < ndquots; i++, d++) { if (!xfs_verify_cksum((char *)d, sizeof(struct xfs_dqblk), - XFS_DQUOT_CRC_OFF)) - return false; - if (!uuid_equal(&d->dd_uuid, &mp->m_sb.sb_meta_uuid)) + XFS_DQUOT_CRC_OFF)) { + if (!readahead) + xfs_buf_verifier_error(bp, -EFSBADCRC, __func__, + d, sizeof(*d), __this_address); return false; + } } return true; } @@ -167,9 +187,10 @@ xfs_dquot_buf_verify_crc( STATIC xfs_failaddr_t xfs_dquot_buf_verify( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + bool readahead) { - struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; + struct xfs_dqblk *dqb = bp->b_addr; xfs_failaddr_t fa; xfs_dqid_t id = 0; int ndquots; @@ -195,14 +216,19 @@ xfs_dquot_buf_verify( for (i = 0; i < ndquots; i++) { struct xfs_disk_dquot *ddq; - ddq = &d[i].dd_diskdq; + ddq = &dqb[i].dd_diskdq; if (i == 0) id = be32_to_cpu(ddq->d_id); - fa = xfs_dquot_verify(mp, ddq, id + i, 0, 0); - if (fa) + fa = xfs_dqblk_verify(mp, &dqb[i], id + i, 0); + if (fa) { + if (!readahead) + xfs_buf_verifier_error(bp, -EFSCORRUPTED, + __func__, &dqb[i], + sizeof(struct xfs_dqblk), fa); return fa; + } } return NULL; @@ -214,7 +240,7 @@ xfs_dquot_buf_verify_struct( { struct xfs_mount *mp = bp->b_target->bt_mount; - return xfs_dquot_buf_verify(mp, bp); + return xfs_dquot_buf_verify(mp, bp, false); } static void @@ -222,15 +248,10 @@ xfs_dquot_buf_read_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_failaddr_t fa; - if (!xfs_dquot_buf_verify_crc(mp, bp)) - xfs_verifier_error(bp, -EFSBADCRC, __this_address); - else { - fa = xfs_dquot_buf_verify(mp, bp); - if (fa) - xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); - } + if (!xfs_dquot_buf_verify_crc(mp, bp, false)) + return; + xfs_dquot_buf_verify(mp, bp, false); } /* @@ -245,8 +266,8 @@ xfs_dquot_buf_readahead_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if (!xfs_dquot_buf_verify_crc(mp, bp) || - xfs_dquot_buf_verify(mp, bp) != NULL) { + if (!xfs_dquot_buf_verify_crc(mp, bp, true) || + xfs_dquot_buf_verify(mp, bp, true) != NULL) { xfs_buf_ioerror(bp, -EIO); bp->b_flags &= ~XBF_DONE; } @@ -262,11 +283,8 @@ xfs_dquot_buf_write_verify( struct xfs_buf *bp) { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_failaddr_t fa; - fa = xfs_dquot_buf_verify(mp, bp); - if (fa) - xfs_verifier_error(bp, -EFSCORRUPTED, __this_address); + xfs_dquot_buf_verify(mp, bp, false); } const struct xfs_buf_ops xfs_dquot_buf_ops = { diff --git a/fs/xfs/libxfs/xfs_errortag.h b/fs/xfs/libxfs/xfs_errortag.h index bc1789d95152..d47b91625945 100644 --- a/fs/xfs/libxfs/xfs_errortag.h +++ b/fs/xfs/libxfs/xfs_errortag.h @@ -65,7 +65,8 @@ #define XFS_ERRTAG_LOG_BAD_CRC 29 #define XFS_ERRTAG_LOG_ITEM_PIN 30 #define XFS_ERRTAG_BUF_LRU_REF 31 -#define XFS_ERRTAG_MAX 32 +#define XFS_ERRTAG_FORCE_SCRUB_REPAIR 32 +#define XFS_ERRTAG_MAX 33 /* * Random factors for above tags, 1 means always, 2 means 1/2 time, etc. @@ -102,5 +103,6 @@ #define XFS_RANDOM_LOG_BAD_CRC 1 #define XFS_RANDOM_LOG_ITEM_PIN 1 #define XFS_RANDOM_BUF_LRU_REF 2 +#define XFS_RANDOM_FORCE_SCRUB_REPAIR 1 #endif /* __XFS_ERRORTAG_H_ */ diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 42956d8d95ed..c1cb29a5f4f6 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -98,6 +98,9 @@ struct xfs_ifork; XFS_SB_VERSION2_PROJID32BIT | \ XFS_SB_VERSION2_FTYPE) +/* Maximum size of the xfs filesystem label, no terminating NULL */ +#define XFSLABEL_MAX 12 + /* * Superblock - in core version. Must match the ondisk version below. * Must be padded to 64 bit alignment. @@ -122,7 +125,7 @@ typedef struct xfs_sb { uint16_t sb_sectsize; /* volume sector size, bytes */ uint16_t sb_inodesize; /* inode size, bytes */ uint16_t sb_inopblock; /* inodes per block */ - char sb_fname[12]; /* file system name */ + char sb_fname[XFSLABEL_MAX]; /* file system name */ uint8_t sb_blocklog; /* log2 of sb_blocksize */ uint8_t sb_sectlog; /* log2 of sb_sectsize */ uint8_t sb_inodelog; /* log2 of sb_inodesize */ @@ -213,7 +216,7 @@ typedef struct xfs_dsb { __be16 sb_sectsize; /* volume sector size, bytes */ __be16 sb_inodesize; /* inode size, bytes */ __be16 sb_inopblock; /* inodes per block */ - char sb_fname[12]; /* file system name */ + char sb_fname[XFSLABEL_MAX]; /* file system name */ __u8 sb_blocklog; /* log2 of sb_blocksize */ __u8 sb_sectlog; /* log2 of sb_sectsize */ __u8 sb_inodelog; /* log2 of sb_inodesize */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index faf1a4edd618..dddc75e4f1f6 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -542,13 +542,20 @@ struct xfs_scrub_metadata { /* o: Metadata object looked funny but isn't corrupt. */ #define XFS_SCRUB_OFLAG_WARNING (1 << 6) +/* + * o: IFLAG_REPAIR was set but metadata object did not need fixing or + * optimization and has therefore not been altered. + */ +#define XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED (1 << 7) + #define XFS_SCRUB_FLAGS_IN (XFS_SCRUB_IFLAG_REPAIR) #define XFS_SCRUB_FLAGS_OUT (XFS_SCRUB_OFLAG_CORRUPT | \ XFS_SCRUB_OFLAG_PREEN | \ XFS_SCRUB_OFLAG_XFAIL | \ XFS_SCRUB_OFLAG_XCORRUPT | \ XFS_SCRUB_OFLAG_INCOMPLETE | \ - XFS_SCRUB_OFLAG_WARNING) + XFS_SCRUB_OFLAG_WARNING | \ + XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED) #define XFS_SCRUB_FLAGS_ALL (XFS_SCRUB_FLAGS_IN | XFS_SCRUB_FLAGS_OUT) /* diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index de627fa19168..4ca4ff7a757d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -148,7 +148,7 @@ xfs_inobt_get_rec( /* * Insert a single inobt record. Cursor must already point to desired location. */ -STATIC int +int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, uint16_t holemask, diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index c5402bb4ce0c..77fffced8bac 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -176,6 +176,9 @@ int xfs_ialloc_has_inode_record(struct xfs_btree_cur *cur, xfs_agino_t low, xfs_agino_t high, bool *exists); int xfs_ialloc_count_inodes(struct xfs_btree_cur *cur, xfs_agino_t *count, xfs_agino_t *freecount); +int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask, + uint8_t count, int32_t freecount, xfs_inofree_t free, + int *stat); int xfs_ialloc_cluster_alignment(struct xfs_mount *mp); void xfs_ialloc_agino_range(struct xfs_mount *mp, xfs_agnumber_t agno, diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 367e9a0726e6..b04c55512159 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -296,7 +296,7 @@ xfs_inobt_verify( case cpu_to_be32(XFS_FIBT_MAGIC): break; default: - return NULL; + return __this_address; } /* level verification */ @@ -608,3 +608,12 @@ xfs_finobt_calc_reserves( *used += tree_len; return 0; } + +/* Calculate the inobt btree size for some records. */ +xfs_extlen_t +xfs_iallocbt_calc_size( + struct xfs_mount *mp, + unsigned long long len) +{ + return xfs_btree_calc_size(mp->m_inobt_mnr, len); +} diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index aa81e2e63f3f..4acdd5458d59 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -74,5 +74,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used); +extern xfs_extlen_t xfs_iallocbt_calc_size(struct xfs_mount *mp, + unsigned long long len); #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index bb1b13a9b5f4..d4af2804b178 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -107,14 +107,12 @@ typedef uint16_t xfs_qwarncnt_t; * to a single function. None of these XFS_QMOPT_* flags are meant to have * persistent values (ie. their values can and will change between versions) */ -#define XFS_QMOPT_DQALLOC 0x0000002 /* alloc dquot ondisk if needed */ #define XFS_QMOPT_UQUOTA 0x0000004 /* user dquot requested */ #define XFS_QMOPT_PQUOTA 0x0000008 /* project dquot requested */ #define XFS_QMOPT_FORCE_RES 0x0000010 /* ignore quota limits */ #define XFS_QMOPT_SBVERSION 0x0000040 /* change superblock version num */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ #define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ -#define XFS_QMOPT_DQNEXT 0x0008000 /* return next dquot >= this ID */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be @@ -152,10 +150,11 @@ typedef uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern xfs_failaddr_t xfs_dquot_verify(struct xfs_mount *mp, - struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type, - uint flags); + struct xfs_disk_dquot *ddq, xfs_dqid_t id, uint type); +extern xfs_failaddr_t xfs_dqblk_verify(struct xfs_mount *mp, + struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); -extern int xfs_dquot_repair(struct xfs_mount *mp, struct xfs_disk_dquot *ddq, +extern int xfs_dqblk_repair(struct xfs_mount *mp, struct xfs_dqblk *dqb, xfs_dqid_t id, uint type); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c index 560e28473024..418d53295893 100644 --- a/fs/xfs/libxfs/xfs_refcount.c +++ b/fs/xfs/libxfs/xfs_refcount.c @@ -88,8 +88,25 @@ xfs_refcount_lookup_ge( return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); } +/* + * Look up the first record equal to [bno, len] in the btree + * given by cur. + */ +int +xfs_refcount_lookup_eq( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + int *stat) +{ + trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno, + XFS_LOOKUP_LE); + cur->bc_rec.rc.rc_startblock = bno; + cur->bc_rec.rc.rc_blockcount = 0; + return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); +} + /* Convert on-disk record to in-core format. */ -static inline void +void xfs_refcount_btrec_to_irec( union xfs_btree_rec *rec, struct xfs_refcount_irec *irec) @@ -149,7 +166,7 @@ xfs_refcount_update( * by [bno, len, refcount]. * This either works (return 0) or gets an EFSCORRUPTED error. */ -STATIC int +int xfs_refcount_insert( struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, @@ -162,7 +179,10 @@ xfs_refcount_insert( cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount; cur->bc_rec.rc.rc_refcount = irec->rc_refcount; error = xfs_btree_insert(cur, i); + if (error) + goto out_error; XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error); + out_error: if (error) trace_xfs_refcount_insert_error(cur->bc_mp, diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h index 2a731ac68fe4..a92ad9078bc1 100644 --- a/fs/xfs/libxfs/xfs_refcount.h +++ b/fs/xfs/libxfs/xfs_refcount.h @@ -24,6 +24,8 @@ extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno, int *stat); extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno, int *stat); +extern int xfs_refcount_lookup_eq(struct xfs_btree_cur *cur, + xfs_agblock_t bno, int *stat); extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur, struct xfs_refcount_irec *irec, int *stat); @@ -85,5 +87,10 @@ static inline xfs_fileoff_t xfs_refcount_max_unmap(int log_res) extern int xfs_refcount_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, bool *exists); +union xfs_btree_rec; +extern void xfs_refcount_btrec_to_irec(union xfs_btree_rec *rec, + struct xfs_refcount_irec *irec); +extern int xfs_refcount_insert(struct xfs_btree_cur *cur, + struct xfs_refcount_irec *irec, int *stat); #endif /* __XFS_REFCOUNT_H__ */ diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index fba8d2718017..c0644f1be8a8 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -1374,6 +1374,8 @@ xfs_rmap_convert_shared( */ error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags, &PREV, &i); + if (error) + goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); ASSERT(PREV.rm_offset <= offset); @@ -2030,6 +2032,34 @@ out_error: return error; } +/* Insert a raw rmap into the rmapbt. */ +int +xfs_rmap_map_raw( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rmap) +{ + struct xfs_owner_info oinfo; + + oinfo.oi_owner = rmap->rm_owner; + oinfo.oi_offset = rmap->rm_offset; + oinfo.oi_flags = 0; + if (rmap->rm_flags & XFS_RMAP_ATTR_FORK) + oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK; + if (rmap->rm_flags & XFS_RMAP_BMBT_BLOCK) + oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK; + + if (rmap->rm_flags || XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner)) + return xfs_rmap_map(cur, rmap->rm_startblock, + rmap->rm_blockcount, + rmap->rm_flags & XFS_RMAP_UNWRITTEN, + &oinfo); + + return xfs_rmap_map_shared(cur, rmap->rm_startblock, + rmap->rm_blockcount, + rmap->rm_flags & XFS_RMAP_UNWRITTEN, + &oinfo); +} + struct xfs_rmap_query_range_info { xfs_rmap_query_range_fn fn; void *priv; @@ -2453,3 +2483,56 @@ xfs_rmap_record_exists( irec.rm_startblock + irec.rm_blockcount >= bno + len); return 0; } + +struct xfs_rmap_key_state { + uint64_t owner; + uint64_t offset; + unsigned int flags; + bool has_rmap; +}; + +/* For each rmap given, figure out if it doesn't match the key we want. */ +STATIC int +xfs_rmap_has_other_keys_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_rmap_key_state *rks = priv; + + if (rks->owner == rec->rm_owner && rks->offset == rec->rm_offset && + ((rks->flags & rec->rm_flags) & XFS_RMAP_KEY_FLAGS) == rks->flags) + return 0; + rks->has_rmap = true; + return XFS_BTREE_QUERY_RANGE_ABORT; +} + +/* + * Given an extent and some owner info, can we find records overlapping + * the extent whose owner info does not match the given owner? + */ +int +xfs_rmap_has_other_keys( + struct xfs_btree_cur *cur, + xfs_agblock_t bno, + xfs_extlen_t len, + struct xfs_owner_info *oinfo, + bool *has_rmap) +{ + struct xfs_rmap_irec low = {0}; + struct xfs_rmap_irec high; + struct xfs_rmap_key_state rks; + int error; + + xfs_owner_info_unpack(oinfo, &rks.owner, &rks.offset, &rks.flags); + rks.has_rmap = false; + + low.rm_startblock = bno; + memset(&high, 0xFF, sizeof(high)); + high.rm_startblock = bno + len - 1; + + error = xfs_rmap_query_range(cur, &low, &high, + xfs_rmap_has_other_keys_helper, &rks); + *has_rmap = rks.has_rmap; + return error; +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 380e53be98d5..43e506f67680 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -238,5 +238,9 @@ int xfs_rmap_has_record(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_record_exists(struct xfs_btree_cur *cur, xfs_agblock_t bno, xfs_extlen_t len, struct xfs_owner_info *oinfo, bool *has_rmap); +int xfs_rmap_has_other_keys(struct xfs_btree_cur *cur, xfs_agblock_t bno, + xfs_extlen_t len, struct xfs_owner_info *oinfo, + bool *has_rmap); +int xfs_rmap_map_raw(struct xfs_btree_cur *cur, struct xfs_rmap_irec *rmap); #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 106be2d0bb88..369eeb7a52ec 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -90,6 +90,9 @@ xfs_rtbuf_get( if (error) return error; + if (nmap == 0 || !xfs_bmap_is_real_extent(&map)) + return -EFSCORRUPTED; + ASSERT(map.br_startblock != NULLFSBLOCK); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, map.br_startblock), @@ -1033,14 +1036,17 @@ xfs_rtalloc_query_range( int is_free; int error = 0; - if (low_rec->ar_startblock > high_rec->ar_startblock) + if (low_rec->ar_startext > high_rec->ar_startext) return -EINVAL; - else if (low_rec->ar_startblock == high_rec->ar_startblock) + if (low_rec->ar_startext >= mp->m_sb.sb_rextents || + low_rec->ar_startext == high_rec->ar_startext) return 0; + if (high_rec->ar_startext >= mp->m_sb.sb_rextents) + high_rec->ar_startext = mp->m_sb.sb_rextents - 1; /* Iterate the bitmap, looking for discrepancies. */ - rtstart = low_rec->ar_startblock; - rem = high_rec->ar_startblock - rtstart; + rtstart = low_rec->ar_startext; + rem = high_rec->ar_startext - rtstart; while (rem) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, @@ -1050,13 +1056,13 @@ xfs_rtalloc_query_range( /* How long does the extent go for? */ error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startblock - 1, &rtend); + high_rec->ar_startext - 1, &rtend); if (error) break; if (is_free) { - rec.ar_startblock = rtstart; - rec.ar_blockcount = rtend - rtstart + 1; + rec.ar_startext = rtstart; + rec.ar_extcount = rtend - rtstart + 1; error = fn(tp, &rec, priv); if (error) @@ -1079,9 +1085,9 @@ xfs_rtalloc_query_all( { struct xfs_rtalloc_rec keys[2]; - keys[0].ar_startblock = 0; - keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks; - keys[0].ar_blockcount = keys[1].ar_blockcount = 0; + keys[0].ar_startext = 0; + keys[1].ar_startext = tp->t_mountp->m_sb.sb_rextents - 1; + keys[0].ar_extcount = keys[1].ar_extcount = 0; return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index d9b94bd5f689..d485e14313c6 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -888,6 +888,109 @@ xfs_sync_sb( return xfs_trans_commit(tp); } +/* + * Update all the secondary superblocks to match the new state of the primary. + * Because we are completely overwriting all the existing fields in the + * secondary superblock buffers, there is no need to read them in from disk. + * Just get a new buffer, stamp it and write it. + * + * The sb buffers need to be cached here so that we serialise against other + * operations that access the secondary superblocks, but we don't want to keep + * them in memory once it is written so we mark it as a one-shot buffer. + */ +int +xfs_update_secondary_sbs( + struct xfs_mount *mp) +{ + xfs_agnumber_t agno; + int saved_error = 0; + int error = 0; + LIST_HEAD (buffer_list); + + /* update secondary superblocks. */ + for (agno = 1; agno < mp->m_sb.sb_agcount; agno++) { + struct xfs_buf *bp; + + bp = xfs_buf_get(mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_SB_DADDR), + XFS_FSS_TO_BB(mp, 1), 0); + /* + * If we get an error reading or writing alternate superblocks, + * continue. xfs_repair chooses the "best" superblock based + * on most matches; if we break early, we'll leave more + * superblocks un-updated than updated, and xfs_repair may + * pick them over the properly-updated primary. + */ + if (!bp) { + xfs_warn(mp, + "error allocating secondary superblock for ag %d", + agno); + if (!saved_error) + saved_error = -ENOMEM; + continue; + } + + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_oneshot(bp); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + xfs_buf_delwri_queue(bp, &buffer_list); + xfs_buf_relse(bp); + + /* don't hold too many buffers at once */ + if (agno % 16) + continue; + + error = xfs_buf_delwri_submit(&buffer_list); + if (error) { + xfs_warn(mp, + "write error %d updating a secondary superblock near ag %d", + error, agno); + if (!saved_error) + saved_error = error; + continue; + } + } + error = xfs_buf_delwri_submit(&buffer_list); + if (error) { + xfs_warn(mp, + "write error %d updating a secondary superblock near ag %d", + error, agno); + } + + return saved_error ? saved_error : error; +} + +/* + * Same behavior as xfs_sync_sb, except that it is always synchronous and it + * also writes the superblock buffer to disk sector 0 immediately. + */ +int +xfs_sync_sb_buf( + struct xfs_mount *mp) +{ + struct xfs_trans *tp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); + if (error) + return error; + + xfs_log_sb(tp); + xfs_trans_bhold(tp, mp->m_sb_bp); + xfs_trans_set_sync(tp); + error = xfs_trans_commit(tp); + if (error) + goto out; + /* + * write out the sb buffer to get the changes to disk + */ + error = xfs_bwrite(mp->m_sb_bp); +out: + xfs_buf_relse(mp->m_sb_bp); + return error; +} + int xfs_fs_geometry( struct xfs_sb *sbp, @@ -972,3 +1075,47 @@ xfs_fs_geometry( return 0; } + +/* Read a secondary superblock. */ +int +xfs_sb_read_secondary( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **bpp) +{ + struct xfs_buf *bp; + int error; + + ASSERT(agno != 0 && agno != NULLAGNUMBER); + error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + if (error) + return error; + xfs_buf_set_ref(bp, XFS_SSB_REF); + *bpp = bp; + return 0; +} + +/* Get an uninitialised secondary superblock buffer. */ +int +xfs_sb_get_secondary( + struct xfs_mount *mp, + struct xfs_trans *tp, + xfs_agnumber_t agno, + struct xfs_buf **bpp) +{ + struct xfs_buf *bp; + + ASSERT(agno != 0 && agno != NULLAGNUMBER); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, + XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), + XFS_FSS_TO_BB(mp, 1), 0); + if (!bp) + return -ENOMEM; + bp->b_ops = &xfs_sb_buf_ops; + xfs_buf_oneshot(bp); + *bpp = bp; + return 0; +} diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index 63dcd2a1a657..244e0162c49e 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -18,6 +18,13 @@ #ifndef __XFS_SB_H__ #define __XFS_SB_H__ +struct xfs_mount; +struct xfs_sb; +struct xfs_dsb; +struct xfs_trans; +struct xfs_fsop_geom; +struct xfs_perag; + /* * perag get/put wrappers for ref counting */ @@ -29,13 +36,22 @@ extern int xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t); extern void xfs_log_sb(struct xfs_trans *tp); extern int xfs_sync_sb(struct xfs_mount *mp, bool wait); +extern int xfs_sync_sb_buf(struct xfs_mount *mp); extern void xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp); extern void xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from); extern void xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from); extern void xfs_sb_quota_from_disk(struct xfs_sb *sbp); +extern int xfs_update_secondary_sbs(struct xfs_mount *mp); + #define XFS_FS_GEOM_MAX_STRUCT_VER (4) extern int xfs_fs_geometry(struct xfs_sb *sbp, struct xfs_fsop_geom *geo, int struct_version); +extern int xfs_sb_read_secondary(struct xfs_mount *mp, + struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **bpp); +extern int xfs_sb_get_secondary(struct xfs_mount *mp, + struct xfs_trans *tp, xfs_agnumber_t agno, + struct xfs_buf **bpp); #endif /* __XFS_SB_H__ */ diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index d0b84da0cb1e..ae99c260adb1 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -57,21 +57,6 @@ extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; extern const struct xfs_buf_ops xfs_rtbuf_ops; -/* - * This structure is used to track log items associated with - * a transaction. It points to the log item and keeps some - * flags to track the state of the log item. It also tracks - * the amount of space needed to log the item it describes - * once we get to commit processing (see xfs_trans_commit()). - */ -struct xfs_log_item_desc { - struct xfs_log_item *lid_item; - struct list_head lid_trans; - unsigned char lid_flags; -}; - -#define XFS_LID_DIRTY 0x1 - /* log size calculation functions */ int xfs_log_calc_unit_res(struct xfs_mount *mp, int unit_bytes); int xfs_log_calc_minimum_size(struct xfs_mount *); @@ -127,6 +112,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp, #define XFS_ATTR_BTREE_REF 1 #define XFS_DQUOT_REF 1 #define XFS_REFC_BTREE_REF 1 +#define XFS_SSB_REF 0 /* * Flags for xfs_trans_ichgtime(). diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index 3c560695c546..ea18449bd732 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -30,7 +30,7 @@ typedef int64_t xfs_fsize_t; /* bytes in a file */ typedef uint64_t xfs_ufsize_t; /* unsigned bytes in a file */ typedef int32_t xfs_suminfo_t; /* type of bitmap summary info */ -typedef int32_t xfs_rtword_t; /* word type for bitmap manipulations */ +typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */ typedef int64_t xfs_lsn_t; /* log sequence number */ typedef int32_t xfs_tid_t; /* transaction identifier */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 018aabbd9394..1f71793f7db4 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -38,68 +38,6 @@ #include "scrub/common.h" #include "scrub/trace.h" -/* - * Walk all the blocks in the AGFL. The fn function can return any negative - * error code or XFS_BTREE_QUERY_RANGE_ABORT. - */ -int -xfs_scrub_walk_agfl( - struct xfs_scrub_context *sc, - int (*fn)(struct xfs_scrub_context *, - xfs_agblock_t bno, void *), - void *priv) -{ - struct xfs_agf *agf; - __be32 *agfl_bno; - struct xfs_mount *mp = sc->mp; - unsigned int flfirst; - unsigned int fllast; - int i; - int error; - - agf = XFS_BUF_TO_AGF(sc->sa.agf_bp); - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, sc->sa.agfl_bp); - flfirst = be32_to_cpu(agf->agf_flfirst); - fllast = be32_to_cpu(agf->agf_fllast); - - /* Nothing to walk in an empty AGFL. */ - if (agf->agf_flcount == cpu_to_be32(0)) - return 0; - - /* first to last is a consecutive list. */ - if (fllast >= flfirst) { - for (i = flfirst; i <= fllast; i++) { - error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); - if (error) - return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - } - - return 0; - } - - /* first to the end */ - for (i = flfirst; i < xfs_agfl_size(mp); i++) { - error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); - if (error) - return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - } - - /* the start to last. */ - for (i = 0; i <= fllast; i++) { - error = fn(sc, be32_to_cpu(agfl_bno[i]), priv); - if (error) - return error; - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - return error; - } - - return 0; -} - /* Superblock */ /* Cross-reference with the other btrees. */ @@ -157,9 +95,7 @@ xfs_scrub_superblock( if (agno == 0) return 0; - error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); + error = xfs_sb_read_secondary(mp, sc->tp, agno, &bp); /* * The superblock verifier can return several different error codes * if it thinks the superblock doesn't look right. For a mount these @@ -680,6 +616,7 @@ struct xfs_scrub_agfl_info { unsigned int sz_entries; unsigned int nr_entries; xfs_agblock_t *entries; + struct xfs_scrub_context *sc; }; /* Cross-reference with the other btrees. */ @@ -701,12 +638,12 @@ xfs_scrub_agfl_block_xref( /* Scrub an AGFL block. */ STATIC int xfs_scrub_agfl_block( - struct xfs_scrub_context *sc, + struct xfs_mount *mp, xfs_agblock_t agbno, void *priv) { - struct xfs_mount *mp = sc->mp; struct xfs_scrub_agfl_info *sai = priv; + struct xfs_scrub_context *sc = sai->sc; xfs_agnumber_t agno = sc->sa.agno; if (xfs_verify_agbno(mp, agno, agbno) && @@ -717,6 +654,9 @@ xfs_scrub_agfl_block( xfs_scrub_agfl_block_xref(sc, agbno, priv); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return XFS_BTREE_QUERY_RANGE_ABORT; + return 0; } @@ -796,8 +736,10 @@ xfs_scrub_agfl( goto out; } memset(&sai, 0, sizeof(sai)); + sai.sc = sc; sai.sz_entries = agflcount; - sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, KM_NOFS); + sai.entries = kmem_zalloc(sizeof(xfs_agblock_t) * agflcount, + KM_MAYFAIL); if (!sai.entries) { error = -ENOMEM; goto out; @@ -805,7 +747,12 @@ xfs_scrub_agfl( /* Check the blocks in the AGFL. */ xfs_rmap_ag_owner(&sai.oinfo, XFS_RMAP_OWN_AG); - error = xfs_scrub_walk_agfl(sc, xfs_scrub_agfl_block, &sai); + error = xfs_agfl_walk(sc->mp, XFS_BUF_TO_AGF(sc->sa.agf_bp), + sc->sa.agfl_bp, xfs_scrub_agfl_block, &sai); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + error = 0; + goto out_free; + } if (error) goto out_free; diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c new file mode 100644 index 000000000000..8b91e9ebe1e7 --- /dev/null +++ b/fs/xfs/scrub/agheader_repair.c @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" +#include "xfs_rmap.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" + +/* Superblock */ + +/* Repair the superblock. */ +int +xfs_repair_superblock( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + xfs_agnumber_t agno; + int error; + + /* Don't try to repair AG 0's sb; let xfs_repair deal with it. */ + agno = sc->sm->sm_agno; + if (agno == 0) + return -EOPNOTSUPP; + + error = xfs_sb_get_secondary(mp, sc->tp, agno, &bp); + if (error) + return error; + + /* Copy AG 0's superblock to this one. */ + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); + + /* Write this to disk. */ + xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF); + xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1); + return error; +} diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c index 517c079d3f68..941a0a55224e 100644 --- a/fs/xfs/scrub/alloc.c +++ b/fs/xfs/scrub/alloc.c @@ -70,7 +70,7 @@ xfs_scrub_allocbt_xref_other( pcur = &sc->sa.cnt_cur; else pcur = &sc->sa.bno_cur; - if (!*pcur) + if (!*pcur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_alloc_lookup_le(*pcur, agbno, len, &has_otherrec); @@ -172,7 +172,7 @@ xfs_scrub_xref_is_used_space( bool is_freesp; int error; - if (!sc->sa.bno_cur) + if (!sc->sa.bno_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_alloc_has_record(sc->sa.bno_cur, agbno, len, &is_freesp); diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c index 127575f0abfb..84b6d6b66578 100644 --- a/fs/xfs/scrub/attr.c +++ b/fs/xfs/scrub/attr.c @@ -126,8 +126,9 @@ xfs_scrub_xattr_listent( if (args.valuelen != valuelen) xfs_scrub_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); - fail_xref: + if (sx->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + context->seen_enough = 1; return; } diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 639d14b51e90..eeadb33a701c 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -51,7 +51,6 @@ xfs_scrub_setup_inode_bmap( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - struct xfs_mount *mp = sc->mp; int error; error = xfs_scrub_get_inode(sc, ip); @@ -75,7 +74,7 @@ xfs_scrub_setup_inode_bmap( } /* Got the inode, lock it and we're ready to go. */ - error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + error = xfs_scrub_trans_alloc(sc, 0); if (error) goto out; sc->ilock_flags |= XFS_ILOCK_EXCL; @@ -175,7 +174,7 @@ xfs_scrub_bmap_xref_rmap( unsigned long long rmap_end; uint64_t owner; - if (!info->sc->sa.rmap_cur) + if (!info->sc->sa.rmap_cur || xfs_scrub_skip_xref(info->sc->sm)) return; if (info->whichfork == XFS_COW_FORK) @@ -684,7 +683,8 @@ xfs_scrub_bmap( info.lastoff = 0; ifp = XFS_IFORK_PTR(ip, whichfork); for_each_xfs_iext(ifp, &icur, &irec) { - if (xfs_scrub_should_terminate(sc, &error)) + if (xfs_scrub_should_terminate(sc, &error) || + (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) break; if (isnullstartblock(irec.br_startblock)) continue; diff --git a/fs/xfs/scrub/btree.c b/fs/xfs/scrub/btree.c index 54218168c8f9..2d29dceaa00e 100644 --- a/fs/xfs/scrub/btree.c +++ b/fs/xfs/scrub/btree.c @@ -442,7 +442,7 @@ xfs_scrub_btree_check_owner( */ if (cur->bc_btnum == XFS_BTNUM_BNO || cur->bc_btnum == XFS_BTNUM_RMAP) { co = kmem_alloc(sizeof(struct check_owner), - KM_MAYFAIL | KM_NOFS); + KM_MAYFAIL); if (!co) return -ENOMEM; co->level = level; @@ -455,6 +455,44 @@ xfs_scrub_btree_check_owner( } /* + * Check that this btree block has at least minrecs records or is one of the + * special blocks that don't require that. + */ +STATIC void +xfs_scrub_btree_check_minrecs( + struct xfs_scrub_btree *bs, + int level, + struct xfs_btree_block *block) +{ + unsigned int numrecs; + int ok_level; + + numrecs = be16_to_cpu(block->bb_numrecs); + + /* More records than minrecs means the block is ok. */ + if (numrecs >= bs->cur->bc_ops->get_minrecs(bs->cur, level)) + return; + + /* + * Certain btree blocks /can/ have fewer than minrecs records. Any + * level greater than or equal to the level of the highest dedicated + * btree block are allowed to violate this constraint. + * + * For a btree rooted in a block, the btree root can have fewer than + * minrecs records. If the btree is rooted in an inode and does not + * store records in the root, the direct children of the root and the + * root itself can have fewer than minrecs records. + */ + ok_level = bs->cur->bc_nlevels - 1; + if (bs->cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) + ok_level--; + if (level >= ok_level) + return; + + xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, level); +} + +/* * Grab and scrub a btree block given a btree pointer. Returns block * and buffer pointers (if applicable) if they're ok to use. */ @@ -491,6 +529,8 @@ xfs_scrub_btree_get_block( if (*pbp) xfs_scrub_buffer_recheck(bs->sc, *pbp); + xfs_scrub_btree_check_minrecs(bs, level, *pblock); + /* * Check the block's owner; this function absorbs error codes * for us. diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c index 8ed91d5c868d..41198a5f872c 100644 --- a/fs/xfs/scrub/common.c +++ b/fs/xfs/scrub/common.c @@ -44,11 +44,14 @@ #include "xfs_rmap_btree.h" #include "xfs_log.h" #include "xfs_trans_priv.h" +#include "xfs_attr.h" +#include "xfs_reflink.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/btree.h" +#include "scrub/repair.h" /* Common code for the metadata scrubbers. */ @@ -539,6 +542,10 @@ xfs_scrub_ag_free( xfs_trans_brelse(sc->tp, sa->agi_bp); sa->agi_bp = NULL; } + if (sa->pag) { + xfs_perag_put(sa->pag); + sa->pag = NULL; + } sa->agno = NULLAGNUMBER; } @@ -566,15 +573,53 @@ xfs_scrub_ag_init( return xfs_scrub_ag_btcur_init(sc, sa); } +/* + * Grab the per-ag structure if we haven't already gotten it. Teardown of the + * xfs_scrub_ag will release it for us. + */ +void +xfs_scrub_perag_get( + struct xfs_mount *mp, + struct xfs_scrub_ag *sa) +{ + if (!sa->pag) + sa->pag = xfs_perag_get(mp, sa->agno); +} + /* Per-scrubber setup functions */ +/* + * Grab an empty transaction so that we can re-grab locked buffers if + * one of our btrees turns out to be cyclic. + * + * If we're going to repair something, we need to ask for the largest possible + * log reservation so that we can handle the worst case scenario for metadata + * updates while rebuilding a metadata item. We also need to reserve as many + * blocks in the head transaction as we think we're going to need to rebuild + * the metadata object. + */ +int +xfs_scrub_trans_alloc( + struct xfs_scrub_context *sc, + uint resblks) +{ + if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) + return xfs_trans_alloc(sc->mp, &M_RES(sc->mp)->tr_itruncate, + resblks, 0, 0, &sc->tp); + + return xfs_trans_alloc_empty(sc->mp, &sc->tp); +} + /* Set us up with a transaction and an empty context. */ int xfs_scrub_setup_fs( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - return xfs_scrub_trans_alloc(sc->sm, sc->mp, &sc->tp); + uint resblks; + + resblks = xfs_repair_calc_ag_resblks(sc); + return xfs_scrub_trans_alloc(sc, resblks); } /* Set us up with AG headers and btree cursors. */ @@ -695,7 +740,6 @@ xfs_scrub_setup_inode_contents( struct xfs_inode *ip, unsigned int resblks) { - struct xfs_mount *mp = sc->mp; int error; error = xfs_scrub_get_inode(sc, ip); @@ -705,7 +749,7 @@ xfs_scrub_setup_inode_contents( /* Got the inode, lock it and we're ready to go. */ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; xfs_ilock(sc->ip, sc->ilock_flags); - error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + error = xfs_scrub_trans_alloc(sc, resblks); if (error) goto out; sc->ilock_flags |= XFS_ILOCK_EXCL; @@ -727,6 +771,10 @@ xfs_scrub_should_check_xref( int *error, struct xfs_btree_cur **curpp) { + /* No point in xref if we already know we're corrupt. */ + if (xfs_scrub_skip_xref(sc->sm)) + return false; + if (*error == 0) return true; @@ -773,3 +821,80 @@ xfs_scrub_buffer_recheck( sc->sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; trace_xfs_scrub_block_error(sc, bp->b_bn, fa); } + +/* + * Scrub the attr/data forks of a metadata inode. The metadata inode must be + * pointed to by sc->ip and the ILOCK must be held. + */ +int +xfs_scrub_metadata_inode_forks( + struct xfs_scrub_context *sc) +{ + __u32 smtype; + bool shared; + int error; + + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return 0; + + /* Metadata inodes don't live on the rt device. */ + if (sc->ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* They should never participate in reflink. */ + if (xfs_is_reflink_inode(sc->ip)) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* They also should never have extended attributes. */ + if (xfs_inode_hasattr(sc->ip)) { + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + return 0; + } + + /* Invoke the data fork scrubber. */ + smtype = sc->sm->sm_type; + sc->sm->sm_type = XFS_SCRUB_TYPE_BMBTD; + error = xfs_scrub_bmap_data(sc); + sc->sm->sm_type = smtype; + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + + /* Look for incorrect shared blocks. */ + if (xfs_sb_version_hasreflink(&sc->mp->m_sb)) { + error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, + &shared); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, + &error)) + return error; + if (shared) + xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); + } + + return error; +} + +/* + * Try to lock an inode in violation of the usual locking order rules. For + * example, trying to get the IOLOCK while in transaction context, or just + * plain breaking AG-order or inode-order inode locking rules. Either way, + * the only way to avoid an ABBA deadlock is to use trylock and back off if + * we can't. + */ +int +xfs_scrub_ilock_inverted( + struct xfs_inode *ip, + uint lock_mode) +{ + int i; + + for (i = 0; i < 20; i++) { + if (xfs_ilock_nowait(ip, lock_mode)) + return 0; + delay(1); + } + return -EDEADLOCK; +} diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h index deaf60400981..76bb2d1d808c 100644 --- a/fs/xfs/scrub/common.h +++ b/fs/xfs/scrub/common.h @@ -38,19 +38,7 @@ xfs_scrub_should_terminate( return false; } -/* - * Grab an empty transaction so that we can re-grab locked buffers if - * one of our btrees turns out to be cyclic. - */ -static inline int -xfs_scrub_trans_alloc( - struct xfs_scrub_metadata *sm, - struct xfs_mount *mp, - struct xfs_trans **tpp) -{ - return xfs_trans_alloc_empty(mp, tpp); -} - +int xfs_scrub_trans_alloc(struct xfs_scrub_context *sc, uint resblks); bool xfs_scrub_process_error(struct xfs_scrub_context *sc, xfs_agnumber_t agno, xfs_agblock_t bno, int *error); bool xfs_scrub_fblock_process_error(struct xfs_scrub_context *sc, int whichfork, @@ -135,16 +123,13 @@ xfs_scrub_setup_quota(struct xfs_scrub_context *sc, struct xfs_inode *ip) void xfs_scrub_ag_free(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa); int xfs_scrub_ag_init(struct xfs_scrub_context *sc, xfs_agnumber_t agno, struct xfs_scrub_ag *sa); +void xfs_scrub_perag_get(struct xfs_mount *mp, struct xfs_scrub_ag *sa); int xfs_scrub_ag_read_headers(struct xfs_scrub_context *sc, xfs_agnumber_t agno, struct xfs_buf **agi, struct xfs_buf **agf, struct xfs_buf **agfl); void xfs_scrub_ag_btcur_free(struct xfs_scrub_ag *sa); int xfs_scrub_ag_btcur_init(struct xfs_scrub_context *sc, struct xfs_scrub_ag *sa); -int xfs_scrub_walk_agfl(struct xfs_scrub_context *sc, - int (*fn)(struct xfs_scrub_context *, xfs_agblock_t bno, - void *), - void *priv); int xfs_scrub_count_rmap_ownedby_ag(struct xfs_scrub_context *sc, struct xfs_btree_cur *cur, struct xfs_owner_info *oinfo, @@ -157,4 +142,17 @@ int xfs_scrub_setup_inode_contents(struct xfs_scrub_context *sc, struct xfs_inode *ip, unsigned int resblks); void xfs_scrub_buffer_recheck(struct xfs_scrub_context *sc, struct xfs_buf *bp); +/* + * Don't bother cross-referencing if we already found corruption or cross + * referencing discrepancies. + */ +static inline bool xfs_scrub_skip_xref(struct xfs_scrub_metadata *sm) +{ + return sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT); +} + +int xfs_scrub_metadata_inode_forks(struct xfs_scrub_context *sc); +int xfs_scrub_ilock_inverted(struct xfs_inode *ip, uint lock_mode); + #endif /* __XFS_SCRUB_COMMON_H__ */ diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index 38f29806eb54..1a4309b3e786 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -172,7 +172,7 @@ xfs_scrub_dir_actor( error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); if (!xfs_scrub_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, &error)) - goto fail_xref; + goto out; if (lookup_ino != ino) { xfs_scrub_fblock_set_corrupt(sdc->sc, XFS_DATA_FORK, offset); goto out; @@ -183,8 +183,13 @@ xfs_scrub_dir_actor( if (error) goto out; out: - return error; -fail_xref: + /* + * A negative error code returned here is supposed to cause the + * dir_emit caller (xfs_readdir) to abort the directory iteration + * and return zero to xfs_scrub_directory. + */ + if (error == 0 && sdc->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + return -EFSCORRUPTED; return error; } @@ -240,6 +245,9 @@ xfs_scrub_dir_rec( } xfs_scrub_buffer_recheck(ds->sc, bp); + if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_relse; + dent = (struct xfs_dir2_data_entry *)(((char *)bp->b_addr) + off); /* Make sure we got a real directory entry. */ @@ -357,6 +365,9 @@ xfs_scrub_directory_data_bestfree( /* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */ + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_buf; + /* Do the bestfrees correspond to actual free space? */ bf = d_ops->data_bestfree_p(bp->b_addr); smallest_bestfree = UINT_MAX; @@ -413,14 +424,18 @@ xfs_scrub_directory_data_bestfree( /* Spot check this free entry */ tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)); - if (tag != ((char *)dup - (char *)bp->b_addr)) + if (tag != ((char *)dup - (char *)bp->b_addr)) { xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + goto out_buf; + } /* * Either this entry is a bestfree or it's smaller than * any of the bestfrees. */ xfs_scrub_directory_check_free_entry(sc, lblk, bf, dup); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out_buf; /* Move on. */ newlen = be16_to_cpu(dup->length); @@ -546,6 +561,8 @@ xfs_scrub_directory_leaf1_bestfree( } if (leafhdr.stale != stale) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; /* Check all the bestfree entries. */ for (i = 0; i < bestcount; i++, bestp++) { @@ -556,9 +573,11 @@ xfs_scrub_directory_leaf1_bestfree( i * args->geo->fsbcount, -1, &dbp); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - continue; + break; xfs_scrub_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; } out: return error; @@ -607,7 +626,7 @@ xfs_scrub_directory_free_bestfree( -1, &dbp); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error)) - continue; + break; xfs_scrub_directory_check_freesp(sc, lblk, dbp, best); xfs_trans_brelse(sc->tp, dbp); } @@ -656,7 +675,7 @@ xfs_scrub_directory_blocks( /* Iterate all the data extents in the directory... */ found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); - while (found) { + while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { /* Block directories only have a single block at offset 0. */ if (is_block && (got.br_startoff > 0 || @@ -719,7 +738,7 @@ xfs_scrub_directory_blocks( /* Scan for free blocks */ lblk = free_lblk; found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got); - while (found) { + while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) { /* * Dirs can't have blocks mapped above 2^32. * Single-block dirs shouldn't even be here. diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c index 106ca4bd753f..00a834d3b56d 100644 --- a/fs/xfs/scrub/ialloc.c +++ b/fs/xfs/scrub/ialloc.c @@ -387,7 +387,8 @@ xfs_scrub_iallocbt_xref_rmap_btreeblks( int error; if (!sc->sa.ino_cur || !sc->sa.rmap_cur || - (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur)) + (xfs_sb_version_hasfinobt(&sc->mp->m_sb) && !sc->sa.fino_cur) || + xfs_scrub_skip_xref(sc->sm)) return; /* Check that we saw as many inobt blocks as the rmap says. */ @@ -424,7 +425,7 @@ xfs_scrub_iallocbt_xref_rmap_inodes( xfs_filblks_t blocks; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Check that we saw as many inode blocks as the rmap knows about. */ @@ -496,7 +497,7 @@ xfs_scrub_xref_inode_check( bool has_inodes; int error; - if (!(*icur)) + if (!(*icur) || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_ialloc_has_inodes_at_extent(*icur, agbno, len, &has_inodes); diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index df14930e4fc5..0c696f7018de 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -55,7 +55,6 @@ xfs_scrub_setup_inode( struct xfs_scrub_context *sc, struct xfs_inode *ip) { - struct xfs_mount *mp = sc->mp; int error; /* @@ -68,7 +67,7 @@ xfs_scrub_setup_inode( break; case -EFSCORRUPTED: case -EFSBADCRC: - return xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + return xfs_scrub_trans_alloc(sc, 0); default: return error; } @@ -76,7 +75,7 @@ xfs_scrub_setup_inode( /* Got the inode, lock it and we're ready to go. */ sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; xfs_ilock(sc->ip, sc->ilock_flags); - error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); + error = xfs_scrub_trans_alloc(sc, 0); if (error) goto out; sc->ilock_flags |= XFS_ILOCK_EXCL; @@ -449,7 +448,7 @@ xfs_scrub_inode_xref_finobt( int has_record; int error; - if (!sc->sa.fino_cur) + if (!sc->sa.fino_cur || xfs_scrub_skip_xref(sc->sm)) return; agino = XFS_INO_TO_AGINO(sc->mp, ino); @@ -492,6 +491,9 @@ xfs_scrub_inode_xref_bmap( xfs_filblks_t acount; int error; + if (xfs_scrub_skip_xref(sc->sm)) + return; + /* Walk all the extents to check nextents/naextents/nblocks. */ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, &nextents, &count); diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c index 1fb88c18d455..77c6b22c6bfd 100644 --- a/fs/xfs/scrub/parent.c +++ b/fs/xfs/scrub/parent.c @@ -147,6 +147,9 @@ xfs_scrub_parent_validate( *try_again = false; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + /* '..' must not point to ourselves. */ if (sc->ip->i_ino == dnum) { xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); @@ -211,7 +214,9 @@ xfs_scrub_parent_validate( */ xfs_iunlock(sc->ip, sc->ilock_flags); sc->ilock_flags = 0; - xfs_ilock(dp, XFS_IOLOCK_SHARED); + error = xfs_scrub_ilock_inverted(dp, XFS_IOLOCK_SHARED); + if (error) + goto out_rele; /* Go looking for our dentry. */ error = xfs_scrub_parent_count_parent_dentries(sc, dp, &nlink); @@ -220,8 +225,10 @@ xfs_scrub_parent_validate( /* Drop the parent lock, relock this inode. */ xfs_iunlock(dp, XFS_IOLOCK_SHARED); + error = xfs_scrub_ilock_inverted(sc->ip, XFS_IOLOCK_EXCL); + if (error) + goto out_rele; sc->ilock_flags = XFS_IOLOCK_EXCL; - xfs_ilock(sc->ip, sc->ilock_flags); /* * If we're an unlinked directory, the parent /won't/ have a link @@ -323,5 +330,13 @@ xfs_scrub_parent( if (try_again && tries == 20) xfs_scrub_set_incomplete(sc); out: + /* + * If we failed to lock the parent inode even after a retry, just mark + * this scrub incomplete and return. + */ + if (sc->try_harder && error == -EDEADLOCK) { + error = 0; + xfs_scrub_set_incomplete(sc); + } return error; } diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c index 6ba465e6c885..15ae4d23d6ac 100644 --- a/fs/xfs/scrub/quota.c +++ b/fs/xfs/scrub/quota.c @@ -66,25 +66,43 @@ xfs_scrub_setup_quota( struct xfs_inode *ip) { uint dqtype; + int error; + + if (!XFS_IS_QUOTA_RUNNING(sc->mp) || !XFS_IS_QUOTA_ON(sc->mp)) + return -ENOENT; dqtype = xfs_scrub_quota_to_dqtype(sc); if (dqtype == 0) return -EINVAL; + sc->has_quotaofflock = true; + mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock); if (!xfs_this_quota_on(sc->mp, dqtype)) return -ENOENT; + error = xfs_scrub_setup_fs(sc, ip); + if (error) + return error; + sc->ip = xfs_quota_inode(sc->mp, dqtype); + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); + sc->ilock_flags = XFS_ILOCK_EXCL; return 0; } /* Quotas. */ +struct xfs_scrub_quota_info { + struct xfs_scrub_context *sc; + xfs_dqid_t last_id; +}; + /* Scrub the fields in an individual quota item. */ -STATIC void +STATIC int xfs_scrub_quota_item( - struct xfs_scrub_context *sc, - uint dqtype, struct xfs_dquot *dq, - xfs_dqid_t id) + uint dqtype, + void *priv) { + struct xfs_scrub_quota_info *sqi = priv; + struct xfs_scrub_context *sc = sqi->sc; struct xfs_mount *mp = sc->mp; struct xfs_disk_dquot *d = &dq->q_core; struct xfs_quotainfo *qi = mp->m_quotainfo; @@ -99,17 +117,18 @@ xfs_scrub_quota_item( unsigned long long icount; unsigned long long rcount; xfs_ino_t fs_icount; - - offset = id / qi->qi_dqperchunk; + xfs_dqid_t id = be32_to_cpu(d->d_id); /* - * We fed $id and DQNEXT into the xfs_qm_dqget call, which means - * that the actual dquot we got must either have the same id or - * the next higher id. + * Except for the root dquot, the actual dquot we got must either have + * the same or higher id as we saw before. */ - if (id > be32_to_cpu(d->d_id)) + offset = id / qi->qi_dqperchunk; + if (id && id <= sqi->last_id) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); + sqi->last_id = id; + /* Did we get the dquot type we wanted? */ if (dqtype != (d->d_flags & XFS_DQ_ALLTYPES)) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, offset); @@ -183,115 +202,85 @@ xfs_scrub_quota_item( xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); if (id != 0 && rhard != 0 && rcount > rhard) xfs_scrub_fblock_set_warning(sc, XFS_DATA_FORK, offset); + + return 0; } -/* Scrub all of a quota type's items. */ -int -xfs_scrub_quota( +/* Check the quota's data fork. */ +STATIC int +xfs_scrub_quota_data_fork( struct xfs_scrub_context *sc) { struct xfs_bmbt_irec irec = { 0 }; - struct xfs_mount *mp = sc->mp; - struct xfs_inode *ip; - struct xfs_quotainfo *qi = mp->m_quotainfo; - struct xfs_dquot *dq; + struct xfs_iext_cursor icur; + struct xfs_quotainfo *qi = sc->mp->m_quotainfo; + struct xfs_ifork *ifp; xfs_fileoff_t max_dqid_off; - xfs_fileoff_t off = 0; - xfs_dqid_t id = 0; - uint dqtype; - int nimaps; int error = 0; - if (!XFS_IS_QUOTA_RUNNING(mp) || !XFS_IS_QUOTA_ON(mp)) - return -ENOENT; - - mutex_lock(&qi->qi_quotaofflock); - dqtype = xfs_scrub_quota_to_dqtype(sc); - if (!xfs_this_quota_on(sc->mp, dqtype)) { - error = -ENOENT; - goto out_unlock_quota; - } - - /* Attach to the quota inode and set sc->ip so that reporting works. */ - ip = xfs_quota_inode(sc->mp, dqtype); - sc->ip = ip; + /* Invoke the fork scrubber. */ + error = xfs_scrub_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; - /* Look for problem extents. */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) { - xfs_scrub_ino_set_corrupt(sc, sc->ip->i_ino); - goto out_unlock_inode; - } + /* Check for data fork problems that apply only to quota files. */ max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk; - while (1) { + ifp = XFS_IFORK_PTR(sc->ip, XFS_DATA_FORK); + for_each_xfs_iext(ifp, &icur, &irec) { if (xfs_scrub_should_terminate(sc, &error)) break; - - off = irec.br_startoff + irec.br_blockcount; - nimaps = 1; - error = xfs_bmapi_read(ip, off, -1, &irec, &nimaps, - XFS_BMAPI_ENTIRE); - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, off, - &error)) - goto out_unlock_inode; - if (!nimaps) - break; - if (irec.br_startblock == HOLESTARTBLOCK) - continue; - - /* Check the extent record doesn't point to crap. */ - if (irec.br_startblock + irec.br_blockcount <= - irec.br_startblock) - xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, - irec.br_startoff); - if (!xfs_verify_fsbno(mp, irec.br_startblock) || - !xfs_verify_fsbno(mp, irec.br_startblock + - irec.br_blockcount - 1)) - xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, - irec.br_startoff); - /* - * Unwritten extents or blocks mapped above the highest + * delalloc extents or blocks mapped above the highest * quota id shouldn't happen. */ if (isnullstartblock(irec.br_startblock) || irec.br_startoff > max_dqid_off || - irec.br_startoff + irec.br_blockcount > max_dqid_off + 1) - xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, off); + irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) { + xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, + irec.br_startoff); + break; + } } - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) - goto out; - /* Check all the quota items. */ - while (id < ((xfs_dqid_t)-1ULL)) { - if (xfs_scrub_should_terminate(sc, &error)) - break; + return error; +} - error = xfs_qm_dqget(mp, NULL, id, dqtype, XFS_QMOPT_DQNEXT, - &dq); - if (error == -ENOENT) - break; - if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, - id * qi->qi_dqperchunk, &error)) - break; +/* Scrub all of a quota type's items. */ +int +xfs_scrub_quota( + struct xfs_scrub_context *sc) +{ + struct xfs_scrub_quota_info sqi; + struct xfs_mount *mp = sc->mp; + struct xfs_quotainfo *qi = mp->m_quotainfo; + uint dqtype; + int error = 0; - xfs_scrub_quota_item(sc, dqtype, dq, id); + dqtype = xfs_scrub_quota_to_dqtype(sc); - id = be32_to_cpu(dq->q_core.d_id) + 1; - xfs_qm_dqput(dq); - if (!id) - break; - } + /* Look for problem extents. */ + error = xfs_scrub_quota_data_fork(sc); + if (error) + goto out; + if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) + goto out; + + /* + * Check all the quota items. Now that we've checked the quota inode + * data fork we have to drop ILOCK_EXCL to use the regular dquot + * functions. + */ + xfs_iunlock(sc->ip, sc->ilock_flags); + sc->ilock_flags = 0; + sqi.sc = sc; + sqi.last_id = 0; + error = xfs_qm_dqiterate(mp, dqtype, xfs_scrub_quota_item, &sqi); + sc->ilock_flags = XFS_ILOCK_EXCL; + xfs_ilock(sc->ip, sc->ilock_flags); + if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, + sqi.last_id * qi->qi_dqperchunk, &error)) + goto out; out: - /* We set sc->ip earlier, so make sure we clear it now. */ - sc->ip = NULL; -out_unlock_quota: - mutex_unlock(&qi->qi_quotaofflock); return error; - -out_unlock_inode: - xfs_iunlock(ip, XFS_ILOCK_EXCL); - goto out; } diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c index 400f1561cd3d..324a5f159145 100644 --- a/fs/xfs/scrub/refcount.c +++ b/fs/xfs/scrub/refcount.c @@ -150,7 +150,7 @@ xfs_scrub_refcountbt_rmap_check( * so we don't need insertion sort here. */ frag = kmem_alloc(sizeof(struct xfs_scrub_refcnt_frag), - KM_MAYFAIL | KM_NOFS); + KM_MAYFAIL); if (!frag) return -ENOMEM; memcpy(&frag->rm, rec, sizeof(frag->rm)); @@ -310,7 +310,7 @@ xfs_scrub_refcountbt_xref_rmap( struct xfs_scrub_refcnt_frag *n; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Cross-reference with the rmapbt to confirm the refcount. */ @@ -404,7 +404,7 @@ xfs_scrub_refcount_xref_rmap( xfs_filblks_t blocks; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Check that we saw as many refcbt blocks as the rmap knows about. */ @@ -460,7 +460,7 @@ xfs_scrub_xref_is_cow_staging( int has_refcount; int error; - if (!sc->sa.refc_cur) + if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm)) return; /* Find the CoW staging extent. */ @@ -504,7 +504,7 @@ xfs_scrub_xref_is_not_shared( bool shared; int error; - if (!sc->sa.refc_cur) + if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_refcount_has_record(sc->sa.refc_cur, agbno, len, &shared); diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c new file mode 100644 index 000000000000..e3e8fba1c99c --- /dev/null +++ b/fs/xfs/scrub/repair.c @@ -0,0 +1,1089 @@ +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_btree.h" +#include "xfs_bit.h" +#include "xfs_log_format.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_inode.h" +#include "xfs_icache.h" +#include "xfs_alloc.h" +#include "xfs_alloc_btree.h" +#include "xfs_ialloc.h" +#include "xfs_ialloc_btree.h" +#include "xfs_rmap.h" +#include "xfs_rmap_btree.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_extent_busy.h" +#include "xfs_ag_resv.h" +#include "xfs_trans_space.h" +#include "xfs_quota.h" +#include "scrub/xfs_scrub.h" +#include "scrub/scrub.h" +#include "scrub/common.h" +#include "scrub/trace.h" +#include "scrub/repair.h" + +/* + * Attempt to repair some metadata, if the metadata is corrupt and userspace + * told us to fix it. This function returns -EAGAIN to mean "re-run scrub", + * and will set *fixed to true if it thinks it repaired anything. + */ +int +xfs_repair_attempt( + struct xfs_inode *ip, + struct xfs_scrub_context *sc, + bool *fixed) +{ + int error = 0; + + trace_xfs_repair_attempt(ip, sc->sm, error); + + xfs_scrub_ag_btcur_free(&sc->sa); + + /* Repair whatever's broken. */ + ASSERT(sc->ops->repair); + error = sc->ops->repair(sc); + trace_xfs_repair_done(ip, sc->sm, error); + switch (error) { + case 0: + /* + * Repair succeeded. Commit the fixes and perform a second + * scrub so that we can tell userspace if we fixed the problem. + */ + sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; + *fixed = true; + return -EAGAIN; + case -EDEADLOCK: + case -EAGAIN: + /* Tell the caller to try again having grabbed all the locks. */ + if (!sc->try_harder) { + sc->try_harder = true; + return -EAGAIN; + } + /* + * We tried harder but still couldn't grab all the resources + * we needed to fix it. The corruption has not been fixed, + * so report back to userspace. + */ + return -EFSCORRUPTED; + default: + return error; + } +} + +/* + * Complain about unfixable problems in the filesystem. We don't log + * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver + * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the + * administrator isn't running xfs_scrub in no-repairs mode. + * + * Use this helper function because _ratelimited silently declares a static + * structure to track rate limiting information. + */ +void +xfs_repair_failure( + struct xfs_mount *mp) +{ + xfs_alert_ratelimited(mp, +"Corruption not fixed during online repair. Unmount and run xfs_repair."); +} + +/* + * Repair probe -- userspace uses this to probe if we're willing to repair a + * given mountpoint. + */ +int +xfs_repair_probe( + struct xfs_scrub_context *sc) +{ + int error = 0; + + if (xfs_scrub_should_terminate(sc, &error)) + return error; + + return 0; +} + +/* + * Roll a transaction, keeping the AG headers locked and reinitializing + * the btree cursors. + */ +int +xfs_repair_roll_ag_trans( + struct xfs_scrub_context *sc) +{ + int error; + + /* Keep the AG header buffers locked so we can keep going. */ + xfs_trans_bhold(sc->tp, sc->sa.agi_bp); + xfs_trans_bhold(sc->tp, sc->sa.agf_bp); + xfs_trans_bhold(sc->tp, sc->sa.agfl_bp); + + /* Roll the transaction. */ + error = xfs_trans_roll(&sc->tp); + if (error) + goto out_release; + + /* Join AG headers to the new transaction. */ + xfs_trans_bjoin(sc->tp, sc->sa.agi_bp); + xfs_trans_bjoin(sc->tp, sc->sa.agf_bp); + xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp); + + return 0; + +out_release: + /* + * Rolling failed, so release the hold on the buffers. The + * buffers will be released during teardown on our way out + * of the kernel. + */ + xfs_trans_bhold_release(sc->tp, sc->sa.agi_bp); + xfs_trans_bhold_release(sc->tp, sc->sa.agf_bp); + xfs_trans_bhold_release(sc->tp, sc->sa.agfl_bp); + + return error; +} + +/* + * Does the given AG have enough space to rebuild a btree? Neither AG + * reservation can be critical, and we must have enough space (factoring + * in AG reservations) to construct a whole btree. + */ +bool +xfs_repair_ag_has_space( + struct xfs_perag *pag, + xfs_extlen_t nr_blocks, + enum xfs_ag_resv_type type) +{ + return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) && + !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) && + pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks; +} + +/* + * Figure out how many blocks to reserve for an AG repair. We calculate the + * worst case estimate for the number of blocks we'd need to rebuild one of + * any type of per-AG btree. + */ +xfs_extlen_t +xfs_repair_calc_ag_resblks( + struct xfs_scrub_context *sc) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_scrub_metadata *sm = sc->sm; + struct xfs_perag *pag; + struct xfs_buf *bp; + xfs_agino_t icount = 0; + xfs_extlen_t aglen = 0; + xfs_extlen_t usedlen; + xfs_extlen_t freelen; + xfs_extlen_t bnobt_sz; + xfs_extlen_t inobt_sz; + xfs_extlen_t rmapbt_sz; + xfs_extlen_t refcbt_sz; + int error; + + if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + return 0; + + /* Use in-core counters if possible. */ + pag = xfs_perag_get(mp, sm->sm_agno); + if (pag->pagi_init) + icount = pag->pagi_count; + + /* + * Otherwise try to get the actual counters from disk; if not, make + * some worst case assumptions. + */ + if (icount == 0) { + error = xfs_ialloc_read_agi(mp, NULL, sm->sm_agno, &bp); + if (error) { + icount = mp->m_sb.sb_agblocks / mp->m_sb.sb_inopblock; + } else { + icount = pag->pagi_count; + xfs_buf_relse(bp); + } + } + + /* Now grab the block counters from the AGF. */ + error = xfs_alloc_read_agf(mp, NULL, sm->sm_agno, 0, &bp); + if (error) { + aglen = mp->m_sb.sb_agblocks; + freelen = aglen; + usedlen = aglen; + } else { + aglen = be32_to_cpu(XFS_BUF_TO_AGF(bp)->agf_length); + freelen = pag->pagf_freeblks; + usedlen = aglen - freelen; + xfs_buf_relse(bp); + } + xfs_perag_put(pag); + + trace_xfs_repair_calc_ag_resblks(mp, sm->sm_agno, icount, aglen, + freelen, usedlen); + + /* + * Figure out how many blocks we'd need worst case to rebuild + * each type of btree. Note that we can only rebuild the + * bnobt/cntbt or inobt/finobt as pairs. + */ + bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen); + if (xfs_sb_version_hassparseinodes(&mp->m_sb)) + inobt_sz = xfs_iallocbt_calc_size(mp, icount / + XFS_INODES_PER_HOLEMASK_BIT); + else + inobt_sz = xfs_iallocbt_calc_size(mp, icount / + XFS_INODES_PER_CHUNK); + if (xfs_sb_version_hasfinobt(&mp->m_sb)) + inobt_sz *= 2; + if (xfs_sb_version_hasreflink(&mp->m_sb)) + refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen); + else + refcbt_sz = 0; + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { + /* + * Guess how many blocks we need to rebuild the rmapbt. + * For non-reflink filesystems we can't have more records than + * used blocks. However, with reflink it's possible to have + * more than one rmap record per AG block. We don't know how + * many rmaps there could be in the AG, so we start off with + * what we hope is an generous over-estimation. + */ + if (xfs_sb_version_hasreflink(&mp->m_sb)) + rmapbt_sz = xfs_rmapbt_calc_size(mp, + (unsigned long long)aglen * 2); + else + rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen); + } else { + rmapbt_sz = 0; + } + + trace_xfs_repair_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz, + inobt_sz, rmapbt_sz, refcbt_sz); + + return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz)); +} + +/* Allocate a block in an AG. */ +int +xfs_repair_alloc_ag_block( + struct xfs_scrub_context *sc, + struct xfs_owner_info *oinfo, + xfs_fsblock_t *fsbno, + enum xfs_ag_resv_type resv) +{ + struct xfs_alloc_arg args = {0}; + xfs_agblock_t bno; + int error; + + switch (resv) { + case XFS_AG_RESV_AGFL: + case XFS_AG_RESV_RMAPBT: + error = xfs_alloc_get_freelist(sc->tp, sc->sa.agf_bp, &bno, 1); + if (error) + return error; + if (bno == NULLAGBLOCK) + return -ENOSPC; + xfs_extent_busy_reuse(sc->mp, sc->sa.agno, bno, + 1, false); + *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.agno, bno); + if (resv == XFS_AG_RESV_RMAPBT) + xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.agno); + return 0; + default: + break; + } + + args.tp = sc->tp; + args.mp = sc->mp; + args.oinfo = *oinfo; + args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.agno, 0); + args.minlen = 1; + args.maxlen = 1; + args.prod = 1; + args.type = XFS_ALLOCTYPE_THIS_AG; + args.resv = resv; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + if (args.fsbno == NULLFSBLOCK) + return -ENOSPC; + ASSERT(args.len == 1); + *fsbno = args.fsbno; + + return 0; +} + +/* Initialize a new AG btree root block with zero entries. */ +int +xfs_repair_init_btblock( + struct xfs_scrub_context *sc, + xfs_fsblock_t fsb, + struct xfs_buf **bpp, + xfs_btnum_t btnum, + const struct xfs_buf_ops *ops) +{ + struct xfs_trans *tp = sc->tp; + struct xfs_mount *mp = sc->mp; + struct xfs_buf *bp; + + trace_xfs_repair_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb), + XFS_FSB_TO_AGBNO(mp, fsb), btnum); + + ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.agno); + bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, fsb), + XFS_FSB_TO_BB(mp, 1), 0); + xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); + xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.agno, 0); + xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF); + xfs_trans_log_buf(tp, bp, 0, bp->b_length); + bp->b_ops = ops; + *bpp = bp; + + return 0; +} + +/* + * Reconstructing per-AG Btrees + * + * When a space btree is corrupt, we don't bother trying to fix it. Instead, + * we scan secondary space metadata to derive the records that should be in + * the damaged btree, initialize a fresh btree root, and insert the records. + * Note that for rebuilding the rmapbt we scan all the primary data to + * generate the new records. + * + * However, that leaves the matter of removing all the metadata describing the + * old broken structure. For primary metadata we use the rmap data to collect + * every extent with a matching rmap owner (exlist); we then iterate all other + * metadata structures with the same rmap owner to collect the extents that + * cannot be removed (sublist). We then subtract sublist from exlist to + * derive the blocks that were used by the old btree. These blocks can be + * reaped. + * + * For rmapbt reconstructions we must use different tactics for extent + * collection. First we iterate all primary metadata (this excludes the old + * rmapbt, obviously) to generate new rmap records. The gaps in the rmap + * records are collected as exlist. The bnobt records are collected as + * sublist. As with the other btrees we subtract sublist from exlist, and the + * result (since the rmapbt lives in the free space) are the blocks from the + * old rmapbt. + */ + +/* Collect a dead btree extent for later disposal. */ +int +xfs_repair_collect_btree_extent( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + xfs_fsblock_t fsbno, + xfs_extlen_t len) +{ + struct xfs_repair_extent *rex; + + trace_xfs_repair_collect_btree_extent(sc->mp, + XFS_FSB_TO_AGNO(sc->mp, fsbno), + XFS_FSB_TO_AGBNO(sc->mp, fsbno), len); + + rex = kmem_alloc(sizeof(struct xfs_repair_extent), KM_MAYFAIL); + if (!rex) + return -ENOMEM; + + INIT_LIST_HEAD(&rex->list); + rex->fsbno = fsbno; + rex->len = len; + list_add_tail(&rex->list, &exlist->list); + + return 0; +} + +/* + * An error happened during the rebuild so the transaction will be cancelled. + * The fs will shut down, and the administrator has to unmount and run repair. + * Therefore, free all the memory associated with the list so we can die. + */ +void +xfs_repair_cancel_btree_extents( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist) +{ + struct xfs_repair_extent *rex; + struct xfs_repair_extent *n; + + for_each_xfs_repair_extent_safe(rex, n, exlist) { + list_del(&rex->list); + kmem_free(rex); + } +} + +/* Compare two btree extents. */ +static int +xfs_repair_btree_extent_cmp( + void *priv, + struct list_head *a, + struct list_head *b) +{ + struct xfs_repair_extent *ap; + struct xfs_repair_extent *bp; + + ap = container_of(a, struct xfs_repair_extent, list); + bp = container_of(b, struct xfs_repair_extent, list); + + if (ap->fsbno > bp->fsbno) + return 1; + if (ap->fsbno < bp->fsbno) + return -1; + return 0; +} + +/* + * Remove all the blocks mentioned in @sublist from the extents in @exlist. + * + * The intent is that callers will iterate the rmapbt for all of its records + * for a given owner to generate @exlist; and iterate all the blocks of the + * metadata structures that are not being rebuilt and have the same rmapbt + * owner to generate @sublist. This routine subtracts all the extents + * mentioned in sublist from all the extents linked in @exlist, which leaves + * @exlist as the list of blocks that are not accounted for, which we assume + * are the dead blocks of the old metadata structure. The blocks mentioned in + * @exlist can be reaped. + */ +#define LEFT_ALIGNED (1 << 0) +#define RIGHT_ALIGNED (1 << 1) +int +xfs_repair_subtract_extents( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_repair_extent_list *sublist) +{ + struct list_head *lp; + struct xfs_repair_extent *ex; + struct xfs_repair_extent *newex; + struct xfs_repair_extent *subex; + xfs_fsblock_t sub_fsb; + xfs_extlen_t sub_len; + int state; + int error = 0; + + if (list_empty(&exlist->list) || list_empty(&sublist->list)) + return 0; + ASSERT(!list_empty(&sublist->list)); + + list_sort(NULL, &exlist->list, xfs_repair_btree_extent_cmp); + list_sort(NULL, &sublist->list, xfs_repair_btree_extent_cmp); + + /* + * Now that we've sorted both lists, we iterate exlist once, rolling + * forward through sublist and/or exlist as necessary until we find an + * overlap or reach the end of either list. We do not reset lp to the + * head of exlist nor do we reset subex to the head of sublist. The + * list traversal is similar to merge sort, but we're deleting + * instead. In this manner we avoid O(n^2) operations. + */ + subex = list_first_entry(&sublist->list, struct xfs_repair_extent, + list); + lp = exlist->list.next; + while (lp != &exlist->list) { + ex = list_entry(lp, struct xfs_repair_extent, list); + + /* + * Advance subex and/or ex until we find a pair that + * intersect or we run out of extents. + */ + while (subex->fsbno + subex->len <= ex->fsbno) { + if (list_is_last(&subex->list, &sublist->list)) + goto out; + subex = list_next_entry(subex, list); + } + if (subex->fsbno >= ex->fsbno + ex->len) { + lp = lp->next; + continue; + } + + /* trim subex to fit the extent we have */ + sub_fsb = subex->fsbno; + sub_len = subex->len; + if (subex->fsbno < ex->fsbno) { + sub_len -= ex->fsbno - subex->fsbno; + sub_fsb = ex->fsbno; + } + if (sub_len > ex->len) + sub_len = ex->len; + + state = 0; + if (sub_fsb == ex->fsbno) + state |= LEFT_ALIGNED; + if (sub_fsb + sub_len == ex->fsbno + ex->len) + state |= RIGHT_ALIGNED; + switch (state) { + case LEFT_ALIGNED: + /* Coincides with only the left. */ + ex->fsbno += sub_len; + ex->len -= sub_len; + break; + case RIGHT_ALIGNED: + /* Coincides with only the right. */ + ex->len -= sub_len; + lp = lp->next; + break; + case LEFT_ALIGNED | RIGHT_ALIGNED: + /* Total overlap, just delete ex. */ + lp = lp->next; + list_del(&ex->list); + kmem_free(ex); + break; + case 0: + /* + * Deleting from the middle: add the new right extent + * and then shrink the left extent. + */ + newex = kmem_alloc(sizeof(struct xfs_repair_extent), + KM_MAYFAIL); + if (!newex) { + error = -ENOMEM; + goto out; + } + INIT_LIST_HEAD(&newex->list); + newex->fsbno = sub_fsb + sub_len; + newex->len = ex->fsbno + ex->len - newex->fsbno; + list_add(&newex->list, &ex->list); + ex->len = sub_fsb - ex->fsbno; + lp = lp->next; + break; + default: + ASSERT(0); + break; + } + } + +out: + return error; +} +#undef LEFT_ALIGNED +#undef RIGHT_ALIGNED + +/* + * Disposal of Blocks from Old per-AG Btrees + * + * Now that we've constructed a new btree to replace the damaged one, we want + * to dispose of the blocks that (we think) the old btree was using. + * Previously, we used the rmapbt to collect the extents (exlist) with the + * rmap owner corresponding to the tree we rebuilt, collected extents for any + * blocks with the same rmap owner that are owned by another data structure + * (sublist), and subtracted sublist from exlist. In theory the extents + * remaining in exlist are the old btree's blocks. + * + * Unfortunately, it's possible that the btree was crosslinked with other + * blocks on disk. The rmap data can tell us if there are multiple owners, so + * if the rmapbt says there is an owner of this block other than @oinfo, then + * the block is crosslinked. Remove the reverse mapping and continue. + * + * If there is one rmap record, we can free the block, which removes the + * reverse mapping but doesn't add the block to the free space. Our repair + * strategy is to hope the other metadata objects crosslinked on this block + * will be rebuilt (atop different blocks), thereby removing all the cross + * links. + * + * If there are no rmap records at all, we also free the block. If the btree + * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't + * supposed to be a rmap record and everything is ok. For other btrees there + * had to have been an rmap entry for the block to have ended up on @exlist, + * so if it's gone now there's something wrong and the fs will shut down. + * + * Note: If there are multiple rmap records with only the same rmap owner as + * the btree we're trying to rebuild and the block is indeed owned by another + * data structure with the same rmap owner, then the block will be in sublist + * and therefore doesn't need disposal. If there are multiple rmap records + * with only the same rmap owner but the block is not owned by something with + * the same rmap owner, the block will be freed. + * + * The caller is responsible for locking the AG headers for the entire rebuild + * operation so that nothing else can sneak in and change the AG state while + * we're not looking. We also assume that the caller already invalidated any + * buffers associated with @exlist. + */ + +/* + * Invalidate buffers for per-AG btree blocks we're dumping. This function + * is not intended for use with file data repairs; we have bunmapi for that. + */ +int +xfs_repair_invalidate_blocks( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist) +{ + struct xfs_repair_extent *rex; + struct xfs_repair_extent *n; + struct xfs_buf *bp; + xfs_fsblock_t fsbno; + xfs_agblock_t i; + + /* + * For each block in each extent, see if there's an incore buffer for + * exactly that block; if so, invalidate it. The buffer cache only + * lets us look for one buffer at a time, so we have to look one block + * at a time. Avoid invalidating AG headers and post-EOFS blocks + * because we never own those; and if we can't TRYLOCK the buffer we + * assume it's owned by someone else. + */ + for_each_xfs_repair_extent_safe(rex, n, exlist) { + for (fsbno = rex->fsbno, i = rex->len; i > 0; fsbno++, i--) { + /* Skip AG headers and post-EOFS blocks */ + if (!xfs_verify_fsbno(sc->mp, fsbno)) + continue; + bp = xfs_buf_incore(sc->mp->m_ddev_targp, + XFS_FSB_TO_DADDR(sc->mp, fsbno), + XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK); + if (bp) { + xfs_trans_bjoin(sc->tp, bp); + xfs_trans_binval(sc->tp, bp); + } + } + } + + return 0; +} + +/* Ensure the freelist is the correct size. */ +int +xfs_repair_fix_freelist( + struct xfs_scrub_context *sc, + bool can_shrink) +{ + struct xfs_alloc_arg args = {0}; + + args.mp = sc->mp; + args.tp = sc->tp; + args.agno = sc->sa.agno; + args.alignment = 1; + args.pag = sc->sa.pag; + + return xfs_alloc_fix_freelist(&args, + can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); +} + +/* + * Put a block back on the AGFL. + */ +STATIC int +xfs_repair_put_freelist( + struct xfs_scrub_context *sc, + xfs_agblock_t agbno) +{ + struct xfs_owner_info oinfo; + int error; + + /* Make sure there's space on the freelist. */ + error = xfs_repair_fix_freelist(sc, true); + if (error) + return error; + + /* + * Since we're "freeing" a lost block onto the AGFL, we have to + * create an rmap for the block prior to merging it or else other + * parts will break. + */ + xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); + error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.agno, agbno, 1, + &oinfo); + if (error) + return error; + + /* Put the block on the AGFL. */ + error = xfs_alloc_put_freelist(sc->tp, sc->sa.agf_bp, sc->sa.agfl_bp, + agbno, 0); + if (error) + return error; + xfs_extent_busy_insert(sc->tp, sc->sa.agno, agbno, 1, + XFS_EXTENT_BUSY_SKIP_DISCARD); + + return 0; +} + +/* Dispose of a single metadata block. */ +STATIC int +xfs_repair_dispose_btree_block( + struct xfs_scrub_context *sc, + xfs_fsblock_t fsbno, + struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type resv) +{ + struct xfs_btree_cur *cur; + struct xfs_buf *agf_bp = NULL; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + bool has_other_rmap; + int error; + + agno = XFS_FSB_TO_AGNO(sc->mp, fsbno); + agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno); + + /* + * If we are repairing per-inode metadata, we need to read in the AGF + * buffer. Otherwise, we're repairing a per-AG structure, so reuse + * the AGF buffer that the setup functions already grabbed. + */ + if (sc->ip) { + error = xfs_alloc_read_agf(sc->mp, sc->tp, agno, 0, &agf_bp); + if (error) + return error; + if (!agf_bp) + return -ENOMEM; + } else { + agf_bp = sc->sa.agf_bp; + } + cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, agno); + + /* Can we find any other rmappings? */ + error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap); + if (error) + goto out_cur; + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + + /* + * If there are other rmappings, this block is cross linked and must + * not be freed. Remove the reverse mapping and move on. Otherwise, + * we were the only owner of the block, so free the extent, which will + * also remove the rmap. + * + * XXX: XFS doesn't support detecting the case where a single block + * metadata structure is crosslinked with a multi-block structure + * because the buffer cache doesn't detect aliasing problems, so we + * can't fix 100% of crosslinking problems (yet). The verifiers will + * blow on writeout, the filesystem will shut down, and the admin gets + * to run xfs_repair. + */ + if (has_other_rmap) + error = xfs_rmap_free(sc->tp, agf_bp, agno, agbno, 1, oinfo); + else if (resv == XFS_AG_RESV_AGFL) + error = xfs_repair_put_freelist(sc, agbno); + else + error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv); + if (agf_bp != sc->sa.agf_bp) + xfs_trans_brelse(sc->tp, agf_bp); + if (error) + return error; + + if (sc->ip) + return xfs_trans_roll_inode(&sc->tp, sc->ip); + return xfs_repair_roll_ag_trans(sc); + +out_cur: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + if (agf_bp != sc->sa.agf_bp) + xfs_trans_brelse(sc->tp, agf_bp); + return error; +} + +/* Dispose of btree blocks from an old per-AG btree. */ +int +xfs_repair_reap_btree_extents( + struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_owner_info *oinfo, + enum xfs_ag_resv_type type) +{ + struct xfs_repair_extent *rex; + struct xfs_repair_extent *n; + int error = 0; + + ASSERT(xfs_sb_version_hasrmapbt(&sc->mp->m_sb)); + + /* Dispose of every block from the old btree. */ + for_each_xfs_repair_extent_safe(rex, n, exlist) { + ASSERT(sc->ip != NULL || + XFS_FSB_TO_AGNO(sc->mp, rex->fsbno) == sc->sa.agno); + + trace_xfs_repair_dispose_btree_extent(sc->mp, + XFS_FSB_TO_AGNO(sc->mp, rex->fsbno), + XFS_FSB_TO_AGBNO(sc->mp, rex->fsbno), rex->len); + + for (; rex->len > 0; rex->len--, rex->fsbno++) { + error = xfs_repair_dispose_btree_block(sc, rex->fsbno, + oinfo, type); + if (error) + goto out; + } + list_del(&rex->list); + kmem_free(rex); + } + +out: + xfs_repair_cancel_btree_extents(sc, exlist); + return error; +} + +/* + * Finding per-AG Btree Roots for AGF/AGI Reconstruction + * + * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild + * the AG headers by using the rmap data to rummage through the AG looking for + * btree roots. This is not guaranteed to work if the AG is heavily damaged + * or the rmap data are corrupt. + * + * Callers of xfs_repair_find_ag_btree_roots must lock the AGF and AGFL + * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the + * AGI is being rebuilt. It must maintain these locks until it's safe for + * other threads to change the btrees' shapes. The caller provides + * information about the btrees to look for by passing in an array of + * xfs_repair_find_ag_btree with the (rmap owner, buf_ops, magic) fields set. + * The (root, height) fields will be set on return if anything is found. The + * last element of the array should have a NULL buf_ops to mark the end of the + * array. + * + * For every rmapbt record matching any of the rmap owners in btree_info, + * read each block referenced by the rmap record. If the block is a btree + * block from this filesystem matching any of the magic numbers and has a + * level higher than what we've already seen, remember the block and the + * height of the tree required to have such a block. When the call completes, + * we return the highest block we've found for each btree description; those + * should be the roots. + */ + +struct xfs_repair_findroot { + struct xfs_scrub_context *sc; + struct xfs_buf *agfl_bp; + struct xfs_agf *agf; + struct xfs_repair_find_ag_btree *btree_info; +}; + +/* See if our block is in the AGFL. */ +STATIC int +xfs_repair_findroot_agfl_walk( + struct xfs_mount *mp, + xfs_agblock_t bno, + void *priv) +{ + xfs_agblock_t *agbno = priv; + + return (*agbno == bno) ? XFS_BTREE_QUERY_RANGE_ABORT : 0; +} + +/* Does this block match the btree information passed in? */ +STATIC int +xfs_repair_findroot_block( + struct xfs_repair_findroot *ri, + struct xfs_repair_find_ag_btree *fab, + uint64_t owner, + xfs_agblock_t agbno, + bool *found_it) +{ + struct xfs_mount *mp = ri->sc->mp; + struct xfs_buf *bp; + struct xfs_btree_block *btblock; + xfs_daddr_t daddr; + int error; + + daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno); + + /* + * Blocks in the AGFL have stale contents that might just happen to + * have a matching magic and uuid. We don't want to pull these blocks + * in as part of a tree root, so we have to filter out the AGFL stuff + * here. If the AGFL looks insane we'll just refuse to repair. + */ + if (owner == XFS_RMAP_OWN_AG) { + error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp, + xfs_repair_findroot_agfl_walk, &agbno); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) + return 0; + if (error) + return error; + } + + error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr, + mp->m_bsize, 0, &bp, NULL); + if (error) + return error; + + /* + * Does this look like a block matching our fs and higher than any + * other block we've found so far? If so, reattach buffer verifiers + * so the AIL won't complain if the buffer is also dirty. + */ + btblock = XFS_BUF_TO_BLOCK(bp); + if (be32_to_cpu(btblock->bb_magic) != fab->magic) + goto out; + if (xfs_sb_version_hascrc(&mp->m_sb) && + !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) + goto out; + bp->b_ops = fab->buf_ops; + + /* Ignore this block if it's lower in the tree than we've seen. */ + if (fab->root != NULLAGBLOCK && + xfs_btree_get_level(btblock) < fab->height) + goto out; + + /* Make sure we pass the verifiers. */ + bp->b_ops->verify_read(bp); + if (bp->b_error) + goto out; + fab->root = agbno; + fab->height = xfs_btree_get_level(btblock) + 1; + *found_it = true; + + trace_xfs_repair_findroot_block(mp, ri->sc->sa.agno, agbno, + be32_to_cpu(btblock->bb_magic), fab->height - 1); +out: + xfs_trans_brelse(ri->sc->tp, bp); + return error; +} + +/* + * Do any of the blocks in this rmap record match one of the btrees we're + * looking for? + */ +STATIC int +xfs_repair_findroot_rmap( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_repair_findroot *ri = priv; + struct xfs_repair_find_ag_btree *fab; + xfs_agblock_t b; + bool found_it; + int error = 0; + + /* Ignore anything that isn't AG metadata. */ + if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner)) + return 0; + + /* Otherwise scan each block + btree type. */ + for (b = 0; b < rec->rm_blockcount; b++) { + found_it = false; + for (fab = ri->btree_info; fab->buf_ops; fab++) { + if (rec->rm_owner != fab->rmap_owner) + continue; + error = xfs_repair_findroot_block(ri, fab, + rec->rm_owner, rec->rm_startblock + b, + &found_it); + if (error) + return error; + if (found_it) + break; + } + } + + return 0; +} + +/* Find the roots of the per-AG btrees described in btree_info. */ +int +xfs_repair_find_ag_btree_roots( + struct xfs_scrub_context *sc, + struct xfs_buf *agf_bp, + struct xfs_repair_find_ag_btree *btree_info, + struct xfs_buf *agfl_bp) +{ + struct xfs_mount *mp = sc->mp; + struct xfs_repair_findroot ri; + struct xfs_repair_find_ag_btree *fab; + struct xfs_btree_cur *cur; + int error; + + ASSERT(xfs_buf_islocked(agf_bp)); + ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp)); + + ri.sc = sc; + ri.btree_info = btree_info; + ri.agf = XFS_BUF_TO_AGF(agf_bp); + ri.agfl_bp = agfl_bp; + for (fab = btree_info; fab->buf_ops; fab++) { + ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG); + ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner)); + fab->root = NULLAGBLOCK; + fab->height = 0; + } + + cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.agno); + error = xfs_rmap_query_all(cur, xfs_repair_findroot_rmap, &ri); + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + + return error; +} + +/* Force a quotacheck the next time we mount. */ +void +xfs_repair_force_quotacheck( + struct xfs_scrub_context *sc, + uint dqtype) +{ + uint flag; + + flag = xfs_quota_chkd_flag(dqtype); + if (!(flag & sc->mp->m_qflags)) + return; + + sc->mp->m_qflags &= ~flag; + spin_lock(&sc->mp->m_sb_lock); + sc->mp->m_sb.sb_qflags &= ~flag; + spin_unlock(&sc->mp->m_sb_lock); + xfs_log_sb(sc->tp); +} + +/* + * Attach dquots to this inode, or schedule quotacheck to fix them. + * + * This function ensures that the appropriate dquots are attached to an inode. + * We cannot allow the dquot code to allocate an on-disk dquot block here + * because we're already in transaction context with the inode locked. The + * on-disk dquot should already exist anyway. If the quota code signals + * corruption or missing quota information, schedule quotacheck, which will + * repair corruptions in the quota metadata. + */ +int +xfs_repair_ino_dqattach( + struct xfs_scrub_context *sc) +{ + int error; + + error = xfs_qm_dqattach_locked(sc->ip, false); + switch (error) { + case -EFSBADCRC: + case -EFSCORRUPTED: + case -ENOENT: + xfs_err_ratelimited(sc->mp, +"inode %llu repair encountered quota error %d, quotacheck forced.", + (unsigned long long)sc->ip->i_ino, error); + if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot) + xfs_repair_force_quotacheck(sc, XFS_DQ_USER); + if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot) + xfs_repair_force_quotacheck(sc, XFS_DQ_GROUP); + if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot) + xfs_repair_force_quotacheck(sc, XFS_DQ_PROJ); + /* fall through */ + case -ESRCH: + error = 0; + break; + default: + break; + } + + return error; +} diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h new file mode 100644 index 000000000000..f2b0895294db --- /dev/null +++ b/fs/xfs/scrub/repair.h @@ -0,0 +1,132 @@ +/* + * Copyright (C) 2018 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_SCRUB_REPAIR_H__ +#define __XFS_SCRUB_REPAIR_H__ + +static inline int xfs_repair_notsupported(struct xfs_scrub_context *sc) +{ + return -EOPNOTSUPP; +} + +#ifdef CONFIG_XFS_ONLINE_REPAIR + +/* Repair helpers */ + +int xfs_repair_attempt(struct xfs_inode *ip, struct xfs_scrub_context *sc, + bool *fixed); +void xfs_repair_failure(struct xfs_mount *mp); +int xfs_repair_roll_ag_trans(struct xfs_scrub_context *sc); +bool xfs_repair_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks, + enum xfs_ag_resv_type type); +xfs_extlen_t xfs_repair_calc_ag_resblks(struct xfs_scrub_context *sc); +int xfs_repair_alloc_ag_block(struct xfs_scrub_context *sc, + struct xfs_owner_info *oinfo, xfs_fsblock_t *fsbno, + enum xfs_ag_resv_type resv); +int xfs_repair_init_btblock(struct xfs_scrub_context *sc, xfs_fsblock_t fsb, + struct xfs_buf **bpp, xfs_btnum_t btnum, + const struct xfs_buf_ops *ops); + +struct xfs_repair_extent { + struct list_head list; + xfs_fsblock_t fsbno; + xfs_extlen_t len; +}; + +struct xfs_repair_extent_list { + struct list_head list; +}; + +static inline void +xfs_repair_init_extent_list( + struct xfs_repair_extent_list *exlist) +{ + INIT_LIST_HEAD(&exlist->list); +} + +#define for_each_xfs_repair_extent_safe(rbe, n, exlist) \ + list_for_each_entry_safe((rbe), (n), &(exlist)->list, list) +int xfs_repair_collect_btree_extent(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *btlist, xfs_fsblock_t fsbno, + xfs_extlen_t len); +void xfs_repair_cancel_btree_extents(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *btlist); +int xfs_repair_subtract_extents(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_repair_extent_list *sublist); +int xfs_repair_fix_freelist(struct xfs_scrub_context *sc, bool can_shrink); +int xfs_repair_invalidate_blocks(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *btlist); +int xfs_repair_reap_btree_extents(struct xfs_scrub_context *sc, + struct xfs_repair_extent_list *exlist, + struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type); + +struct xfs_repair_find_ag_btree { + /* in: rmap owner of the btree we're looking for */ + uint64_t rmap_owner; + + /* in: buffer ops */ + const struct xfs_buf_ops *buf_ops; + + /* in: magic number of the btree */ + uint32_t magic; + + /* out: the highest btree block found and the tree height */ + xfs_agblock_t root; + unsigned int height; +}; + +int xfs_repair_find_ag_btree_roots(struct xfs_scrub_context *sc, + struct xfs_buf *agf_bp, + struct xfs_repair_find_ag_btree *btree_info, + struct xfs_buf *agfl_bp); +void xfs_repair_force_quotacheck(struct xfs_scrub_context *sc, uint dqtype); +int xfs_repair_ino_dqattach(struct xfs_scrub_context *sc); + +/* Metadata repairers */ + +int xfs_repair_probe(struct xfs_scrub_context *sc); +int xfs_repair_superblock(struct xfs_scrub_context *sc); + +#else + +static inline int xfs_repair_attempt( + struct xfs_inode *ip, + struct xfs_scrub_context *sc, + bool *fixed) +{ + return -EOPNOTSUPP; +} + +static inline void xfs_repair_failure(struct xfs_mount *mp) {} + +static inline xfs_extlen_t +xfs_repair_calc_ag_resblks( + struct xfs_scrub_context *sc) +{ + ASSERT(!(sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)); + return 0; +} + +#define xfs_repair_probe xfs_repair_notsupported +#define xfs_repair_superblock xfs_repair_notsupported + +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + +#endif /* __XFS_SCRUB_REPAIR_H__ */ diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c index 8f2a7c3ff455..b376a9a77c04 100644 --- a/fs/xfs/scrub/rmap.c +++ b/fs/xfs/scrub/rmap.c @@ -66,7 +66,7 @@ xfs_scrub_rmapbt_xref_refc( bool is_unwritten; int error; - if (!sc->sa.refc_cur) + if (!sc->sa.refc_cur || xfs_scrub_skip_xref(sc->sm)) return; non_inode = XFS_RMAP_NON_INODE_OWNER(irec->rm_owner); @@ -207,7 +207,7 @@ xfs_scrub_xref_check_owner( bool has_rmap; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_rmap_record_exists(sc->sa.rmap_cur, bno, len, oinfo, @@ -250,7 +250,7 @@ xfs_scrub_xref_has_no_owner( bool has_rmap; int error; - if (!sc->sa.rmap_cur) + if (!sc->sa.rmap_cur || xfs_scrub_skip_xref(sc->sm)) return; error = xfs_rmap_has_record(sc->sa.rmap_cur, bno, len, &has_rmap); diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index 39c41dfe08ee..40f462a11ea5 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -66,11 +66,15 @@ xfs_scrub_rtbitmap_rec( void *priv) { struct xfs_scrub_context *sc = priv; + xfs_rtblock_t startblock; + xfs_rtblock_t blockcount; - if (rec->ar_startblock + rec->ar_blockcount <= rec->ar_startblock || - !xfs_verify_rtbno(sc->mp, rec->ar_startblock) || - !xfs_verify_rtbno(sc->mp, rec->ar_startblock + - rec->ar_blockcount - 1)) + startblock = rec->ar_startext * tp->t_mountp->m_sb.sb_rextsize; + blockcount = rec->ar_extcount * tp->t_mountp->m_sb.sb_rextsize; + + if (startblock + blockcount <= startblock || + !xfs_verify_rtbno(sc->mp, startblock) || + !xfs_verify_rtbno(sc->mp, startblock + blockcount - 1)) xfs_scrub_fblock_set_corrupt(sc, XFS_DATA_FORK, 0); return 0; } @@ -82,6 +86,11 @@ xfs_scrub_rtbitmap( { int error; + /* Invoke the fork scrubber. */ + error = xfs_scrub_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + return error; + error = xfs_rtalloc_query_all(sc->tp, xfs_scrub_rtbitmap_rec, sc); if (!xfs_scrub_fblock_process_error(sc, XFS_DATA_FORK, 0, &error)) goto out; @@ -95,8 +104,35 @@ int xfs_scrub_rtsummary( struct xfs_scrub_context *sc) { + struct xfs_inode *rsumip = sc->mp->m_rsumip; + struct xfs_inode *old_ip = sc->ip; + uint old_ilock_flags = sc->ilock_flags; + int error = 0; + + /* + * We ILOCK'd the rt bitmap ip in the setup routine, now lock the + * rt summary ip in compliance with the rt inode locking rules. + * + * Since we switch sc->ip to rsumip we have to save the old ilock + * flags so that we don't mix up the inode state that @sc tracks. + */ + sc->ip = rsumip; + sc->ilock_flags = XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM; + xfs_ilock(sc->ip, sc->ilock_flags); + + /* Invoke the fork scrubber. */ + error = xfs_scrub_metadata_inode_forks(sc); + if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) + goto out; + /* XXX: implement this some day */ - return -ENOENT; + xfs_scrub_set_incomplete(sc); +out: + /* Switch back to the rtbitmap inode and lock flags. */ + xfs_iunlock(sc->ip, sc->ilock_flags); + sc->ilock_flags = old_ilock_flags; + sc->ip = old_ip; + return error; } @@ -107,11 +143,23 @@ xfs_scrub_xref_is_used_rt_space( xfs_rtblock_t fsbno, xfs_extlen_t len) { + xfs_rtblock_t startext; + xfs_rtblock_t endext; + xfs_rtblock_t extcount; bool is_free; int error; + if (xfs_scrub_skip_xref(sc->sm)) + return; + + startext = fsbno; + endext = fsbno + len - 1; + do_div(startext, sc->mp->m_sb.sb_rextsize); + if (do_div(endext, sc->mp->m_sb.sb_rextsize)) + endext++; + extcount = endext - startext; xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); - error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, fsbno, len, + error = xfs_rtalloc_extent_is_free(sc->mp, sc->tp, startext, extcount, &is_free); if (!xfs_scrub_should_check_xref(sc, &error, NULL)) goto out_unlock; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 26c75967a072..36db098ba583 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -42,11 +42,18 @@ #include "xfs_refcount_btree.h" #include "xfs_rmap.h" #include "xfs_rmap_btree.h" +#include "xfs_quota.h" +#include "xfs_qm.h" +#include "xfs_errortag.h" +#include "xfs_error.h" +#include "xfs_log.h" +#include "xfs_trans_priv.h" #include "scrub/xfs_scrub.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" #include "scrub/btree.h" +#include "scrub/repair.h" /* * Online Scrub and Repair @@ -120,6 +127,24 @@ * XCORRUPT flag; btree query function errors are noted by setting the * XFAIL flag and deleting the cursor to prevent further attempts to * cross-reference with a defective btree. + * + * If a piece of metadata proves corrupt or suboptimal, the userspace + * program can ask the kernel to apply some tender loving care (TLC) to + * the metadata object by setting the REPAIR flag and re-calling the + * scrub ioctl. "Corruption" is defined by metadata violating the + * on-disk specification; operations cannot continue if the violation is + * left untreated. It is possible for XFS to continue if an object is + * "suboptimal", however performance may be degraded. Repairs are + * usually performed by rebuilding the metadata entirely out of + * redundant metadata. Optimizing, on the other hand, can sometimes be + * done without rebuilding entire structures. + * + * Generally speaking, the repair code has the following code structure: + * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock. + * The first check helps us figure out if we need to rebuild or simply + * optimize the structure so that the rebuild knows what to do. The + * second check evaluates the completeness of the repair; that is what + * is reported to userspace. */ /* @@ -155,7 +180,10 @@ xfs_scrub_teardown( { xfs_scrub_ag_free(sc, &sc->sa); if (sc->tp) { - xfs_trans_cancel(sc->tp); + if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) + error = xfs_trans_commit(sc->tp); + else + xfs_trans_cancel(sc->tp); sc->tp = NULL; } if (sc->ip) { @@ -166,6 +194,8 @@ xfs_scrub_teardown( iput(VFS_I(sc->ip)); sc->ip = NULL; } + if (sc->has_quotaofflock) + mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); if (sc->buf) { kmem_free(sc->buf); sc->buf = NULL; @@ -180,126 +210,150 @@ static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { .type = ST_NONE, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_probe, + .repair = xfs_repair_probe, }, [XFS_SCRUB_TYPE_SB] = { /* superblock */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_superblock, + .repair = xfs_repair_superblock, }, [XFS_SCRUB_TYPE_AGF] = { /* agf */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agf, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agfl, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_AGI] = { /* agi */ .type = ST_PERAG, .setup = xfs_scrub_setup_fs, .scrub = xfs_scrub_agi, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_bnobt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_allocbt, .scrub = xfs_scrub_cntbt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_iallocbt, .scrub = xfs_scrub_inobt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_iallocbt, .scrub = xfs_scrub_finobt, .has = xfs_sb_version_hasfinobt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_rmapbt, .scrub = xfs_scrub_rmapbt, .has = xfs_sb_version_hasrmapbt, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ .type = ST_PERAG, .setup = xfs_scrub_setup_ag_refcountbt, .scrub = xfs_scrub_refcountbt, .has = xfs_sb_version_hasreflink, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_INODE] = { /* inode record */ .type = ST_INODE, .setup = xfs_scrub_setup_inode, .scrub = xfs_scrub_inode, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_data, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_attr, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ .type = ST_INODE, .setup = xfs_scrub_setup_inode_bmap, .scrub = xfs_scrub_bmap_cow, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_DIR] = { /* directory */ .type = ST_INODE, .setup = xfs_scrub_setup_directory, .scrub = xfs_scrub_directory, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ .type = ST_INODE, .setup = xfs_scrub_setup_xattr, .scrub = xfs_scrub_xattr, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ .type = ST_INODE, .setup = xfs_scrub_setup_symlink, .scrub = xfs_scrub_symlink, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ .type = ST_INODE, .setup = xfs_scrub_setup_parent, .scrub = xfs_scrub_parent, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_FS, .setup = xfs_scrub_setup_rt, .scrub = xfs_scrub_rtbitmap, .has = xfs_sb_version_hasrealtime, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_FS, .setup = xfs_scrub_setup_rt, .scrub = xfs_scrub_rtsummary, .has = xfs_sb_version_hasrealtime, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ .type = ST_FS, .setup = xfs_scrub_setup_quota, .scrub = xfs_scrub_quota, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ .type = ST_FS, .setup = xfs_scrub_setup_quota, .scrub = xfs_scrub_quota, + .repair = xfs_repair_notsupported, }, [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ .type = ST_FS, .setup = xfs_scrub_setup_quota, .scrub = xfs_scrub_quota, + .repair = xfs_repair_notsupported, }, }; @@ -379,15 +433,54 @@ xfs_scrub_validate_inputs( if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) goto out; - /* We don't know how to repair anything yet. */ - if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) - goto out; + /* + * We only want to repair read-write v5+ filesystems. Defer the check + * for ops->repair until after our scrub confirms that we need to + * perform repairs so that we avoid failing due to not supporting + * repairing an object that doesn't need repairs. + */ + if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { + error = -EOPNOTSUPP; + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto out; + + error = -EROFS; + if (mp->m_flags & XFS_MOUNT_RDONLY) + goto out; + } error = 0; out: return error; } +#ifdef CONFIG_XFS_ONLINE_REPAIR +static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc) +{ + /* + * Userspace asked us to repair something, we repaired it, rescanned + * it, and the rescan says it's still broken. Scream about this in + * the system logs. + */ + if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && + (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT))) + xfs_repair_failure(sc->mp); +} +#else +static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc) +{ + /* + * Userspace asked us to scrub something, it's broken, and we have no + * way of fixing it. Scream in the logs. + */ + if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT)) + xfs_alert_ratelimited(sc->mp, + "Corruption detected during scrub."); +} +#endif /* CONFIG_XFS_ONLINE_REPAIR */ + /* Dispatch metadata scrubbing. */ int xfs_scrub_metadata( @@ -397,6 +490,7 @@ xfs_scrub_metadata( struct xfs_scrub_context sc; struct xfs_mount *mp = ip->i_mount; bool try_harder = false; + bool already_fixed = false; int error = 0; BUILD_BUG_ON(sizeof(meta_scrub_ops) != @@ -446,10 +540,44 @@ retry_op: } else if (error) goto out_teardown; - if (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | - XFS_SCRUB_OFLAG_XCORRUPT)) - xfs_alert_ratelimited(mp, "Corruption detected during scrub."); + if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) { + bool needs_fix; + + /* Let debug users force us into the repair routines. */ + if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) + sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; + + needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | + XFS_SCRUB_OFLAG_XCORRUPT | + XFS_SCRUB_OFLAG_PREEN)); + /* + * If userspace asked for a repair but it wasn't necessary, + * report that back to userspace. + */ + if (!needs_fix) { + sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; + goto out_nofix; + } + + /* + * If it's broken, userspace wants us to fix it, and we haven't + * already tried to fix it, then attempt a repair. + */ + error = xfs_repair_attempt(ip, &sc, &already_fixed); + if (error == -EAGAIN) { + if (sc.try_harder) + try_harder = true; + error = xfs_scrub_teardown(&sc, ip, 0); + if (error) { + xfs_repair_failure(mp); + goto out; + } + goto retry_op; + } + } +out_nofix: + xfs_scrub_postmortem(&sc); out_teardown: error = xfs_scrub_teardown(&sc, ip, error); out: diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h index 0d92af86f67a..636424d5e2ee 100644 --- a/fs/xfs/scrub/scrub.h +++ b/fs/xfs/scrub/scrub.h @@ -38,6 +38,9 @@ struct xfs_scrub_meta_ops { /* Examine metadata for errors. */ int (*scrub)(struct xfs_scrub_context *); + /* Repair or optimize the metadata. */ + int (*repair)(struct xfs_scrub_context *); + /* Decide if we even have this piece of metadata. */ bool (*has)(struct xfs_sb *); @@ -48,6 +51,7 @@ struct xfs_scrub_meta_ops { /* Buffer pointers and btree cursors for an entire AG. */ struct xfs_scrub_ag { xfs_agnumber_t agno; + struct xfs_perag *pag; /* AG btree roots */ struct xfs_buf *agf_bp; @@ -73,6 +77,7 @@ struct xfs_scrub_context { void *buf; uint ilock_flags; bool try_harder; + bool has_quotaofflock; /* State tracking for single-AG operations. */ struct xfs_scrub_ag sa; diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h index 5d2b1c241be5..794d56bb1af8 100644 --- a/fs/xfs/scrub/trace.h +++ b/fs/xfs/scrub/trace.h @@ -69,6 +69,8 @@ DEFINE_EVENT(xfs_scrub_class, name, \ DEFINE_SCRUB_EVENT(xfs_scrub_start); DEFINE_SCRUB_EVENT(xfs_scrub_done); DEFINE_SCRUB_EVENT(xfs_scrub_deadlock_retry); +DEFINE_SCRUB_EVENT(xfs_repair_attempt); +DEFINE_SCRUB_EVENT(xfs_repair_done); TRACE_EVENT(xfs_scrub_op_error, TP_PROTO(struct xfs_scrub_context *sc, xfs_agnumber_t agno, @@ -492,6 +494,262 @@ TRACE_EVENT(xfs_scrub_xref_error, __entry->ret_ip) ); +/* repair tracepoints */ +#if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) + +DECLARE_EVENT_CLASS(xfs_repair_extent_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len), + TP_ARGS(mp, agno, agbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len) +); +#define DEFINE_REPAIR_EXTENT_EVENT(name) \ +DEFINE_EVENT(xfs_repair_extent_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len), \ + TP_ARGS(mp, agno, agbno, len)) +DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_dispose_btree_extent); +DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_collect_btree_extent); +DEFINE_REPAIR_EXTENT_EVENT(xfs_repair_agfl_insert); + +DECLARE_EVENT_CLASS(xfs_repair_rmap_class, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t agbno, xfs_extlen_t len, + uint64_t owner, uint64_t offset, unsigned int flags), + TP_ARGS(mp, agno, agbno, len, owner, offset, flags), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, len) + __field(uint64_t, owner) + __field(uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->len = len; + __entry->owner = owner; + __entry->offset = offset; + __entry->flags = flags; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u owner %lld offset %llu flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +); +#define DEFINE_REPAIR_RMAP_EVENT(name) \ +DEFINE_EVENT(xfs_repair_rmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \ + xfs_agblock_t agbno, xfs_extlen_t len, \ + uint64_t owner, uint64_t offset, unsigned int flags), \ + TP_ARGS(mp, agno, agbno, len, owner, offset, flags)) +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_alloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_ialloc_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_rmap_extent_fn); +DEFINE_REPAIR_RMAP_EVENT(xfs_repair_bmap_extent_fn); + +TRACE_EVENT(xfs_repair_refcount_extent_fn, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + struct xfs_refcount_irec *irec), + TP_ARGS(mp, agno, irec), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, startblock) + __field(xfs_extlen_t, blockcount) + __field(xfs_nlink_t, refcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startblock = irec->rc_startblock; + __entry->blockcount = irec->rc_blockcount; + __entry->refcount = irec->rc_refcount; + ), + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startblock, + __entry->blockcount, + __entry->refcount) +) + +TRACE_EVENT(xfs_repair_init_btblock, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + xfs_btnum_t btnum), + TP_ARGS(mp, agno, agbno, btnum), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(uint32_t, btnum) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->btnum = btnum; + ), + TP_printk("dev %d:%d agno %u agbno %u btnum %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->btnum) +) +TRACE_EVENT(xfs_repair_findroot_block, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, + uint32_t magic, uint16_t level), + TP_ARGS(mp, agno, agbno, magic, level), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(uint32_t, magic) + __field(uint16_t, level) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agbno = agbno; + __entry->magic = magic; + __entry->level = level; + ), + TP_printk("dev %d:%d agno %u agbno %u magic 0x%x level %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->magic, + __entry->level) +) +TRACE_EVENT(xfs_repair_calc_ag_resblks, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t icount, xfs_agblock_t aglen, xfs_agblock_t freelen, + xfs_agblock_t usedlen), + TP_ARGS(mp, agno, icount, aglen, freelen, usedlen), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, icount) + __field(xfs_agblock_t, aglen) + __field(xfs_agblock_t, freelen) + __field(xfs_agblock_t, usedlen) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->icount = icount; + __entry->aglen = aglen; + __entry->freelen = freelen; + __entry->usedlen = usedlen; + ), + TP_printk("dev %d:%d agno %d icount %u aglen %u freelen %u usedlen %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->icount, + __entry->aglen, + __entry->freelen, + __entry->usedlen) +) +TRACE_EVENT(xfs_repair_calc_ag_resblks_btsize, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agblock_t bnobt_sz, xfs_agblock_t inobt_sz, + xfs_agblock_t rmapbt_sz, xfs_agblock_t refcbt_sz), + TP_ARGS(mp, agno, bnobt_sz, inobt_sz, rmapbt_sz, refcbt_sz), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, bnobt_sz) + __field(xfs_agblock_t, inobt_sz) + __field(xfs_agblock_t, rmapbt_sz) + __field(xfs_agblock_t, refcbt_sz) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->bnobt_sz = bnobt_sz; + __entry->inobt_sz = inobt_sz; + __entry->rmapbt_sz = rmapbt_sz; + __entry->refcbt_sz = refcbt_sz; + ), + TP_printk("dev %d:%d agno %d bno %u ino %u rmap %u refcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->bnobt_sz, + __entry->inobt_sz, + __entry->rmapbt_sz, + __entry->refcbt_sz) +) +TRACE_EVENT(xfs_repair_reset_counters, + TP_PROTO(struct xfs_mount *mp), + TP_ARGS(mp), + TP_STRUCT__entry( + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + ), + TP_printk("dev %d:%d", + MAJOR(__entry->dev), MINOR(__entry->dev)) +) + +TRACE_EVENT(xfs_repair_ialloc_insert, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, + xfs_agino_t startino, uint16_t holemask, uint8_t count, + uint8_t freecount, uint64_t freemask), + TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, startino) + __field(uint16_t, holemask) + __field(uint8_t, count) + __field(uint8_t, freecount) + __field(uint64_t, freemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->startino = startino; + __entry->holemask = holemask; + __entry->count = count; + __entry->freecount = freecount; + __entry->freemask = freemask; + ), + TP_printk("dev %d:%d agno %d startino %u holemask 0x%x count %u freecount %u freemask 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->startino, + __entry->holemask, + __entry->count, + __entry->freecount, + __entry->freemask) +) + +#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */ + #endif /* _TRACE_XFS_SCRUB_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 102463543db3..ca6903726689 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1378,10 +1378,9 @@ xfs_vm_bmap( struct address_space *mapping, sector_t block) { - struct inode *inode = (struct inode *)mapping->host; - struct xfs_inode *ip = XFS_I(inode); + struct xfs_inode *ip = XFS_I(mapping->host); - trace_xfs_vm_bmap(XFS_I(inode)); + trace_xfs_vm_bmap(ip); /* * The swap code (ab-)uses ->bmap to get a block mapping and then @@ -1394,9 +1393,7 @@ xfs_vm_bmap( */ if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) return 0; - - filemap_write_and_wait(mapping); - return generic_block_bmap(mapping, block, xfs_get_blocks); + return iomap_bmap(mapping, block, &xfs_iomap_ops); } STATIC int @@ -1475,6 +1472,16 @@ xfs_vm_set_page_dirty( return newly_dirty; } +static int +xfs_iomap_swapfile_activate( + struct swap_info_struct *sis, + struct file *swap_file, + sector_t *span) +{ + sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file)); + return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops); +} + const struct address_space_operations xfs_address_space_operations = { .readpage = xfs_vm_readpage, .readpages = xfs_vm_readpages, @@ -1488,6 +1495,7 @@ const struct address_space_operations xfs_address_space_operations = { .migratepage = buffer_migrate_page, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, + .swap_activate = xfs_iomap_swapfile_activate, }; const struct address_space_operations xfs_dax_aops = { @@ -1495,4 +1503,5 @@ const struct address_space_operations xfs_dax_aops = { .direct_IO = noop_direct_IO, .set_page_dirty = noop_set_page_dirty, .invalidatepage = noop_invalidatepage, + .swap_activate = xfs_iomap_swapfile_activate, }; diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 2203465e63ea..618bb71535c8 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -160,7 +160,7 @@ STATIC void xfs_bui_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_bui_release(BUI_ITEM(lip)); } @@ -305,7 +305,7 @@ xfs_bud_item_unlock( { struct xfs_bud_log_item *budp = BUD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_bui_release(budp->bud_buip); kmem_zone_free(xfs_bud_zone, budp); } diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 8cd8c412f52d..06badcbadeb4 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -848,7 +848,7 @@ xfs_free_eofblocks( /* * Attach the dquots to the inode up front. */ - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -871,8 +871,8 @@ xfs_free_eofblocks( * contents of the file are flushed to disk then the files * may be full of holes (ie NULL files bug). */ - error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, - XFS_ISIZE(ip)); + error = xfs_itruncate_extents_flags(&tp, ip, XFS_DATA_FORK, + XFS_ISIZE(ip), XFS_BMAPI_NODISCARD); if (error) { /* * If we get an error at this point we simply don't @@ -918,7 +918,7 @@ xfs_alloc_file_space( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -1169,7 +1169,7 @@ xfs_free_file_space( trace_xfs_free_file_space(ip); - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 55661cbdb51b..5179ab9e3d6a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -549,17 +549,31 @@ xfs_buf_hash_destroy( } /* - * Look up, and creates if absent, a lockable buffer for - * a given range of an inode. The buffer is returned - * locked. No I/O is implied by this call. + * Look up a buffer in the buffer cache and return it referenced and locked + * in @found_bp. + * + * If @new_bp is supplied and we have a lookup miss, insert @new_bp into the + * cache. + * + * If XBF_TRYLOCK is set in @flags, only try to lock the buffer and return + * -EAGAIN if we fail to lock it. + * + * Return values are: + * -EFSCORRUPTED if have been supplied with an invalid address + * -EAGAIN on trylock failure + * -ENOENT if we fail to find a match and @new_bp was NULL + * 0, with @found_bp: + * - @new_bp if we inserted it into the cache + * - the buffer we found and locked. */ -xfs_buf_t * -_xfs_buf_find( +static int +xfs_buf_find( struct xfs_buftarg *btp, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, - xfs_buf_t *new_bp) + struct xfs_buf *new_bp, + struct xfs_buf **found_bp) { struct xfs_perag *pag; xfs_buf_t *bp; @@ -567,6 +581,8 @@ _xfs_buf_find( xfs_daddr_t eofs; int i; + *found_bp = NULL; + for (i = 0; i < nmaps; i++) cmap.bm_len += map[i].bm_len; @@ -580,16 +596,11 @@ _xfs_buf_find( */ eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) { - /* - * XXX (dgc): we should really be returning -EFSCORRUPTED here, - * but none of the higher level infrastructure supports - * returning a specific error on buffer lookup failures. - */ xfs_alert(btp->bt_mount, "%s: daddr 0x%llx out of range, EOFS 0x%llx", __func__, cmap.bm_bn, eofs); WARN_ON(1); - return NULL; + return -EFSCORRUPTED; } pag = xfs_perag_get(btp->bt_mount, @@ -604,19 +615,20 @@ _xfs_buf_find( } /* No match found */ - if (new_bp) { - /* the buffer keeps the perag reference until it is freed */ - new_bp->b_pag = pag; - rhashtable_insert_fast(&pag->pag_buf_hash, - &new_bp->b_rhash_head, - xfs_buf_hash_params); - spin_unlock(&pag->pag_buf_lock); - } else { + if (!new_bp) { XFS_STATS_INC(btp->bt_mount, xb_miss_locked); spin_unlock(&pag->pag_buf_lock); xfs_perag_put(pag); + return -ENOENT; } - return new_bp; + + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + rhashtable_insert_fast(&pag->pag_buf_hash, &new_bp->b_rhash_head, + xfs_buf_hash_params); + spin_unlock(&pag->pag_buf_lock); + *found_bp = new_bp; + return 0; found: spin_unlock(&pag->pag_buf_lock); @@ -626,7 +638,7 @@ found: if (flags & XBF_TRYLOCK) { xfs_buf_rele(bp); XFS_STATS_INC(btp->bt_mount, xb_busy_locked); - return NULL; + return -EAGAIN; } xfs_buf_lock(bp); XFS_STATS_INC(btp->bt_mount, xb_get_locked_waited); @@ -646,6 +658,24 @@ found: trace_xfs_buf_find(bp, flags, _RET_IP_); XFS_STATS_INC(btp->bt_mount, xb_get_locked); + *found_bp = bp; + return 0; +} + +struct xfs_buf * +xfs_buf_incore( + struct xfs_buftarg *target, + xfs_daddr_t blkno, + size_t numblks, + xfs_buf_flags_t flags) +{ + struct xfs_buf *bp; + int error; + DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); + + error = xfs_buf_find(target, &map, 1, flags, NULL, &bp); + if (error) + return NULL; return bp; } @@ -665,9 +695,27 @@ xfs_buf_get_map( struct xfs_buf *new_bp; int error = 0; - bp = _xfs_buf_find(target, map, nmaps, flags, NULL); - if (likely(bp)) + error = xfs_buf_find(target, map, nmaps, flags, NULL, &bp); + + switch (error) { + case 0: + /* cache hit */ goto found; + case -EAGAIN: + /* cache hit, trylock failure, caller handles failure */ + ASSERT(flags & XBF_TRYLOCK); + return NULL; + case -ENOENT: + /* cache miss, go for insert */ + break; + case -EFSCORRUPTED: + default: + /* + * None of the higher layers understand failure types + * yet, so return NULL to signal a fatal lookup error. + */ + return NULL; + } new_bp = _xfs_buf_alloc(target, map, nmaps, flags); if (unlikely(!new_bp)) @@ -679,8 +727,8 @@ xfs_buf_get_map( return NULL; } - bp = _xfs_buf_find(target, map, nmaps, flags, new_bp); - if (!bp) { + error = xfs_buf_find(target, map, nmaps, flags, new_bp, &bp); + if (error) { xfs_buf_free(new_bp); return NULL; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index edced162a674..f5f2b71c2fde 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -218,20 +218,9 @@ typedef struct xfs_buf { } xfs_buf_t; /* Finding and Reading Buffers */ -struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target, - struct xfs_buf_map *map, int nmaps, - xfs_buf_flags_t flags, struct xfs_buf *new_bp); - -static inline struct xfs_buf * -xfs_incore( - struct xfs_buftarg *target, - xfs_daddr_t blkno, - size_t numblks, - xfs_buf_flags_t flags) -{ - DEFINE_SINGLE_BUF_MAP(map, blkno, numblks); - return _xfs_buf_find(target, &map, 1, flags, NULL); -} +struct xfs_buf *xfs_buf_incore(struct xfs_buftarg *target, + xfs_daddr_t blkno, size_t numblks, + xfs_buf_flags_t flags); struct xfs_buf *_xfs_buf_alloc(struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -358,6 +347,18 @@ extern void xfs_buf_terminate(void); void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref); +/* + * If the buffer is already on the LRU, do nothing. Otherwise set the buffer + * up with a reference count of 0 so it will be tossed from the cache when + * released. + */ +static inline void xfs_buf_oneshot(struct xfs_buf *bp) +{ + if (!list_empty(&bp->b_lru) || atomic_read(&bp->b_lru_ref) > 1) + return; + atomic_set(&bp->b_lru_ref, 0); +} + static inline int xfs_buf_ispinned(struct xfs_buf *bp) { return atomic_read(&bp->b_pin_count); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 82ad270e390e..c2311379d1c3 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -438,7 +438,7 @@ xfs_buf_item_unpin( * xfs_trans_uncommit() will try to reference the * buffer which we no longer have a hold on. */ - if (lip->li_desc) + if (!list_empty(&lip->li_trans)) xfs_trans_del_item(lip); /* @@ -568,13 +568,15 @@ xfs_buf_item_unlock( { struct xfs_buf_log_item *bip = BUF_ITEM(lip); struct xfs_buf *bp = bip->bli_buf; - bool aborted = !!(lip->li_flags & XFS_LI_ABORTED); + bool aborted; bool hold = !!(bip->bli_flags & XFS_BLI_HOLD); bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY); #if defined(DEBUG) || defined(XFS_WARN) bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED); #endif + aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags); + /* Clear the buffer's association with this transaction. */ bp->b_transp = NULL; @@ -743,8 +745,10 @@ xfs_buf_item_init( * nothing to do here so return. */ ASSERT(bp->b_target->bt_mount == mp); - if (bip != NULL) { + if (bip) { ASSERT(bip->bli_item.li_type == XFS_LI_BUF); + ASSERT(!bp->b_transp); + ASSERT(bip->bli_buf == bp); return 0; } diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index a7daef9e16bf..2567391489bd 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -288,49 +288,43 @@ xfs_dquot_set_prealloc_limits(struct xfs_dquot *dqp) } /* - * Allocate a block and fill it with dquots. - * This is called when the bmapi finds a hole. + * Ensure that the given in-core dquot has a buffer on disk backing it, and + * return the buffer. This is called when the bmapi finds a hole. */ STATIC int -xfs_qm_dqalloc( - xfs_trans_t **tpp, - xfs_mount_t *mp, - xfs_dquot_t *dqp, - xfs_inode_t *quotip, - xfs_fileoff_t offset_fsb, - xfs_buf_t **O_bpp) +xfs_dquot_disk_alloc( + struct xfs_trans **tpp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) { - xfs_fsblock_t firstblock; - struct xfs_defer_ops dfops; - xfs_bmbt_irec_t map; - int nmaps, error; - xfs_buf_t *bp; - xfs_trans_t *tp = *tpp; - - ASSERT(tp != NULL); + struct xfs_bmbt_irec map; + struct xfs_defer_ops dfops; + struct xfs_mount *mp = (*tpp)->t_mountp; + struct xfs_buf *bp; + struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); + xfs_fsblock_t firstblock; + int nmaps = 1; + int error; trace_xfs_dqalloc(dqp); - /* - * Initialize the bmap freelist prior to calling bmapi code. - */ xfs_defer_init(&dfops, &firstblock); xfs_ilock(quotip, XFS_ILOCK_EXCL); - /* - * Return if this type of quotas is turned off while we didn't - * have an inode lock - */ if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + /* + * Return if this type of quotas is turned off while we didn't + * have an inode lock + */ xfs_iunlock(quotip, XFS_ILOCK_EXCL); return -ESRCH; } - xfs_trans_ijoin(tp, quotip, XFS_ILOCK_EXCL); - nmaps = 1; - error = xfs_bmapi_write(tp, quotip, offset_fsb, - XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, - &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), - &map, &nmaps, &dfops); + /* Create the block mapping. */ + xfs_trans_ijoin(*tpp, quotip, XFS_ILOCK_EXCL); + error = xfs_bmapi_write(*tpp, quotip, dqp->q_fileoffset, + XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, + &firstblock, XFS_QM_DQALLOC_SPACE_RES(mp), + &map, &nmaps, &dfops); if (error) goto error0; ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); @@ -344,10 +338,8 @@ xfs_qm_dqalloc( dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); /* now we can just get the buffer (there's nothing to read yet) */ - bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, - dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, - 0); + bp = xfs_trans_get_buf(*tpp, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0); if (!bp) { error = -ENOMEM; goto error1; @@ -358,37 +350,45 @@ xfs_qm_dqalloc( * Make a chunk of dquots out of this buffer and log * the entire thing. */ - xfs_qm_init_dquot_blk(tp, mp, be32_to_cpu(dqp->q_core.d_id), + xfs_qm_init_dquot_blk(*tpp, mp, be32_to_cpu(dqp->q_core.d_id), dqp->dq_flags & XFS_DQ_ALLTYPES, bp); + xfs_buf_set_ref(bp, XFS_DQUOT_REF); /* - * xfs_defer_finish() may commit the current transaction and - * start a second transaction if the freelist is not empty. + * Hold the buffer and join it to the dfops so that we'll still own + * the buffer when we return to the caller. The buffer disposal on + * error must be paid attention to very carefully, as it has been + * broken since commit efa092f3d4c6 "[XFS] Fixes a bug in the quota + * code when allocating a new dquot record" in 2005, and the later + * conversion to xfs_defer_ops in commit 310a75a3c6c747 failed to keep + * the buffer locked across the _defer_finish call. We can now do + * this correctly with xfs_defer_bjoin. * - * Since we still want to modify this buffer, we need to - * ensure that the buffer is not released on commit of - * the first transaction and ensure the buffer is added to the - * second transaction. + * Above, we allocated a disk block for the dquot information and + * used get_buf to initialize the dquot. If the _defer_bjoin fails, + * the buffer is still locked to *tpp, so we must _bhold_release and + * then _trans_brelse the buffer. If the _defer_finish fails, the old + * transaction is gone but the new buffer is not joined or held to any + * transaction, so we must _buf_relse it. * - * If there is only one transaction then don't stop the buffer - * from being released when it commits later on. + * If everything succeeds, the caller of this function is returned a + * buffer that is locked and held to the transaction. The caller + * is responsible for unlocking any buffer passed back, either + * manually or by committing the transaction. */ - - xfs_trans_bhold(tp, bp); - + xfs_trans_bhold(*tpp, bp); + error = xfs_defer_bjoin(&dfops, bp); + if (error) { + xfs_trans_bhold_release(*tpp, bp); + xfs_trans_brelse(*tpp, bp); + goto error1; + } error = xfs_defer_finish(tpp, &dfops); - if (error) + if (error) { + xfs_buf_relse(bp); goto error1; - - /* Transaction was committed? */ - if (*tpp != tp) { - tp = *tpp; - xfs_trans_bjoin(tp, bp); - } else { - xfs_trans_bhold_release(tp, bp); } - - *O_bpp = bp; + *bpp = bp; return 0; error1: @@ -398,32 +398,24 @@ error0: } /* - * Maps a dquot to the buffer containing its on-disk version. - * This returns a ptr to the buffer containing the on-disk dquot - * in the bpp param, and a ptr to the on-disk dquot within that buffer + * Read in the in-core dquot's on-disk metadata and return the buffer. + * Returns ENOENT to signal a hole. */ STATIC int -xfs_qm_dqtobp( - xfs_trans_t **tpp, - xfs_dquot_t *dqp, - xfs_disk_dquot_t **O_ddpp, - xfs_buf_t **O_bpp, - uint flags) +xfs_dquot_disk_read( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) { struct xfs_bmbt_irec map; - int nmaps = 1, error; struct xfs_buf *bp; - struct xfs_inode *quotip; - struct xfs_mount *mp = dqp->q_mount; - xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id); - struct xfs_trans *tp = (tpp ? *tpp : NULL); + struct xfs_inode *quotip = xfs_quota_inode(mp, dqp->dq_flags); uint lock_mode; - - quotip = xfs_quota_inode(dqp->q_mount, dqp->dq_flags); - dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; + int nmaps = 1; + int error; lock_mode = xfs_ilock_data_map_shared(quotip); - if (!xfs_this_quota_on(dqp->q_mount, dqp->dq_flags)) { + if (!xfs_this_quota_on(mp, dqp->dq_flags)) { /* * Return if this type of quotas is turned off while we * didn't have the quota inode lock. @@ -436,81 +428,48 @@ xfs_qm_dqtobp( * Find the block map; no allocations yet */ error = xfs_bmapi_read(quotip, dqp->q_fileoffset, - XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); - + XFS_DQUOT_CLUSTER_SIZE_FSB, &map, &nmaps, 0); xfs_iunlock(quotip, lock_mode); if (error) return error; ASSERT(nmaps == 1); - ASSERT(map.br_blockcount == 1); + ASSERT(map.br_blockcount >= 1); + ASSERT(map.br_startblock != DELAYSTARTBLOCK); + if (map.br_startblock == HOLESTARTBLOCK) + return -ENOENT; + + trace_xfs_dqtobp_read(dqp); /* - * Offset of dquot in the (fixed sized) dquot chunk. + * store the blkno etc so that we don't have to do the + * mapping all the time */ - dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * - sizeof(xfs_dqblk_t); - - ASSERT(map.br_startblock != DELAYSTARTBLOCK); - if (map.br_startblock == HOLESTARTBLOCK) { - /* - * We don't allocate unless we're asked to - */ - if (!(flags & XFS_QMOPT_DQALLOC)) - return -ENOENT; - - ASSERT(tp); - error = xfs_qm_dqalloc(tpp, mp, dqp, quotip, - dqp->q_fileoffset, &bp); - if (error) - return error; - tp = *tpp; - } else { - trace_xfs_dqtobp_read(dqp); + dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); - /* - * store the blkno etc so that we don't have to do the - * mapping all the time - */ - dqp->q_blkno = XFS_FSB_TO_DADDR(mp, map.br_startblock); - - error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, - dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen, - 0, &bp, &xfs_dquot_buf_ops); - if (error) { - ASSERT(bp == NULL); - return error; - } + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0, &bp, + &xfs_dquot_buf_ops); + if (error) { + ASSERT(bp == NULL); + return error; } ASSERT(xfs_buf_islocked(bp)); - *O_bpp = bp; - *O_ddpp = bp->b_addr + dqp->q_bufoffset; + xfs_buf_set_ref(bp, XFS_DQUOT_REF); + *bpp = bp; return 0; } - -/* - * Read in the ondisk dquot using dqtobp() then copy it to an incore version, - * and release the buffer immediately. - * - * If XFS_QMOPT_DQALLOC is set, allocate a dquot on disk if it needed. - */ -int -xfs_qm_dqread( +/* Allocate and initialize everything we need for an incore dquot. */ +STATIC struct xfs_dquot * +xfs_dquot_alloc( struct xfs_mount *mp, xfs_dqid_t id, - uint type, - uint flags, - struct xfs_dquot **O_dqpp) + uint type) { struct xfs_dquot *dqp; - struct xfs_disk_dquot *ddqp; - struct xfs_buf *bp; - struct xfs_trans *tp = NULL; - int error; dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); @@ -520,6 +479,12 @@ xfs_qm_dqread( INIT_LIST_HEAD(&dqp->q_lru); mutex_init(&dqp->q_qlock); init_waitqueue_head(&dqp->q_pinwait); + dqp->q_fileoffset = (xfs_fileoff_t)id / mp->m_quotainfo->qi_dqperchunk; + /* + * Offset of dquot in the (fixed sized) dquot chunk. + */ + dqp->q_bufoffset = (id % mp->m_quotainfo->qi_dqperchunk) * + sizeof(xfs_dqblk_t); /* * Because we want to use a counting completion, complete @@ -548,35 +513,22 @@ xfs_qm_dqread( break; } - XFS_STATS_INC(mp, xs_qm_dquot); - - trace_xfs_dqread(dqp); + xfs_qm_dquot_logitem_init(dqp); - if (flags & XFS_QMOPT_DQALLOC) { - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, - XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); - if (error) - goto error0; - } + XFS_STATS_INC(mp, xs_qm_dquot); + return dqp; +} - /* - * get a pointer to the on-disk dquot and the buffer containing it - * dqp already knows its own type (GROUP/USER). - */ - error = xfs_qm_dqtobp(&tp, dqp, &ddqp, &bp, flags); - if (error) { - /* - * This can happen if quotas got turned off (ESRCH), - * or if the dquot didn't exist on disk and we ask to - * allocate (ENOENT). - */ - trace_xfs_dqread_fail(dqp); - goto error1; - } +/* Copy the in-core quota fields in from the on-disk buffer. */ +STATIC void +xfs_dquot_from_disk( + struct xfs_dquot *dqp, + struct xfs_buf *bp) +{ + struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; /* copy everything from disk dquot to the incore dquot */ memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t)); - xfs_qm_dquot_logitem_init(dqp); /* * Reservation counters are defined as reservation plus current usage @@ -588,40 +540,90 @@ xfs_qm_dqread( /* initialize the dquot speculative prealloc thresholds */ xfs_dquot_set_prealloc_limits(dqp); +} - /* Mark the buf so that this will stay incore a little longer */ - xfs_buf_set_ref(bp, XFS_DQUOT_REF); +/* Allocate and initialize the dquot buffer for this in-core dquot. */ +static int +xfs_qm_dqread_alloc( + struct xfs_mount *mp, + struct xfs_dquot *dqp, + struct xfs_buf **bpp) +{ + struct xfs_trans *tp; + struct xfs_buf *bp; + int error; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_dqalloc, + XFS_QM_DQALLOC_SPACE_RES(mp), 0, 0, &tp); + if (error) + goto err; + + error = xfs_dquot_disk_alloc(&tp, dqp, &bp); + if (error) + goto err_cancel; + + error = xfs_trans_commit(tp); + if (error) { + /* + * Buffer was held to the transaction, so we have to unlock it + * manually here because we're not passing it back. + */ + xfs_buf_relse(bp); + goto err; + } + *bpp = bp; + return 0; + +err_cancel: + xfs_trans_cancel(tp); +err: + return error; +} + +/* + * Read in the ondisk dquot using dqtobp() then copy it to an incore version, + * and release the buffer immediately. If @can_alloc is true, fill any + * holes in the on-disk metadata. + */ +static int +xfs_qm_dqread( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + bool can_alloc, + struct xfs_dquot **dqpp) +{ + struct xfs_dquot *dqp; + struct xfs_buf *bp; + int error; + + dqp = xfs_dquot_alloc(mp, id, type); + trace_xfs_dqread(dqp); + + /* Try to read the buffer, allocating if necessary. */ + error = xfs_dquot_disk_read(mp, dqp, &bp); + if (error == -ENOENT && can_alloc) + error = xfs_qm_dqread_alloc(mp, dqp, &bp); + if (error) + goto err; /* - * We got the buffer with a xfs_trans_read_buf() (in dqtobp()) - * So we need to release with xfs_trans_brelse(). - * The strategy here is identical to that of inodes; we lock - * the dquot in xfs_qm_dqget() before making it accessible to - * others. This is because dquots, like inodes, need a good level of - * concurrency, and we don't want to take locks on the entire buffers - * for dquot accesses. - * Note also that the dquot buffer may even be dirty at this point, if - * this particular dquot was repaired. We still aren't afraid to - * brelse it because we have the changes incore. + * At this point we should have a clean locked buffer. Copy the data + * to the incore dquot and release the buffer since the incore dquot + * has its own locking protocol so we needn't tie up the buffer any + * further. */ ASSERT(xfs_buf_islocked(bp)); - xfs_trans_brelse(tp, bp); + xfs_dquot_from_disk(dqp, bp); - if (tp) { - error = xfs_trans_commit(tp); - if (error) - goto error0; - } - - *O_dqpp = dqp; + xfs_buf_relse(bp); + *dqpp = dqp; return error; -error1: - if (tp) - xfs_trans_cancel(tp); -error0: +err: + trace_xfs_dqread_fail(dqp); xfs_qm_dqdestroy(dqp); - *O_dqpp = NULL; + *dqpp = NULL; return error; } @@ -679,77 +681,230 @@ xfs_dq_get_next_id( } /* - * Given the file system, inode OR id, and type (UDQUOT/GDQUOT), return a - * a locked dquot, doing an allocation (if requested) as needed. - * When both an inode and an id are given, the inode's id takes precedence. - * That is, if the id changes while we don't hold the ilock inside this - * function, the new dquot is returned, not necessarily the one requested - * in the id argument. + * Look up the dquot in the in-core cache. If found, the dquot is returned + * locked and ready to go. + */ +static struct xfs_dquot * +xfs_qm_dqget_cache_lookup( + struct xfs_mount *mp, + struct xfs_quotainfo *qi, + struct radix_tree_root *tree, + xfs_dqid_t id) +{ + struct xfs_dquot *dqp; + +restart: + mutex_lock(&qi->qi_tree_lock); + dqp = radix_tree_lookup(tree, id); + if (!dqp) { + mutex_unlock(&qi->qi_tree_lock); + XFS_STATS_INC(mp, xs_qm_dqcachemisses); + return NULL; + } + + xfs_dqlock(dqp); + if (dqp->dq_flags & XFS_DQ_FREEING) { + xfs_dqunlock(dqp); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_freeing(dqp); + delay(1); + goto restart; + } + + dqp->q_nrefs++; + mutex_unlock(&qi->qi_tree_lock); + + trace_xfs_dqget_hit(dqp); + XFS_STATS_INC(mp, xs_qm_dqcachehits); + return dqp; +} + +/* + * Try to insert a new dquot into the in-core cache. If an error occurs the + * caller should throw away the dquot and start over. Otherwise, the dquot + * is returned locked (and held by the cache) as if there had been a cache + * hit. + */ +static int +xfs_qm_dqget_cache_insert( + struct xfs_mount *mp, + struct xfs_quotainfo *qi, + struct radix_tree_root *tree, + xfs_dqid_t id, + struct xfs_dquot *dqp) +{ + int error; + + mutex_lock(&qi->qi_tree_lock); + error = radix_tree_insert(tree, id, dqp); + if (unlikely(error)) { + /* Duplicate found! Caller must try again. */ + WARN_ON(error != -EEXIST); + mutex_unlock(&qi->qi_tree_lock); + trace_xfs_dqget_dup(dqp); + return error; + } + + /* Return a locked dquot to the caller, with a reference taken. */ + xfs_dqlock(dqp); + dqp->q_nrefs = 1; + + qi->qi_dquots++; + mutex_unlock(&qi->qi_tree_lock); + + return 0; +} + +/* Check our input parameters. */ +static int +xfs_qm_dqget_checks( + struct xfs_mount *mp, + uint type) +{ + if (WARN_ON_ONCE(!XFS_IS_QUOTA_RUNNING(mp))) + return -ESRCH; + + switch (type) { + case XFS_DQ_USER: + if (!XFS_IS_UQUOTA_ON(mp)) + return -ESRCH; + return 0; + case XFS_DQ_GROUP: + if (!XFS_IS_GQUOTA_ON(mp)) + return -ESRCH; + return 0; + case XFS_DQ_PROJ: + if (!XFS_IS_PQUOTA_ON(mp)) + return -ESRCH; + return 0; + default: + WARN_ON_ONCE(0); + return -EINVAL; + } +} + +/* + * Given the file system, id, and type (UDQUOT/GDQUOT), return a a locked + * dquot, doing an allocation (if requested) as needed. */ int xfs_qm_dqget( - xfs_mount_t *mp, - xfs_inode_t *ip, /* locked inode (optional) */ - xfs_dqid_t id, /* uid/projid/gid depending on type */ - uint type, /* XFS_DQ_USER/XFS_DQ_PROJ/XFS_DQ_GROUP */ - uint flags, /* DQALLOC, DQSUSER, DQREPAIR, DOWARN */ - xfs_dquot_t **O_dqpp) /* OUT : locked incore dquot */ + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + bool can_alloc, + struct xfs_dquot **O_dqpp) { struct xfs_quotainfo *qi = mp->m_quotainfo; - struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); struct xfs_dquot *dqp; int error; - ASSERT(XFS_IS_QUOTA_RUNNING(mp)); - if ((! XFS_IS_UQUOTA_ON(mp) && type == XFS_DQ_USER) || - (! XFS_IS_PQUOTA_ON(mp) && type == XFS_DQ_PROJ) || - (! XFS_IS_GQUOTA_ON(mp) && type == XFS_DQ_GROUP)) { - return -ESRCH; + error = xfs_qm_dqget_checks(mp, type); + if (error) + return error; + +restart: + dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); + if (dqp) { + *O_dqpp = dqp; + return 0; } - ASSERT(type == XFS_DQ_USER || - type == XFS_DQ_PROJ || - type == XFS_DQ_GROUP); - if (ip) { - ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(xfs_inode_dquot(ip, type) == NULL); + error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); + if (error) + return error; + + error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + if (error) { + /* + * Duplicate found. Just throw away the new dquot and start + * over. + */ + xfs_qm_dqdestroy(dqp); + XFS_STATS_INC(mp, xs_qm_dquot_dups); + goto restart; } -restart: - mutex_lock(&qi->qi_tree_lock); - dqp = radix_tree_lookup(tree, id); - if (dqp) { - xfs_dqlock(dqp); - if (dqp->dq_flags & XFS_DQ_FREEING) { - xfs_dqunlock(dqp); - mutex_unlock(&qi->qi_tree_lock); - trace_xfs_dqget_freeing(dqp); - delay(1); - goto restart; - } + trace_xfs_dqget_miss(dqp); + *O_dqpp = dqp; + return 0; +} - /* uninit / unused quota found in radix tree, keep looking */ - if (flags & XFS_QMOPT_DQNEXT) { - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - xfs_dqunlock(dqp); - mutex_unlock(&qi->qi_tree_lock); - error = xfs_dq_get_next_id(mp, type, &id); - if (error) - return error; - goto restart; - } - } +/* + * Given a dquot id and type, read and initialize a dquot from the on-disk + * metadata. This function is only for use during quota initialization so + * it ignores the dquot cache assuming that the dquot shrinker isn't set up. + * The caller is responsible for _qm_dqdestroy'ing the returned dquot. + */ +int +xfs_qm_dqget_uncached( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct xfs_dquot **dqpp) +{ + int error; - dqp->q_nrefs++; - mutex_unlock(&qi->qi_tree_lock); + error = xfs_qm_dqget_checks(mp, type); + if (error) + return error; + + return xfs_qm_dqread(mp, id, type, 0, dqpp); +} + +/* Return the quota id for a given inode and type. */ +xfs_dqid_t +xfs_qm_id_for_quotatype( + struct xfs_inode *ip, + uint type) +{ + switch (type) { + case XFS_DQ_USER: + return ip->i_d.di_uid; + case XFS_DQ_GROUP: + return ip->i_d.di_gid; + case XFS_DQ_PROJ: + return xfs_get_projid(ip); + } + ASSERT(0); + return 0; +} + +/* + * Return the dquot for a given inode and type. If @can_alloc is true, then + * allocate blocks if needed. The inode's ILOCK must be held and it must not + * have already had an inode attached. + */ +int +xfs_qm_dqget_inode( + struct xfs_inode *ip, + uint type, + bool can_alloc, + struct xfs_dquot **O_dqpp) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_quotainfo *qi = mp->m_quotainfo; + struct radix_tree_root *tree = xfs_dquot_tree(qi, type); + struct xfs_dquot *dqp; + xfs_dqid_t id; + int error; - trace_xfs_dqget_hit(dqp); - XFS_STATS_INC(mp, xs_qm_dqcachehits); + error = xfs_qm_dqget_checks(mp, type); + if (error) + return error; + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_inode_dquot(ip, type) == NULL); + + id = xfs_qm_id_for_quotatype(ip, type); + +restart: + dqp = xfs_qm_dqget_cache_lookup(mp, qi, tree, id); + if (dqp) { *O_dqpp = dqp; return 0; } - mutex_unlock(&qi->qi_tree_lock); - XFS_STATS_INC(mp, xs_qm_dqcachemisses); /* * Dquot cache miss. We don't want to keep the inode lock across @@ -758,87 +913,81 @@ restart: * lock here means dealing with a chown that can happen before * we re-acquire the lock. */ - if (ip) - xfs_iunlock(ip, XFS_ILOCK_EXCL); - - error = xfs_qm_dqread(mp, id, type, flags, &dqp); - - if (ip) - xfs_ilock(ip, XFS_ILOCK_EXCL); - - /* If we are asked to find next active id, keep looking */ - if (error == -ENOENT && (flags & XFS_QMOPT_DQNEXT)) { - error = xfs_dq_get_next_id(mp, type, &id); - if (!error) - goto restart; - } - + xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = xfs_qm_dqread(mp, id, type, can_alloc, &dqp); + xfs_ilock(ip, XFS_ILOCK_EXCL); if (error) return error; - if (ip) { - /* - * A dquot could be attached to this inode by now, since - * we had dropped the ilock. - */ - if (xfs_this_quota_on(mp, type)) { - struct xfs_dquot *dqp1; - - dqp1 = xfs_inode_dquot(ip, type); - if (dqp1) { - xfs_qm_dqdestroy(dqp); - dqp = dqp1; - xfs_dqlock(dqp); - goto dqret; - } - } else { - /* inode stays locked on return */ + /* + * A dquot could be attached to this inode by now, since we had + * dropped the ilock. + */ + if (xfs_this_quota_on(mp, type)) { + struct xfs_dquot *dqp1; + + dqp1 = xfs_inode_dquot(ip, type); + if (dqp1) { xfs_qm_dqdestroy(dqp); - return -ESRCH; + dqp = dqp1; + xfs_dqlock(dqp); + goto dqret; } + } else { + /* inode stays locked on return */ + xfs_qm_dqdestroy(dqp); + return -ESRCH; } - mutex_lock(&qi->qi_tree_lock); - error = radix_tree_insert(tree, id, dqp); - if (unlikely(error)) { - WARN_ON(error != -EEXIST); - + error = xfs_qm_dqget_cache_insert(mp, qi, tree, id, dqp); + if (error) { /* * Duplicate found. Just throw away the new dquot and start * over. */ - mutex_unlock(&qi->qi_tree_lock); - trace_xfs_dqget_dup(dqp); xfs_qm_dqdestroy(dqp); XFS_STATS_INC(mp, xs_qm_dquot_dups); goto restart; } - /* - * We return a locked dquot to the caller, with a reference taken - */ - xfs_dqlock(dqp); - dqp->q_nrefs = 1; +dqret: + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + trace_xfs_dqget_miss(dqp); + *O_dqpp = dqp; + return 0; +} - qi->qi_dquots++; - mutex_unlock(&qi->qi_tree_lock); +/* + * Starting at @id and progressing upwards, look for an initialized incore + * dquot, lock it, and return it. + */ +int +xfs_qm_dqget_next( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct xfs_dquot **dqpp) +{ + struct xfs_dquot *dqp; + int error = 0; - /* If we are asked to find next active id, keep looking */ - if (flags & XFS_QMOPT_DQNEXT) { - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - xfs_qm_dqput(dqp); - error = xfs_dq_get_next_id(mp, type, &id); - if (error) - return error; - goto restart; + *dqpp = NULL; + for (; !error; error = xfs_dq_get_next_id(mp, type, &id)) { + error = xfs_qm_dqget(mp, id, type, false, &dqp); + if (error == -ENOENT) + continue; + else if (error != 0) + break; + + if (!XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + *dqpp = dqp; + return 0; } + + xfs_qm_dqput(dqp); } - dqret: - ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); - trace_xfs_dqget_miss(dqp); - *O_dqpp = dqp; - return 0; + return error; } /* @@ -913,9 +1062,9 @@ xfs_qm_dqflush_done( * since it's cheaper, and then we recheck while * holding the lock before removing the dquot from the AIL. */ - if ((lip->li_flags & XFS_LI_IN_AIL) && + if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && ((lip->li_lsn == qip->qli_flush_lsn) || - (lip->li_flags & XFS_LI_FAILED))) { + test_bit(XFS_LI_FAILED, &lip->li_flags))) { /* xfs_trans_ail_delete() drops the AIL lock. */ spin_lock(&ailp->ail_lock); @@ -926,8 +1075,7 @@ xfs_qm_dqflush_done( * Clear the failed state since we are about to drop the * flush lock */ - if (lip->li_flags & XFS_LI_FAILED) - xfs_clear_li_failed(lip); + xfs_clear_li_failed(lip); spin_unlock(&ailp->ail_lock); } } @@ -953,6 +1101,7 @@ xfs_qm_dqflush( { struct xfs_mount *mp = dqp->q_mount; struct xfs_buf *bp; + struct xfs_dqblk *dqb; struct xfs_disk_dquot *ddqp; xfs_failaddr_t fa; int error; @@ -996,12 +1145,13 @@ xfs_qm_dqflush( /* * Calculate the location of the dquot inside the buffer. */ - ddqp = bp->b_addr + dqp->q_bufoffset; + dqb = bp->b_addr + dqp->q_bufoffset; + ddqp = &dqb->dd_diskdq; /* - * A simple sanity check in case we got a corrupted dquot.. + * A simple sanity check in case we got a corrupted dquot. */ - fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(ddqp->d_id), 0, 0); + fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS", be32_to_cpu(ddqp->d_id), fa); @@ -1032,8 +1182,6 @@ xfs_qm_dqflush( * of a dquot without an up-to-date CRC getting to disk. */ if (xfs_sb_version_hascrc(&mp->m_sb)) { - struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddqp; - dqb->dd_lsn = cpu_to_be64(dqp->q_logitem.qli_item.li_lsn); xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); @@ -1119,3 +1267,35 @@ xfs_qm_exit(void) kmem_zone_destroy(xfs_qm_dqtrxzone); kmem_zone_destroy(xfs_qm_dqzone); } + +/* + * Iterate every dquot of a particular type. The caller must ensure that the + * particular quota type is active. iter_fn can return negative error codes, + * or XFS_BTREE_QUERY_RANGE_ABORT to indicate that it wants to stop iterating. + */ +int +xfs_qm_dqiterate( + struct xfs_mount *mp, + uint dqtype, + xfs_qm_dqiterate_fn iter_fn, + void *priv) +{ + struct xfs_dquot *dq; + xfs_dqid_t id = 0; + int error; + + do { + error = xfs_qm_dqget_next(mp, id, dqtype, &dq); + if (error == -ENOENT) + return 0; + if (error) + return error; + + error = iter_fn(dq, dqtype, priv); + id = be32_to_cpu(dq->q_core.d_id); + xfs_qm_dqput(dq); + id++; + } while (error == 0 && id != 0); + + return error; +} diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h index 2f536f33cd26..bdd6bd921528 100644 --- a/fs/xfs/xfs_dquot.h +++ b/fs/xfs/xfs_dquot.h @@ -160,8 +160,6 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp) #define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ) #define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP) -extern int xfs_qm_dqread(struct xfs_mount *, xfs_dqid_t, uint, - uint, struct xfs_dquot **); extern void xfs_qm_dqdestroy(xfs_dquot_t *); extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **); extern void xfs_qm_dqunpin_wait(xfs_dquot_t *); @@ -169,8 +167,19 @@ extern void xfs_qm_adjust_dqtimers(xfs_mount_t *, xfs_disk_dquot_t *); extern void xfs_qm_adjust_dqlimits(struct xfs_mount *, struct xfs_dquot *); -extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *, - xfs_dqid_t, uint, uint, xfs_dquot_t **); +extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, + uint type); +extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id, + uint type, bool can_alloc, + struct xfs_dquot **dqpp); +extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type, + bool can_alloc, + struct xfs_dquot **dqpp); +extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id, + uint type, struct xfs_dquot **dqpp); +extern int xfs_qm_dqget_uncached(struct xfs_mount *mp, + xfs_dqid_t id, uint type, + struct xfs_dquot **dqpp); extern void xfs_qm_dqput(xfs_dquot_t *); extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); @@ -185,4 +194,9 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) return dqp; } +typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq, uint dqtype, + void *priv); +int xfs_qm_dqiterate(struct xfs_mount *mp, uint dqtype, + xfs_qm_dqiterate_fn iter_fn, void *priv); + #endif /* __XFS_DQUOT_H__ */ diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c index 4b331e354da7..8eb7415474d6 100644 --- a/fs/xfs/xfs_dquot_item.c +++ b/fs/xfs/xfs_dquot_item.c @@ -173,7 +173,7 @@ xfs_qm_dquot_logitem_push( * The buffer containing this item failed to be written back * previously. Resubmit the buffer for IO */ - if (lip->li_flags & XFS_LI_FAILED) { + if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; @@ -209,10 +209,7 @@ xfs_qm_dquot_logitem_push( spin_unlock(&lip->li_ailp->ail_lock); error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(dqp->q_mount, "%s: push error %d on dqp "PTR_FMT, - __func__, error, dqp); - } else { + if (!error) { if (!xfs_buf_delwri_queue(bp, buffer_list)) rval = XFS_ITEM_FLUSHING; xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index a63f5083f497..7975634cb8fe 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -61,6 +61,7 @@ static unsigned int xfs_errortag_random_default[] = { XFS_RANDOM_LOG_BAD_CRC, XFS_RANDOM_LOG_ITEM_PIN, XFS_RANDOM_BUF_LRU_REF, + XFS_RANDOM_FORCE_SCRUB_REPAIR, }; struct xfs_errortag_attr { @@ -167,6 +168,7 @@ XFS_ERRORTAG_ATTR_RW(drop_writes, XFS_ERRTAG_DROP_WRITES); XFS_ERRORTAG_ATTR_RW(log_bad_crc, XFS_ERRTAG_LOG_BAD_CRC); XFS_ERRORTAG_ATTR_RW(log_item_pin, XFS_ERRTAG_LOG_ITEM_PIN); XFS_ERRORTAG_ATTR_RW(buf_lru_ref, XFS_ERRTAG_BUF_LRU_REF); +XFS_ERRORTAG_ATTR_RW(force_repair, XFS_ERRTAG_FORCE_SCRUB_REPAIR); static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(noerror), @@ -201,6 +203,7 @@ static struct attribute *xfs_errortag_attrs[] = { XFS_ERRORTAG_ATTR_LIST(log_bad_crc), XFS_ERRORTAG_ATTR_LIST(log_item_pin), XFS_ERRORTAG_ATTR_LIST(buf_lru_ref), + XFS_ERRORTAG_ATTR_LIST(force_repair), NULL, }; diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index b5b1e567b9f4..a889b550979a 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -168,7 +168,7 @@ STATIC void xfs_efi_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_efi_release(EFI_ITEM(lip)); } @@ -402,7 +402,7 @@ xfs_efd_item_unlock( { struct xfs_efd_log_item *efdp = EFD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_efi_release(efdp->efd_efip); xfs_efd_item_free(efdp); } @@ -542,7 +542,7 @@ xfs_efi_recover( for (i = 0; i < efip->efi_format.efi_nextents; i++) { extp = &efip->efi_format.efi_extents[i]; error = xfs_trans_free_extent(tp, efdp, extp->ext_start, - extp->ext_len, &oinfo); + extp->ext_len, &oinfo, false); if (error) goto abort_error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e70fb8ccecea..0e3fb8978344 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -414,6 +414,12 @@ xfs_dio_write_end_io( if (size <= 0) return size; + /* + * Capture amount written on completion as we can't reliably account + * for it on submission. + */ + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size); + if (flags & IOMAP_DIO_COW) { error = xfs_reflink_end_cow(ip, offset, size); if (error) @@ -599,7 +605,16 @@ xfs_file_dax_write( } out: xfs_iunlock(ip, iolock); - return error ? error : ret; + if (error) + return error; + + if (ret > 0) { + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } + return ret; } STATIC ssize_t @@ -669,6 +684,12 @@ write_retry: out: if (iolock) xfs_iunlock(ip, iolock); + + if (ret > 0) { + XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); + /* Handle various SYNC-type writes */ + ret = generic_write_sync(iocb, ret); + } return ret; } @@ -693,8 +714,9 @@ xfs_file_write_iter( return -EIO; if (IS_DAX(inode)) - ret = xfs_file_dax_write(iocb, from); - else if (iocb->ki_flags & IOCB_DIRECT) { + return xfs_file_dax_write(iocb, from); + + if (iocb->ki_flags & IOCB_DIRECT) { /* * Allow a directio write to fall back to a buffered * write *only* in the case that we're doing a reflink @@ -702,20 +724,11 @@ xfs_file_write_iter( * allow an operation to fall back to buffered mode. */ ret = xfs_file_dio_aio_write(iocb, from); - if (ret == -EREMCHG) - goto buffered; - } else { -buffered: - ret = xfs_file_buffered_aio_write(iocb, from); + if (ret != -EREMCHG) + return ret; } - if (ret > 0) { - XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret); - - /* Handle various SYNC-type writes */ - ret = generic_write_sync(iocb, ret); - } - return ret; + return xfs_file_buffered_aio_write(iocb, from); } #define XFS_FALLOC_FL_SUPPORTED \ @@ -1007,7 +1020,7 @@ xfs_file_llseek( * page_lock (MM) * i_lock (XFS - extent map serialisation) */ -static int +static vm_fault_t __xfs_filemap_fault( struct vm_fault *vmf, enum page_entry_size pe_size, @@ -1015,7 +1028,7 @@ __xfs_filemap_fault( { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); - int ret; + vm_fault_t ret; trace_xfs_filemap_fault(ip, pe_size, write_fault); @@ -1044,7 +1057,7 @@ __xfs_filemap_fault( return ret; } -static int +static vm_fault_t xfs_filemap_fault( struct vm_fault *vmf) { @@ -1054,7 +1067,7 @@ xfs_filemap_fault( (vmf->flags & FAULT_FLAG_WRITE)); } -static int +static vm_fault_t xfs_filemap_huge_fault( struct vm_fault *vmf, enum page_entry_size pe_size) @@ -1067,7 +1080,7 @@ xfs_filemap_huge_fault( (vmf->flags & FAULT_FLAG_WRITE)); } -static int +static vm_fault_t xfs_filemap_page_mkwrite( struct vm_fault *vmf) { @@ -1079,7 +1092,7 @@ xfs_filemap_page_mkwrite( * on write faults. In reality, it needs to serialise against truncate and * prepare memory for writing so handle is as standard write fault. */ -static int +static vm_fault_t xfs_filemap_pfn_mkwrite( struct vm_fault *vmf) { diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 43cfc07996a4..0299febece9c 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -465,10 +465,9 @@ xfs_getfsmap_rtdev_rtbitmap_helper( struct xfs_rmap_irec irec; xfs_daddr_t rec_daddr; - rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock); - - irec.rm_startblock = rec->ar_startblock; - irec.rm_blockcount = rec->ar_blockcount; + irec.rm_startblock = rec->ar_startext * mp->m_sb.sb_rextsize; + rec_daddr = XFS_FSB_TO_BB(mp, irec.rm_startblock); + irec.rm_blockcount = rec->ar_extcount * mp->m_sb.sb_rextsize; irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ irec.rm_offset = 0; irec.rm_flags = 0; @@ -534,8 +533,11 @@ xfs_getfsmap_rtdev_rtbitmap_query( xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); - alow.ar_startblock = info->low.rm_startblock; - ahigh.ar_startblock = info->high.rm_startblock; + alow.ar_startext = info->low.rm_startblock; + ahigh.ar_startext = info->high.rm_startblock; + do_div(alow.ar_startext, tp->t_mountp->m_sb.sb_rextsize); + if (do_div(ahigh.ar_startext, tp->t_mountp->m_sb.sb_rextsize)) + ahigh.ar_startext++; error = xfs_rtalloc_query_range(tp, &alow, &ahigh, xfs_getfsmap_rtdev_rtbitmap_helper, info); if (error) diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 523792768080..bc7ef18da243 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -24,85 +24,42 @@ #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_defer.h" -#include "xfs_da_format.h" -#include "xfs_da_btree.h" -#include "xfs_inode.h" #include "xfs_trans.h" -#include "xfs_inode_item.h" #include "xfs_error.h" #include "xfs_btree.h" -#include "xfs_alloc_btree.h" #include "xfs_alloc.h" -#include "xfs_rmap_btree.h" -#include "xfs_ialloc.h" #include "xfs_fsops.h" -#include "xfs_itable.h" #include "xfs_trans_space.h" #include "xfs_rtalloc.h" #include "xfs_trace.h" #include "xfs_log.h" -#include "xfs_filestream.h" -#include "xfs_rmap.h" +#include "xfs_ag.h" #include "xfs_ag_resv.h" /* - * File system operations + * growfs operations */ - -static struct xfs_buf * -xfs_growfs_get_hdr_buf( - struct xfs_mount *mp, - xfs_daddr_t blkno, - size_t numblks, - int flags, - const struct xfs_buf_ops *ops) -{ - struct xfs_buf *bp; - - bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags); - if (!bp) - return NULL; - - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - bp->b_bn = blkno; - bp->b_maps[0].bm_bn = blkno; - bp->b_ops = ops; - - return bp; -} - static int xfs_growfs_data_private( xfs_mount_t *mp, /* mount point for filesystem */ xfs_growfs_data_t *in) /* growfs data input struct */ { - xfs_agf_t *agf; - struct xfs_agfl *agfl; - xfs_agi_t *agi; - xfs_agnumber_t agno; - xfs_extlen_t agsize; - xfs_extlen_t tmpsize; - xfs_alloc_rec_t *arec; xfs_buf_t *bp; - int bucket; - int dpct; - int error, saved_error = 0; + int error; xfs_agnumber_t nagcount; xfs_agnumber_t nagimax = 0; xfs_rfsblock_t nb, nb_mod; xfs_rfsblock_t new; - xfs_rfsblock_t nfree; xfs_agnumber_t oagcount; - int pct; xfs_trans_t *tp; + LIST_HEAD (buffer_list); + struct aghdr_init_data id = {}; nb = in->newblocks; - pct = in->imaxpct; - if (nb < mp->m_sb.sb_dblocks || pct < 0 || pct > 100) + if (nb < mp->m_sb.sb_dblocks) return -EINVAL; if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; - dpct = pct - mp->m_sb.sb_imax_pct; error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); @@ -135,376 +92,45 @@ xfs_growfs_data_private( return error; /* - * Write new AG headers to disk. Non-transactional, but written - * synchronously so they are completed prior to the growfs transaction - * being logged. + * Write new AG headers to disk. Non-transactional, but need to be + * written and completed prior to the growfs transaction being logged. + * To do this, we use a delayed write buffer list and wait for + * submission and IO completion of the list as a whole. This allows the + * IO subsystem to merge all the AG headers in a single AG into a single + * IO and hide most of the latency of the IO from us. + * + * This also means that if we get an error whilst building the buffer + * list to write, we can cancel the entire list without having written + * anything. */ - nfree = 0; - for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) { - __be32 *agfl_bno; - - /* - * AG freespace header block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, - &xfs_agf_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - agf = XFS_BUF_TO_AGF(bp); - agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC); - agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION); - agf->agf_seqno = cpu_to_be32(agno); - if (agno == nagcount - 1) - agsize = - nb - - (agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); + INIT_LIST_HEAD(&id.buffer_list); + for (id.agno = nagcount - 1; + id.agno >= oagcount; + id.agno--, new -= id.agsize) { + + if (id.agno == nagcount - 1) + id.agsize = nb - + (id.agno * (xfs_rfsblock_t)mp->m_sb.sb_agblocks); else - agsize = mp->m_sb.sb_agblocks; - agf->agf_length = cpu_to_be32(agsize); - agf->agf_roots[XFS_BTNUM_BNOi] = cpu_to_be32(XFS_BNO_BLOCK(mp)); - agf->agf_roots[XFS_BTNUM_CNTi] = cpu_to_be32(XFS_CNT_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_BNOi] = cpu_to_be32(1); - agf->agf_levels[XFS_BTNUM_CNTi] = cpu_to_be32(1); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - agf->agf_roots[XFS_BTNUM_RMAPi] = - cpu_to_be32(XFS_RMAP_BLOCK(mp)); - agf->agf_levels[XFS_BTNUM_RMAPi] = cpu_to_be32(1); - agf->agf_rmap_blocks = cpu_to_be32(1); - } - - agf->agf_flfirst = cpu_to_be32(1); - agf->agf_fllast = 0; - agf->agf_flcount = 0; - tmpsize = agsize - mp->m_ag_prealloc_blocks; - agf->agf_freeblks = cpu_to_be32(tmpsize); - agf->agf_longest = cpu_to_be32(tmpsize); - if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasreflink(&mp->m_sb)) { - agf->agf_refcount_root = cpu_to_be32( - xfs_refc_block(mp)); - agf->agf_refcount_level = cpu_to_be32(1); - agf->agf_refcount_blocks = cpu_to_be32(1); - } - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * AG freelist header block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, - &xfs_agfl_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - agfl = XFS_BUF_TO_AGFL(bp); - if (xfs_sb_version_hascrc(&mp->m_sb)) { - agfl->agfl_magicnum = cpu_to_be32(XFS_AGFL_MAGIC); - agfl->agfl_seqno = cpu_to_be32(agno); - uuid_copy(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid); - } - - agfl_bno = XFS_BUF_TO_AGFL_BNO(mp, bp); - for (bucket = 0; bucket < xfs_agfl_size(mp); bucket++) - agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * AG inode header block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), - XFS_FSS_TO_BB(mp, 1), 0, - &xfs_agi_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - agi = XFS_BUF_TO_AGI(bp); - agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC); - agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION); - agi->agi_seqno = cpu_to_be32(agno); - agi->agi_length = cpu_to_be32(agsize); - agi->agi_count = 0; - agi->agi_root = cpu_to_be32(XFS_IBT_BLOCK(mp)); - agi->agi_level = cpu_to_be32(1); - agi->agi_freecount = 0; - agi->agi_newino = cpu_to_be32(NULLAGINO); - agi->agi_dirino = cpu_to_be32(NULLAGINO); - if (xfs_sb_version_hascrc(&mp->m_sb)) - uuid_copy(&agi->agi_uuid, &mp->m_sb.sb_meta_uuid); - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { - agi->agi_free_root = cpu_to_be32(XFS_FIBT_BLOCK(mp)); - agi->agi_free_level = cpu_to_be32(1); - } - for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) - agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; + id.agsize = mp->m_sb.sb_agblocks; - /* - * BNO btree root block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_allocbt_buf_ops); - - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_BNO, 0, 1, agno, 0); - - arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - arec->ar_blockcount = cpu_to_be32( - agsize - be32_to_cpu(arec->ar_startblock)); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * CNT btree root block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_allocbt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_CNT, 0, 1, agno, 0); - - arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1); - arec->ar_startblock = cpu_to_be32(mp->m_ag_prealloc_blocks); - arec->ar_blockcount = cpu_to_be32( - agsize - be32_to_cpu(arec->ar_startblock)); - nfree += be32_to_cpu(arec->ar_blockcount); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* RMAP btree root block */ - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) { - struct xfs_rmap_rec *rrec; - struct xfs_btree_block *block; - - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_RMAP_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_rmapbt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_RMAP, 0, 0, - agno, 0); - block = XFS_BUF_TO_BLOCK(bp); - - - /* - * mark the AG header regions as static metadata The BNO - * btree block is the first block after the headers, so - * it's location defines the size of region the static - * metadata consumes. - * - * Note: unlike mkfs, we never have to account for log - * space when growing the data regions - */ - rrec = XFS_RMAP_REC_ADDR(block, 1); - rrec->rm_startblock = 0; - rrec->rm_blockcount = cpu_to_be32(XFS_BNO_BLOCK(mp)); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_FS); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account freespace btree root blocks */ - rrec = XFS_RMAP_REC_ADDR(block, 2); - rrec->rm_startblock = cpu_to_be32(XFS_BNO_BLOCK(mp)); - rrec->rm_blockcount = cpu_to_be32(2); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account inode btree root blocks */ - rrec = XFS_RMAP_REC_ADDR(block, 3); - rrec->rm_startblock = cpu_to_be32(XFS_IBT_BLOCK(mp)); - rrec->rm_blockcount = cpu_to_be32(XFS_RMAP_BLOCK(mp) - - XFS_IBT_BLOCK(mp)); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_INOBT); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account for rmap btree root */ - rrec = XFS_RMAP_REC_ADDR(block, 4); - rrec->rm_startblock = cpu_to_be32(XFS_RMAP_BLOCK(mp)); - rrec->rm_blockcount = cpu_to_be32(1); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_AG); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - - /* account for refc btree root */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { - rrec = XFS_RMAP_REC_ADDR(block, 5); - rrec->rm_startblock = cpu_to_be32( - xfs_refc_block(mp)); - rrec->rm_blockcount = cpu_to_be32(1); - rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC); - rrec->rm_offset = 0; - be16_add_cpu(&block->bb_numrecs, 1); - } - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - } - - /* - * INO btree root block - */ - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_inobt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_INO , 0, 0, agno, 0); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - - /* - * FINO btree root block - */ - if (xfs_sb_version_hasfinobt(&mp->m_sb)) { - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, XFS_FIBT_BLOCK(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_inobt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_FINO, - 0, 0, agno, 0); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - } - - /* - * refcount btree root block - */ - if (xfs_sb_version_hasreflink(&mp->m_sb)) { - bp = xfs_growfs_get_hdr_buf(mp, - XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)), - BTOBB(mp->m_sb.sb_blocksize), 0, - &xfs_refcountbt_buf_ops); - if (!bp) { - error = -ENOMEM; - goto error0; - } - - xfs_btree_init_block(mp, bp, XFS_BTNUM_REFC, - 0, 0, agno, 0); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) - goto error0; - } - } - xfs_trans_agblocks_delta(tp, nfree); - /* - * There are new blocks in the old last a.g. - */ - if (new) { - struct xfs_owner_info oinfo; - - /* - * Change the agi length. - */ - error = xfs_ialloc_read_agi(mp, tp, agno, &bp); - if (error) { - goto error0; - } - ASSERT(bp); - agi = XFS_BUF_TO_AGI(bp); - be32_add_cpu(&agi->agi_length, new); - ASSERT(nagcount == oagcount || - be32_to_cpu(agi->agi_length) == mp->m_sb.sb_agblocks); - xfs_ialloc_log_agi(tp, bp, XFS_AGI_LENGTH); - /* - * Change agf length. - */ - error = xfs_alloc_read_agf(mp, tp, agno, 0, &bp); + error = xfs_ag_init_headers(mp, &id); if (error) { - goto error0; + xfs_buf_delwri_cancel(&id.buffer_list); + goto out_trans_cancel; } - ASSERT(bp); - agf = XFS_BUF_TO_AGF(bp); - be32_add_cpu(&agf->agf_length, new); - ASSERT(be32_to_cpu(agf->agf_length) == - be32_to_cpu(agi->agi_length)); + } + error = xfs_buf_delwri_submit(&id.buffer_list); + if (error) + goto out_trans_cancel; - xfs_alloc_log_agf(tp, bp, XFS_AGF_LENGTH); + xfs_trans_agblocks_delta(tp, id.nfree); - /* - * Free the new space. - * - * XFS_RMAP_OWN_NULL is used here to tell the rmap btree that - * this doesn't actually exist in the rmap btree. - */ - xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_NULL); - error = xfs_rmap_free(tp, bp, agno, - be32_to_cpu(agf->agf_length) - new, - new, &oinfo); - if (error) - goto error0; - error = xfs_free_extent(tp, - XFS_AGB_TO_FSB(mp, agno, - be32_to_cpu(agf->agf_length) - new), - new, &oinfo, XFS_AG_RESV_NONE); + /* If there are new blocks in the old last AG, extend it. */ + if (new) { + error = xfs_ag_extend_space(mp, tp, &id, new); if (error) - goto error0; + goto out_trans_cancel; } /* @@ -517,10 +143,8 @@ xfs_growfs_data_private( if (nb > mp->m_sb.sb_dblocks) xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, nb - mp->m_sb.sb_dblocks); - if (nfree) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree); - if (dpct) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + if (id.nfree) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, id.nfree); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) @@ -529,12 +153,6 @@ xfs_growfs_data_private( /* New allocation groups fully initialized, so update mount struct */ if (nagimax) mp->m_maxagi = nagimax; - if (mp->m_sb.sb_imax_pct) { - uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; - do_div(icount, 100); - mp->m_maxicount = icount << mp->m_sb.sb_inopblog; - } else - mp->m_maxicount = 0; xfs_set_low_space_thresholds(mp); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); @@ -545,73 +163,24 @@ xfs_growfs_data_private( if (new) { struct xfs_perag *pag; - pag = xfs_perag_get(mp, agno); + pag = xfs_perag_get(mp, id.agno); error = xfs_ag_resv_free(pag); xfs_perag_put(pag); if (error) - goto out; + return error; } - /* Reserve AG metadata blocks. */ + /* + * Reserve AG metadata blocks. ENOSPC here does not mean there was a + * growfs failure, just that there still isn't space for new user data + * after the grow has been run. + */ error = xfs_fs_reserve_ag_blocks(mp); - if (error && error != -ENOSPC) - goto out; - - /* update secondary superblocks. */ - for (agno = 1; agno < nagcount; agno++) { + if (error == -ENOSPC) error = 0; - /* - * new secondary superblocks need to be zeroed, not read from - * disk as the contents of the new area we are growing into is - * completely unknown. - */ - if (agno < oagcount) { - error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0, &bp, - &xfs_sb_buf_ops); - } else { - bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp, - XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)), - XFS_FSS_TO_BB(mp, 1), 0); - if (bp) { - bp->b_ops = &xfs_sb_buf_ops; - xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); - } else - error = -ENOMEM; - } - - /* - * If we get an error reading or writing alternate superblocks, - * continue. xfs_repair chooses the "best" superblock based - * on most matches; if we break early, we'll leave more - * superblocks un-updated than updated, and xfs_repair may - * pick them over the properly-updated primary. - */ - if (error) { - xfs_warn(mp, - "error %d reading secondary superblock for ag %d", - error, agno); - saved_error = error; - continue; - } - xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb); - - error = xfs_bwrite(bp); - xfs_buf_relse(bp); - if (error) { - xfs_warn(mp, - "write error %d updating secondary superblock for ag %d", - error, agno); - saved_error = error; - continue; - } - } - - out: - return saved_error ? saved_error : error; + return error; - error0: +out_trans_cancel: xfs_trans_cancel(tp); return error; } @@ -638,25 +207,71 @@ xfs_growfs_log_private( return -ENOSYS; } +static int +xfs_growfs_imaxpct( + struct xfs_mount *mp, + __u32 imaxpct) +{ + struct xfs_trans *tp; + int dpct; + int error; + + if (imaxpct > 100) + return -EINVAL; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, + XFS_GROWFS_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + dpct = imaxpct - mp->m_sb.sb_imax_pct; + xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); + xfs_trans_set_sync(tp); + return xfs_trans_commit(tp); +} + /* * protected versions of growfs function acquire and release locks on the mount * point - exported through ioctls: XFS_IOC_FSGROWFSDATA, XFS_IOC_FSGROWFSLOG, * XFS_IOC_FSGROWFSRT */ - - int xfs_growfs_data( - xfs_mount_t *mp, - xfs_growfs_data_t *in) + struct xfs_mount *mp, + struct xfs_growfs_data *in) { - int error; + int error = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; - error = xfs_growfs_data_private(mp, in); + + /* update imaxpct separately to the physical grow of the filesystem */ + if (in->imaxpct != mp->m_sb.sb_imax_pct) { + error = xfs_growfs_imaxpct(mp, in->imaxpct); + if (error) + goto out_error; + } + + if (in->newblocks != mp->m_sb.sb_dblocks) { + error = xfs_growfs_data_private(mp, in); + if (error) + goto out_error; + } + + /* Post growfs calculations needed to reflect new state in operations */ + if (mp->m_sb.sb_imax_pct) { + uint64_t icount = mp->m_sb.sb_dblocks * mp->m_sb.sb_imax_pct; + do_div(icount, 100); + mp->m_maxicount = icount << mp->m_sb.sb_inopblog; + } else + mp->m_maxicount = 0; + + /* Update secondary superblocks now the physical grow has completed */ + error = xfs_update_secondary_sbs(mp); + +out_error: /* * Increment the generation unconditionally, the error could be from * updating the secondary superblocks, in which case the new size diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 3e1cc3001bcb..fdde17a2333c 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -47,6 +47,7 @@ xfs_param_t xfs_params = { struct xfs_globals xfs_globals = { .log_recovery_delay = 0, /* no delay by default */ + .mount_delay = 0, /* no delay by default */ #ifdef XFS_ASSERT_FATAL .bug_on_assert = true, /* assert failures BUG() */ #else diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 9a18f69f6e96..164350d91efc 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -107,7 +107,8 @@ xfs_inode_free_callback( xfs_idestroy_fork(ip, XFS_COW_FORK); if (ip->i_itemp) { - ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, + &ip->i_itemp->ili_item.li_flags)); xfs_inode_item_destroy(ip); ip->i_itemp = NULL; } @@ -309,6 +310,46 @@ xfs_reinit_inode( } /* + * If we are allocating a new inode, then check what was returned is + * actually a free, empty inode. If we are not allocating an inode, + * then check we didn't find a free inode. + * + * Returns: + * 0 if the inode free state matches the lookup context + * -ENOENT if the inode is free and we are not allocating + * -EFSCORRUPTED if there is any state mismatch at all + */ +static int +xfs_iget_check_free_state( + struct xfs_inode *ip, + int flags) +{ + if (flags & XFS_IGET_CREATE) { + /* should be a free inode */ + if (VFS_I(ip)->i_mode != 0) { + xfs_warn(ip->i_mount, +"Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", + ip->i_ino, VFS_I(ip)->i_mode); + return -EFSCORRUPTED; + } + + if (ip->i_d.di_nblocks != 0) { + xfs_warn(ip->i_mount, +"Corruption detected! Free inode 0x%llx has blocks allocated!", + ip->i_ino); + return -EFSCORRUPTED; + } + return 0; + } + + /* should be an allocated inode */ + if (VFS_I(ip)->i_mode == 0) + return -ENOENT; + + return 0; +} + +/* * Check the validity of the inode we just found it the cache */ static int @@ -357,12 +398,12 @@ xfs_iget_cache_hit( } /* - * If lookup is racing with unlink return an error immediately. + * Check the inode free state is valid. This also detects lookup + * racing with unlinks. */ - if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = -ENOENT; + error = xfs_iget_check_free_state(ip, flags); + if (error) goto out_error; - } /* * If IRECLAIMABLE is set, we've torn down the VFS inode already. @@ -485,29 +526,12 @@ xfs_iget_cache_miss( /* - * If we are allocating a new inode, then check what was returned is - * actually a free, empty inode. If we are not allocating an inode, - * the check we didn't find a free inode. + * Check the inode free state is valid. This also detects lookup + * racing with unlinks. */ - if (flags & XFS_IGET_CREATE) { - if (VFS_I(ip)->i_mode != 0) { - xfs_warn(mp, -"Corruption detected! Free inode 0x%llx not marked free on disk", - ino); - error = -EFSCORRUPTED; - goto out_destroy; - } - if (ip->i_d.di_nblocks != 0) { - xfs_warn(mp, -"Corruption detected! Free inode 0x%llx has blocks allocated!", - ino); - error = -EFSCORRUPTED; - goto out_destroy; - } - } else if (VFS_I(ip)->i_mode == 0) { - error = -ENOENT; + error = xfs_iget_check_free_state(ip, flags); + if (error) goto out_destroy; - } /* * Preload the radix tree so we can insert safely under the @@ -1802,3 +1826,21 @@ xfs_inode_clear_cowblocks_tag( return __xfs_inode_clear_blocks_tag(ip, trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); } + +/* Disable post-EOF and CoW block auto-reclamation. */ +void +xfs_icache_disable_reclaim( + struct xfs_mount *mp) +{ + cancel_delayed_work_sync(&mp->m_eofblocks_work); + cancel_delayed_work_sync(&mp->m_cowblocks_work); +} + +/* Enable post-EOF and CoW block auto-reclamation. */ +void +xfs_icache_enable_reclaim( + struct xfs_mount *mp) +{ + xfs_queue_eofblocks(mp); + xfs_queue_cowblocks(mp); +} diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index d4a77588eca1..d69a0f5a6a73 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -131,4 +131,7 @@ xfs_fs_eofblocks_from_user( int xfs_icache_inode_is_allocated(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, bool *inuse); +void xfs_icache_disable_reclaim(struct xfs_mount *mp); +void xfs_icache_enable_reclaim(struct xfs_mount *mp); + #endif diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c index 865ad1373e5e..5da9599156ed 100644 --- a/fs/xfs/xfs_icreate_item.c +++ b/fs/xfs/xfs_icreate_item.c @@ -91,7 +91,7 @@ xfs_icreate_item_unlock( { struct xfs_icreate_item *icp = ICR_ITEM(lip); - if (icp->ic_item.li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) kmem_zone_free(xfs_icreate_zone, icp); return; } @@ -184,5 +184,5 @@ xfs_icreate_log( xfs_trans_add_item(tp, &icp->ic_item); tp->t_flags |= XFS_TRANS_DIRTY; - icp->ic_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &icp->ic_item.li_flags); } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2b70c8b4cee2..05207a64dd53 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -498,7 +498,7 @@ again: if (!try_lock) { for (j = (i - 1); j >= 0 && !try_lock; j--) { lp = (xfs_log_item_t *)ips[j]->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) + if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) try_lock++; } } @@ -598,7 +598,7 @@ xfs_lock_two_inodes( * and try again. */ lp = (xfs_log_item_t *)ip0->i_itemp; - if (lp && (lp->li_flags & XFS_LI_IN_AIL)) { + if (lp && test_bit(XFS_LI_IN_AIL, &lp->li_flags)) { if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(ip1_mode, 1))) { xfs_iunlock(ip0, ip0_mode); if ((++attempts % 5) == 0) @@ -791,6 +791,18 @@ xfs_ialloc( ASSERT(*ialloc_context == NULL); /* + * Protect against obviously corrupt allocation btree records. Later + * xfs_iget checks will catch re-allocation of other active in-memory + * and on-disk inodes. If we don't catch reallocating the parent inode + * here we will deadlock in xfs_iget() so we have to do these checks + * first. + */ + if ((pip && ino == pip->i_ino) || !xfs_verify_dir_ino(mp, ino)) { + xfs_alert(mp, "Allocated a known in-use inode 0x%llx!", ino); + return -EFSCORRUPTED; + } + + /* * Get the in-core inode with the lock held exclusively. * This is because we're setting fields here we need * to prevent others from looking at until we're done. @@ -1196,6 +1208,7 @@ xfs_create( unlock_dp_on_error = true; xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* * Reserve disk quota and the inode. @@ -1411,11 +1424,11 @@ xfs_link( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - error = xfs_qm_dqattach(sip, 0); + error = xfs_qm_dqattach(sip); if (error) goto std_return; - error = xfs_qm_dqattach(tdp, 0); + error = xfs_qm_dqattach(tdp); if (error) goto std_return; @@ -1451,6 +1464,7 @@ xfs_link( } xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* * Handle initial link state of O_TMPFILE inode @@ -1534,11 +1548,12 @@ xfs_itruncate_clear_reflink_flags( * dirty on error so that transactions can be easily aborted if possible. */ int -xfs_itruncate_extents( +xfs_itruncate_extents_flags( struct xfs_trans **tpp, struct xfs_inode *ip, int whichfork, - xfs_fsize_t new_size) + xfs_fsize_t new_size, + int flags) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; @@ -1561,6 +1576,8 @@ xfs_itruncate_extents( trace_xfs_itruncate_extents_start(ip, new_size); + flags |= xfs_bmapi_aflag(whichfork); + /* * Since it is possible for space to become allocated beyond * the end of the file (in a crash where the space is allocated @@ -1579,12 +1596,9 @@ xfs_itruncate_extents( unmap_len = last_block - first_unmap_block + 1; while (!done) { xfs_defer_init(&dfops, &first_block); - error = xfs_bunmapi(tp, ip, - first_unmap_block, unmap_len, - xfs_bmapi_aflag(whichfork), - XFS_ITRUNC_MAX_EXTENTS, - &first_block, &dfops, - &done); + error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags, + XFS_ITRUNC_MAX_EXTENTS, &first_block, + &dfops, &done); if (error) goto out_bmap_cancel; @@ -1811,6 +1825,7 @@ xfs_inactive_ifree( xfs_trans_ijoin(tp, ip, 0); xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; error = xfs_ifree(tp, ip, &dfops); if (error) { /* @@ -1911,7 +1926,7 @@ xfs_inactive( ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0)) truncate = 1; - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return; @@ -2574,11 +2589,11 @@ xfs_remove( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - error = xfs_qm_dqattach(dp, 0); + error = xfs_qm_dqattach(dp); if (error) goto std_return; - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) goto std_return; @@ -2647,6 +2662,7 @@ xfs_remove( goto out_trans_cancel; xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; error = xfs_dir_removename(tp, dp, name, ip->i_ino, &first_block, &dfops, resblks); if (error) { @@ -3014,6 +3030,7 @@ xfs_rename( } xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* RENAME_EXCHANGE is unique from here on. */ if (flags & RENAME_EXCHANGE) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1eebc53df7d7..00fee6824745 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -415,8 +415,8 @@ uint xfs_ilock_attr_map_shared(struct xfs_inode *); uint xfs_ip2xflags(struct xfs_inode *); int xfs_ifree(struct xfs_trans *, xfs_inode_t *, struct xfs_defer_ops *); -int xfs_itruncate_extents(struct xfs_trans **, struct xfs_inode *, - int, xfs_fsize_t); +int xfs_itruncate_extents_flags(struct xfs_trans **, + struct xfs_inode *, int, xfs_fsize_t, int); void xfs_iext_realloc(xfs_inode_t *, int, int); void xfs_iunpin_wait(xfs_inode_t *); @@ -433,6 +433,16 @@ int xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t, xfs_nlink_t, dev_t, prid_t, struct xfs_inode **); +static inline int +xfs_itruncate_extents( + struct xfs_trans **tpp, + struct xfs_inode *ip, + int whichfork, + xfs_fsize_t new_size) +{ + return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0); +} + /* from xfs_file.c */ enum xfs_prealloc_flags { XFS_PREALLOC_SET = (1 << 1), diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 34b91b789702..3e5b8574818e 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -518,7 +518,7 @@ xfs_inode_item_push( * The buffer containing this item failed to be written back * previously. Resubmit the buffer for IO. */ - if (lip->li_flags & XFS_LI_FAILED) { + if (test_bit(XFS_LI_FAILED, &lip->li_flags)) { if (!xfs_buf_trylock(bp)) return XFS_ITEM_LOCKED; @@ -729,14 +729,14 @@ xfs_iflush_done( */ iip = INODE_ITEM(blip); if ((iip->ili_logged && blip->li_lsn == iip->ili_flush_lsn) || - (blip->li_flags & XFS_LI_FAILED)) + test_bit(XFS_LI_FAILED, &blip->li_flags)) need_ail++; } /* make sure we capture the state of the initial inode. */ iip = INODE_ITEM(lip); if ((iip->ili_logged && lip->li_lsn == iip->ili_flush_lsn) || - lip->li_flags & XFS_LI_FAILED) + test_bit(XFS_LI_FAILED, &lip->li_flags)) need_ail++; /* @@ -803,7 +803,7 @@ xfs_iflush_abort( xfs_inode_log_item_t *iip = ip->i_itemp; if (iip) { - if (iip->ili_item.li_flags & XFS_LI_IN_AIL) { + if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) { xfs_trans_ail_remove(&iip->ili_item, stale ? SHUTDOWN_LOG_IO_ERROR : SHUTDOWN_CORRUPT_INCORE); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 89fb1eb80aae..5dd9e22b4a4c 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1103,7 +1103,8 @@ xfs_ioctl_setattr_dax_invalidate( if (fa->fsx_xflags & FS_XFLAG_DAX) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) return -EINVAL; - if (bdev_dax_supported(sb, sb->s_blocksize) < 0) + if (!bdev_dax_supported(xfs_find_bdev_for_inode(VFS_I(ip)), + sb->s_blocksize)) return -EINVAL; } @@ -1811,6 +1812,88 @@ xfs_ioc_swapext( return error; } +static int +xfs_ioc_getlabel( + struct xfs_mount *mp, + char __user *user_label) +{ + struct xfs_sb *sbp = &mp->m_sb; + char label[XFSLABEL_MAX + 1]; + + /* Paranoia */ + BUILD_BUG_ON(sizeof(sbp->sb_fname) > FSLABEL_MAX); + + spin_lock(&mp->m_sb_lock); + strncpy(label, sbp->sb_fname, sizeof(sbp->sb_fname)); + spin_unlock(&mp->m_sb_lock); + + /* xfs on-disk label is 12 chars, be sure we send a null to user */ + label[XFSLABEL_MAX] = '\0'; + if (copy_to_user(user_label, label, sizeof(sbp->sb_fname))) + return -EFAULT; + return 0; +} + +static int +xfs_ioc_setlabel( + struct file *filp, + struct xfs_mount *mp, + char __user *newlabel) +{ + struct xfs_sb *sbp = &mp->m_sb; + char label[XFSLABEL_MAX + 1]; + size_t len; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* + * The generic ioctl allows up to FSLABEL_MAX chars, but XFS is much + * smaller, at 12 bytes. We copy one more to be sure we find the + * (required) NULL character to test the incoming label length. + * NB: The on disk label doesn't need to be null terminated. + */ + if (copy_from_user(label, newlabel, XFSLABEL_MAX + 1)) + return -EFAULT; + len = strnlen(label, XFSLABEL_MAX + 1); + if (len > sizeof(sbp->sb_fname)) + return -EINVAL; + + error = mnt_want_write_file(filp); + if (error) + return error; + + spin_lock(&mp->m_sb_lock); + memset(sbp->sb_fname, 0, sizeof(sbp->sb_fname)); + strncpy(sbp->sb_fname, label, sizeof(sbp->sb_fname)); + spin_unlock(&mp->m_sb_lock); + + /* + * Now we do several things to satisfy userspace. + * In addition to normal logging of the primary superblock, we also + * immediately write these changes to sector zero for the primary, then + * update all backup supers (as xfs_db does for a label change), then + * invalidate the block device page cache. This is so that any prior + * buffered reads from userspace (i.e. from blkid) are invalidated, + * and userspace will see the newly-written label. + */ + error = xfs_sync_sb_buf(mp); + if (error) + goto out; + /* + * growfs also updates backup supers so lock against that. + */ + mutex_lock(&mp->m_growlock); + error = xfs_update_secondary_sbs(mp); + mutex_unlock(&mp->m_growlock); + + invalidate_bdev(mp->m_ddev_targp->bt_bdev); + +out: + mnt_drop_write_file(filp); + return error; +} + /* * Note: some of the ioctl's return positive numbers as a * byte count indicating success, such as readlink_by_handle. @@ -1834,6 +1917,10 @@ xfs_file_ioctl( switch (cmd) { case FITRIM: return xfs_ioc_trim(mp, arg); + case FS_IOC_GETFSLABEL: + return xfs_ioc_getlabel(mp, arg); + case FS_IOC_SETFSLABEL: + return xfs_ioc_setlabel(filp, mp, arg); case XFS_IOC_ALLOCSP: case XFS_IOC_FREESP: case XFS_IOC_RESVSP: diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 046469fcc1b8..c6ce6f9335b6 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -224,7 +224,7 @@ xfs_iomap_write_direct( * necessary and move on to transaction setup. */ xfs_iunlock(ip, lockmode); - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -576,7 +576,7 @@ xfs_file_iomap_begin_delay( goto done; } - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach_locked(ip, false); if (error) goto out_unlock; @@ -692,7 +692,7 @@ xfs_iomap_write_allocate( /* * Make sure that the dquots are there. */ - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -946,8 +946,11 @@ error_on_bmapi_transaction: return error; } -static inline bool imap_needs_alloc(struct inode *inode, - struct xfs_bmbt_irec *imap, int nimaps) +static inline bool +imap_needs_alloc( + struct inode *inode, + struct xfs_bmbt_irec *imap, + int nimaps) { return !nimaps || imap->br_startblock == HOLESTARTBLOCK || @@ -955,31 +958,58 @@ static inline bool imap_needs_alloc(struct inode *inode, (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); } -static inline bool needs_cow_for_zeroing(struct xfs_bmbt_irec *imap, int nimaps) +static inline bool +needs_cow_for_zeroing( + struct xfs_bmbt_irec *imap, + int nimaps) { return nimaps && imap->br_startblock != HOLESTARTBLOCK && imap->br_state != XFS_EXT_UNWRITTEN; } -static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) +static int +xfs_ilock_for_iomap( + struct xfs_inode *ip, + unsigned flags, + unsigned *lockmode) { + unsigned mode = XFS_ILOCK_SHARED; + /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ - if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) - return true; + if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO))) { + /* + * FIXME: It could still overwrite on unshared extents and not + * need allocation. + */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + mode = XFS_ILOCK_EXCL; + } /* - * Extents not yet cached requires exclusive access, don't block. - * This is an opencoded xfs_ilock_data_map_shared() to cater for the + * Extents not yet cached requires exclusive access, don't block. This + * is an opencoded xfs_ilock_data_map_shared() call but with * non-blocking behaviour. */ - if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && - !(ip->i_df.if_flags & XFS_IFEXTENTS)) - return true; - return false; + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + mode = XFS_ILOCK_EXCL; + } + + if (flags & IOMAP_NOWAIT) { + if (!xfs_ilock_nowait(ip, mode)) + return -EAGAIN; + } else { + xfs_ilock(ip, mode); + } + + *lockmode = mode; + return 0; } static int @@ -1007,19 +1037,15 @@ xfs_file_iomap_begin( return xfs_file_iomap_begin_delay(inode, offset, length, iomap); } - if (need_excl_ilock(ip, flags)) - lockmode = XFS_ILOCK_EXCL; - else - lockmode = XFS_ILOCK_SHARED; - - if (flags & IOMAP_NOWAIT) { - if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) - return -EAGAIN; - if (!xfs_ilock_nowait(ip, lockmode)) - return -EAGAIN; - } else { - xfs_ilock(ip, lockmode); - } + /* + * Lock the inode in the manner required for the specified operation and + * check for as many conditions that would result in blocking as + * possible. This removes most of the non-blocking checks from the + * mapping code below. + */ + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; ASSERT(offset <= mp->m_super->s_maxbytes); if (offset > mp->m_super->s_maxbytes - length) @@ -1040,19 +1066,21 @@ xfs_file_iomap_begin( goto out_unlock; } - if (xfs_is_reflink_inode(ip) && - ((flags & IOMAP_WRITE) || - ((flags & IOMAP_ZERO) && needs_cow_for_zeroing(&imap, nimaps)))) { + /* Non-modifying mapping requested, so we are done */ + if (!(flags & (IOMAP_WRITE | IOMAP_ZERO))) + goto out_found; + + /* + * Break shared extents if necessary. Checks for non-blocking IO have + * been done up front, so we don't need to do them here. + */ + if (xfs_is_reflink_inode(ip)) { + /* if zeroing doesn't need COW allocation, then we are done. */ + if ((flags & IOMAP_ZERO) && + !needs_cow_for_zeroing(&imap, nimaps)) + goto out_found; + if (flags & IOMAP_DIRECT) { - /* - * A reflinked inode will result in CoW alloc. - * FIXME: It could still overwrite on unshared extents - * and not need allocation. - */ - if (flags & IOMAP_NOWAIT) { - error = -EAGAIN; - goto out_unlock; - } /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &shared, &lockmode); @@ -1068,46 +1096,45 @@ xfs_file_iomap_begin( length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - if ((flags & IOMAP_WRITE) && imap_needs_alloc(inode, &imap, nimaps)) { - /* - * If nowait is set bail since we are going to make - * allocations. - */ - if (flags & IOMAP_NOWAIT) { - error = -EAGAIN; - goto out_unlock; - } - /* - * We cap the maximum length we map here to MAX_WRITEBACK_PAGES - * pages to keep the chunks of work done where somewhat symmetric - * with the work writeback does. This is a completely arbitrary - * number pulled out of thin air as a best guess for initial - * testing. - * - * Note that the values needs to be less than 32-bits wide until - * the lower level functions are updated. - */ - length = min_t(loff_t, length, 1024 * PAGE_SIZE); - /* - * xfs_iomap_write_direct() expects the shared lock. It - * is unlocked on return. - */ - if (lockmode == XFS_ILOCK_EXCL) - xfs_ilock_demote(ip, lockmode); - error = xfs_iomap_write_direct(ip, offset, length, &imap, - nimaps); - if (error) - return error; + /* Don't need to allocate over holes when doing zeroing operations. */ + if (flags & IOMAP_ZERO) + goto out_found; - iomap->flags = IOMAP_F_NEW; - trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); - } else { - ASSERT(nimaps); + if (!imap_needs_alloc(inode, &imap, nimaps)) + goto out_found; - xfs_iunlock(ip, lockmode); - trace_xfs_iomap_found(ip, offset, length, 0, &imap); + /* If nowait is set bail since we are going to make allocations. */ + if (flags & IOMAP_NOWAIT) { + error = -EAGAIN; + goto out_unlock; } + /* + * We cap the maximum length we map to a sane size to keep the chunks + * of work done where somewhat symmetric with the work writeback does. + * This is a completely arbitrary number pulled out of thin air as a + * best guess for initial testing. + * + * Note that the values needs to be less than 32-bits wide until the + * lower level functions are updated. + */ + length = min_t(loff_t, length, 1024 * PAGE_SIZE); + + /* + * xfs_iomap_write_direct() expects the shared lock. It is unlocked on + * return. + */ + if (lockmode == XFS_ILOCK_EXCL) + xfs_ilock_demote(ip, lockmode); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); + if (error) + return error; + + iomap->flags = IOMAP_F_NEW; + trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + +out_finish: if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; @@ -1117,6 +1144,13 @@ xfs_file_iomap_begin( if (shared) iomap->flags |= IOMAP_F_SHARED; return 0; + +out_found: + ASSERT(nimaps); + xfs_iunlock(ip, lockmode); + trace_xfs_iomap_found(ip, offset, length, 0, &imap); + goto out_finish; + out_unlock: xfs_iunlock(ip, lockmode); return error; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index df42e4cb4dc4..b0eb49bb4918 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -855,7 +855,7 @@ xfs_setattr_size( /* * Make sure that the dquots are attached to the inode. */ - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; @@ -1195,6 +1195,30 @@ static const struct inode_operations xfs_inline_symlink_inode_operations = { .update_time = xfs_vn_update_time, }; +/* Figure out if this file actually supports DAX. */ +static bool +xfs_inode_supports_dax( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* Only supported on non-reflinked files. */ + if (!S_ISREG(VFS_I(ip)->i_mode) || xfs_is_reflink_inode(ip)) + return false; + + /* DAX mount option or DAX iflag must be set. */ + if (!(mp->m_flags & XFS_MOUNT_DAX) && + !(ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) + return false; + + /* Block size must match page size */ + if (mp->m_sb.sb_blocksize != PAGE_SIZE) + return false; + + /* Device has to support DAX too. */ + return xfs_find_daxdev_for_inode(VFS_I(ip)) != NULL; +} + STATIC void xfs_diflags_to_iflags( struct inode *inode, @@ -1213,11 +1237,7 @@ xfs_diflags_to_iflags( inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - if (S_ISREG(inode->i_mode) && - ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE && - !xfs_is_reflink_inode(ip) && - (ip->i_mount->m_flags & XFS_MOUNT_DAX || - ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)) + if (xfs_inode_supports_dax(ip)) inode->i_flags |= S_DAX; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 2fcd9ed5d075..c21039f27e39 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1047,6 +1047,7 @@ xfs_log_item_init( INIT_LIST_HEAD(&item->li_ail); INIT_LIST_HEAD(&item->li_cil); INIT_LIST_HEAD(&item->li_bio_list); + INIT_LIST_HEAD(&item->li_trans); } /* @@ -2110,10 +2111,10 @@ xlog_print_tic_res( */ void xlog_print_trans( - struct xfs_trans *tp) + struct xfs_trans *tp) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_log_item_desc *lidp; + struct xfs_mount *mp = tp->t_mountp; + struct xfs_log_item *lip; /* dump core transaction and ticket info */ xfs_warn(mp, "transaction summary:"); @@ -2124,15 +2125,14 @@ xlog_print_trans( xlog_print_tic_res(mp, tp->t_ticket); /* dump each log item */ - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv = lip->li_lv; struct xfs_log_iovec *vec; int i; xfs_warn(mp, "log item: "); xfs_warn(mp, " type = 0x%x", lip->li_type); - xfs_warn(mp, " flags = 0x%x", lip->li_flags); + xfs_warn(mp, " flags = 0x%lx", lip->li_flags); if (!lv) continue; xfs_warn(mp, " niovecs = %d", lv->lv_niovecs); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 4668403b1741..c15687724728 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -141,10 +141,9 @@ xlog_cil_alloc_shadow_bufs( struct xlog *log, struct xfs_trans *tp) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv; int niovecs = 0; int nbytes = 0; @@ -152,7 +151,7 @@ xlog_cil_alloc_shadow_bufs( bool ordered = false; /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* get number of vecs and size of data to be stored */ @@ -317,7 +316,7 @@ xlog_cil_insert_format_items( int *diff_len, int *diff_iovecs) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; /* Bail out if we didn't find a log item. */ @@ -326,15 +325,14 @@ xlog_cil_insert_format_items( return; } - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv; struct xfs_log_vec *old_lv = NULL; struct xfs_log_vec *shadow; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* @@ -406,7 +404,7 @@ xlog_cil_insert_items( { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; int len = 0; int diff_iovecs = 0; int iclog_space; @@ -479,11 +477,10 @@ xlog_cil_insert_items( * We do this here so we only need to take the CIL lock once during * the transaction commit. */ - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { /* Skip items which aren't dirty in this transaction. */ - if (!(lidp->lid_flags & XFS_LID_DIRTY)) + if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* @@ -1013,6 +1010,7 @@ xfs_log_commit_cil( *commit_lsn = xc_commit_lsn; xfs_log_done(mp, tp->t_ticket, NULL, regrant); + tp->t_ticket = NULL; xfs_trans_unreserve_and_mod_sb(tp); /* diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 2b2383f1895e..06a09cb948b5 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2702,7 +2702,7 @@ xlog_recover_do_reg_buffer( goto next; } fa = xfs_dquot_verify(mp, item->ri_buf[i].i_addr, - -1, 0, 0); + -1, 0); if (fa) { xfs_alert(mp, "dquot corrupt at %pS trying to replay into block 0x%llx", @@ -3348,7 +3348,7 @@ xlog_recover_dquot_pass2( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0, 0); + fa = xfs_dquot_verify(mp, recddq, dq_f->qlf_id, 0); if (fa) { xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS", dq_f->qlf_id, fa); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index a901b86772f8..73ed8fec0328 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1072,9 +1072,7 @@ xfs_unmountfs( uint64_t resblks; int error; - cancel_delayed_work_sync(&mp->m_eofblocks_work); - cancel_delayed_work_sync(&mp->m_cowblocks_work); - + xfs_icache_disable_reclaim(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); xfs_rtunmount_inodes(mp); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index ec39ae274c78..c3e014bfc848 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -161,10 +161,7 @@ xfs_qm_dqpurge( * to purge this dquot anyway, so we go ahead regardless. */ error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(mp, "%s: dquot "PTR_FMT" flush failed", - __func__, dqp); - } else { + if (!error) { error = xfs_bwrite(bp); xfs_buf_relse(bp); } @@ -173,7 +170,7 @@ xfs_qm_dqpurge( ASSERT(atomic_read(&dqp->q_pincount) == 0); ASSERT(XFS_FORCED_SHUTDOWN(mp) || - !(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL)); + !test_bit(XFS_LI_IN_AIL, &dqp->q_logitem.qli_item.li_flags)); xfs_dqfunlock(dqp); xfs_dqunlock(dqp); @@ -265,7 +262,7 @@ xfs_qm_dqattach_one( xfs_inode_t *ip, xfs_dqid_t id, uint type, - uint doalloc, + bool doalloc, xfs_dquot_t **IO_idqpp) { xfs_dquot_t *dqp; @@ -291,7 +288,7 @@ xfs_qm_dqattach_one( * exist on disk and we didn't ask it to allocate; ESRCH if quotas got * turned off suddenly. */ - error = xfs_qm_dqget(ip->i_mount, ip, id, type, doalloc, &dqp); + error = xfs_qm_dqget_inode(ip, type, doalloc, &dqp); if (error) return error; @@ -326,14 +323,14 @@ xfs_qm_need_dqattach( /* * Given a locked inode, attach dquot(s) to it, taking U/G/P-QUOTAON * into account. - * If XFS_QMOPT_DQALLOC, the dquot(s) will be allocated if needed. + * If @doalloc is true, the dquot(s) will be allocated if needed. * Inode may get unlocked and relocked in here, and the caller must deal with * the consequences. */ int xfs_qm_dqattach_locked( xfs_inode_t *ip, - uint flags) + bool doalloc) { xfs_mount_t *mp = ip->i_mount; int error = 0; @@ -345,8 +342,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER, - flags & XFS_QMOPT_DQALLOC, - &ip->i_udquot); + doalloc, &ip->i_udquot); if (error) goto done; ASSERT(ip->i_udquot); @@ -354,8 +350,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) { error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, - flags & XFS_QMOPT_DQALLOC, - &ip->i_gdquot); + doalloc, &ip->i_gdquot); if (error) goto done; ASSERT(ip->i_gdquot); @@ -363,8 +358,7 @@ xfs_qm_dqattach_locked( if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) { error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ, - flags & XFS_QMOPT_DQALLOC, - &ip->i_pdquot); + doalloc, &ip->i_pdquot); if (error) goto done; ASSERT(ip->i_pdquot); @@ -381,8 +375,7 @@ done: int xfs_qm_dqattach( - struct xfs_inode *ip, - uint flags) + struct xfs_inode *ip) { int error; @@ -390,7 +383,7 @@ xfs_qm_dqattach( return 0; xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_qm_dqattach_locked(ip, flags); + error = xfs_qm_dqattach_locked(ip, false); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -479,11 +472,8 @@ xfs_qm_dquot_isolate( spin_unlock(lru_lock); error = xfs_qm_dqflush(dqp, &bp); - if (error) { - xfs_warn(dqp->q_mount, "%s: dquot "PTR_FMT" flush failed", - __func__, dqp); + if (error) goto out_unlock_dirty; - } xfs_buf_delwri_queue(bp, &isol->buffers); xfs_buf_relse(bp); @@ -571,27 +561,88 @@ xfs_qm_set_defquota( { xfs_dquot_t *dqp; struct xfs_def_quota *defq; + struct xfs_disk_dquot *ddqp; int error; - error = xfs_qm_dqread(mp, 0, type, 0, &dqp); + error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); + if (error) + return; - if (!error) { - xfs_disk_dquot_t *ddqp = &dqp->q_core; + ddqp = &dqp->q_core; + defq = xfs_get_defquota(dqp, qinf); - defq = xfs_get_defquota(dqp, qinf); + /* + * Timers and warnings have been already set, let's just set the + * default limits for this quota type + */ + defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); + defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); + defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); + defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); + defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); + defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); + xfs_qm_dqdestroy(dqp); +} - /* - * Timers and warnings have been already set, let's just set the - * default limits for this quota type - */ - defq->bhardlimit = be64_to_cpu(ddqp->d_blk_hardlimit); - defq->bsoftlimit = be64_to_cpu(ddqp->d_blk_softlimit); - defq->ihardlimit = be64_to_cpu(ddqp->d_ino_hardlimit); - defq->isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit); - defq->rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit); - defq->rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit); - xfs_qm_dqdestroy(dqp); - } +/* Initialize quota time limits from the root dquot. */ +static void +xfs_qm_init_timelimits( + struct xfs_mount *mp, + struct xfs_quotainfo *qinf) +{ + struct xfs_disk_dquot *ddqp; + struct xfs_dquot *dqp; + uint type; + int error; + + qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; + qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; + qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; + qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; + qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; + qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; + + /* + * We try to get the limits from the superuser's limits fields. + * This is quite hacky, but it is standard quota practice. + * + * Since we may not have done a quotacheck by this point, just read + * the dquot without attaching it to any hashtables or lists. + * + * Timers and warnings are globally set by the first timer found in + * user/group/proj quota types, otherwise a default value is used. + * This should be split into different fields per quota type. + */ + if (XFS_IS_UQUOTA_RUNNING(mp)) + type = XFS_DQ_USER; + else if (XFS_IS_GQUOTA_RUNNING(mp)) + type = XFS_DQ_GROUP; + else + type = XFS_DQ_PROJ; + error = xfs_qm_dqget_uncached(mp, 0, type, &dqp); + if (error) + return; + + ddqp = &dqp->q_core; + /* + * The warnings and timers set the grace period given to + * a user or group before he or she can not perform any + * more writing. If it is zero, a default is used. + */ + if (ddqp->d_btimer) + qinf->qi_btimelimit = be32_to_cpu(ddqp->d_btimer); + if (ddqp->d_itimer) + qinf->qi_itimelimit = be32_to_cpu(ddqp->d_itimer); + if (ddqp->d_rtbtimer) + qinf->qi_rtbtimelimit = be32_to_cpu(ddqp->d_rtbtimer); + if (ddqp->d_bwarns) + qinf->qi_bwarnlimit = be16_to_cpu(ddqp->d_bwarns); + if (ddqp->d_iwarns) + qinf->qi_iwarnlimit = be16_to_cpu(ddqp->d_iwarns); + if (ddqp->d_rtbwarns) + qinf->qi_rtbwarnlimit = be16_to_cpu(ddqp->d_rtbwarns); + + xfs_qm_dqdestroy(dqp); } /* @@ -600,11 +651,10 @@ xfs_qm_set_defquota( */ STATIC int xfs_qm_init_quotainfo( - xfs_mount_t *mp) + struct xfs_mount *mp) { - xfs_quotainfo_t *qinf; - int error; - xfs_dquot_t *dqp; + struct xfs_quotainfo *qinf; + int error; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -636,52 +686,7 @@ xfs_qm_init_quotainfo( mp->m_qflags |= (mp->m_sb.sb_qflags & XFS_ALL_QUOTA_CHKD); - /* - * We try to get the limits from the superuser's limits fields. - * This is quite hacky, but it is standard quota practice. - * - * Since we may not have done a quotacheck by this point, just read - * the dquot without attaching it to any hashtables or lists. - * - * Timers and warnings are globally set by the first timer found in - * user/group/proj quota types, otherwise a default value is used. - * This should be split into different fields per quota type. - */ - error = xfs_qm_dqread(mp, 0, - XFS_IS_UQUOTA_RUNNING(mp) ? XFS_DQ_USER : - (XFS_IS_GQUOTA_RUNNING(mp) ? XFS_DQ_GROUP : - XFS_DQ_PROJ), - 0, &dqp); - - if (!error) { - xfs_disk_dquot_t *ddqp = &dqp->q_core; - - /* - * The warnings and timers set the grace period given to - * a user or group before he or she can not perform any - * more writing. If it is zero, a default is used. - */ - qinf->qi_btimelimit = ddqp->d_btimer ? - be32_to_cpu(ddqp->d_btimer) : XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = ddqp->d_itimer ? - be32_to_cpu(ddqp->d_itimer) : XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = ddqp->d_rtbtimer ? - be32_to_cpu(ddqp->d_rtbtimer) : XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = ddqp->d_bwarns ? - be16_to_cpu(ddqp->d_bwarns) : XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = ddqp->d_iwarns ? - be16_to_cpu(ddqp->d_iwarns) : XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = ddqp->d_rtbwarns ? - be16_to_cpu(ddqp->d_rtbwarns) : XFS_QM_RTBWARNLIMIT; - xfs_qm_dqdestroy(dqp); - } else { - qinf->qi_btimelimit = XFS_QM_BTIMELIMIT; - qinf->qi_itimelimit = XFS_QM_ITIMELIMIT; - qinf->qi_rtbtimelimit = XFS_QM_RTBTIMELIMIT; - qinf->qi_bwarnlimit = XFS_QM_BWARNLIMIT; - qinf->qi_iwarnlimit = XFS_QM_IWARNLIMIT; - qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT; - } + xfs_qm_init_timelimits(mp, qinf); if (XFS_IS_UQUOTA_RUNNING(mp)) xfs_qm_set_defquota(mp, XFS_DQ_USER, qinf); @@ -865,9 +870,9 @@ xfs_qm_reset_dqcounts( * find uninitialised dquot blks. See comment in * xfs_dquot_verify. */ - fa = xfs_dquot_verify(mp, ddq, id + j, type, 0); + fa = xfs_dqblk_verify(mp, &dqb[j], id + j, type); if (fa) - xfs_dquot_repair(mp, ddq, id + j, type); + xfs_dqblk_repair(mp, &dqb[j], id + j, type); /* * Reset type in case we are reusing group quota file for @@ -893,7 +898,7 @@ xfs_qm_reset_dqcounts( } STATIC int -xfs_qm_dqiter_bufs( +xfs_qm_reset_dqcounts_all( struct xfs_mount *mp, xfs_dqid_t firstid, xfs_fsblock_t bno, @@ -961,11 +966,11 @@ xfs_qm_dqiter_bufs( } /* - * Iterate over all allocated USR/GRP/PRJ dquots in the system, calling a - * caller supplied function for every chunk of dquots that we find. + * Iterate over all allocated dquot blocks in this quota inode, zeroing all + * counters for every chunk of dquots that we find. */ STATIC int -xfs_qm_dqiterate( +xfs_qm_reset_dqcounts_buf( struct xfs_mount *mp, struct xfs_inode *qip, uint flags, @@ -1041,7 +1046,7 @@ xfs_qm_dqiterate( * Iterate thru all the blks in the extent and * reset the counters of all the dquots inside them. */ - error = xfs_qm_dqiter_bufs(mp, firstid, + error = xfs_qm_reset_dqcounts_all(mp, firstid, map[i].br_startblock, map[i].br_blockcount, flags, buffer_list); @@ -1066,16 +1071,17 @@ out: STATIC int xfs_qm_quotacheck_dqadjust( struct xfs_inode *ip, - xfs_dqid_t id, uint type, xfs_qcnt_t nblks, xfs_qcnt_t rtblks) { struct xfs_mount *mp = ip->i_mount; struct xfs_dquot *dqp; + xfs_dqid_t id; int error; - error = xfs_qm_dqget(mp, ip, id, type, XFS_QMOPT_DQALLOC, &dqp); + id = xfs_qm_id_for_quotatype(ip, type); + error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { /* * Shouldn't be able to turn off quotas here. @@ -1148,13 +1154,10 @@ xfs_qm_dqusage_adjust( } /* - * We don't _need_ to take the ilock EXCL. However, the xfs_qm_dqget - * interface expects the inode to be exclusively locked because that's - * the case in all other instances. It's OK that we do this because - * quotacheck is done only at mount time. + * We don't _need_ to take the ilock EXCL here because quotacheck runs + * at mount time and therefore nobody will be racing chown/chproj. */ - error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, XFS_ILOCK_EXCL, - &ip); + error = xfs_iget(mp, NULL, ino, XFS_IGET_DONTCACHE, 0, &ip); if (error) { *res = BULKSTAT_RV_NOTHING; return error; @@ -1189,33 +1192,31 @@ xfs_qm_dqusage_adjust( * and quotaoffs don't race. (Quotachecks happen at mount time only). */ if (XFS_IS_UQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_uid, - XFS_DQ_USER, nblks, rtblks); + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_USER, nblks, + rtblks); if (error) goto error0; } if (XFS_IS_GQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, ip->i_d.di_gid, - XFS_DQ_GROUP, nblks, rtblks); + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_GROUP, nblks, + rtblks); if (error) goto error0; } if (XFS_IS_PQUOTA_ON(mp)) { - error = xfs_qm_quotacheck_dqadjust(ip, xfs_get_projid(ip), - XFS_DQ_PROJ, nblks, rtblks); + error = xfs_qm_quotacheck_dqadjust(ip, XFS_DQ_PROJ, nblks, + rtblks); if (error) goto error0; } - xfs_iunlock(ip, XFS_ILOCK_EXCL); IRELE(ip); *res = BULKSTAT_RV_DIDONE; return 0; error0: - xfs_iunlock(ip, XFS_ILOCK_EXCL); IRELE(ip); *res = BULKSTAT_RV_GIVEUP; return error; @@ -1247,9 +1248,8 @@ xfs_qm_flush_one( */ if (!xfs_dqflock_nowait(dqp)) { /* buf is pinned in-core by delwri list */ - DEFINE_SINGLE_BUF_MAP(map, dqp->q_blkno, - mp->m_quotainfo->qi_dqchunklen); - bp = _xfs_buf_find(mp->m_ddev_targp, &map, 1, 0, NULL); + bp = xfs_buf_incore(mp->m_ddev_targp, dqp->q_blkno, + mp->m_quotainfo->qi_dqchunklen, 0); if (!bp) { error = -EINVAL; goto out_unlock; @@ -1307,7 +1307,7 @@ xfs_qm_quotacheck( * We don't log our changes till later. */ if (uip) { - error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, uip, XFS_QMOPT_UQUOTA, &buffer_list); if (error) goto error_return; @@ -1315,7 +1315,7 @@ xfs_qm_quotacheck( } if (gip) { - error = xfs_qm_dqiterate(mp, gip, XFS_QMOPT_GQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, gip, XFS_QMOPT_GQUOTA, &buffer_list); if (error) goto error_return; @@ -1323,7 +1323,7 @@ xfs_qm_quotacheck( } if (pip) { - error = xfs_qm_dqiterate(mp, pip, XFS_QMOPT_PQUOTA, + error = xfs_qm_reset_dqcounts_buf(mp, pip, XFS_QMOPT_PQUOTA, &buffer_list); if (error) goto error_return; @@ -1675,7 +1675,7 @@ xfs_qm_vop_dqalloc( * if necessary. The dquot(s) will not be locked. */ if (XFS_NOT_DQATTACHED(mp, ip)) { - error = xfs_qm_dqattach_locked(ip, XFS_QMOPT_DQALLOC); + error = xfs_qm_dqattach_locked(ip, true); if (error) { xfs_iunlock(ip, lockflags); return error; @@ -1694,10 +1694,7 @@ xfs_qm_vop_dqalloc( * holding ilock. */ xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, uid, - XFS_DQ_USER, - XFS_QMOPT_DQALLOC, - &uq); + error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq); if (error) { ASSERT(error != -ENOENT); return error; @@ -1720,10 +1717,7 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) { if (ip->i_d.di_gid != gid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, gid, - XFS_DQ_GROUP, - XFS_QMOPT_DQALLOC, - &gq); + error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1739,10 +1733,8 @@ xfs_qm_vop_dqalloc( if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) { if (xfs_get_projid(ip) != prid) { xfs_iunlock(ip, lockflags); - error = xfs_qm_dqget(mp, NULL, (xfs_dqid_t)prid, - XFS_DQ_PROJ, - XFS_QMOPT_DQALLOC, - &pq); + error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ, + true, &pq); if (error) { ASSERT(error != -ENOENT); goto error_rele; @@ -1933,7 +1925,7 @@ xfs_qm_vop_rename_dqattach( */ if (i == 0 || ip != i_tab[i-1]) { if (XFS_NOT_DQATTACHED(mp, ip)) { - error = xfs_qm_dqattach(ip, 0); + error = xfs_qm_dqattach(ip); if (error) return error; } diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 2975a822e9f0..e3129b280423 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -170,8 +170,10 @@ extern void xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint); /* quota ops */ extern int xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint); -extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t *, - uint, struct qc_dqblk *, uint); +extern int xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t, + uint, struct qc_dqblk *); +extern int xfs_qm_scall_getquota_next(struct xfs_mount *, + xfs_dqid_t *, uint, struct qc_dqblk *); extern int xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint, struct qc_dqblk *); extern int xfs_qm_scall_quotaon(struct xfs_mount *, uint); diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c index 2be6d2735ca9..36b89e2c5eb9 100644 --- a/fs/xfs/xfs_qm_bhv.c +++ b/fs/xfs/xfs_qm_bhv.c @@ -72,7 +72,7 @@ xfs_qm_statvfs( xfs_mount_t *mp = ip->i_mount; xfs_dquot_t *dqp; - if (!xfs_qm_dqget(mp, NULL, xfs_get_projid(ip), XFS_DQ_PROJ, 0, &dqp)) { + if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) { xfs_fill_statvfs_from_dquot(statp, dqp); xfs_qm_dqput(dqp); } diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 9cb5c381b01c..3e05d300b14e 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -425,7 +425,7 @@ xfs_qm_scall_setqlim( * a reference to the dquot, so it's safe to do this unlock/lock without * it being reclaimed in the mean time. */ - error = xfs_qm_dqget(mp, NULL, id, type, XFS_QMOPT_DQALLOC, &dqp); + error = xfs_qm_dqget(mp, id, type, true, &dqp); if (error) { ASSERT(error != -ENOENT); goto out_unlock; @@ -622,39 +622,14 @@ out: return error; } - -int -xfs_qm_scall_getquota( +/* Fill out the quota context. */ +static void +xfs_qm_scall_getquota_fill_qc( struct xfs_mount *mp, - xfs_dqid_t *id, uint type, - struct qc_dqblk *dst, - uint dqget_flags) + const struct xfs_dquot *dqp, + struct qc_dqblk *dst) { - struct xfs_dquot *dqp; - int error; - - /* - * Try to get the dquot. We don't want it allocated on disk, so - * we aren't passing the XFS_QMOPT_DOALLOC flag. If it doesn't - * exist, we'll get ENOENT back. - */ - error = xfs_qm_dqget(mp, NULL, *id, type, dqget_flags, &dqp); - if (error) - return error; - - /* - * If everything's NULL, this dquot doesn't quite exist as far as - * our utility programs are concerned. - */ - if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { - error = -ENOENT; - goto out_put; - } - - /* Fill in the ID we actually read from disk */ - *id = be32_to_cpu(dqp->q_core.d_id); - memset(dst, 0, sizeof(*dst)); dst->d_spc_hardlimit = XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit)); @@ -696,7 +671,7 @@ xfs_qm_scall_getquota( if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) || (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) || (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) && - *id != 0) { + dqp->q_core.d_id != 0) { if ((dst->d_space > dst->d_spc_softlimit) && (dst->d_spc_softlimit > 0)) { ASSERT(dst->d_spc_timer != 0); @@ -707,11 +682,69 @@ xfs_qm_scall_getquota( } } #endif +} + +/* Return the quota information for the dquot matching id. */ +int +xfs_qm_scall_getquota( + struct xfs_mount *mp, + xfs_dqid_t id, + uint type, + struct qc_dqblk *dst) +{ + struct xfs_dquot *dqp; + int error; + + /* + * Try to get the dquot. We don't want it allocated on disk, so don't + * set doalloc. If it doesn't exist, we'll get ENOENT back. + */ + error = xfs_qm_dqget(mp, id, type, false, &dqp); + if (error) + return error; + + /* + * If everything's NULL, this dquot doesn't quite exist as far as + * our utility programs are concerned. + */ + if (XFS_IS_DQUOT_UNINITIALIZED(dqp)) { + error = -ENOENT; + goto out_put; + } + + xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); + out_put: xfs_qm_dqput(dqp); return error; } +/* + * Return the quota information for the first initialized dquot whose id + * is at least as high as id. + */ +int +xfs_qm_scall_getquota_next( + struct xfs_mount *mp, + xfs_dqid_t *id, + uint type, + struct qc_dqblk *dst) +{ + struct xfs_dquot *dqp; + int error; + + error = xfs_qm_dqget_next(mp, *id, type, &dqp); + if (error) + return error; + + /* Fill in the ID we actually read from disk */ + *id = be32_to_cpu(dqp->q_core.d_id); + + xfs_qm_scall_getquota_fill_qc(mp, type, dqp, dst); + + xfs_qm_dqput(dqp); + return error; +} STATIC int xfs_dqrele_inode( diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index ce6506adab7b..3edf52b14919 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -48,6 +48,22 @@ struct xfs_trans; (XFS_IS_PQUOTA_ON(mp) && \ (mp->m_sb.sb_qflags & XFS_PQUOTA_CHKD) == 0)) +static inline uint +xfs_quota_chkd_flag( + uint dqtype) +{ + switch (dqtype) { + case XFS_DQ_USER: + return XFS_UQUOTA_CHKD; + case XFS_DQ_GROUP: + return XFS_GQUOTA_CHKD; + case XFS_DQ_PROJ: + return XFS_PQUOTA_CHKD; + default: + return 0; + } +} + /* * The structure kept inside the xfs_trans_t keep track of dquot changes * within a transaction and apply them later. @@ -90,8 +106,8 @@ extern struct xfs_dquot *xfs_qm_vop_chown(struct xfs_trans *, extern int xfs_qm_vop_chown_reserve(struct xfs_trans *, struct xfs_inode *, struct xfs_dquot *, struct xfs_dquot *, struct xfs_dquot *, uint); -extern int xfs_qm_dqattach(struct xfs_inode *, uint); -extern int xfs_qm_dqattach_locked(struct xfs_inode *, uint); +extern int xfs_qm_dqattach(struct xfs_inode *); +extern int xfs_qm_dqattach_locked(struct xfs_inode *ip, bool doalloc); extern void xfs_qm_dqdetach(struct xfs_inode *); extern void xfs_qm_dqrele(struct xfs_dquot *); extern void xfs_qm_statvfs(struct xfs_inode *, struct kstatfs *); @@ -132,7 +148,7 @@ static inline int xfs_trans_reserve_quota_bydquots(struct xfs_trans *tp, #define xfs_qm_vop_rename_dqattach(it) (0) #define xfs_qm_vop_chown(tp, ip, old, new) (NULL) #define xfs_qm_vop_chown_reserve(tp, ip, u, g, p, fl) (0) -#define xfs_qm_dqattach(ip, fl) (0) +#define xfs_qm_dqattach(ip) (0) #define xfs_qm_dqattach_locked(ip, fl) (0) #define xfs_qm_dqdetach(ip) #define xfs_qm_dqrele(d) diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c index a65108594a07..c93fc913dffb 100644 --- a/fs/xfs/xfs_quotaops.c +++ b/fs/xfs/xfs_quotaops.c @@ -239,8 +239,7 @@ xfs_fs_get_dqblk( return -ESRCH; id = from_kqid(&init_user_ns, qid); - return xfs_qm_scall_getquota(mp, &id, - xfs_quota_type(qid.type), qdq, 0); + return xfs_qm_scall_getquota(mp, id, xfs_quota_type(qid.type), qdq); } /* Return quota info for active quota >= this qid */ @@ -260,9 +259,8 @@ xfs_fs_get_nextdqblk( return -ESRCH; id = from_kqid(&init_user_ns, *qid); - ret = xfs_qm_scall_getquota(mp, &id, - xfs_quota_type(qid->type), qdq, - XFS_QMOPT_DQNEXT); + ret = xfs_qm_scall_getquota_next(mp, &id, xfs_quota_type(qid->type), + qdq); if (ret) return ret; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 15c9393dd7a7..e5866b714d5f 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -159,7 +159,7 @@ STATIC void xfs_cui_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_cui_release(CUI_ITEM(lip)); } @@ -310,7 +310,7 @@ xfs_cud_item_unlock( { struct xfs_cud_log_item *cudp = CUD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_cui_release(cudp->cud_cuip); kmem_zone_free(xfs_cud_zone, cudp); } diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index cdbd342a5249..713e857d9ffa 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -305,7 +305,7 @@ xfs_reflink_reserve_cow( * Fork all the shared blocks from our write offset until the end of * the extent. */ - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach_locked(ip, false); if (error) return error; @@ -431,7 +431,7 @@ retry: if (error) return error; - error = xfs_qm_dqattach_locked(ip, 0); + error = xfs_qm_dqattach_locked(ip, false); if (error) goto out; goto retry; @@ -552,6 +552,9 @@ xfs_reflink_trim_irec_to_next_cow( * * If cancel_real is true this function cancels all COW fork extents for the * inode; if cancel_real is false, real extents are not cleared. + * + * Caller must have already joined the inode to the current transaction. The + * inode will be joined to the transaction returned to the caller. */ int xfs_reflink_cancel_cow_blocks( @@ -592,7 +595,6 @@ xfs_reflink_cancel_cow_blocks( if (error) break; } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { - xfs_trans_ijoin(*tpp, ip, 0); xfs_defer_init(&dfops, &firstfsb); /* Free the CoW orphan record. */ @@ -1359,7 +1361,7 @@ xfs_reflink_remap_range( goto out_unlock; /* Attach dquots to dest inode before changing block map */ - ret = xfs_qm_dqattach(dest, 0); + ret = xfs_qm_dqattach(dest); if (ret) goto out_unlock; @@ -1551,7 +1553,12 @@ next: return 0; } -/* Clear the inode reflink flag if there are no shared extents. */ +/* + * Clear the inode reflink flag if there are no shared extents. + * + * The caller is responsible for joining the inode to the transaction passed in. + * The inode will be joined to the transaction that is returned to the caller. + */ int xfs_reflink_clear_inode_flag( struct xfs_inode *ip, @@ -1578,7 +1585,6 @@ xfs_reflink_clear_inode_flag( trace_xfs_reflink_unset_inode_flag(ip); ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK; xfs_inode_clear_cowblocks_tag(ip); - xfs_trans_ijoin(*tpp, ip, 0); xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE); return error; diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 06a07846c9b3..e5b5b3e7ef82 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -158,7 +158,7 @@ STATIC void xfs_rui_item_unlock( struct xfs_log_item *lip) { - if (lip->li_flags & XFS_LI_ABORTED) + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) xfs_rui_release(RUI_ITEM(lip)); } @@ -331,7 +331,7 @@ xfs_rud_item_unlock( { struct xfs_rud_log_item *rudp = RUD_ITEM(lip); - if (lip->li_flags & XFS_LI_ABORTED) { + if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) { xfs_rui_release(rudp->rud_ruip); kmem_zone_free(xfs_rud_zone, rudp); } diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index dfee3c991155..52632ab727f7 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -23,9 +23,14 @@ struct xfs_mount; struct xfs_trans; +/* + * XXX: Most of the realtime allocation functions deal in units of realtime + * extents, not realtime blocks. This looks funny when paired with the type + * name and screams for a larger cleanup. + */ struct xfs_rtalloc_rec { - xfs_rtblock_t ar_startblock; - xfs_rtblock_t ar_blockcount; + xfs_rtblock_t ar_startext; + xfs_rtblock_t ar_extcount; }; typedef int (*xfs_rtalloc_query_range_fn)( diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index f643d76db516..ed67389f4948 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1372,7 +1372,6 @@ xfs_fs_remount( */ xfs_restore_resvblks(mp); xfs_log_work_queue(mp); - xfs_queue_eofblocks(mp); /* Recover any CoW blocks that never got remapped. */ error = xfs_reflink_recover_cow(mp); @@ -1382,7 +1381,7 @@ xfs_fs_remount( xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); return error; } - xfs_queue_cowblocks(mp); + xfs_icache_enable_reclaim(mp); /* Create the per-AG metadata reservation pool .*/ error = xfs_fs_reserve_ag_blocks(mp); @@ -1392,8 +1391,13 @@ xfs_fs_remount( /* rw -> ro */ if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & SB_RDONLY)) { + /* + * Cancel background eofb scanning so it cannot race with the + * final log force+buftarg wait and deadlock the remount. + */ + xfs_icache_disable_reclaim(mp); + /* Get rid of any leftover CoW reservations... */ - cancel_delayed_work_sync(&mp->m_cowblocks_work); error = xfs_icache_free_cowblocks(mp, NULL); if (error) { xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); @@ -1416,12 +1420,6 @@ xfs_fs_remount( */ xfs_save_resvblks(mp); - /* - * Cancel background eofb scanning so it cannot race with the - * final log force+buftarg wait and deadlock the remount. - */ - cancel_delayed_work_sync(&mp->m_eofblocks_work); - xfs_quiesce_attr(mp); mp->m_flags |= XFS_MOUNT_RDONLY; } @@ -1441,6 +1439,7 @@ xfs_fs_freeze( { struct xfs_mount *mp = XFS_M(sb); + xfs_icache_disable_reclaim(mp); xfs_save_resvblks(mp); xfs_quiesce_attr(mp); return xfs_sync_sb(mp, true); @@ -1454,6 +1453,7 @@ xfs_fs_unfreeze( xfs_restore_resvblks(mp); xfs_log_work_queue(mp); + xfs_icache_enable_reclaim(mp); return 0; } @@ -1635,6 +1635,17 @@ xfs_fs_fill_super( #endif sb->s_op = &xfs_super_operations; + /* + * Delay mount work if the debug hook is set. This is debug + * instrumention to coordinate simulation of xfs mount failures with + * VFS superblock operations + */ + if (xfs_globals.mount_delay) { + xfs_notice(mp, "Delaying mount for %d seconds.", + xfs_globals.mount_delay); + msleep(xfs_globals.mount_delay * 1000); + } + if (silent) flags |= XFS_MFSI_QUIET; @@ -1690,11 +1701,17 @@ xfs_fs_fill_super( sb->s_flags |= SB_I_VERSION; if (mp->m_flags & XFS_MOUNT_DAX) { + bool rtdev_is_dax = false, datadev_is_dax; + xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - error = bdev_dax_supported(sb, sb->s_blocksize); - if (error) { + datadev_is_dax = bdev_dax_supported(mp->m_ddev_targp->bt_bdev, + sb->s_blocksize); + if (mp->m_rtdev_targp) + rtdev_is_dax = bdev_dax_supported( + mp->m_rtdev_targp->bt_bdev, sb->s_blocksize); + if (!rtdev_is_dax && !datadev_is_dax) { xfs_alert(mp, "DAX unsupported by block device. Turning off DAX."); mp->m_flags &= ~XFS_MOUNT_DAX; @@ -1761,6 +1778,7 @@ xfs_fs_fill_super( out_close_devices: xfs_close_devices(mp); out_free_fsname: + sb->s_fs_info = NULL; xfs_free_fsname(mp); kfree(mp); out: @@ -1778,6 +1796,10 @@ xfs_fs_put_super( { struct xfs_mount *mp = XFS_M(sb); + /* if ->fill_super failed, we have no mount to tear down */ + if (!sb->s_fs_info) + return; + xfs_notice(mp, "Unmounting Filesystem"); xfs_filestream_unmount(mp); xfs_unmountfs(mp); @@ -1787,6 +1809,8 @@ xfs_fs_put_super( xfs_destroy_percpu_counters(mp); xfs_destroy_mount_workqueues(mp); xfs_close_devices(mp); + + sb->s_fs_info = NULL; xfs_free_fsname(mp); kfree(mp); } @@ -1806,6 +1830,9 @@ xfs_fs_nr_cached_objects( struct super_block *sb, struct shrink_control *sc) { + /* Paranoia: catch incorrect calls during mount setup or teardown */ + if (WARN_ON_ONCE(!sb->s_fs_info)) + return 0; return xfs_reclaim_inodes_count(XFS_M(sb)); } @@ -1879,11 +1906,6 @@ xfs_init_zones(void) if (!xfs_trans_zone) goto out_destroy_ifork_zone; - xfs_log_item_desc_zone = - kmem_zone_init(sizeof(struct xfs_log_item_desc), - "xfs_log_item_desc"); - if (!xfs_log_item_desc_zone) - goto out_destroy_trans_zone; /* * The size of the zone allocated buf log item is the maximum @@ -1893,7 +1915,7 @@ xfs_init_zones(void) xfs_buf_item_zone = kmem_zone_init(sizeof(struct xfs_buf_log_item), "xfs_buf_item"); if (!xfs_buf_item_zone) - goto out_destroy_log_item_desc_zone; + goto out_destroy_trans_zone; xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) + ((XFS_EFD_MAX_FAST_EXTENTS - 1) * @@ -1981,8 +2003,6 @@ xfs_init_zones(void) kmem_zone_destroy(xfs_efd_zone); out_destroy_buf_item_zone: kmem_zone_destroy(xfs_buf_item_zone); - out_destroy_log_item_desc_zone: - kmem_zone_destroy(xfs_log_item_desc_zone); out_destroy_trans_zone: kmem_zone_destroy(xfs_trans_zone); out_destroy_ifork_zone: @@ -2021,7 +2041,6 @@ xfs_destroy_zones(void) kmem_zone_destroy(xfs_efi_zone); kmem_zone_destroy(xfs_efd_zone); kmem_zone_destroy(xfs_buf_item_zone); - kmem_zone_destroy(xfs_log_item_desc_zone); kmem_zone_destroy(xfs_trans_zone); kmem_zone_destroy(xfs_ifork_zone); kmem_zone_destroy(xfs_da_state_zone); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 5b66ac12913c..aed03da637d4 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -259,6 +259,7 @@ xfs_symlink( * bmapi or the directory create code. */ xfs_defer_init(&dfops, &first_block); + tp->t_agfl_dfops = &dfops; /* * Allocate an inode for the symlink. @@ -488,16 +489,11 @@ xfs_inactive_symlink_rmt( error = xfs_defer_finish(&tp, &dfops); if (error) goto error_bmap_cancel; - /* - * The first xact was committed, so add the inode to the new one. - * Mark it dirty so it will be logged and moved forward in the log as - * part of every commit. - */ - xfs_trans_ijoin(tp, ip, 0); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + /* * Commit the transaction containing extent freeing and EFDs. */ + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 82afee005140..b53a33e69932 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -95,6 +95,7 @@ extern xfs_param_t xfs_params; struct xfs_globals { int log_recovery_delay; /* log recovery delay (secs) */ + int mount_delay; /* mount setup delay (secs) */ bool bug_on_assert; /* BUG() the kernel on assert failure */ }; extern struct xfs_globals xfs_globals; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 8b2ccc234f36..2d5cd2529f8e 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -165,9 +165,40 @@ log_recovery_delay_show( } XFS_SYSFS_ATTR_RW(log_recovery_delay); +STATIC ssize_t +mount_delay_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 60) + return -EINVAL; + + xfs_globals.mount_delay = val; + + return count; +} + +STATIC ssize_t +mount_delay_show( + struct kobject *kobject, + char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.mount_delay); +} +XFS_SYSFS_ATTR_RW(mount_delay); + static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), + ATTR_LIST(mount_delay), NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 8955254b900e..9d4c4ca24fe6 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -441,8 +441,7 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __field(unsigned, bli_recur) __field(int, bli_refcount) __field(unsigned, bli_flags) - __field(void *, li_desc) - __field(unsigned, li_flags) + __field(unsigned long, li_flags) ), TP_fast_assign( __entry->dev = bip->bli_buf->b_target->bt_dev; @@ -455,12 +454,11 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); __entry->buf_lockval = bip->bli_buf->b_sema.count; - __entry->li_desc = bip->bli_item.li_desc; __entry->li_flags = bip->bli_item.li_flags; ), TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " "lock %d flags %s recur %d refcount %d bliflags %s " - "lidesc %p liflags %s", + "liflags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->buf_bno, __entry->buf_len, @@ -471,7 +469,6 @@ DECLARE_EVENT_CLASS(xfs_buf_item_class, __entry->bli_recur, __entry->bli_refcount, __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), - __entry->li_desc, __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) ) @@ -1018,7 +1015,7 @@ DECLARE_EVENT_CLASS(xfs_log_item_class, __field(dev_t, dev) __field(void *, lip) __field(uint, type) - __field(uint, flags) + __field(unsigned long, flags) __field(xfs_lsn_t, lsn) ), TP_fast_assign( @@ -1070,7 +1067,7 @@ DECLARE_EVENT_CLASS(xfs_ail_class, __field(dev_t, dev) __field(void *, lip) __field(uint, type) - __field(uint, flags) + __field(unsigned long, flags) __field(xfs_lsn_t, old_lsn) __field(xfs_lsn_t, new_lsn) ), @@ -1750,6 +1747,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __field(int, namelen) __field(int, valuelen) __field(xfs_dahash_t, hashval) + __field(int, flags) __field(int, op_flags) ), TP_fast_assign( @@ -1760,10 +1758,11 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen = args->namelen; __entry->valuelen = args->valuelen; __entry->hashval = args->hashval; + __entry->flags = args->flags; __entry->op_flags = args->op_flags; ), TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d " - "hashval 0x%x op_flags %s", + "hashval 0x%x flags %s op_flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->namelen, @@ -1771,6 +1770,7 @@ DECLARE_EVENT_CLASS(xfs_attr_class, __entry->namelen, __entry->valuelen, __entry->hashval, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS), __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) ) @@ -2243,30 +2243,35 @@ struct xfs_defer_pending; struct xfs_defer_ops; DECLARE_EVENT_CLASS(xfs_defer_class, - TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), - TP_ARGS(mp, dop), + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, + unsigned long caller_ip), + TP_ARGS(mp, dop, caller_ip), TP_STRUCT__entry( __field(dev_t, dev) __field(void *, dop) __field(char, committed) __field(char, low) + __field(unsigned long, caller_ip) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; __entry->dop = dop; __entry->committed = dop->dop_committed; __entry->low = dop->dop_low; + __entry->caller_ip = caller_ip; ), - TP_printk("dev %d:%d ops %p committed %d low %d", + TP_printk("dev %d:%d ops %p committed %d low %d, caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dop, __entry->committed, - __entry->low) + __entry->low, + (char *)__entry->caller_ip) ) #define DEFINE_DEFER_EVENT(name) \ DEFINE_EVENT(xfs_defer_class, name, \ - TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop), \ - TP_ARGS(mp, dop)) + TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, \ + unsigned long caller_ip), \ + TP_ARGS(mp, dop, caller_ip)) DECLARE_EVENT_CLASS(xfs_defer_error_class, TP_PROTO(struct xfs_mount *mp, struct xfs_defer_ops *dop, int error), @@ -2433,6 +2438,8 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort); #define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer); DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_deferred); +DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_defer); +DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_agfl_free_deferred); /* rmap tracepoints */ DECLARE_EVENT_CLASS(xfs_rmap_class, @@ -3346,6 +3353,43 @@ TRACE_EVENT(xfs_trans_resv_calc, __entry->logflags) ); +DECLARE_EVENT_CLASS(xfs_trans_class, + TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), + TP_ARGS(tp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(uint32_t, tid) + __field(uint32_t, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = tp->t_mountp->m_super->s_dev; + __entry->tid = 0; + if (tp->t_ticket) + __entry->tid = tp->t_ticket->t_tid; + __entry->flags = tp->t_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d trans %x flags 0x%x caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->tid, + __entry->flags, + (char *)__entry->caller_ip) +) + +#define DEFINE_TRANS_EVENT(name) \ +DEFINE_EVENT(xfs_trans_class, name, \ + TP_PROTO(struct xfs_trans *tp, unsigned long caller_ip), \ + TP_ARGS(tp, caller_ip)) +DEFINE_TRANS_EVENT(xfs_trans_alloc); +DEFINE_TRANS_EVENT(xfs_trans_cancel); +DEFINE_TRANS_EVENT(xfs_trans_commit); +DEFINE_TRANS_EVENT(xfs_trans_dup); +DEFINE_TRANS_EVENT(xfs_trans_free); +DEFINE_TRANS_EVENT(xfs_trans_roll); +DEFINE_TRANS_EVENT(xfs_trans_add_item); +DEFINE_TRANS_EVENT(xfs_trans_free_items); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index d6d8f9d129a7..fc7ba75b8b69 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -31,9 +31,9 @@ #include "xfs_log.h" #include "xfs_trace.h" #include "xfs_error.h" +#include "xfs_defer.h" kmem_zone_t *xfs_trans_zone; -kmem_zone_t *xfs_log_item_desc_zone; #if defined(CONFIG_TRACEPOINTS) static void @@ -79,6 +79,7 @@ xfs_trans_free( xfs_extent_busy_sort(&tp->t_busy); xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); + trace_xfs_trans_free(tp, _RET_IP_); atomic_dec(&tp->t_mountp->m_active_trans); if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); @@ -94,11 +95,13 @@ xfs_trans_free( * blocks. Locks and log items, however, are no inherited. They must * be added to the new transaction explicitly. */ -STATIC xfs_trans_t * +STATIC struct xfs_trans * xfs_trans_dup( - xfs_trans_t *tp) + struct xfs_trans *tp) { - xfs_trans_t *ntp; + struct xfs_trans *ntp; + + trace_xfs_trans_dup(tp, _RET_IP_); ntp = kmem_zone_zalloc(xfs_trans_zone, KM_SLEEP); @@ -127,6 +130,7 @@ xfs_trans_dup( ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; tp->t_rtx_res = tp->t_rtx_res_used; ntp->t_pflags = tp->t_pflags; + ntp->t_agfl_dfops = tp->t_agfl_dfops; xfs_trans_dup_dqinfo(tp, ntp); @@ -283,6 +287,8 @@ xfs_trans_alloc( return error; } + trace_xfs_trans_alloc(tp, _RET_IP_); + *tpp = tp; return 0; } @@ -727,73 +733,52 @@ out: return; } -/* - * Add the given log item to the transaction's list of log items. - * - * The log item will now point to its new descriptor with its li_desc field. - */ +/* Add the given log item to the transaction's list of log items. */ void xfs_trans_add_item( struct xfs_trans *tp, struct xfs_log_item *lip) { - struct xfs_log_item_desc *lidp; - ASSERT(lip->li_mountp == tp->t_mountp); ASSERT(lip->li_ailp == tp->t_mountp->m_ail); + ASSERT(list_empty(&lip->li_trans)); + ASSERT(!test_bit(XFS_LI_DIRTY, &lip->li_flags)); - lidp = kmem_zone_zalloc(xfs_log_item_desc_zone, KM_SLEEP | KM_NOFS); - - lidp->lid_item = lip; - lidp->lid_flags = 0; - list_add_tail(&lidp->lid_trans, &tp->t_items); - - lip->li_desc = lidp; -} - -STATIC void -xfs_trans_free_item_desc( - struct xfs_log_item_desc *lidp) -{ - list_del_init(&lidp->lid_trans); - kmem_zone_free(xfs_log_item_desc_zone, lidp); + list_add_tail(&lip->li_trans, &tp->t_items); + trace_xfs_trans_add_item(tp, _RET_IP_); } /* - * Unlink and free the given descriptor. + * Unlink the log item from the transaction. the log item is no longer + * considered dirty in this transaction, as the linked transaction has + * finished, either by abort or commit completion. */ void xfs_trans_del_item( struct xfs_log_item *lip) { - xfs_trans_free_item_desc(lip->li_desc); - lip->li_desc = NULL; + clear_bit(XFS_LI_DIRTY, &lip->li_flags); + list_del_init(&lip->li_trans); } -/* - * Unlock all of the items of a transaction and free all the descriptors - * of that transaction. - */ +/* Detach and unlock all of the items in a transaction */ void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, bool abort) { - struct xfs_log_item_desc *lidp, *next; - - list_for_each_entry_safe(lidp, next, &tp->t_items, lid_trans) { - struct xfs_log_item *lip = lidp->lid_item; + struct xfs_log_item *lip, *next; - lip->li_desc = NULL; + trace_xfs_trans_free_items(tp, _RET_IP_); + list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { + xfs_trans_del_item(lip); if (commit_lsn != NULLCOMMITLSN) lip->li_ops->iop_committing(lip, commit_lsn); if (abort) - lip->li_flags |= XFS_LI_ABORTED; + set_bit(XFS_LI_ABORTED, &lip->li_flags); lip->li_ops->iop_unlock(lip); - - xfs_trans_free_item_desc(lidp); } } @@ -861,7 +846,7 @@ xfs_trans_committed_bulk( xfs_lsn_t item_lsn; if (aborted) - lip->li_flags |= XFS_LI_ABORTED; + set_bit(XFS_LI_ABORTED, &lip->li_flags); item_lsn = lip->li_ops->iop_committed(lip, commit_lsn); /* item_lsn of -1 means the item needs no further processing */ @@ -936,6 +921,11 @@ __xfs_trans_commit( int error = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; + ASSERT(!tp->t_agfl_dfops || + !xfs_defer_has_unfinished_work(tp->t_agfl_dfops) || regrant); + + trace_xfs_trans_commit(tp, _RET_IP_); + /* * If there is nothing to be logged by the transaction, * then unlock all of the items associated with the @@ -991,6 +981,7 @@ out_unreserve: commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); if (commit_lsn == -1 && !error) error = -EIO; + tp->t_ticket = NULL; } current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); @@ -1022,6 +1013,8 @@ xfs_trans_cancel( struct xfs_mount *mp = tp->t_mountp; bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); + trace_xfs_trans_cancel(tp, _RET_IP_); + /* * See if the caller is relying on us to shut down the * filesystem. This happens in paths where we detect @@ -1033,17 +1026,19 @@ xfs_trans_cancel( } #ifdef DEBUG if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; - list_for_each_entry(lidp, &tp->t_items, lid_trans) - ASSERT(!(lidp->lid_item->li_type == XFS_LI_EFD)); + list_for_each_entry(lip, &tp->t_items, li_trans) + ASSERT(!(lip->li_type == XFS_LI_EFD)); } #endif xfs_trans_unreserve_and_mod_sb(tp); xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) + if (tp->t_ticket) { xfs_log_done(mp, tp->t_ticket, NULL, false); + tp->t_ticket = NULL; + } /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); @@ -1067,6 +1062,8 @@ xfs_trans_roll( struct xfs_trans_res tres; int error; + trace_xfs_trans_roll(trans, _RET_IP_); + /* * Copy the critical parameters from one trans to the next. */ diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 9d542dfe0052..29706b8b3bd4 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -27,7 +27,6 @@ struct xfs_efi_log_item; struct xfs_inode; struct xfs_item_ops; struct xfs_log_iovec; -struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_trans_res; @@ -43,12 +42,12 @@ struct xfs_bud_log_item; typedef struct xfs_log_item { struct list_head li_ail; /* AIL pointers */ + struct list_head li_trans; /* transaction list */ xfs_lsn_t li_lsn; /* last on-disk lsn */ - struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ struct xfs_mount *li_mountp; /* ptr to fs mount */ struct xfs_ail *li_ailp; /* ptr to AIL */ uint li_type; /* item type */ - uint li_flags; /* misc flags */ + unsigned long li_flags; /* misc flags */ struct xfs_buf *li_buf; /* real buffer pointer */ struct list_head li_bio_list; /* buffer item list */ void (*li_cb)(struct xfs_buf *, @@ -64,14 +63,21 @@ typedef struct xfs_log_item { xfs_lsn_t li_seq; /* CIL commit seq */ } xfs_log_item_t; -#define XFS_LI_IN_AIL 0x1 -#define XFS_LI_ABORTED 0x2 -#define XFS_LI_FAILED 0x4 +/* + * li_flags use the (set/test/clear)_bit atomic interfaces because updates can + * race with each other and we don't want to have to use the AIL lock to + * serialise all updates. + */ +#define XFS_LI_IN_AIL 0 +#define XFS_LI_ABORTED 1 +#define XFS_LI_FAILED 2 +#define XFS_LI_DIRTY 3 /* log item dirty in transaction */ #define XFS_LI_FLAGS \ - { XFS_LI_IN_AIL, "IN_AIL" }, \ - { XFS_LI_ABORTED, "ABORTED" }, \ - { XFS_LI_FAILED, "FAILED" } + { (1 << XFS_LI_IN_AIL), "IN_AIL" }, \ + { (1 << XFS_LI_ABORTED), "ABORTED" }, \ + { (1 << XFS_LI_FAILED), "FAILED" }, \ + { (1 << XFS_LI_DIRTY), "DIRTY" } struct xfs_item_ops { void (*iop_size)(xfs_log_item_t *, int *, int *); @@ -111,6 +117,7 @@ typedef struct xfs_trans { struct xlog_ticket *t_ticket; /* log mgr ticket */ struct xfs_mount *t_mountp; /* ptr to fs mount struct */ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ + struct xfs_defer_ops *t_agfl_dfops; /* optional agfl fixup dfops */ unsigned int t_flags; /* misc flags */ int64_t t_icount_delta; /* superblock icount change */ int64_t t_ifree_delta; /* superblock ifree change */ @@ -228,7 +235,8 @@ struct xfs_efd_log_item *xfs_trans_get_efd(struct xfs_trans *, uint); int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, - xfs_extlen_t, struct xfs_owner_info *); + xfs_extlen_t, struct xfs_owner_info *, + bool); int xfs_trans_commit(struct xfs_trans *); int xfs_trans_roll(struct xfs_trans **); int xfs_trans_roll_inode(struct xfs_trans **, struct xfs_inode *); @@ -242,7 +250,6 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp, struct xfs_buf *src_bp); extern kmem_zone_t *xfs_trans_zone; -extern kmem_zone_t *xfs_log_item_desc_zone; /* rmap updates */ enum xfs_rmap_intent_type; diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d4a2445215e6..41e280ef1483 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -32,30 +32,51 @@ #ifdef DEBUG /* * Check that the list is sorted as it should be. + * + * Called with the ail lock held, but we don't want to assert fail with it + * held otherwise we'll lock everything up and won't be able to debug the + * cause. Hence we sample and check the state under the AIL lock and return if + * everything is fine, otherwise we drop the lock and run the ASSERT checks. + * Asserts may not be fatal, so pick the lock back up and continue onwards. */ STATIC void xfs_ail_check( - struct xfs_ail *ailp, - xfs_log_item_t *lip) + struct xfs_ail *ailp, + struct xfs_log_item *lip) { - xfs_log_item_t *prev_lip; + struct xfs_log_item *prev_lip; + struct xfs_log_item *next_lip; + xfs_lsn_t prev_lsn = NULLCOMMITLSN; + xfs_lsn_t next_lsn = NULLCOMMITLSN; + xfs_lsn_t lsn; + bool in_ail; + if (list_empty(&ailp->ail_head)) return; /* - * Check the next and previous entries are valid. + * Sample then check the next and previous entries are valid. */ - ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); - prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail); + in_ail = test_bit(XFS_LI_IN_AIL, &lip->li_flags); + prev_lip = list_entry(lip->li_ail.prev, struct xfs_log_item, li_ail); if (&prev_lip->li_ail != &ailp->ail_head) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); - - prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail); - if (&prev_lip->li_ail != &ailp->ail_head) - ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); + prev_lsn = prev_lip->li_lsn; + next_lip = list_entry(lip->li_ail.next, struct xfs_log_item, li_ail); + if (&next_lip->li_ail != &ailp->ail_head) + next_lsn = next_lip->li_lsn; + lsn = lip->li_lsn; + if (in_ail && + (prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0) && + (next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0)) + return; + spin_unlock(&ailp->ail_lock); + ASSERT(in_ail); + ASSERT(prev_lsn == NULLCOMMITLSN || XFS_LSN_CMP(prev_lsn, lsn) <= 0); + ASSERT(next_lsn == NULLCOMMITLSN || XFS_LSN_CMP(next_lsn, lsn) >= 0); + spin_lock(&ailp->ail_lock); } #else /* !DEBUG */ #define xfs_ail_check(a,l) @@ -684,7 +705,7 @@ xfs_trans_ail_update_bulk( for (i = 0; i < nr_items; i++) { struct xfs_log_item *lip = log_items[i]; - if (lip->li_flags & XFS_LI_IN_AIL) { + if (test_and_set_bit(XFS_LI_IN_AIL, &lip->li_flags)) { /* check if we really need to move the item */ if (XFS_LSN_CMP(lsn, lip->li_lsn) <= 0) continue; @@ -694,7 +715,6 @@ xfs_trans_ail_update_bulk( if (mlip == lip) mlip_changed = 1; } else { - lip->li_flags |= XFS_LI_IN_AIL; trace_xfs_ail_insert(lip, 0, lsn); } lip->li_lsn = lsn; @@ -725,7 +745,7 @@ xfs_ail_delete_one( trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); xfs_ail_delete(ailp, lip); xfs_clear_li_failed(lip); - lip->li_flags &= ~XFS_LI_IN_AIL; + clear_bit(XFS_LI_IN_AIL, &lip->li_flags); lip->li_lsn = 0; return mlip == lip; @@ -761,7 +781,7 @@ xfs_trans_ail_delete( struct xfs_mount *mp = ailp->ail_mount; bool mlip_changed; - if (!(lip->li_flags & XFS_LI_IN_AIL)) { + if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) { spin_unlock(&ailp->ail_lock); if (!XFS_FORCED_SHUTDOWN(mp)) { xfs_alert_tag(mp, XFS_PTAG_AILDELETE, diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c index 14543d93cd4b..230a21df4b12 100644 --- a/fs/xfs/xfs_trans_bmap.c +++ b/fs/xfs/xfs_trans_bmap.c @@ -79,7 +79,7 @@ xfs_trans_log_finish_bmap_update( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags); return error; } @@ -158,7 +158,7 @@ xfs_bmap_update_log_item( bmap = container_of(item, struct xfs_bmap_intent, bi_list); tp->t_flags |= XFS_TRANS_DIRTY; - buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags); /* * atomic_inc_return gives us the value after the increment; diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index a5d9dfc45d98..a8ddb4eed279 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -40,7 +40,7 @@ xfs_trans_buf_item_match( struct xfs_buf_map *map, int nmaps) { - struct xfs_log_item_desc *lidp; + struct xfs_log_item *lip; struct xfs_buf_log_item *blip; int len = 0; int i; @@ -48,8 +48,8 @@ xfs_trans_buf_item_match( for (i = 0; i < nmaps; i++) len += map[i].bm_len; - list_for_each_entry(lidp, &tp->t_items, lid_trans) { - blip = (struct xfs_buf_log_item *)lidp->lid_item; + list_for_each_entry(lip, &tp->t_items, li_trans) { + blip = (struct xfs_buf_log_item *)lip; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && XFS_BUF_ADDR(blip->bli_buf) == map[0].bm_bn && @@ -100,14 +100,10 @@ _xfs_trans_bjoin( atomic_inc(&bip->bli_refcount); /* - * Get a log_item_desc to point at the new item. + * Attach the item to the transaction so we can find it in + * xfs_trans_get_buf() and friends. */ xfs_trans_add_item(tp, &bip->bli_item); - - /* - * Initialize b_fsprivate2 so we can find it with incore_match() - * in xfs_trans_get_buf() and friends above. - */ bp->b_transp = tp; } @@ -391,7 +387,7 @@ xfs_trans_brelse( * If the buffer is dirty within this transaction, we can't * release it until we commit. */ - if (bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY) + if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; /* @@ -442,7 +438,7 @@ xfs_trans_brelse( ASSERT(bp->b_pincount == 0); ***/ ASSERT(atomic_read(&bip->bli_refcount) == 0); - ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags)); ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF)); xfs_buf_item_relse(bp); } @@ -542,7 +538,7 @@ xfs_trans_dirty_buf( bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; - bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); } /* @@ -626,7 +622,7 @@ xfs_trans_binval( ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); - ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY); + ASSERT(test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; } @@ -642,7 +638,7 @@ xfs_trans_binval( memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); } - bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); tp->t_flags |= XFS_TRANS_DIRTY; } diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index c3d547211d16..c381c02cca45 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -77,7 +77,7 @@ xfs_trans_log_dquot( ASSERT(XFS_DQ_IS_LOCKED(dqp)); tp->t_flags |= XFS_TRANS_DIRTY; - dqp->q_logitem.qli_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &dqp->q_logitem.qli_item.li_flags); } /* @@ -879,7 +879,7 @@ xfs_trans_log_quotaoff_item( xfs_qoff_logitem_t *qlp) { tp->t_flags |= XFS_TRANS_DIRTY; - qlp->qql_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags); } STATIC void diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index ab438647592a..2f44a08bdf65 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -68,7 +68,8 @@ xfs_trans_free_extent( struct xfs_efd_log_item *efdp, xfs_fsblock_t start_block, xfs_extlen_t ext_len, - struct xfs_owner_info *oinfo) + struct xfs_owner_info *oinfo, + bool skip_discard) { struct xfs_mount *mp = tp->t_mountp; uint next_extent; @@ -79,9 +80,8 @@ xfs_trans_free_extent( trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); - error = xfs_free_extent(tp, start_block, ext_len, oinfo, - XFS_AG_RESV_NONE); - + error = __xfs_free_extent(tp, start_block, ext_len, + oinfo, XFS_AG_RESV_NONE, skip_discard); /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: @@ -90,7 +90,7 @@ xfs_trans_free_extent( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - efdp->efd_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); next_extent = efdp->efd_next_extent; ASSERT(next_extent < efdp->efd_format.efd_nextents); @@ -155,7 +155,7 @@ xfs_extent_free_log_item( free = container_of(item, struct xfs_extent_free_item, xefi_list); tp->t_flags |= XFS_TRANS_DIRTY; - efip->efi_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags); /* * atomic_inc_return gives us the value after the increment; @@ -195,7 +195,7 @@ xfs_extent_free_finish_item( error = xfs_trans_free_extent(tp, done_item, free->xefi_startblock, free->xefi_blockcount, - &free->xefi_oinfo); + &free->xefi_oinfo, free->xefi_skip_discard); kmem_free(free); return error; } @@ -231,9 +231,79 @@ static const struct xfs_defer_op_type xfs_extent_free_defer_type = { .cancel_item = xfs_extent_free_cancel_item, }; +/* + * AGFL blocks are accounted differently in the reserve pools and are not + * inserted into the busy extent list. + */ +STATIC int +xfs_agfl_free_finish_item( + struct xfs_trans *tp, + struct xfs_defer_ops *dop, + struct list_head *item, + void *done_item, + void **state) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_efd_log_item *efdp = done_item; + struct xfs_extent_free_item *free; + struct xfs_extent *extp; + struct xfs_buf *agbp; + int error; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + uint next_extent; + + free = container_of(item, struct xfs_extent_free_item, xefi_list); + ASSERT(free->xefi_blockcount == 1); + agno = XFS_FSB_TO_AGNO(mp, free->xefi_startblock); + agbno = XFS_FSB_TO_AGBNO(mp, free->xefi_startblock); + + trace_xfs_agfl_free_deferred(mp, agno, 0, agbno, free->xefi_blockcount); + + error = xfs_alloc_read_agf(mp, tp, agno, 0, &agbp); + if (!error) + error = xfs_free_agfl_block(tp, agno, agbno, agbp, + &free->xefi_oinfo); + + /* + * Mark the transaction dirty, even on error. This ensures the + * transaction is aborted, which: + * + * 1.) releases the EFI and frees the EFD + * 2.) shuts down the filesystem + */ + tp->t_flags |= XFS_TRANS_DIRTY; + set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags); + + next_extent = efdp->efd_next_extent; + ASSERT(next_extent < efdp->efd_format.efd_nextents); + extp = &(efdp->efd_format.efd_extents[next_extent]); + extp->ext_start = free->xefi_startblock; + extp->ext_len = free->xefi_blockcount; + efdp->efd_next_extent++; + + kmem_free(free); + return error; +} + + +/* sub-type with special handling for AGFL deferred frees */ +static const struct xfs_defer_op_type xfs_agfl_free_defer_type = { + .type = XFS_DEFER_OPS_TYPE_AGFL_FREE, + .max_items = XFS_EFI_MAX_FAST_EXTENTS, + .diff_items = xfs_extent_free_diff_items, + .create_intent = xfs_extent_free_create_intent, + .abort_intent = xfs_extent_free_abort_intent, + .log_item = xfs_extent_free_log_item, + .create_done = xfs_extent_free_create_done, + .finish_item = xfs_agfl_free_finish_item, + .cancel_item = xfs_extent_free_cancel_item, +}; + /* Register the deferred op type. */ void xfs_extent_free_init_defer_op(void) { xfs_defer_init_op_type(&xfs_extent_free_defer_type); + xfs_defer_init_op_type(&xfs_agfl_free_defer_type); } diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 07cea592dc01..f7bd7960a90f 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -133,14 +133,13 @@ xfs_trans_log_inode( * set however, then go ahead and bump the i_version counter * unconditionally. */ - if (!(ip->i_itemp->ili_item.li_desc->lid_flags & XFS_LID_DIRTY) && + if (!test_and_set_bit(XFS_LI_DIRTY, &ip->i_itemp->ili_item.li_flags) && IS_I_VERSION(VFS_I(ip))) { if (inode_maybe_inc_iversion(VFS_I(ip), flags & XFS_ILOG_CORE)) flags |= XFS_ILOG_CORE; } tp->t_flags |= XFS_TRANS_DIRTY; - ip->i_itemp->ili_item.li_desc->lid_flags |= XFS_LID_DIRTY; /* * Always OR in the bits from the ili_last_fields field. diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index be24b0c8a332..9717ae74b36d 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -19,7 +19,6 @@ #define __XFS_TRANS_PRIV_H__ struct xfs_log_item; -struct xfs_log_item_desc; struct xfs_mount; struct xfs_trans; struct xfs_ail; @@ -119,7 +118,7 @@ xfs_trans_ail_remove( spin_lock(&ailp->ail_lock); /* xfs_trans_ail_delete() drops the AIL lock */ - if (lip->li_flags & XFS_LI_IN_AIL) + if (test_bit(XFS_LI_IN_AIL, &lip->li_flags)) xfs_trans_ail_delete(ailp, lip, shutdown_type); else spin_unlock(&ailp->ail_lock); @@ -171,11 +170,10 @@ xfs_clear_li_failed( { struct xfs_buf *bp = lip->li_buf; - ASSERT(lip->li_flags & XFS_LI_IN_AIL); + ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); lockdep_assert_held(&lip->li_ailp->ail_lock); - if (lip->li_flags & XFS_LI_FAILED) { - lip->li_flags &= ~XFS_LI_FAILED; + if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { lip->li_buf = NULL; xfs_buf_rele(bp); } @@ -188,9 +186,8 @@ xfs_set_li_failed( { lockdep_assert_held(&lip->li_ailp->ail_lock); - if (!(lip->li_flags & XFS_LI_FAILED)) { + if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { xfs_buf_hold(bp); - lip->li_flags |= XFS_LI_FAILED; lip->li_buf = bp; } } diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c index 94c1877af834..c7f8e82f5bda 100644 --- a/fs/xfs/xfs_trans_refcount.c +++ b/fs/xfs/xfs_trans_refcount.c @@ -77,7 +77,7 @@ xfs_trans_log_finish_refcount_update( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags); return error; } @@ -154,7 +154,7 @@ xfs_refcount_update_log_item( refc = container_of(item, struct xfs_refcount_intent, ri_list); tp->t_flags |= XFS_TRANS_DIRTY; - cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags); /* * atomic_inc_return gives us the value after the increment; diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c index 9b577beb43d7..5831ca0c270b 100644 --- a/fs/xfs/xfs_trans_rmap.c +++ b/fs/xfs/xfs_trans_rmap.c @@ -117,7 +117,7 @@ xfs_trans_log_finish_rmap_update( * 2.) shuts down the filesystem */ tp->t_flags |= XFS_TRANS_DIRTY; - rudp->rud_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags); return error; } @@ -175,7 +175,7 @@ xfs_rmap_update_log_item( rmap = container_of(item, struct xfs_rmap_intent, ri_list); tp->t_flags |= XFS_TRANS_DIRTY; - ruip->rui_item.li_desc->lid_flags |= XFS_LID_DIRTY; + set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags); /* * atomic_inc_return gives us the value after the increment; diff --git a/include/linux/bio.h b/include/linux/bio.h index 810a8bee8f85..397a38aca182 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -127,6 +127,11 @@ static inline void *bio_data(struct bio *bio) return NULL; } +static inline bool bio_full(struct bio *bio) +{ + return bio->bi_vcnt >= bio->bi_max_vecs; +} + /* * will die */ @@ -474,6 +479,10 @@ void bio_chain(struct bio *, struct bio *); extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); +bool __bio_try_merge_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int off); +void __bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); struct rq_map_data; extern struct bio *bio_map_user_iov(struct request_queue *, diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 894e5d125de6..96225a77c112 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -205,8 +205,6 @@ void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); int bh_uptodate_or_lock(struct buffer_head *bh); int bh_submit_read(struct buffer_head *bh); -loff_t page_cache_seek_hole_data(struct inode *inode, loff_t offset, - loff_t length, int whence); extern int buffer_heads_over_limit; diff --git a/include/linux/dax.h b/include/linux/dax.h index f9eb22ad341e..c99692ddd4b5 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -64,10 +64,10 @@ static inline bool dax_write_cache_enabled(struct dax_device *dax_dev) struct writeback_control; int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); #if IS_ENABLED(CONFIG_FS_DAX) -int __bdev_dax_supported(struct super_block *sb, int blocksize); -static inline int bdev_dax_supported(struct super_block *sb, int blocksize) +bool __bdev_dax_supported(struct block_device *bdev, int blocksize); +static inline bool bdev_dax_supported(struct block_device *bdev, int blocksize) { - return __bdev_dax_supported(sb, blocksize); + return __bdev_dax_supported(bdev, blocksize); } static inline struct dax_device *fs_dax_get_by_host(const char *host) @@ -84,9 +84,10 @@ struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); int dax_writeback_mapping_range(struct address_space *mapping, struct block_device *bdev, struct writeback_control *wbc); #else -static inline int bdev_dax_supported(struct super_block *sb, int blocksize) +static inline bool bdev_dax_supported(struct block_device *bdev, + int blocksize) { - return -EOPNOTSUPP; + return false; } static inline struct dax_device *fs_dax_get_by_host(const char *host) diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 19a07de28212..a044a824da85 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -4,6 +4,7 @@ #include <linux/types.h> +struct address_space; struct fiemap_extent_info; struct inode; struct iov_iter; @@ -18,6 +19,7 @@ struct vm_fault; #define IOMAP_DELALLOC 0x02 /* delayed allocation blocks */ #define IOMAP_MAPPED 0x03 /* blocks allocated at @addr */ #define IOMAP_UNWRITTEN 0x04 /* blocks allocated at @addr in unwritten state */ +#define IOMAP_INLINE 0x05 /* data inline in the inode */ /* * Flags for all iomap mappings: @@ -26,15 +28,19 @@ struct vm_fault; * written data and requires fdatasync to commit them to persistent storage. */ #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ -#define IOMAP_F_BOUNDARY 0x02 /* mapping ends at metadata boundary */ -#define IOMAP_F_DIRTY 0x04 /* uncommitted metadata */ +#define IOMAP_F_DIRTY 0x02 /* uncommitted metadata */ /* * Flags that only need to be reported for IOMAP_REPORT requests: */ #define IOMAP_F_MERGED 0x10 /* contains multiple blocks/extents */ #define IOMAP_F_SHARED 0x20 /* block shared with another file */ -#define IOMAP_F_DATA_INLINE 0x40 /* data inline in the inode */ + +/* + * Flags from 0x1000 up are for file system specific usage: + */ +#define IOMAP_F_PRIVATE 0x1000 + /* * Magic value for addr: @@ -59,7 +65,7 @@ struct iomap { #define IOMAP_REPORT (1 << 2) /* report extent status, e.g. FIEMAP */ #define IOMAP_FAULT (1 << 3) /* mapping for page fault */ #define IOMAP_DIRECT (1 << 4) /* direct I/O */ -#define IOMAP_NOWAIT (1 << 5) /* Don't wait for writeback */ +#define IOMAP_NOWAIT (1 << 5) /* do not block */ struct iomap_ops { /* @@ -95,6 +101,8 @@ loff_t iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops); loff_t iomap_seek_data(struct inode *inode, loff_t offset, const struct iomap_ops *ops); +sector_t iomap_bmap(struct address_space *mapping, sector_t bno, + const struct iomap_ops *ops); /* * Flags for direct I/O ->end_io: @@ -106,4 +114,15 @@ typedef int (iomap_dio_end_io_t)(struct kiocb *iocb, ssize_t ret, ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops, iomap_dio_end_io_t end_io); +#ifdef CONFIG_SWAP +struct file; +struct swap_info_struct; + +int iomap_swapfile_activate(struct swap_info_struct *sis, + struct file *swap_file, sector_t *pagespan, + const struct iomap_ops *ops); +#else +# define iomap_swapfile_activate(sis, swapfile, pagespan, ops) (-EIO) +#endif /* CONFIG_SWAP */ + #endif /* LINUX_IOMAP_H */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index d2a8313fabd7..9d132f8f2df8 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -242,6 +242,8 @@ struct fsxattr { #define FICLONERANGE _IOW(0x94, 13, struct file_clone_range) #define FIDEDUPERANGE _IOWR(0x94, 54, struct file_dedupe_range) +#define FSLABEL_MAX 256 /* Max chars for the interface; each fs may differ */ + #define FS_IOC_GETFLAGS _IOR('f', 1, long) #define FS_IOC_SETFLAGS _IOW('f', 2, long) #define FS_IOC_GETVERSION _IOR('v', 1, long) @@ -251,8 +253,10 @@ struct fsxattr { #define FS_IOC32_SETFLAGS _IOW('f', 2, int) #define FS_IOC32_GETVERSION _IOR('v', 1, int) #define FS_IOC32_SETVERSION _IOW('v', 2, int) -#define FS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) -#define FS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) +#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) +#define FS_IOC_GETFSLABEL _IOR(0x94, 49, char[FSLABEL_MAX]) +#define FS_IOC_SETFSLABEL _IOW(0x94, 50, char[FSLABEL_MAX]) /* * File system encryption support diff --git a/mm/internal.h b/mm/internal.h index 502d14189794..9e3654d70289 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -53,7 +53,7 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, struct zap_details *details); -extern int __do_page_cache_readahead(struct address_space *mapping, +extern unsigned int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); diff --git a/mm/readahead.c b/mm/readahead.c index 539bbb6c1fad..e273f0de3376 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -140,23 +140,23 @@ out: } /* - * __do_page_cache_readahead() actually reads a chunk of disk. It allocates all - * the pages first, then submits them all for I/O. This avoids the very bad + * __do_page_cache_readahead() actually reads a chunk of disk. It allocates + * the pages first, then submits them for I/O. This avoids the very bad * behaviour which would occur if page allocations are causing VM writeback. * We really don't want to intermingle reads and writes like that. * * Returns the number of pages requested, or the maximum amount of I/O allowed. */ -int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read, - unsigned long lookahead_size) +unsigned int __do_page_cache_readahead(struct address_space *mapping, + struct file *filp, pgoff_t offset, unsigned long nr_to_read, + unsigned long lookahead_size) { struct inode *inode = mapping->host; struct page *page; unsigned long end_index; /* The last page we want to read */ LIST_HEAD(page_pool); int page_idx; - int ret = 0; + unsigned int nr_pages = 0; loff_t isize = i_size_read(inode); gfp_t gfp_mask = readahead_gfp_mask(mapping); @@ -177,8 +177,18 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, rcu_read_lock(); page = radix_tree_lookup(&mapping->i_pages, page_offset); rcu_read_unlock(); - if (page && !radix_tree_exceptional_entry(page)) + if (page && !radix_tree_exceptional_entry(page)) { + /* + * Page already present? Kick off the current batch of + * contiguous pages before continuing with the next + * batch. + */ + if (nr_pages) + read_pages(mapping, filp, &page_pool, nr_pages, + gfp_mask); + nr_pages = 0; continue; + } page = __page_cache_alloc(gfp_mask); if (!page) @@ -187,7 +197,7 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, list_add(&page->lru, &page_pool); if (page_idx == nr_to_read - lookahead_size) SetPageReadahead(page); - ret++; + nr_pages++; } /* @@ -195,11 +205,11 @@ int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, * uptodate then the caller will launch readpage again, and * will then handle the error. */ - if (ret) - read_pages(mapping, filp, &page_pool, ret, gfp_mask); + if (nr_pages) + read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask); BUG_ON(!list_empty(&page_pool)); out: - return ret; + return nr_pages; } /* @@ -223,16 +233,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); nr_to_read = min(nr_to_read, max_pages); while (nr_to_read) { - int err; - unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE; if (this_chunk > nr_to_read) this_chunk = nr_to_read; - err = __do_page_cache_readahead(mapping, filp, - offset, this_chunk, 0); - if (err < 0) - return err; + __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0); offset += this_chunk; nr_to_read -= this_chunk; |