diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/iomap.c | 84 | ||||
-rw-r--r-- | fs/xfs/Makefile | 1 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ag_resv.c | 325 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ag_resv.h | 35 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_alloc.c | 112 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_alloc.h | 8 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_bmap.c | 6 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_btree.c | 59 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_btree.h | 28 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_defer.c | 79 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_ialloc_btree.c | 2 | ||||
-rw-r--r-- | fs/xfs/libxfs/xfs_log_format.h | 10 | ||||
-rw-r--r-- | fs/xfs/xfs_buf_item.c | 9 | ||||
-rw-r--r-- | fs/xfs/xfs_file.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_filestream.c | 4 | ||||
-rw-r--r-- | fs/xfs/xfs_fsops.c | 2 | ||||
-rw-r--r-- | fs/xfs/xfs_mount.h | 44 | ||||
-rw-r--r-- | fs/xfs/xfs_rmap_item.c | 36 | ||||
-rw-r--r-- | fs/xfs/xfs_rmap_item.h | 8 | ||||
-rw-r--r-- | fs/xfs/xfs_super.c | 5 | ||||
-rw-r--r-- | fs/xfs/xfs_sysfs.c | 47 | ||||
-rw-r--r-- | fs/xfs/xfs_trace.h | 75 | ||||
-rw-r--r-- | fs/xfs/xfs_trans.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_trans_extfree.c | 3 | ||||
-rw-r--r-- | fs/xfs/xfs_xattr.c | 1 |
25 files changed, 854 insertions, 135 deletions
diff --git a/fs/iomap.c b/fs/iomap.c index 706270f21b35..ec411a6b9edc 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -252,6 +252,88 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); +static struct page * +__iomap_read_page(struct inode *inode, loff_t offset) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL); + if (IS_ERR(page)) + return page; + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; +} + +static loff_t +iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, + struct iomap *iomap) +{ + long status = 0; + ssize_t written = 0; + + do { + struct page *page, *rpage; + unsigned long offset; /* Offset into pagecache page */ + unsigned long bytes; /* Bytes to write to page */ + + offset = (pos & (PAGE_SIZE - 1)); + bytes = min_t(unsigned long, PAGE_SIZE - offset, length); + + rpage = __iomap_read_page(inode, pos); + if (IS_ERR(rpage)) + return PTR_ERR(rpage); + + status = iomap_write_begin(inode, pos, bytes, + AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE, + &page, iomap); + put_page(rpage); + if (unlikely(status)) + return status; + + WARN_ON_ONCE(!PageUptodate(page)); + + status = iomap_write_end(inode, pos, bytes, bytes, page); + if (unlikely(status <= 0)) { + if (WARN_ON_ONCE(status == 0)) + return -EIO; + return status; + } + + cond_resched(); + + pos += status; + written += status; + length -= status; + + balance_dirty_pages_ratelimited(inode->i_mapping); + } while (length); + + return written; +} + +int +iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len, + struct iomap_ops *ops) +{ + loff_t ret; + + while (len) { + ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL, + iomap_dirty_actor); + if (ret <= 0) + return ret; + pos += ret; + len -= ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(iomap_file_dirty); + static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, unsigned bytes, struct iomap *iomap) { @@ -430,6 +512,8 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, if (iomap->flags & IOMAP_F_MERGED) flags |= FIEMAP_EXTENT_MERGED; + if (iomap->flags & IOMAP_F_SHARED) + flags |= FIEMAP_EXTENT_SHARED; return fiemap_fill_next_extent(fi, iomap->offset, iomap->blkno != IOMAP_NULL_BLOCK ? iomap->blkno << 9: 0, diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index fc593c869493..584e87e11cb6 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -52,6 +52,7 @@ xfs-y += $(addprefix libxfs/, \ xfs_inode_fork.o \ xfs_inode_buf.o \ xfs_log_rlimit.o \ + xfs_ag_resv.o \ xfs_rmap.o \ xfs_rmap_btree.o \ xfs_sb.o \ diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c new file mode 100644 index 000000000000..e3ae0f2b4294 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag_resv.c @@ -0,0 +1,325 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_alloc.h" +#include "xfs_error.h" +#include "xfs_trace.h" +#include "xfs_cksum.h" +#include "xfs_trans.h" +#include "xfs_bit.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_ag_resv.h" +#include "xfs_trans_space.h" +#include "xfs_rmap_btree.h" +#include "xfs_btree.h" + +/* + * Per-AG Block Reservations + * + * For some kinds of allocation group metadata structures, it is advantageous + * to reserve a small number of blocks in each AG so that future expansions of + * that data structure do not encounter ENOSPC because errors during a btree + * split cause the filesystem to go offline. + * + * Prior to the introduction of reflink, this wasn't an issue because the free + * space btrees maintain a reserve of space (the AGFL) to handle any expansion + * that may be necessary; and allocations of other metadata (inodes, BMBT, + * dir/attr) aren't restricted to a single AG. However, with reflink it is + * possible to allocate all the space in an AG, have subsequent reflink/CoW + * activity expand the refcount btree, and discover that there's no space left + * to handle that expansion. Since we can calculate the maximum size of the + * refcount btree, we can reserve space for it and avoid ENOSPC. + * + * Handling per-AG reservations consists of three changes to the allocator's + * behavior: First, because these reservations are always needed, we decrease + * the ag_max_usable counter to reflect the size of the AG after the reserved + * blocks are taken. Second, the reservations must be reflected in the + * fdblocks count to maintain proper accounting. Third, each AG must maintain + * its own reserved block counter so that we can calculate the amount of space + * that must remain free to maintain the reservations. Fourth, the "remaining + * reserved blocks" count must be used when calculating the length of the + * longest free extent in an AG and to clamp maxlen in the per-AG allocation + * functions. In other words, we maintain a virtual allocation via in-core + * accounting tricks so that we don't have to clean up after a crash. :) + * + * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type + * values via struct xfs_alloc_arg or directly to the xfs_free_extent + * function. It might seem a little funny to maintain a reservoir of blocks + * to feed another reservoir, but the AGFL only holds enough blocks to get + * through the next transaction. The per-AG reservation is to ensure (we + * hope) that each AG never runs out of blocks. Each data structure wanting + * to use the reservation system should update ask/used in xfs_ag_resv_init. + */ + +/* + * Are we critically low on blocks? For now we'll define that as the number + * of blocks we can get our hands on being less than 10% of what we reserved + * or less than some arbitrary number (maximum btree height). + */ +bool +xfs_ag_resv_critical( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + xfs_extlen_t avail; + xfs_extlen_t orig; + + switch (type) { + case XFS_AG_RESV_METADATA: + avail = pag->pagf_freeblks - pag->pag_agfl_resv.ar_reserved; + orig = pag->pag_meta_resv.ar_asked; + break; + case XFS_AG_RESV_AGFL: + avail = pag->pagf_freeblks + pag->pagf_flcount - + pag->pag_meta_resv.ar_reserved; + orig = pag->pag_agfl_resv.ar_asked; + break; + default: + ASSERT(0); + return false; + } + + trace_xfs_ag_resv_critical(pag, type, avail); + + /* Critically low if less than 10% or max btree height remains. */ + return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS; +} + +/* + * How many blocks are reserved but not used, and therefore must not be + * allocated away? + */ +xfs_extlen_t +xfs_ag_resv_needed( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + xfs_extlen_t len; + + len = pag->pag_meta_resv.ar_reserved + pag->pag_agfl_resv.ar_reserved; + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + len -= xfs_perag_resv(pag, type)->ar_reserved; + break; + case XFS_AG_RESV_NONE: + /* empty */ + break; + default: + ASSERT(0); + } + + trace_xfs_ag_resv_needed(pag, type, len); + + return len; +} + +/* Clean out a reservation */ +static int +__xfs_ag_resv_free( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + struct xfs_ag_resv *resv; + xfs_extlen_t oldresv; + int error; + + trace_xfs_ag_resv_free(pag, type, 0); + + resv = xfs_perag_resv(pag, type); + pag->pag_mount->m_ag_max_usable += resv->ar_asked; + /* + * AGFL blocks are always considered "free", so whatever + * was reserved at mount time must be given back at umount. + */ + if (type == XFS_AG_RESV_AGFL) + oldresv = resv->ar_orig_reserved; + else + oldresv = resv->ar_reserved; + error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); + resv->ar_reserved = 0; + resv->ar_asked = 0; + + if (error) + trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, + error, _RET_IP_); + return error; +} + +/* Free a per-AG reservation. */ +int +xfs_ag_resv_free( + struct xfs_perag *pag) +{ + int error; + int err2; + + error = __xfs_ag_resv_free(pag, XFS_AG_RESV_AGFL); + err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); + if (err2 && !error) + error = err2; + return error; +} + +static int +__xfs_ag_resv_init( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + xfs_extlen_t ask, + xfs_extlen_t used) +{ + struct xfs_mount *mp = pag->pag_mount; + struct xfs_ag_resv *resv; + int error; + + resv = xfs_perag_resv(pag, type); + if (used > ask) + ask = used; + resv->ar_asked = ask; + resv->ar_reserved = resv->ar_orig_reserved = ask - used; + mp->m_ag_max_usable -= ask; + + trace_xfs_ag_resv_init(pag, type, ask); + + error = xfs_mod_fdblocks(mp, -(int64_t)resv->ar_reserved, true); + if (error) + trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, + error, _RET_IP_); + + return error; +} + +/* Create a per-AG block reservation. */ +int +xfs_ag_resv_init( + struct xfs_perag *pag) +{ + xfs_extlen_t ask; + xfs_extlen_t used; + int error = 0; + + /* Create the metadata reservation. */ + if (pag->pag_meta_resv.ar_asked == 0) { + ask = used = 0; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, + ask, used); + if (error) + goto out; + } + + /* Create the AGFL metadata reservation */ + if (pag->pag_agfl_resv.ar_asked == 0) { + ask = used = 0; + + error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used); + if (error) + goto out; + } + +out: + return error; +} + +/* Allocate a block from the reservation. */ +void +xfs_ag_resv_alloc_extent( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + struct xfs_alloc_arg *args) +{ + struct xfs_ag_resv *resv; + xfs_extlen_t len; + uint field; + + trace_xfs_ag_resv_alloc_extent(pag, type, args->len); + + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + resv = xfs_perag_resv(pag, type); + break; + default: + ASSERT(0); + /* fall through */ + case XFS_AG_RESV_NONE: + field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : + XFS_TRANS_SB_FDBLOCKS; + xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len); + return; + } + + len = min_t(xfs_extlen_t, args->len, resv->ar_reserved); + resv->ar_reserved -= len; + if (type == XFS_AG_RESV_AGFL) + return; + /* Allocations of reserved blocks only need on-disk sb updates... */ + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len); + /* ...but non-reserved blocks need in-core and on-disk updates. */ + if (args->len > len) + xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, + -((int64_t)args->len - len)); +} + +/* Free a block to the reservation. */ +void +xfs_ag_resv_free_extent( + struct xfs_perag *pag, + enum xfs_ag_resv_type type, + struct xfs_trans *tp, + xfs_extlen_t len) +{ + xfs_extlen_t leftover; + struct xfs_ag_resv *resv; + + trace_xfs_ag_resv_free_extent(pag, type, len); + + switch (type) { + case XFS_AG_RESV_METADATA: + case XFS_AG_RESV_AGFL: + resv = xfs_perag_resv(pag, type); + break; + default: + ASSERT(0); + /* fall through */ + case XFS_AG_RESV_NONE: + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); + return; + } + + leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved); + resv->ar_reserved += leftover; + if (type == XFS_AG_RESV_AGFL) + return; + /* Freeing into the reserved pool only requires on-disk update... */ + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len); + /* ...but freeing beyond that requires in-core and on-disk update. */ + if (len > leftover) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover); +} diff --git a/fs/xfs/libxfs/xfs_ag_resv.h b/fs/xfs/libxfs/xfs_ag_resv.h new file mode 100644 index 000000000000..8d6c687deef3 --- /dev/null +++ b/fs/xfs/libxfs/xfs_ag_resv.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2016 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_AG_RESV_H__ +#define __XFS_AG_RESV_H__ + +int xfs_ag_resv_free(struct xfs_perag *pag); +int xfs_ag_resv_init(struct xfs_perag *pag); + +bool xfs_ag_resv_critical(struct xfs_perag *pag, enum xfs_ag_resv_type type); +xfs_extlen_t xfs_ag_resv_needed(struct xfs_perag *pag, + enum xfs_ag_resv_type type); + +void xfs_ag_resv_alloc_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, + struct xfs_alloc_arg *args); +void xfs_ag_resv_free_extent(struct xfs_perag *pag, enum xfs_ag_resv_type type, + struct xfs_trans *tp, xfs_extlen_t len); + +#endif /* __XFS_AG_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 05b5243d89f6..2620a86a756a 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -37,6 +37,7 @@ #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_log.h" +#include "xfs_ag_resv.h" struct workqueue_struct *xfs_alloc_wq; @@ -74,14 +75,8 @@ xfs_prealloc_blocks( * extents need to be actually allocated. To get around this, we explicitly set * aside a few blocks which will not be reserved in delayed allocation. * - * When rmap is disabled, we need to reserve 4 fsbs _per AG_ for the freelist - * and 4 more to handle a potential split of the file's bmap btree. - * - * When rmap is enabled, we must also be able to handle two rmap btree inserts - * to record both the file data extent and a new bmbt block. The bmbt block - * might not be in the same AG as the file data extent. In the worst case - * the bmap btree splits multiple levels and all the new blocks come from - * different AGs, so set aside enough to handle rmap btree splits in all AGs. + * We need to reserve 4 fsbs _per AG_ for the freelist and 4 more to handle a + * potential split of the file's bmap btree. */ unsigned int xfs_alloc_set_aside( @@ -90,8 +85,6 @@ xfs_alloc_set_aside( unsigned int blocks; blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE); - if (xfs_sb_version_hasrmapbt(&mp->m_sb)) - blocks += mp->m_sb.sb_agcount * mp->m_rmap_maxlevels; return blocks; } @@ -680,12 +673,29 @@ xfs_alloc_ag_vextent( xfs_alloc_arg_t *args) /* argument structure for allocation */ { int error=0; + xfs_extlen_t reservation; + xfs_extlen_t oldmax; ASSERT(args->minlen > 0); ASSERT(args->maxlen > 0); ASSERT(args->minlen <= args->maxlen); ASSERT(args->mod < args->prod); ASSERT(args->alignment > 0); + + /* + * Clamp maxlen to the amount of free space minus any reservations + * that have been made. + */ + oldmax = args->maxlen; + reservation = xfs_ag_resv_needed(args->pag, args->resv); + if (args->maxlen > args->pag->pagf_freeblks - reservation) + args->maxlen = args->pag->pagf_freeblks - reservation; + if (args->maxlen == 0) { + args->agbno = NULLAGBLOCK; + args->maxlen = oldmax; + return 0; + } + /* * Branch to correct routine based on the type. */ @@ -705,12 +715,14 @@ xfs_alloc_ag_vextent( /* NOTREACHED */ } + args->maxlen = oldmax; + if (error || args->agbno == NULLAGBLOCK) return error; ASSERT(args->len >= args->minlen); ASSERT(args->len <= args->maxlen); - ASSERT(!args->wasfromfl || !args->isfl); + ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); ASSERT(args->agbno % args->alignment == 0); /* if not file data, insert new block into the reverse map btree */ @@ -732,12 +744,7 @@ xfs_alloc_ag_vextent( args->agbno, args->len)); } - if (!args->isfl) { - xfs_trans_mod_sb(args->tp, args->wasdel ? - XFS_TRANS_SB_RES_FDBLOCKS : - XFS_TRANS_SB_FDBLOCKS, - -((long)(args->len))); - } + xfs_ag_resv_alloc_extent(args->pag, args->resv, args); XFS_STATS_INC(args->mp, xs_allocx); XFS_STATS_ADD(args->mp, xs_allocb, args->len); @@ -1583,6 +1590,7 @@ xfs_alloc_ag_vextent_small( int *stat) /* status: 0-freelist, 1-normal/none */ { struct xfs_owner_info oinfo; + struct xfs_perag *pag; int error; xfs_agblock_t fbno; xfs_extlen_t flen; @@ -1600,7 +1608,8 @@ xfs_alloc_ag_vextent_small( * to respect minleft even when pulling from the * freelist. */ - else if (args->minlen == 1 && args->alignment == 1 && !args->isfl && + else if (args->minlen == 1 && args->alignment == 1 && + args->resv != XFS_AG_RESV_AGFL && (be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_flcount) > args->minleft)) { error = xfs_alloc_get_freelist(args->tp, args->agbp, &fbno, 0); @@ -1629,13 +1638,18 @@ xfs_alloc_ag_vextent_small( /* * If we're feeding an AGFL block to something that * doesn't live in the free space, we need to clear - * out the OWN_AG rmap. + * out the OWN_AG rmap and add the block back to + * the AGFL per-AG reservation. */ xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_AG); error = xfs_rmap_free(args->tp, args->agbp, args->agno, fbno, 1, &oinfo); if (error) goto error0; + pag = xfs_perag_get(args->mp, args->agno); + xfs_ag_resv_free_extent(pag, XFS_AG_RESV_AGFL, + args->tp, 1); + xfs_perag_put(pag); *stat = 0; return 0; @@ -1683,7 +1697,7 @@ xfs_free_ag_extent( xfs_agblock_t bno, xfs_extlen_t len, struct xfs_owner_info *oinfo, - int isfl) + enum xfs_ag_resv_type type) { xfs_btree_cur_t *bno_cur; /* cursor for by-block btree */ xfs_btree_cur_t *cnt_cur; /* cursor for by-size btree */ @@ -1911,21 +1925,22 @@ xfs_free_ag_extent( */ pag = xfs_perag_get(mp, agno); error = xfs_alloc_update_counters(tp, pag, agbp, len); + xfs_ag_resv_free_extent(pag, type, tp, len); xfs_perag_put(pag); if (error) goto error0; - if (!isfl) - xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (long)len); XFS_STATS_INC(mp, xs_freex); XFS_STATS_ADD(mp, xs_freeb, len); - trace_xfs_free_extent(mp, agno, bno, len, isfl, haveleft, haveright); + trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, + haveleft, haveright); return 0; error0: - trace_xfs_free_extent(mp, agno, bno, len, isfl, -1, -1); + trace_xfs_free_extent(mp, agno, bno, len, type == XFS_AG_RESV_AGFL, + -1, -1); if (bno_cur) xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); if (cnt_cur) @@ -1950,21 +1965,43 @@ xfs_alloc_compute_maxlevels( } /* - * Find the length of the longest extent in an AG. + * Find the length of the longest extent in an AG. The 'need' parameter + * specifies how much space we're going to need for the AGFL and the + * 'reserved' parameter tells us how many blocks in this AG are reserved for + * other callers. */ xfs_extlen_t xfs_alloc_longest_free_extent( struct xfs_mount *mp, struct xfs_perag *pag, - xfs_extlen_t need) + xfs_extlen_t need, + xfs_extlen_t reserved) { xfs_extlen_t delta = 0; + /* + * If the AGFL needs a recharge, we'll have to subtract that from the + * longest extent. + */ if (need > pag->pagf_flcount) delta = need - pag->pagf_flcount; + /* + * If we cannot maintain others' reservations with space from the + * not-longest freesp extents, we'll have to subtract /that/ from + * the longest extent too. + */ + if (pag->pagf_freeblks - pag->pagf_longest < reserved) + delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); + + /* + * If the longest extent is long enough to satisfy all the + * reservations and AGFL rules in place, we can return this extent. + */ if (pag->pagf_longest > delta) return pag->pagf_longest - delta; + + /* Otherwise, let the caller try for 1 block if there's space. */ return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } @@ -2004,20 +2041,24 @@ xfs_alloc_space_available( { struct xfs_perag *pag = args->pag; xfs_extlen_t longest; + xfs_extlen_t reservation; /* blocks that are still reserved */ int available; if (flags & XFS_ALLOC_FLAG_FREEING) return true; + reservation = xfs_ag_resv_needed(pag, args->resv); + /* do we have enough contiguous free space for the allocation? */ - longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); + longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free, + reservation); if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) return false; - /* do have enough free space remaining for the allocation? */ + /* do we have enough free space remaining for the allocation? */ available = (int)(pag->pagf_freeblks + pag->pagf_flcount - - min_free - args->total); - if (available < (int)args->minleft) + reservation - min_free - args->total); + if (available < (int)args->minleft || available <= 0) return false; return true; @@ -2124,7 +2165,7 @@ xfs_alloc_fix_freelist( if (error) goto out_agbp_relse; error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, - &targs.oinfo, 1); + &targs.oinfo, XFS_AG_RESV_AGFL); if (error) goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); @@ -2135,7 +2176,7 @@ xfs_alloc_fix_freelist( targs.mp = mp; targs.agbp = agbp; targs.agno = args->agno; - targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; + targs.alignment = targs.minlen = targs.prod = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); @@ -2146,6 +2187,7 @@ xfs_alloc_fix_freelist( while (pag->pagf_flcount < need) { targs.agbno = 0; targs.maxlen = need - pag->pagf_flcount; + targs.resv = XFS_AG_RESV_AGFL; /* Allocate as many blocks as possible at once. */ error = xfs_alloc_ag_vextent(&targs); @@ -2825,7 +2867,8 @@ xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo) /* extent owner */ + struct xfs_owner_info *oinfo, /* extent owner */ + enum xfs_ag_resv_type type) /* block reservation type */ { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *agbp; @@ -2834,6 +2877,7 @@ xfs_free_extent( int error; ASSERT(len != 0); + ASSERT(type != XFS_AG_RESV_AGFL); if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FREE_EXTENT, @@ -2851,7 +2895,7 @@ xfs_free_extent( agbno + len <= be32_to_cpu(XFS_BUF_TO_AGF(agbp)->agf_length), err); - error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, 0); + error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); if (error) goto err; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 6fe2d6b7cfe9..f7c520193239 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -87,10 +87,10 @@ typedef struct xfs_alloc_arg { xfs_alloctype_t otype; /* original allocation type */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ - char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* mask defining userdata treatment */ xfs_fsblock_t firstblock; /* io first block allocated */ struct xfs_owner_info oinfo; /* owner of blocks being allocated */ + enum xfs_ag_resv_type resv; /* block reservation to use */ } xfs_alloc_arg_t; /* @@ -106,7 +106,8 @@ unsigned int xfs_alloc_set_aside(struct xfs_mount *mp); unsigned int xfs_alloc_ag_max_usable(struct xfs_mount *mp); xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, - struct xfs_perag *pag, xfs_extlen_t need); + struct xfs_perag *pag, xfs_extlen_t need, + xfs_extlen_t reserved); unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); @@ -184,7 +185,8 @@ xfs_free_extent( struct xfs_trans *tp, /* transaction pointer */ xfs_fsblock_t bno, /* starting block number of extent */ xfs_extlen_t len, /* length of extent */ - struct xfs_owner_info *oinfo);/* extent owner */ + struct xfs_owner_info *oinfo, /* extent owner */ + enum xfs_ag_resv_type type); /* block reservation type */ int /* error */ xfs_alloc_lookup_ge( diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 614803bc8a9f..6fd458674e56 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -47,6 +47,7 @@ #include "xfs_attr_leaf.h" #include "xfs_filestream.h" #include "xfs_rmap.h" +#include "xfs_ag_resv.h" kmem_zone_t *xfs_bmap_free_item_zone; @@ -3501,7 +3502,8 @@ xfs_bmap_longest_free_extent( } longest = xfs_alloc_longest_free_extent(mp, pag, - xfs_alloc_min_freelist(mp, pag)); + xfs_alloc_min_freelist(mp, pag), + xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (*blen < longest) *blen = longest; @@ -3781,7 +3783,7 @@ xfs_bmap_btalloc( } args.minleft = ap->minleft; args.wasdel = ap->wasdel; - args.isfl = 0; + args.resv = XFS_AG_RESV_NONE; args.userdata = ap->userdata; if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) args.ip = ap->ip; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index 08569792fe20..aa1752f918b8 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2070,7 +2070,7 @@ __xfs_btree_updkeys( struct xfs_buf *bp0, bool force_all) { - union xfs_btree_bigkey key; /* keys from current level */ + union xfs_btree_key key; /* keys from current level */ union xfs_btree_key *lkey; /* keys from the next level up */ union xfs_btree_key *hkey; union xfs_btree_key *nlkey; /* keys from the next level up */ @@ -2086,7 +2086,7 @@ __xfs_btree_updkeys( trace_xfs_btree_updkeys(cur, level, bp0); - lkey = (union xfs_btree_key *)&key; + lkey = &key; hkey = xfs_btree_high_key_from_key(cur, lkey); xfs_btree_get_keys(cur, block, lkey); for (level++; level < cur->bc_nlevels; level++) { @@ -3226,7 +3226,7 @@ xfs_btree_insrec( struct xfs_buf *bp; /* buffer for block */ union xfs_btree_ptr nptr; /* new block ptr */ struct xfs_btree_cur *ncur; /* new btree cursor */ - union xfs_btree_bigkey nkey; /* new block key */ + union xfs_btree_key nkey; /* new block key */ union xfs_btree_key *lkey; int optr; /* old key/record index */ int ptr; /* key/record index */ @@ -3241,7 +3241,7 @@ xfs_btree_insrec( XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, &rec); ncur = NULL; - lkey = (union xfs_btree_key *)&nkey; + lkey = &nkey; /* * If we have an external root pointer, and we've made it to the @@ -3444,14 +3444,14 @@ xfs_btree_insert( union xfs_btree_ptr nptr; /* new block number (split result) */ struct xfs_btree_cur *ncur; /* new cursor (split result) */ struct xfs_btree_cur *pcur; /* previous level's cursor */ - union xfs_btree_bigkey bkey; /* key of block to insert */ + union xfs_btree_key bkey; /* key of block to insert */ union xfs_btree_key *key; union xfs_btree_rec rec; /* record to insert */ level = 0; ncur = NULL; pcur = cur; - key = (union xfs_btree_key *)&bkey; + key = &bkey; xfs_btree_set_ptr_null(cur, &nptr); @@ -4797,3 +4797,50 @@ xfs_btree_query_range( return xfs_btree_overlapped_query_range(cur, &low_key, &high_key, fn, priv); } + +/* + * Calculate the number of blocks needed to store a given number of records + * in a short-format (per-AG metadata) btree. + */ +xfs_extlen_t +xfs_btree_calc_size( + struct xfs_mount *mp, + uint *limits, + unsigned long long len) +{ + int level; + int maxrecs; + xfs_extlen_t rval; + + maxrecs = limits[0]; + for (level = 0, rval = 0; len > 1; level++) { + len += maxrecs - 1; + do_div(len, maxrecs); + maxrecs = limits[1]; + rval += len; + } + return rval; +} + +int +xfs_btree_count_blocks_helper( + struct xfs_btree_cur *cur, + int level, + void *data) +{ + xfs_extlen_t *blocks = data; + (*blocks)++; + + return 0; +} + +/* Count the blocks in a btree and return the result in *blocks. */ +int +xfs_btree_count_blocks( + struct xfs_btree_cur *cur, + xfs_extlen_t *blocks) +{ + *blocks = 0; + return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper, + blocks); +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 04d0865e5e6d..3f8556a5c2ad 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -37,30 +37,18 @@ union xfs_btree_ptr { __be64 l; /* long form ptr */ }; -union xfs_btree_key { - struct xfs_bmbt_key bmbt; - xfs_bmdr_key_t bmbr; /* bmbt root block */ - xfs_alloc_key_t alloc; - struct xfs_inobt_key inobt; - struct xfs_rmap_key rmap; -}; - /* - * In-core key that holds both low and high keys for overlapped btrees. - * The two keys are packed next to each other on disk, so do the same - * in memory. Preserve the existing xfs_btree_key as a single key to - * avoid the mental model breakage that would happen if we passed a - * bigkey into a function that operates on a single key. + * The in-core btree key. Overlapping btrees actually store two keys + * per pointer, so we reserve enough memory to hold both. The __*bigkey + * items should never be accessed directly. */ -union xfs_btree_bigkey { +union xfs_btree_key { struct xfs_bmbt_key bmbt; xfs_bmdr_key_t bmbr; /* bmbt root block */ xfs_alloc_key_t alloc; struct xfs_inobt_key inobt; - struct { - struct xfs_rmap_key rmap; - struct xfs_rmap_key rmap_hi; - }; + struct xfs_rmap_key rmap; + struct xfs_rmap_key __rmap_bigkey[2]; }; union xfs_btree_rec { @@ -513,6 +501,8 @@ bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); uint xfs_btree_compute_maxlevels(struct xfs_mount *mp, uint *limits, unsigned long len); +xfs_extlen_t xfs_btree_calc_size(struct xfs_mount *mp, uint *limits, + unsigned long long len); /* return codes */ #define XFS_BTREE_QUERY_RANGE_CONTINUE 0 /* keep iterating */ @@ -529,4 +519,6 @@ typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, int xfs_btree_visit_blocks(struct xfs_btree_cur *cur, xfs_btree_visit_blocks_fn fn, void *data); +int xfs_btree_count_blocks(struct xfs_btree_cur *cur, xfs_extlen_t *blocks); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c221d0ecd52e..613c5cf19436 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -81,6 +81,10 @@ * - For each work item attached to the log intent item, * * Perform the described action. * * Attach the work item to the log done item. + * * If the result of doing the work was -EAGAIN, ->finish work + * wants a new transaction. See the "Requesting a Fresh + * Transaction while Finishing Deferred Work" section below for + * details. * * The key here is that we must log an intent item for all pending * work items every time we roll the transaction, and that we must log @@ -88,6 +92,34 @@ * we can perform complex remapping operations, chaining intent items * as needed. * + * Requesting a Fresh Transaction while Finishing Deferred Work + * + * If ->finish_item decides that it needs a fresh transaction to + * finish the work, it must ask its caller (xfs_defer_finish) for a + * continuation. The most likely cause of this circumstance are the + * refcount adjust functions deciding that they've logged enough items + * to be at risk of exceeding the transaction reservation. + * + * To get a fresh transaction, we want to log the existing log done + * item to prevent the log intent item from replaying, immediately log + * a new log intent item with the unfinished work items, roll the + * transaction, and re-call ->finish_item wherever it left off. The + * log done item and the new log intent item must be in the same + * transaction or atomicity cannot be guaranteed; defer_finish ensures + * that this happens. + * + * This requires some coordination between ->finish_item and + * defer_finish. Upon deciding to request a new transaction, + * ->finish_item should update the current work item to reflect the + * unfinished work. Next, it should reset the log done item's list + * count to the number of items finished, and return -EAGAIN. + * defer_finish sees the -EAGAIN, logs the new log intent item + * with the remaining work items, and leaves the xfs_defer_pending + * item at the head of the dop_work queue. Then it rolls the + * transaction and picks up processing where it left off. It is + * required that ->finish_item must be careful to leave enough + * transaction reservation to fit the new log intent item. + * * This is an example of remapping the extent (E, E+B) into file X at * offset A and dealing with the extent (C, C+B) already being mapped * there: @@ -104,21 +136,26 @@ * | Intent to add rmap (X, E, A, B) | * +-------------------------------------------------+ * | Reduce refcount for extent (C, B) | t2 - * | Done reducing refcount for extent (C, B) | + * | Done reducing refcount for extent (C, 9) | + * | Intent to reduce refcount for extent (C+9, B-9) | + * | (ran out of space after 9 refcount updates) | + * +-------------------------------------------------+ + * | Reduce refcount for extent (C+9, B+9) | t3 + * | Done reducing refcount for extent (C+9, B-9) | * | Increase refcount for extent (E, B) | * | Done increasing refcount for extent (E, B) | * | Intent to free extent (C, B) | * | Intent to free extent (F, 1) (refcountbt block) | * | Intent to remove rmap (F, 1, REFC) | * +-------------------------------------------------+ - * | Remove rmap (X, C, A, B) | t3 + * | Remove rmap (X, C, A, B) | t4 * | Done removing rmap (X, C, A, B) | * | Add rmap (X, E, A, B) | * | Done adding rmap (X, E, A, B) | * | Remove rmap (F, 1, REFC) | * | Done removing rmap (F, 1, REFC) | * +-------------------------------------------------+ - * | Free extent (C, B) | t4 + * | Free extent (C, B) | t5 * | Done freeing extent (C, B) | * | Free extent (D, 1) | * | Done freeing extent (D, 1) | @@ -141,6 +178,9 @@ * - Intent to free extent (C, B) * - Intent to free extent (F, 1) (refcountbt block) * - Intent to remove rmap (F, 1, REFC) + * + * Note that the continuation requested between t2 and t3 is likely to + * reoccur. */ static const struct xfs_defer_op_type *defer_op_types[XFS_DEFER_OPS_TYPE_MAX]; @@ -323,7 +363,16 @@ xfs_defer_finish( dfp->dfp_count--; error = dfp->dfp_type->finish_item(*tp, dop, li, dfp->dfp_done, &state); - if (error) { + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction; + * put the work item back on the list + * and jump out. + */ + list_add(li, &dfp->dfp_work); + dfp->dfp_count++; + break; + } else if (error) { /* * Clean up after ourselves and jump out. * xfs_defer_cancel will take care of freeing @@ -335,9 +384,25 @@ xfs_defer_finish( goto out; } } - /* Done with the dfp, free it. */ - list_del(&dfp->dfp_list); - kmem_free(dfp); + if (error == -EAGAIN) { + /* + * Caller wants a fresh transaction, so log a + * new log intent item to replace the old one + * and roll the transaction. See "Requesting + * a Fresh Transaction while Finishing + * Deferred Work" above. + */ + dfp->dfp_intent = dfp->dfp_type->create_intent(*tp, + dfp->dfp_count); + dfp->dfp_done = NULL; + list_for_each(li, &dfp->dfp_work) + dfp->dfp_type->log_item(*tp, dfp->dfp_intent, + li); + } else { + /* Done with the dfp, free it. */ + list_del(&dfp->dfp_list); + kmem_free(dfp); + } if (cleanup_fn) cleanup_fn(*tp, state, error); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 31ca2208c03d..eab68ae2e011 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -132,7 +132,7 @@ xfs_inobt_free_block( xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_INOBT); return xfs_free_extent(cur->bc_tp, XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp)), 1, - &oinfo); + &oinfo, XFS_AG_RESV_NONE); } STATIC int diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a6eed43fa7cd..fc5eef85d61e 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -647,9 +647,17 @@ struct xfs_rui_log_format { __uint16_t rui_size; /* size of this item */ __uint32_t rui_nextents; /* # extents to free */ __uint64_t rui_id; /* rui identifier */ - struct xfs_map_extent rui_extents[1]; /* array of extents to rmap */ + struct xfs_map_extent rui_extents[]; /* array of extents to rmap */ }; +static inline size_t +xfs_rui_log_format_sizeof( + unsigned int nr) +{ + return sizeof(struct xfs_rui_log_format) + + nr * sizeof(struct xfs_map_extent); +} + /* * This is the structure used to lay out an rud log item in the * log. The rud_extents array is a variable size array whose diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index e455f9098d49..2975cb2319f4 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -865,7 +865,7 @@ xfs_buf_item_log_segment( */ if (bit) { end_bit = MIN(bit + bits_to_set, (uint)NBWORD); - mask = ((1 << (end_bit - bit)) - 1) << bit; + mask = ((1U << (end_bit - bit)) - 1) << bit; *wordp |= mask; wordp++; bits_set = end_bit - bit; @@ -888,7 +888,7 @@ xfs_buf_item_log_segment( */ end_bit = bits_to_set - bits_set; if (end_bit) { - mask = (1 << end_bit) - 1; + mask = (1U << end_bit) - 1; *wordp |= mask; } } @@ -1095,7 +1095,8 @@ xfs_buf_iodone_callback_error( bp->b_last_error != bp->b_error) { bp->b_flags |= (XBF_WRITE | XBF_DONE | XBF_WRITE_FAIL); bp->b_last_error = bp->b_error; - if (cfg->retry_timeout && !bp->b_first_retry_time) + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && + !bp->b_first_retry_time) bp->b_first_retry_time = jiffies; xfs_buf_ioerror(bp, 0); @@ -1111,7 +1112,7 @@ xfs_buf_iodone_callback_error( if (cfg->max_retries != XFS_ERR_RETRY_FOREVER && ++bp->b_retries > cfg->max_retries) goto permanent_error; - if (cfg->retry_timeout && + if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER && time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time)) goto permanent_error; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index e612a0233710..b927ea9abe33 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -269,6 +269,8 @@ xfs_file_dio_aio_read( return -EINVAL; } + file_accessed(iocb->ki_filp); + /* * Locking is a bit tricky here. If we take an exclusive lock for direct * IO, we effectively serialise all new concurrent read IO to this file @@ -323,7 +325,6 @@ xfs_file_dio_aio_read( } xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); - file_accessed(iocb->ki_filp); return ret; } diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index 4a33a3304369..c8005fdaaa8a 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -30,6 +30,7 @@ #include "xfs_mru_cache.h" #include "xfs_filestream.h" #include "xfs_trace.h" +#include "xfs_ag_resv.h" struct xfs_fstrm_item { struct xfs_mru_cache_elem mru; @@ -198,7 +199,8 @@ xfs_filestream_pick_ag( } longest = xfs_alloc_longest_free_extent(mp, pag, - xfs_alloc_min_freelist(mp, pag)); + xfs_alloc_min_freelist(mp, pag), + xfs_ag_resv_needed(pag, XFS_AG_RESV_NONE)); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 0b7f986745c1..94ac06f3d908 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -553,7 +553,7 @@ xfs_growfs_data_private( error = xfs_free_extent(tp, XFS_AGB_TO_FSB(mp, agno, be32_to_cpu(agf->agf_length) - new), - new, &oinfo); + new, &oinfo, XFS_AG_RESV_NONE); if (error) goto error0; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index b36676cde103..041d9493e798 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -57,10 +57,16 @@ enum { #define XFS_ERR_RETRY_FOREVER -1 +/* + * Although retry_timeout is in jiffies which is normally an unsigned long, + * we limit the retry timeout to 86400 seconds, or one day. So even a + * signed 32-bit long is sufficient for a HZ value up to 24855. Making it + * signed lets us store the special "-1" value, meaning retry forever. + */ struct xfs_error_cfg { struct xfs_kobj kobj; int max_retries; - unsigned long retry_timeout; /* in jiffies, 0 = no timeout */ + long retry_timeout; /* in jiffies, -1 = infinite */ }; typedef struct xfs_mount { @@ -325,6 +331,22 @@ xfs_mp_fail_writes(struct xfs_mount *mp) } #endif +/* per-AG block reservation data structures*/ +enum xfs_ag_resv_type { + XFS_AG_RESV_NONE = 0, + XFS_AG_RESV_METADATA, + XFS_AG_RESV_AGFL, +}; + +struct xfs_ag_resv { + /* number of blocks originally reserved here */ + xfs_extlen_t ar_orig_reserved; + /* number of blocks reserved here */ + xfs_extlen_t ar_reserved; + /* number of blocks originally asked for */ + xfs_extlen_t ar_asked; +}; + /* * Per-ag incore structure, copies of information in agf and agi, to improve the * performance of allocation group selection. @@ -372,8 +394,28 @@ typedef struct xfs_perag { /* for rcu-safe freeing */ struct rcu_head rcu_head; int pagb_count; /* pagb slots in use */ + + /* Blocks reserved for all kinds of metadata. */ + struct xfs_ag_resv pag_meta_resv; + /* Blocks reserved for just AGFL-based metadata. */ + struct xfs_ag_resv pag_agfl_resv; } xfs_perag_t; +static inline struct xfs_ag_resv * +xfs_perag_resv( + struct xfs_perag *pag, + enum xfs_ag_resv_type type) +{ + switch (type) { + case XFS_AG_RESV_METADATA: + return &pag->pag_meta_resv; + case XFS_AG_RESV_AGFL: + return &pag->pag_agfl_resv; + default: + return NULL; + } +} + extern void xfs_uuid_table_free(void); extern int xfs_log_sbcount(xfs_mount_t *); extern __uint64_t xfs_default_resblks(xfs_mount_t *mp); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 2500f28689d5..0432a459871c 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -51,28 +51,16 @@ xfs_rui_item_free( kmem_zone_free(xfs_rui_zone, ruip); } -/* - * This returns the number of iovecs needed to log the given rui item. - * We only need 1 iovec for an rui item. It just logs the rui_log_format - * structure. - */ -static inline int -xfs_rui_item_sizeof( - struct xfs_rui_log_item *ruip) -{ - return sizeof(struct xfs_rui_log_format) + - (ruip->rui_format.rui_nextents - 1) * - sizeof(struct xfs_map_extent); -} - STATIC void xfs_rui_item_size( struct xfs_log_item *lip, int *nvecs, int *nbytes) { + struct xfs_rui_log_item *ruip = RUI_ITEM(lip); + *nvecs += 1; - *nbytes += xfs_rui_item_sizeof(RUI_ITEM(lip)); + *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); } /* @@ -97,7 +85,7 @@ xfs_rui_item_format( ruip->rui_format.rui_size = 1; xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_RUI_FORMAT, &ruip->rui_format, - xfs_rui_item_sizeof(ruip)); + xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents)); } /* @@ -205,16 +193,12 @@ xfs_rui_init( { struct xfs_rui_log_item *ruip; - uint size; ASSERT(nextents > 0); - if (nextents > XFS_RUI_MAX_FAST_EXTENTS) { - size = (uint)(sizeof(struct xfs_rui_log_item) + - ((nextents - 1) * sizeof(struct xfs_map_extent))); - ruip = kmem_zalloc(size, KM_SLEEP); - } else { + if (nextents > XFS_RUI_MAX_FAST_EXTENTS) + ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), KM_SLEEP); + else ruip = kmem_zone_zalloc(xfs_rui_zone, KM_SLEEP); - } xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops); ruip->rui_format.rui_nextents = nextents; @@ -239,14 +223,12 @@ xfs_rui_copy_format( uint len; src_rui_fmt = buf->i_addr; - len = sizeof(struct xfs_rui_log_format) + - (src_rui_fmt->rui_nextents - 1) * - sizeof(struct xfs_map_extent); + len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents); if (buf->i_len != len) return -EFSCORRUPTED; - memcpy((char *)dst_rui_fmt, (char *)src_rui_fmt, len); + memcpy(dst_rui_fmt, src_rui_fmt, len); return 0; } diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index aefcc3a318a5..340c968e1f9c 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -70,6 +70,14 @@ struct xfs_rui_log_item { struct xfs_rui_log_format rui_format; }; +static inline size_t +xfs_rui_log_item_sizeof( + unsigned int nr) +{ + return offsetof(struct xfs_rui_log_item, rui_format) + + xfs_rui_log_format_sizeof(nr); +} + /* * This is the "rmap update done" log item. It is used to log the fact that * some rmapbt updates mentioned in an earlier rui item have been performed. diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index fd6be45b3a1e..340975392e91 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -1782,9 +1782,8 @@ xfs_init_zones(void) if (!xfs_rud_zone) goto out_destroy_icreate_zone; - xfs_rui_zone = kmem_zone_init((sizeof(struct xfs_rui_log_item) + - ((XFS_RUI_MAX_FAST_EXTENTS - 1) * - sizeof(struct xfs_map_extent))), + xfs_rui_zone = kmem_zone_init( + xfs_rui_log_item_sizeof(XFS_RUI_MAX_FAST_EXTENTS), "xfs_rui_item"); if (!xfs_rui_zone) goto out_destroy_rud_zone; diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 79cfd3fc5324..5f8d55d29a11 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -393,9 +393,15 @@ max_retries_show( struct kobject *kobject, char *buf) { + int retries; struct xfs_error_cfg *cfg = to_error_cfg(kobject); - return snprintf(buf, PAGE_SIZE, "%d\n", cfg->max_retries); + if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) + retries = -1; + else + retries = cfg->max_retries; + + return snprintf(buf, PAGE_SIZE, "%d\n", retries); } static ssize_t @@ -415,7 +421,10 @@ max_retries_store( if (val < -1) return -EINVAL; - cfg->max_retries = val; + if (val == -1) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else + cfg->max_retries = val; return count; } XFS_SYSFS_ATTR_RW(max_retries); @@ -425,10 +434,15 @@ retry_timeout_seconds_show( struct kobject *kobject, char *buf) { + int timeout; struct xfs_error_cfg *cfg = to_error_cfg(kobject); - return snprintf(buf, PAGE_SIZE, "%ld\n", - jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC); + if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) + timeout = -1; + else + timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC; + + return snprintf(buf, PAGE_SIZE, "%d\n", timeout); } static ssize_t @@ -445,11 +459,16 @@ retry_timeout_seconds_store( if (ret) return ret; - /* 1 day timeout maximum */ - if (val < 0 || val > 86400) + /* 1 day timeout maximum, -1 means infinite */ + if (val < -1 || val > 86400) return -EINVAL; - cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + if (val == -1) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else { + cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); + ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX); + } return count; } XFS_SYSFS_ATTR_RW(retry_timeout_seconds); @@ -519,18 +538,19 @@ struct xfs_error_init { static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { { .name = "default", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "EIO", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENOSPC", .max_retries = XFS_ERR_RETRY_FOREVER, - .retry_timeout = 0, + .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENODEV", - .max_retries = 0, + .max_retries = 0, /* We can't recover from devices disappearing */ + .retry_timeout = 0, }, }; @@ -561,7 +581,10 @@ xfs_error_sysfs_init_class( goto out_error; cfg->max_retries = init[i].max_retries; - cfg->retry_timeout = msecs_to_jiffies( + if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER) + cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; + else + cfg->retry_timeout = msecs_to_jiffies( init[i].retry_timeout * MSEC_PER_SEC); } return 0; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index d303a665dba9..c2a875fcf26e 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1570,14 +1570,15 @@ TRACE_EVENT(xfs_agf, TRACE_EVENT(xfs_free_extent, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, bool isfl, int haveleft, int haveright), - TP_ARGS(mp, agno, agbno, len, isfl, haveleft, haveright), + xfs_extlen_t len, enum xfs_ag_resv_type resv, int haveleft, + int haveright), + TP_ARGS(mp, agno, agbno, len, resv, haveleft, haveright), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, isfl) + __field(int, resv) __field(int, haveleft) __field(int, haveright) ), @@ -1586,16 +1587,16 @@ TRACE_EVENT(xfs_free_extent, __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->isfl = isfl; + __entry->resv = resv; __entry->haveleft = haveleft; __entry->haveright = haveright; ), - TP_printk("dev %d:%d agno %u agbno %u len %u isfl %d %s", + TP_printk("dev %d:%d agno %u agbno %u len %u resv %d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, - __entry->isfl, + __entry->resv, __entry->haveleft ? (__entry->haveright ? "both" : "left") : (__entry->haveright ? "right" : "none")) @@ -1622,7 +1623,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __field(short, otype) __field(char, wasdel) __field(char, wasfromfl) - __field(char, isfl) + __field(int, resv) __field(char, userdata) __field(xfs_fsblock_t, firstblock) ), @@ -1643,13 +1644,13 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __entry->otype = args->otype; __entry->wasdel = args->wasdel; __entry->wasfromfl = args->wasfromfl; - __entry->isfl = args->isfl; + __entry->resv = args->resv; __entry->userdata = args->userdata; __entry->firstblock = args->firstblock; ), TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " "prod %u minleft %u total %u alignment %u minalignslop %u " - "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " + "len %u type %s otype %s wasdel %d wasfromfl %d resv %d " "userdata %d firstblock 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, @@ -1667,7 +1668,7 @@ DECLARE_EVENT_CLASS(xfs_alloc_class, __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), __entry->wasdel, __entry->wasfromfl, - __entry->isfl, + __entry->resv, __entry->userdata, (unsigned long long)__entry->firstblock) ) @@ -2558,6 +2559,60 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result); DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result); +/* per-AG reservation */ +DECLARE_EVENT_CLASS(xfs_ag_resv_class, + TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv, + xfs_extlen_t len), + TP_ARGS(pag, resv, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(int, resv) + __field(xfs_extlen_t, freeblks) + __field(xfs_extlen_t, flcount) + __field(xfs_extlen_t, reserved) + __field(xfs_extlen_t, asked) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + struct xfs_ag_resv *r = xfs_perag_resv(pag, resv); + + __entry->dev = pag->pag_mount->m_super->s_dev; + __entry->agno = pag->pag_agno; + __entry->resv = resv; + __entry->freeblks = pag->pagf_freeblks; + __entry->flcount = pag->pagf_flcount; + __entry->reserved = r ? r->ar_reserved : 0; + __entry->asked = r ? r->ar_asked : 0; + __entry->len = len; + ), + TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->resv, + __entry->freeblks, + __entry->flcount, + __entry->reserved, + __entry->asked, + __entry->len) +) +#define DEFINE_AG_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_ag_resv_class, name, \ + TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type type, \ + xfs_extlen_t len), \ + TP_ARGS(pag, type, len)) + +/* per-AG reservation tracepoints */ +DEFINE_AG_RESV_EVENT(xfs_ag_resv_init); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_free); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_alloc_extent); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_free_extent); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_critical); +DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed); + +DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error); +DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 5f3d33d16e67..70f42ea86dfb 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -217,7 +217,7 @@ undo_log: undo_blocks: if (blocks > 0) { - xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); + xfs_mod_fdblocks(tp->t_mountp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } @@ -318,7 +318,6 @@ xfs_trans_mod_sb( * in-core superblock's counter. This should only * be applied to the on-disk superblock. */ - ASSERT(delta < 0); tp->t_res_fdblocks_delta += delta; if (xfs_sb_version_haslazysbcount(&mp->m_sb)) flags &= ~XFS_TRANS_SB_DIRTY; diff --git a/fs/xfs/xfs_trans_extfree.c b/fs/xfs/xfs_trans_extfree.c index 459ddec137a4..ab438647592a 100644 --- a/fs/xfs/xfs_trans_extfree.c +++ b/fs/xfs/xfs_trans_extfree.c @@ -79,7 +79,8 @@ xfs_trans_free_extent( trace_xfs_bmap_free_deferred(tp->t_mountp, agno, 0, agbno, ext_len); - error = xfs_free_extent(tp, start_block, ext_len, oinfo); + error = xfs_free_extent(tp, start_block, ext_len, oinfo, + XFS_AG_RESV_NONE); /* * Mark the transaction dirty, even on error. This ensures the diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index ea62245fee26..62900938f26d 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -147,6 +147,7 @@ __xfs_xattr_put_listent( arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { context->count = -1; /* insufficient space */ + context->seen_enough = 1; return 0; } offset = (char *)context->alist + context->count; |